Skip to main content

percent_encoding/
ascii_set.rs

1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use core::{mem, ops};
10
11/// Represents a set of characters or bytes in the ASCII range.
12///
13/// This is used in [`percent_encode`] and [`utf8_percent_encode`].
14/// This is similar to [percent-encode sets](https://url.spec.whatwg.org/#percent-encoded-bytes).
15///
16/// Use the `add` method of an existing set to define a new set. For example:
17///
18/// [`percent_encode`]: crate::percent_encode
19/// [`utf8_percent_encode`]: crate::utf8_percent_encode
20///
21/// ```
22/// use percent_encoding::{AsciiSet, CONTROLS};
23///
24/// /// https://url.spec.whatwg.org/#fragment-percent-encode-set
25/// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
26/// ```
27#[derive(Debug, PartialEq, Eq)]
28pub struct AsciiSet {
29    mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK],
30}
31
32type Chunk = u32;
33
34const ASCII_RANGE_LEN: usize = 0x80;
35
36const BITS_PER_CHUNK: usize = 8 * mem::size_of::<Chunk>();
37
38impl AsciiSet {
39    /// An empty set.
40    pub const EMPTY: Self = Self {
41        mask: [0; ASCII_RANGE_LEN / BITS_PER_CHUNK],
42    };
43
44    /// Called with UTF-8 bytes rather than code points.
45    /// Not used for non-ASCII bytes.
46    pub(crate) const fn contains(&self, byte: u8) -> bool {
47        let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
48        let mask = 1 << (byte as usize % BITS_PER_CHUNK);
49        (chunk & mask) != 0
50    }
51
52    pub(crate) fn should_percent_encode(&self, byte: u8) -> bool {
53        !byte.is_ascii() || self.contains(byte)
54    }
55
56    pub const fn add(&self, byte: u8) -> Self {
57        let mut mask = self.mask;
58        mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK);
59        Self {
60            mask,
61        }
62    }
63
64    pub const fn remove(&self, byte: u8) -> Self {
65        let mut mask = self.mask;
66        mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK));
67        Self {
68            mask,
69        }
70    }
71
72    /// Return the union of two sets.
73    pub const fn union(&self, other: Self) -> Self {
74        let mask = [
75            self.mask[0] | other.mask[0],
76            self.mask[1] | other.mask[1],
77            self.mask[2] | other.mask[2],
78            self.mask[3] | other.mask[3],
79        ];
80        Self {
81            mask,
82        }
83    }
84
85    /// Return the negation of the set.
86    pub const fn complement(&self) -> Self {
87        let mask = [!self.mask[0], !self.mask[1], !self.mask[2], !self.mask[3]];
88        Self {
89            mask,
90        }
91    }
92}
93
94impl ops::Add for AsciiSet {
95    type Output = Self;
96
97    fn add(self, other: Self) -> Self {
98        self.union(other)
99    }
100}
101
102impl ops::Not for AsciiSet {
103    type Output = Self;
104
105    fn not(self) -> Self {
106        self.complement()
107    }
108}
109
110/// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL).
111///
112/// Note that this includes the newline and tab characters, but not the space 0x20.
113///
114/// <https://url.spec.whatwg.org/#c0-control-percent-encode-set>
115pub const CONTROLS: &AsciiSet = &AsciiSet {
116    mask: [
117        !0_u32, // C0: 0x00 to 0x1F (32 bits set)
118        0,
119        0,
120        1 << (0x7F_u32 % 32), // DEL: 0x7F (one bit set)
121    ],
122};
123
124macro_rules! static_assert {
125    ($( $bool: expr, )+) => {
126        fn _static_assert() {
127            $(
128                let _ = mem::transmute::<[u8; $bool as usize], u8>;
129            )+
130        }
131    }
132}
133
134static_assert! {
135    CONTROLS.contains(0x00),
136    CONTROLS.contains(0x1F),
137    !CONTROLS.contains(0x20),
138    !CONTROLS.contains(0x7E),
139    CONTROLS.contains(0x7F),
140}
141
142/// Everything that is not an ASCII letter or digit.
143///
144/// This is probably more eager than necessary in any context.
145pub const NON_ALPHANUMERIC: &AsciiSet = &CONTROLS
146    .add(b' ')
147    .add(b'!')
148    .add(b'"')
149    .add(b'#')
150    .add(b'$')
151    .add(b'%')
152    .add(b'&')
153    .add(b'\'')
154    .add(b'(')
155    .add(b')')
156    .add(b'*')
157    .add(b'+')
158    .add(b',')
159    .add(b'-')
160    .add(b'.')
161    .add(b'/')
162    .add(b':')
163    .add(b';')
164    .add(b'<')
165    .add(b'=')
166    .add(b'>')
167    .add(b'?')
168    .add(b'@')
169    .add(b'[')
170    .add(b'\\')
171    .add(b']')
172    .add(b'^')
173    .add(b'_')
174    .add(b'`')
175    .add(b'{')
176    .add(b'|')
177    .add(b'}')
178    .add(b'~');
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn add_op() {
186        let left = AsciiSet::EMPTY.add(b'A');
187        let right = AsciiSet::EMPTY.add(b'B');
188        let expected = AsciiSet::EMPTY.add(b'A').add(b'B');
189        assert_eq!(left + right, expected);
190    }
191
192    #[test]
193    fn not_op() {
194        let set = AsciiSet::EMPTY.add(b'A').add(b'B');
195        let not_set = !set;
196        assert!(!not_set.contains(b'A'));
197        assert!(not_set.contains(b'C'));
198    }
199
200    /// This test ensures that we can get the union of two sets as a constant value, which is
201    /// useful for defining sets in a modular way.
202    #[test]
203    fn union() {
204        const A: AsciiSet = AsciiSet::EMPTY.add(b'A');
205        const B: AsciiSet = AsciiSet::EMPTY.add(b'B');
206        const UNION: AsciiSet = A.union(B);
207        const EXPECTED: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B');
208        assert_eq!(UNION, EXPECTED);
209    }
210
211    /// This test ensures that we can get the complement of a set as a constant value, which is
212    /// useful for defining sets in a modular way.
213    #[test]
214    fn complement() {
215        const BOTH: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B');
216        const COMPLEMENT: AsciiSet = BOTH.complement();
217        assert!(!COMPLEMENT.contains(b'A'));
218        assert!(!COMPLEMENT.contains(b'B'));
219        assert!(COMPLEMENT.contains(b'C'));
220    }
221}