ctoolbox/formats/
utf_8e_128.rs

1// FIXME: Can this be simplified by leaning on Rust's native UTF-8 en/decoding?
2
3/// Encodes a Unicode scalar value or an extended (> U+10FFFF) 128‑bit integer
4/// using the UTF‑8e‑128 scheme.
5///
6/// For values <= `0x10_FFFF` this produces standard UTF‑8 (1–4 bytes).
7/// For larger values it emits:
8///   0:  0xFF
9///   1:  10LLLLLL   (1 <= L <= 22) number of payload continuation bytes
10///   2+: L payload continuation bytes 10bbbbbb ... (big‑endian 6‑bit groups)
11/// Returns the number of bytes written.
12///
13/// Panics if the provided buffer is too small (needs up to 24 bytes).
14pub fn encode_utf_8e_128_buf(buf: &mut [u8], codepoint: u128) -> usize {
15    // Standard UTF-8 path (unchanged) for values within Unicode range
16    if codepoint <= 0x10FFFF {
17        let cp = u32::try_from(codepoint)
18            .expect("Failed to create u32; range checked");
19        // (Optional) Reject surrogate range if you only want Unicode scalar values.
20        // if (0xD800..=0xDFFF).contains(&cp) {
21        //     // An alternative here might be to assign them higher Dcs, or
22        //     // since this encoding can hold 132 bits, to stuff them into that
23        //     // unused space
24        //     panic!("Cannot encode surrogate as scalar");
25        // }
26        if cp <= 0x7F {
27            buf[0] = u8::try_from(cp).expect("Failed to create byte");
28            return 1;
29        } else if cp <= 0x7FF {
30            buf[0] =
31                0xC0 | (u8::try_from(cp >> 6).expect("Failed to create byte"));
32            buf[1] = 0x80
33                | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
34            return 2;
35        } else if cp <= 0xFFFF {
36            buf[0] =
37                0xE0 | (u8::try_from(cp >> 12).expect("Failed to create byte"));
38            buf[1] = 0x80
39                | (u8::try_from((cp >> 6) & 0x3F)
40                    .expect("Failed to create byte"));
41            buf[2] = 0x80
42                | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
43            return 3;
44        }
45        buf[0] =
46            0xF0 | (u8::try_from(cp >> 18).expect("Failed to create byte"));
47        buf[1] = 0x80
48            | (u8::try_from((cp >> 12) & 0x3F).expect("Failed to create byte"));
49        buf[2] = 0x80
50            | (u8::try_from((cp >> 6) & 0x3F).expect("Failed to create byte"));
51        buf[3] =
52            0x80 | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
53        return 4;
54    }
55
56    // Extended form
57    // Determine bit length
58    let bits = 128
59        - usize::try_from(codepoint.leading_zeros())
60            .expect("Failed to create usize"); // codepoint > 0x10FFFF so bits >= 21
61    let mut l = bits.div_ceil(6); // minimal number of 6-bit groups
62    if l == 0 {
63        l = 1;
64    }
65    assert!(l <= 22, "Value requires more than 132 bits?");
66
67    assert!(buf.len() >= 2 + l, "Buffer too small for extended encoding");
68
69    // Extract groups big-endian: groups[0] is first (most significant) group
70    let mut groups = [0u8; 22];
71    {
72        let mut tmp = codepoint;
73        for i in 0..l {
74            groups[l - 1 - i] =
75                u8::try_from(tmp & 0x3F).expect("Failed to create byte");
76            tmp >>= 6;
77        }
78        debug_assert!(tmp == 0);
79    }
80
81    // Canonical rule: first payload group must be non-zero (value > 0)
82    debug_assert!(groups[0] != 0);
83
84    buf[0] = 0xFF;
85    buf[1] = 0x80 | u8::try_from(l).expect("Failed to create byte"); // length continuation byte
86
87    for i in 0..l {
88        buf[2 + i] = 0x80 | groups[i];
89    }
90
91    // Additional canonical check for 128-bit max if l == 22:
92    // top 4 bits of first payload group must be zero (they are the unused padding bits).
93    if l == 22 {
94        debug_assert!(
95            (groups[0] & 0x3C) == 0,
96            "Non-zero padding bits in 22-byte encoding"
97        );
98    }
99
100    2 + l
101}
102
103/// Decodes one UTF‑8 / UTF‑8e‑128 codepoint from the provided byte slice.
104/// On success returns Some((value, `length_consumed`)), else None.
105/// Enforces canonical (no overlong) encodings for both standard and extended forms.
106pub fn decode_utf_8e_128_buf(bytes: &[u8]) -> Option<(u128, usize)> {
107    let first = *bytes.first()?;
108    if first == 0xFF {
109        // Extended form
110        let h = *bytes.get(1)?;
111        if (h & 0xC0) != 0x80 {
112            return None;
113        }
114        let l = usize::from(h & 0x3F);
115        if l == 0 || l > 22 {
116            return None;
117        }
118        if bytes.len() < 2 + l {
119            return None;
120        }
121
122        // Gather groups
123        let mut groups = [0u8; 22];
124        for i in 0..l {
125            let b = bytes[2 + i];
126            if (b & 0xC0) != 0x80 {
127                return None;
128            }
129            groups[i] = b & 0x3F;
130        }
131
132        // Canonical: first group not zero
133        if groups[0] == 0 {
134            return None;
135        }
136
137        // If l == 22, top 4 bits of first group (padding) must be zero.
138        if l == 22 && (groups[0] & 0x3C) != 0 {
139            return None;
140        }
141
142        // Reconstruct value pruning leading padding bits if total bits > 128
143        let total_bits = 6 * l;
144        let extra = total_bits.saturating_sub(128); // 0..=4
145        if extra > 4 {
146            return None; // should not happen with l<=22 and u128 output
147        }
148
149        // Ensure the extra (padding) high bits are zero
150        if extra > 0 && (groups[0] >> (6 - extra)) != 0 {
151            return None;
152        }
153
154        let mut value: u128 = 0;
155        if extra < 6 {
156            // Take lower (6 - extra) bits of first group
157            let first_payload_bits = groups[0] & ((1u8 << (6 - extra)) - 1);
158            value = u128::from(first_payload_bits);
159        }
160        for i in 1..l {
161            value = (value << 6) | u128::from(groups[i]);
162        }
163
164        // Must not overlap with standard range
165        if value <= 0x10FFFF {
166            return None;
167        }
168
169        return Some((value, 2 + l));
170    }
171
172    // Standard UTF-8 decoding
173    if first < 0x80 {
174        return Some((u128::from(first), 1));
175    }
176
177    // Determine expected length and initial mask / prefix
178    let (len, min_val, max_val_mask) = if (first & 0xE0) == 0xC0 {
179        // 110xxxxx
180        (2usize, 0x80u32, 0x1F)
181    } else if (first & 0xF0) == 0xE0 {
182        // 1110xxxx
183        (3usize, 0x800u32, 0x0F)
184    } else if (first & 0xF8) == 0xF0 {
185        // 11110xxx
186        (4usize, 0x10000u32, 0x07)
187    } else {
188        return None;
189    };
190
191    if bytes.len() < len {
192        return None;
193    }
194
195    let mut val: u32 = u32::from(first & max_val_mask);
196    for i in 1..len {
197        let b = bytes[i];
198        if (b & 0xC0) != 0x80 {
199            return None;
200        }
201        val = (val << 6) | u32::from(b & 0x3F);
202    }
203
204    // Overlong check
205    if val < min_val {
206        return None;
207    }
208
209    // Unicode max (U+10FFFF)
210    if val > 0x10FFFF {
211        return None;
212    }
213
214    // Optional: reject surrogate range for scalar value canonicality.
215    // if (0xD800..=0xDFFF).contains(&val) {
216    //     return None;
217    // }
218
219    Some((u128::from(val), len))
220}
221
222/// Generalized UTF-8 encoding for u128.
223/// Returns a `Vec<u8>` containing the encoded bytes.
224pub fn encode_utf_8e_128(codepoint: u128) -> Vec<u8> {
225    let mut buf = [0u8; 24];
226    let encoded_len = encode_utf_8e_128_buf(&mut buf, codepoint);
227    buf[..encoded_len].to_vec()
228}
229
230/// Decodes one generalized UTF-8 codepoint from bytes.
231/// Returns Some((value, `length_consumed`)), or the replacement character on error.
232pub fn decode_utf_8e_128(bytes: &[u8]) -> Option<(u128, usize)> {
233    if bytes.is_empty() {
234        return None;
235    }
236
237    let mut buf = [0u8; 24];
238    let used_len = bytes.len().min(24);
239    buf[..used_len].copy_from_slice(&bytes[..used_len]);
240
241    if let Some(x) = decode_utf_8e_128_buf(&buf) {
242        Some(x)
243    } else {
244        // Overwrite buffer with replacement character [0xEF, 0xBF, 0xBD]
245        buf[0] = 0xEF;
246        buf[1] = 0xBF;
247        buf[2] = 0xBD;
248        for b in &mut buf[3..] {
249            *b = 0;
250        }
251        // Return replacement character and length
252        Some((0xFFFD, 3))
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    #[crate::ctb_test]
261    fn test_standard_ascii() {
262        let mut buf = [0u8; 24];
263        for ch in [0x00u128, 0x41, 0x7F] {
264            let n = encode_utf_8e_128_buf(&mut buf, ch);
265            let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
266            assert_eq!(v, ch);
267            assert_eq!(n, m);
268        }
269    }
270
271    #[crate::ctb_test]
272    fn test_standard_multibyte() {
273        let samples = [
274            0x80u128, 0x7FF, 0x800, 0x1234, 0x20AC, 0xFFFF, 0x10000, 0x10FFFF,
275        ];
276        let mut buf = [0u8; 24];
277        for cp in samples {
278            let n = encode_utf_8e_128_buf(&mut buf, cp);
279            let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
280            assert_eq!(v, cp);
281            assert_eq!(n, m);
282        }
283    }
284
285    #[crate::ctb_test]
286    fn test_extended_simple() {
287        let mut buf = [0u8; 24];
288        let cp = 0x10FFFFu128 + 1;
289        let n = encode_utf_8e_128_buf(&mut buf, cp);
290        assert!(n >= 3);
291        assert_eq!(buf[0], 0xFF);
292        let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
293        assert_eq!(v, cp);
294        assert_eq!(n, m);
295    }
296
297    #[crate::ctb_test]
298    fn test_extended_large() {
299        let mut buf = [0u8; 24];
300        let cp = u128::MAX;
301        let n = encode_utf_8e_128_buf(&mut buf, cp);
302        assert_eq!(buf[0], 0xFF);
303        let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
304        assert_eq!(v, cp);
305        assert_eq!(n, m);
306    }
307
308    #[crate::ctb_test]
309    fn test_malformed() {
310        assert!(decode_utf_8e_128_buf(&[]).is_none());
311        assert!(decode_utf_8e_128_buf(&[0x80]).is_none()); // continuation as start
312        assert!(decode_utf_8e_128_buf(&[0xFF]).is_none()); // incomplete extended
313    }
314
315    #[crate::ctb_test]
316    fn test_overlaps_rejected() {
317        // If value <= U+10FFFF must be encoded in standard form; constructing extended form should be rejected.
318        // Manually craft extended for 0x41
319        let bytes = vec![0xFF, 0x81, 0xC1]; // length=1, payload=0x01 -> value=1 (<= U+10FFFF)
320        assert!(decode_utf_8e_128_buf(&bytes).is_none());
321
322        // Construct an extended encoding for a value in standard range (should decode to None)
323        // Manually: value = 0x10FFFF (should have used standard form)
324        let mut bytes = Vec::new();
325        bytes.push(0xFF);
326        // Determine minimal groups for 0x10FFFF
327        let val = 0x10FFFFu128;
328        let bits = 128
329            - usize::try_from(val.leading_zeros())
330                .expect("Failed to create usize");
331        let l = bits.div_ceil(6);
332        bytes.push(0x80 | u8::try_from(l).expect("Failed to create byte"));
333        let mut groups = [0u8; 22];
334        let mut tmp = val;
335        for i in 0..l {
336            groups[l - 1 - i] =
337                u8::try_from(tmp & 0x3F).expect("Failed to create byte");
338            tmp >>= 6;
339        }
340        for i in 0..l {
341            bytes.push(0x80 | groups[i]);
342        }
343        assert!(decode_utf_8e_128_buf(&bytes).is_none());
344    }
345
346    #[crate::ctb_test]
347    fn test_encode_utf_8e_128_buf_basic() {
348        let mut buf = [0u8; 24];
349        // ASCII
350        let n = encode_utf_8e_128_buf(&mut buf, 0x41);
351        assert_eq!(&buf[..n], &[0x41]);
352        // 2-byte
353        let n = encode_utf_8e_128_buf(&mut buf, 0x80);
354        assert_eq!(&buf[..n], &[0xC2, 0x80]);
355        // 3-byte
356        let n = encode_utf_8e_128_buf(&mut buf, 0x800);
357        assert_eq!(&buf[..n], &[0xE0, 0xA0, 0x80]);
358        // 4-byte
359        let n = encode_utf_8e_128_buf(&mut buf, 0x10000);
360        assert_eq!(&buf[..n], &[0xF0, 0x90, 0x80, 0x80]);
361        // Extended
362        let n = encode_utf_8e_128_buf(&mut buf, 0x1_0000_0000);
363        assert_eq!(buf[0], 0xFF);
364    }
365
366    #[crate::ctb_test]
367    fn test_decode_utf_8e_128_buf_basic() {
368        // ASCII
369        let res = decode_utf_8e_128_buf(&[0x41]);
370        assert_eq!(res, Some((0x41, 1)));
371        // 2-byte
372        let res = decode_utf_8e_128_buf(&[0xC2, 0x80]);
373        assert_eq!(res, Some((0x80, 2)));
374        // 3-byte
375        let res = decode_utf_8e_128_buf(&[0xE0, 0xA0, 0x80]);
376        assert_eq!(res, Some((0x800, 3)));
377        // 4-byte
378        let res = decode_utf_8e_128_buf(&[0xF0, 0x90, 0x80, 0x80]);
379        assert_eq!(res, Some((0x10000, 4)));
380    }
381
382    #[crate::ctb_test]
383    fn test_encode_decode_utf_8e_128() {
384        // Roundtrip
385        for &cp in &[
386            0x41u128,
387            0x80,
388            0x800,
389            0x10000,
390            0x10FFFF,
391            0x1_0000_0000,
392            u128::MAX,
393        ] {
394            let encoded = encode_utf_8e_128(cp);
395            let decoded = decode_utf_8e_128(&encoded).unwrap();
396            assert_eq!(decoded.0, cp);
397        }
398    }
399
400    #[crate::ctb_test]
401    fn test_decode_utf_8e_128_replacement() {
402        // Invalid input returns replacement character
403        let res = decode_utf_8e_128(&[0xFF]);
404        assert_eq!(res, Some((0xFFFD, 3)));
405    }
406}