1pub fn encode_utf_8e_128_buf(buf: &mut [u8], codepoint: u128) -> usize {
15 if codepoint <= 0x10FFFF {
17 let cp = u32::try_from(codepoint)
18 .expect("Failed to create u32; range checked");
19 if cp <= 0x7F {
27 buf[0] = u8::try_from(cp).expect("Failed to create byte");
28 return 1;
29 } else if cp <= 0x7FF {
30 buf[0] =
31 0xC0 | (u8::try_from(cp >> 6).expect("Failed to create byte"));
32 buf[1] = 0x80
33 | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
34 return 2;
35 } else if cp <= 0xFFFF {
36 buf[0] =
37 0xE0 | (u8::try_from(cp >> 12).expect("Failed to create byte"));
38 buf[1] = 0x80
39 | (u8::try_from((cp >> 6) & 0x3F)
40 .expect("Failed to create byte"));
41 buf[2] = 0x80
42 | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
43 return 3;
44 }
45 buf[0] =
46 0xF0 | (u8::try_from(cp >> 18).expect("Failed to create byte"));
47 buf[1] = 0x80
48 | (u8::try_from((cp >> 12) & 0x3F).expect("Failed to create byte"));
49 buf[2] = 0x80
50 | (u8::try_from((cp >> 6) & 0x3F).expect("Failed to create byte"));
51 buf[3] =
52 0x80 | (u8::try_from(cp & 0x3F).expect("Failed to create byte"));
53 return 4;
54 }
55
56 let bits = 128
59 - usize::try_from(codepoint.leading_zeros())
60 .expect("Failed to create usize"); let mut l = bits.div_ceil(6); if l == 0 {
63 l = 1;
64 }
65 assert!(l <= 22, "Value requires more than 132 bits?");
66
67 assert!(buf.len() >= 2 + l, "Buffer too small for extended encoding");
68
69 let mut groups = [0u8; 22];
71 {
72 let mut tmp = codepoint;
73 for i in 0..l {
74 groups[l - 1 - i] =
75 u8::try_from(tmp & 0x3F).expect("Failed to create byte");
76 tmp >>= 6;
77 }
78 debug_assert!(tmp == 0);
79 }
80
81 debug_assert!(groups[0] != 0);
83
84 buf[0] = 0xFF;
85 buf[1] = 0x80 | u8::try_from(l).expect("Failed to create byte"); for i in 0..l {
88 buf[2 + i] = 0x80 | groups[i];
89 }
90
91 if l == 22 {
94 debug_assert!(
95 (groups[0] & 0x3C) == 0,
96 "Non-zero padding bits in 22-byte encoding"
97 );
98 }
99
100 2 + l
101}
102
103pub fn decode_utf_8e_128_buf(bytes: &[u8]) -> Option<(u128, usize)> {
107 let first = *bytes.first()?;
108 if first == 0xFF {
109 let h = *bytes.get(1)?;
111 if (h & 0xC0) != 0x80 {
112 return None;
113 }
114 let l = usize::from(h & 0x3F);
115 if l == 0 || l > 22 {
116 return None;
117 }
118 if bytes.len() < 2 + l {
119 return None;
120 }
121
122 let mut groups = [0u8; 22];
124 for i in 0..l {
125 let b = bytes[2 + i];
126 if (b & 0xC0) != 0x80 {
127 return None;
128 }
129 groups[i] = b & 0x3F;
130 }
131
132 if groups[0] == 0 {
134 return None;
135 }
136
137 if l == 22 && (groups[0] & 0x3C) != 0 {
139 return None;
140 }
141
142 let total_bits = 6 * l;
144 let extra = total_bits.saturating_sub(128); if extra > 4 {
146 return None; }
148
149 if extra > 0 && (groups[0] >> (6 - extra)) != 0 {
151 return None;
152 }
153
154 let mut value: u128 = 0;
155 if extra < 6 {
156 let first_payload_bits = groups[0] & ((1u8 << (6 - extra)) - 1);
158 value = u128::from(first_payload_bits);
159 }
160 for i in 1..l {
161 value = (value << 6) | u128::from(groups[i]);
162 }
163
164 if value <= 0x10FFFF {
166 return None;
167 }
168
169 return Some((value, 2 + l));
170 }
171
172 if first < 0x80 {
174 return Some((u128::from(first), 1));
175 }
176
177 let (len, min_val, max_val_mask) = if (first & 0xE0) == 0xC0 {
179 (2usize, 0x80u32, 0x1F)
181 } else if (first & 0xF0) == 0xE0 {
182 (3usize, 0x800u32, 0x0F)
184 } else if (first & 0xF8) == 0xF0 {
185 (4usize, 0x10000u32, 0x07)
187 } else {
188 return None;
189 };
190
191 if bytes.len() < len {
192 return None;
193 }
194
195 let mut val: u32 = u32::from(first & max_val_mask);
196 for i in 1..len {
197 let b = bytes[i];
198 if (b & 0xC0) != 0x80 {
199 return None;
200 }
201 val = (val << 6) | u32::from(b & 0x3F);
202 }
203
204 if val < min_val {
206 return None;
207 }
208
209 if val > 0x10FFFF {
211 return None;
212 }
213
214 Some((u128::from(val), len))
220}
221
222pub fn encode_utf_8e_128(codepoint: u128) -> Vec<u8> {
225 let mut buf = [0u8; 24];
226 let encoded_len = encode_utf_8e_128_buf(&mut buf, codepoint);
227 buf[..encoded_len].to_vec()
228}
229
230pub fn decode_utf_8e_128(bytes: &[u8]) -> Option<(u128, usize)> {
233 if bytes.is_empty() {
234 return None;
235 }
236
237 let mut buf = [0u8; 24];
238 let used_len = bytes.len().min(24);
239 buf[..used_len].copy_from_slice(&bytes[..used_len]);
240
241 if let Some(x) = decode_utf_8e_128_buf(&buf) {
242 Some(x)
243 } else {
244 buf[0] = 0xEF;
246 buf[1] = 0xBF;
247 buf[2] = 0xBD;
248 for b in &mut buf[3..] {
249 *b = 0;
250 }
251 Some((0xFFFD, 3))
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 use super::*;
259
260 #[crate::ctb_test]
261 fn test_standard_ascii() {
262 let mut buf = [0u8; 24];
263 for ch in [0x00u128, 0x41, 0x7F] {
264 let n = encode_utf_8e_128_buf(&mut buf, ch);
265 let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
266 assert_eq!(v, ch);
267 assert_eq!(n, m);
268 }
269 }
270
271 #[crate::ctb_test]
272 fn test_standard_multibyte() {
273 let samples = [
274 0x80u128, 0x7FF, 0x800, 0x1234, 0x20AC, 0xFFFF, 0x10000, 0x10FFFF,
275 ];
276 let mut buf = [0u8; 24];
277 for cp in samples {
278 let n = encode_utf_8e_128_buf(&mut buf, cp);
279 let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
280 assert_eq!(v, cp);
281 assert_eq!(n, m);
282 }
283 }
284
285 #[crate::ctb_test]
286 fn test_extended_simple() {
287 let mut buf = [0u8; 24];
288 let cp = 0x10FFFFu128 + 1;
289 let n = encode_utf_8e_128_buf(&mut buf, cp);
290 assert!(n >= 3);
291 assert_eq!(buf[0], 0xFF);
292 let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
293 assert_eq!(v, cp);
294 assert_eq!(n, m);
295 }
296
297 #[crate::ctb_test]
298 fn test_extended_large() {
299 let mut buf = [0u8; 24];
300 let cp = u128::MAX;
301 let n = encode_utf_8e_128_buf(&mut buf, cp);
302 assert_eq!(buf[0], 0xFF);
303 let (v, m) = decode_utf_8e_128_buf(&buf[..n]).unwrap();
304 assert_eq!(v, cp);
305 assert_eq!(n, m);
306 }
307
308 #[crate::ctb_test]
309 fn test_malformed() {
310 assert!(decode_utf_8e_128_buf(&[]).is_none());
311 assert!(decode_utf_8e_128_buf(&[0x80]).is_none()); assert!(decode_utf_8e_128_buf(&[0xFF]).is_none()); }
314
315 #[crate::ctb_test]
316 fn test_overlaps_rejected() {
317 let bytes = vec![0xFF, 0x81, 0xC1]; assert!(decode_utf_8e_128_buf(&bytes).is_none());
321
322 let mut bytes = Vec::new();
325 bytes.push(0xFF);
326 let val = 0x10FFFFu128;
328 let bits = 128
329 - usize::try_from(val.leading_zeros())
330 .expect("Failed to create usize");
331 let l = bits.div_ceil(6);
332 bytes.push(0x80 | u8::try_from(l).expect("Failed to create byte"));
333 let mut groups = [0u8; 22];
334 let mut tmp = val;
335 for i in 0..l {
336 groups[l - 1 - i] =
337 u8::try_from(tmp & 0x3F).expect("Failed to create byte");
338 tmp >>= 6;
339 }
340 for i in 0..l {
341 bytes.push(0x80 | groups[i]);
342 }
343 assert!(decode_utf_8e_128_buf(&bytes).is_none());
344 }
345
346 #[crate::ctb_test]
347 fn test_encode_utf_8e_128_buf_basic() {
348 let mut buf = [0u8; 24];
349 let n = encode_utf_8e_128_buf(&mut buf, 0x41);
351 assert_eq!(&buf[..n], &[0x41]);
352 let n = encode_utf_8e_128_buf(&mut buf, 0x80);
354 assert_eq!(&buf[..n], &[0xC2, 0x80]);
355 let n = encode_utf_8e_128_buf(&mut buf, 0x800);
357 assert_eq!(&buf[..n], &[0xE0, 0xA0, 0x80]);
358 let n = encode_utf_8e_128_buf(&mut buf, 0x10000);
360 assert_eq!(&buf[..n], &[0xF0, 0x90, 0x80, 0x80]);
361 let n = encode_utf_8e_128_buf(&mut buf, 0x1_0000_0000);
363 assert_eq!(buf[0], 0xFF);
364 }
365
366 #[crate::ctb_test]
367 fn test_decode_utf_8e_128_buf_basic() {
368 let res = decode_utf_8e_128_buf(&[0x41]);
370 assert_eq!(res, Some((0x41, 1)));
371 let res = decode_utf_8e_128_buf(&[0xC2, 0x80]);
373 assert_eq!(res, Some((0x80, 2)));
374 let res = decode_utf_8e_128_buf(&[0xE0, 0xA0, 0x80]);
376 assert_eq!(res, Some((0x800, 3)));
377 let res = decode_utf_8e_128_buf(&[0xF0, 0x90, 0x80, 0x80]);
379 assert_eq!(res, Some((0x10000, 4)));
380 }
381
382 #[crate::ctb_test]
383 fn test_encode_decode_utf_8e_128() {
384 for &cp in &[
386 0x41u128,
387 0x80,
388 0x800,
389 0x10000,
390 0x10FFFF,
391 0x1_0000_0000,
392 u128::MAX,
393 ] {
394 let encoded = encode_utf_8e_128(cp);
395 let decoded = decode_utf_8e_128(&encoded).unwrap();
396 assert_eq!(decoded.0, cp);
397 }
398 }
399
400 #[crate::ctb_test]
401 fn test_decode_utf_8e_128_replacement() {
402 let res = decode_utf_8e_128(&[0xFF]);
404 assert_eq!(res, Some((0xFFFD, 3)));
405 }
406}