1use crate::formats::unicode::{
14 ucs2decode, ucs2encode, unpaired_surrogates_to_scalars,
15};
16use anyhow::{Context, Result, ensure};
17
18pub fn encode_wtf8_single(int_value: u32) -> Result<Vec<u8>> {
48 ensure!(
49 int_value <= 0x10FFFF,
50 format!("WTF-8: Codepoint {int_value} out of range")
51 );
52 fn create_byte(int_value: u32, shift: u32) -> u8 {
53 u8::try_from(((int_value >> shift) & 0x3F) | 0x80)
54 .expect("Failed to create byte")
55 }
56
57 let mut symbol = Vec::new();
58 if (int_value & 0xFFFFFF80) == 0 {
59 symbol.push(u8::try_from(int_value).expect("Failed to create byte"));
61 } else {
62 if (int_value & 0xFFFFF800) == 0 {
63 symbol.push(
65 u8::try_from((int_value >> 6) & 0x1F | 0xC0)
66 .expect("Failed to create byte"),
67 );
68 } else if (int_value & 0xFFFF0000) == 0 {
69 symbol.push(
71 u8::try_from((int_value >> 12) & 0x0F | 0xE0)
72 .expect("Failed to create byte"),
73 );
74 symbol.push(create_byte(int_value, 6));
75 } else if (int_value & 0xFFE00000) == 0 {
76 symbol.push(
78 u8::try_from((int_value >> 18) & 0x07 | 0xF0)
79 .expect("Failed to create byte"),
80 );
81 symbol.push(create_byte(int_value, 12));
82 symbol.push(create_byte(int_value, 6));
83 }
84 symbol.push(
85 u8::try_from((int_value & 0x3F) | 0x80)
86 .expect("Failed to create byte"),
87 );
88 }
89 Ok(symbol)
90}
91
92pub fn encode_wtf8_from_scalars(codepoints: &[u32]) -> Result<Vec<u8>> {
98 let mut out = Vec::new();
99 let mut i = 0;
100 while i < codepoints.len() {
101 let cp = codepoints[i];
102 if (0xD800..=0xDBFF).contains(&cp) && i + 1 < codepoints.len() {
104 let next = codepoints[i + 1];
105 if (0xDC00..=0xDFFF).contains(&next) {
106 let high = cp;
108 let low = next;
109 let full = 0x10000 + (((high - 0xD800) << 10) | (low - 0xDC00));
110 out.extend(encode_wtf8_single(full)?);
111 i += 2;
112 continue;
113 }
114 }
115 out.extend(encode_wtf8_single(cp)?);
117 i += 1;
118 }
119 Ok(out)
120}
121
122pub fn encode_wtf8_from_ucs2(ucs2: &[u16]) -> Vec<u8> {
125 let codepoints = ucs2decode(ucs2);
126 encode_wtf8_from_scalars(codepoints.as_slice())
127 .expect("It should not be possible to fail here")
128}
129
130fn read_continuation_byte(input: &[u8], byte_index: &mut usize) -> Result<u8> {
131 if *byte_index >= input.len() {
132 return Err(anyhow::anyhow!("WTF-8: Invalid byte index"));
133 }
134 let continuation_byte = input[*byte_index];
135 *byte_index += 1;
136 if (continuation_byte & 0xC0) == 0x80 {
137 Ok(continuation_byte & 0x3F)
138 } else {
139 Err(anyhow::anyhow!("WTF-8: Invalid continuation byte"))
141 }
142}
143
144pub fn decode_wtf8_single(
147 byte_array_input: &[u8],
148) -> anyhow::Result<(u32, usize), anyhow::Error> {
149 let mut byte_index = 0;
150 let byte_count = byte_array_input.len();
151
152 if byte_index > byte_count {
153 return Err(anyhow::anyhow!("Invalid WTF-8 sequence"));
154 }
155 if byte_index == byte_count {
156 return Err(anyhow::anyhow!(
157 "The original WTF-8 returned false here, not sure why"
158 ));
159 }
160
161 let byte1 = byte_array_input[byte_index];
162 byte_index += 1;
163
164 if (byte1 & 0x80) == 0 {
166 return Ok((u32::from(byte1), 1));
167 }
168
169 if (byte1 & 0xE0) == 0xC0 {
171 let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
172 let int_value = (u32::from(byte1 & 0x1F) << 6) | u32::from(byte2);
173 if int_value >= 0x80 {
174 return Ok((int_value, 2));
175 }
176 return Err(anyhow::anyhow!("WTF-8: Invalid 2-byte sequence"));
177 }
178
179 if (byte1 & 0xF0) == 0xE0 {
181 let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
182 let byte3 = read_continuation_byte(byte_array_input, &mut byte_index)?;
183 let int_value = (u32::from(byte1 & 0x0F) << 12)
184 | (u32::from(byte2) << 6)
185 | u32::from(byte3);
186 if int_value >= 0x0800 {
187 return Ok((int_value, 3));
188 }
189 return Err(anyhow::anyhow!("WTF-8: Invalid 3-byte sequence"));
190 }
191
192 if (byte1 & 0xF8) == 0xF0 {
194 let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
195 let byte3 = read_continuation_byte(byte_array_input, &mut byte_index)?;
196 let byte4 = read_continuation_byte(byte_array_input, &mut byte_index)?;
197 let int_value = (u32::from(byte1 & 0x0F)) << 18
198 | (u32::from(byte2)) << 12
199 | (u32::from(byte3)) << 6
200 | u32::from(byte4);
201 if (0x010000..=0x10FFFF).contains(&int_value) {
202 return Ok((int_value, 4));
203 }
204 }
205
206 Err(anyhow::anyhow!("WTF-8: Invalid 4-byte sequence"))
207}
208
209pub fn decode_wtf8_to_ucs2(byte_array_input: &[u8]) -> Result<Vec<u16>> {
211 let mut codepoints_with_unpaired_surrogates: Vec<u16> = Vec::new();
212 let mut byte_index = 0;
213 while byte_index < byte_array_input.len() {
214 let (cp, len) = decode_wtf8_single(&byte_array_input[byte_index..])
215 .context(format!(
216 "WTF-8: Invalid byte sequence at index {byte_index}"
217 ))?;
218 if cp > 0xFFFF {
219 let surrogates = ucs2encode(&[cp]).context(format!(
220 "WTF-8: Failed to convert codepoint to unpaired surrogates: {cp:?}"
221 ))?;
222 codepoints_with_unpaired_surrogates.extend(surrogates);
223 } else {
224 codepoints_with_unpaired_surrogates
225 .push(u16::try_from(cp).expect("Failed to create u16"));
226 }
227 byte_index += len;
228 }
229
230 Ok(codepoints_with_unpaired_surrogates)
231}
232
233pub fn decode_wtf8_to_scalars(byte_array_input: &[u8]) -> Result<Vec<u32>> {
235 let codepoints_with_unpaired_surrogates =
236 decode_wtf8_to_ucs2(byte_array_input)
237 .context("WTF-8: Failed to decode to UCS-2")?;
238 let codepoints_as_u32: Vec<u32> = codepoints_with_unpaired_surrogates
239 .iter()
240 .map(|&cp| u32::from(cp))
241 .collect();
242 Ok(unpaired_surrogates_to_scalars(&codepoints_as_u32))
243}
244
245pub fn is_unpackable_wtf8(byte_array_input: &[u8]) -> bool {
247 decode_wtf8_single(byte_array_input).is_ok()
248}
249
250#[cfg(test)]
251mod tests {
252 use crate::formats::unicode::{UNICODE_HISTORIC_MAX, UNICODE_MAX};
253
254 use super::*;
255
256 struct TestCase {
259 decoded: &'static [u16],
260 encoded: &'static [u8],
261 description: &'static str,
262 }
263
264 #[crate::ctb_test]
265 fn test_wtf8_encode_decode() {
266 let cases = [
267 TestCase {
269 decoded: &[0x0000],
270 encoded: b"\0",
271 description: "U+0000",
272 },
273 TestCase {
274 decoded: &[0x005C],
275 encoded: b"\x5C",
276 description: "U+005C",
277 },
278 TestCase {
279 decoded: &[0x007F],
280 encoded: b"\x7F",
281 description: "U+007F",
282 },
283 TestCase {
285 decoded: &[0x0080],
286 encoded: &[0xC2, 0x80],
287 description: "U+0080",
288 },
289 TestCase {
290 decoded: &[0x05CA],
291 encoded: &[0xD7, 0x8A],
292 description: "U+05CA",
293 },
294 TestCase {
295 decoded: &[0x07FF],
296 encoded: &[0xDF, 0xBF],
297 description: "U+07FF",
298 },
299 TestCase {
301 decoded: &[0x0800],
302 encoded: &[0xE0, 0xA0, 0x80],
303 description: "U+0800",
304 },
305 TestCase {
306 decoded: &[0x2C3C],
307 encoded: &[0xE2, 0xB0, 0xBC],
308 description: "U+2C3C",
309 },
310 TestCase {
311 decoded: &[0xFFFF],
312 encoded: &[0xEF, 0xBF, 0xBF],
313 description: "U+FFFF",
314 },
315 TestCase {
319 decoded: &[0xD800],
320 encoded: &[0xED, 0xA0, 0x80],
321 description: "U+D800",
322 },
323 TestCase {
324 decoded: &[0xD800, 0xD800],
325 encoded: &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80],
326 description: "High surrogate followed by another high surrogate",
327 },
328 TestCase {
329 decoded: &[0xD800, 0x41_u16],
330 encoded: &[0xED, 0xA0, 0x80, b'A'],
331 description: "High surrogate followed by a symbol that is not a surrogate",
332 },
333 TestCase {
334 decoded: &[0xD800, 0xD834, 0xDF06, 0xD800],
335 encoded: &[
336 0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80,
337 ],
338 description: "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate",
339 },
340 TestCase {
341 decoded: &[0xD9AF],
342 encoded: &[0xED, 0xA6, 0xAF],
343 description: "U+D9AF",
344 },
345 TestCase {
346 decoded: &[0xDBFF],
347 encoded: &[0xED, 0xAF, 0xBF],
348 description: "U+DBFF",
349 },
350 TestCase {
352 decoded: &[0xDC00],
353 encoded: &[0xED, 0xB0, 0x80],
354 description: "U+DC00",
355 },
356 TestCase {
357 decoded: &[0xDC00, 0xDC00],
358 encoded: &[0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80],
359 description: "Low surrogate followed by another low surrogate",
360 },
361 TestCase {
362 decoded: &[0xDC00, 0x41_u16],
363 encoded: &[0xED, 0xB0, 0x80, b'A'],
364 description: "Low surrogate followed by a symbol that is not a surrogate",
365 },
366 TestCase {
367 decoded: &[0xDC00, 0xD834, 0xDF06, 0xDC00],
368 encoded: &[
369 0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80,
370 ],
371 description: "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate",
372 },
373 TestCase {
374 decoded: &[0xDEEE],
375 encoded: &[0xED, 0xBB, 0xAE],
376 description: "U+DEEE",
377 },
378 TestCase {
379 decoded: &[0xDFFF],
380 encoded: &[0xED, 0xBF, 0xBF],
381 description: "U+DFFF",
382 },
383 TestCase {
385 decoded: &[0xD800, 0xDC00],
387 encoded: &[0xF0, 0x90, 0x80, 0x80],
388 description: "U+10000",
389 },
390 TestCase {
391 decoded: &[0xD834, 0xDF06],
393 encoded: &[0xF0, 0x9D, 0x8C, 0x86],
394 description: "U+1D306",
395 },
396 TestCase {
397 decoded: &[0xDBFF, 0xDFFF],
399 encoded: &[0xF4, 0x8F, 0xBF, 0xBF],
400 description: "U+10FFFF",
401 },
402 ];
403 for case in cases.iter() {
404 let encoded = encode_wtf8_from_ucs2(case.decoded);
406 assert_eq!(encoded, case.encoded, "Encoding: {}", case.description);
407 let decoded = decode_wtf8_to_ucs2(case.encoded);
409 if let Ok(decoded) = &decoded {
410 assert_eq!(
411 decoded.as_slice(),
412 case.decoded,
413 "Decoding: {}, encoded: {:?}, decoded: {:?}, expected: {:?}",
414 case.description,
415 case.encoded,
416 decoded,
417 case.decoded
418 );
419 } else {
420 panic!(
421 "Decoding error for case '{}':\n error: {:?}\n encoded: {:?}\n expected decoded: {:?}",
422 case.description,
423 decoded.err(),
424 case.encoded,
425 case.decoded,
426 );
427 }
428 }
429 }
430
431 #[crate::ctb_test]
432 fn test_wtf8_decode_errors() {
433 assert!(decode_wtf8_single(&[0xFF]).is_err());
435 assert!(decode_wtf8_single(&[0xE9, 0x00, 0x00]).is_err());
437 assert!(decode_wtf8_single(&[0xC2, 0xFF, 0xFF]).is_err());
439 assert!(decode_wtf8_single(&[0xC2, 0xEF, 0xBF, 0xBF]).is_err());
440 assert!(decode_wtf8_single(&[0xF0, 0x9D]).is_err());
442 assert!(
443 decode_wtf8_to_scalars(&[0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF])
444 .is_err()
445 ); }
447
448 #[crate::ctb_test]
449 fn test_encode_wtf8_single_basic() {
450 assert_eq!(encode_wtf8_single(0x00).unwrap(), vec![0x00]);
452 assert_eq!(encode_wtf8_single(0x7F).unwrap(), vec![0x7F]);
453 assert_eq!(encode_wtf8_single(0x80).unwrap(), vec![0xC2, 0x80]);
455 assert_eq!(encode_wtf8_single(0x7FF).unwrap(), vec![0xDF, 0xBF]);
456 assert_eq!(encode_wtf8_single(0x800).unwrap(), vec![0xE0, 0xA0, 0x80]);
458 assert_eq!(encode_wtf8_single(0xFFFF).unwrap(), vec![0xEF, 0xBF, 0xBF]);
459 assert_eq!(
461 encode_wtf8_single(0x10000).unwrap(),
462 vec![0xF0, 0x90, 0x80, 0x80]
463 );
464 assert_eq!(
465 encode_wtf8_single(UNICODE_MAX).unwrap(),
466 vec![0xF4, 0x8F, 0xBF, 0xBF]
467 );
468 assert!(encode_wtf8_single(UNICODE_HISTORIC_MAX).is_err());
469 }
470
471 #[crate::ctb_test]
472 fn test_encode_wtf8_from_scalars_surrogates() {
473 assert_eq!(
475 encode_wtf8_from_scalars(&[0xD800]).unwrap(),
476 vec![0xED, 0xA0, 0x80]
477 );
478 assert_eq!(
480 encode_wtf8_from_scalars(&[0xD834, 0xDF06]).unwrap(),
481 vec![0xF0, 0x9D, 0x8C, 0x86]
482 );
483 assert_eq!(
485 encode_wtf8_from_scalars(&[0xD800, 0xD834, 0xDF06, 0xD800])
486 .unwrap(),
487 vec![0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80]
488 );
489 }
490
491 #[crate::ctb_test]
492 fn test_decode_wtf8_to_scalars_basic() {
493 assert_eq!(decode_wtf8_to_scalars(&[0x00]).unwrap(), vec![0x00]);
495 assert_eq!(decode_wtf8_to_scalars(&[0xC2, 0x80]).unwrap(), vec![0x80]);
497 assert_eq!(
499 decode_wtf8_to_scalars(&[0xE0, 0xA0, 0x80]).unwrap(),
500 vec![0x800]
501 );
502 assert_eq!(
504 decode_wtf8_to_scalars(&[0xF0, 0x90, 0x80, 0x80]).unwrap(),
505 vec![0x10000]
506 );
507 assert_eq!(
508 decode_wtf8_to_scalars(&[0xF4, 0x8F, 0xBF, 0xBF]).unwrap(),
509 vec![UNICODE_MAX]
510 );
511 assert!(
512 decode_wtf8_to_scalars(&[0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF])
513 .is_err()
514 );
515 }
516
517 #[crate::ctb_test]
518 fn test_is_unpackable_wtf8() {
519 assert!(is_unpackable_wtf8(&[0x00]));
520 assert!(is_unpackable_wtf8(b"AAAA"));
521 assert!(is_unpackable_wtf8(&[b'A', b'A', b'A', 0xE0, 0xA0, 0x80]));
522 assert!(is_unpackable_wtf8(&[0xE0, 0xA0, 0x80, b'A', b'A', b'A']));
523 assert!(is_unpackable_wtf8(&[0xC2, 0x80]));
524 assert!(is_unpackable_wtf8(&[0xE0, 0xA0, 0x80]));
525 assert!(is_unpackable_wtf8(&[0xF0, 0x90, 0x80, 0x80]));
526 assert!(!is_unpackable_wtf8(&[0xC2])); assert!(!is_unpackable_wtf8(&[0xF0, 0x90])); assert!(!is_unpackable_wtf8(&[0xFF])); }
530}