ctoolbox/formats/
wtf8.rs

1/* SPDX-License-Identifier: MIT */
2/* Copyright © Mathias Bynens <https://mathiasbynens.be/> */
3
4/// Utilities for WTF-8 encoding and decoding.
5/// Reference: <https://simonsapin.github.io/wtf-8>/
6/// See also the standard library version:
7/// <https://doc.rust-lang.org/src/std/sys_common/wtf8.rs.html>
8/// TODO?: This could have some more optional features added for other uses:
9/// - Allow encoding of values above U+10FFFF (up to U+7FFFFFFF) as 6-byte
10///   sequences.
11/// - Allow retaining surrogates unpaired when encoding, in which case it would
12///   be CESU-8 implementation.
13use crate::formats::unicode::{
14    ucs2decode, ucs2encode, unpaired_surrogates_to_scalars,
15};
16use anyhow::{Context, Result, ensure};
17
18// Based on https://web.archive.org/web/20190305073920/https://github.com/mathiasbynens/wtf-8/blob/58c6b976c6678144d180b2307bee5615457e2cc7/wtf-8.js
19// This code for wtf8 is included under the following license (from https://web.archive.org/web/20190305074047/https://github.com/mathiasbynens/wtf-8/blob/58c6b976c6678144d180b2307bee5615457e2cc7/LICENSE-MIT.txt):
20/*
21Copyright Mathias Bynens <https://mathiasbynens.be/>
22
23Permission is hereby granted, free of charge, to any person obtaining
24a copy of this software and associated documentation files (the
25"Software"), to deal in the Software without restriction, including
26without limitation the rights to use, copy, modify, merge, publish,
27distribute, sublicense, and/or sell copies of the Software, and to
28permit persons to whom the Software is furnished to do so, subject to
29the following conditions:
30
31The above copyright notice and this permission notice shall be
32included in all copies or substantial portions of the Software.
33
34THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
38LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
39OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
40WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
41*/
42
43/// Encode a single Unicode codepoint as WTF-8 byte array.
44/// Returns a `Vec<u8>` containing the WTF-8 encoding of the codepoint.
45/// Surrogates and non-scalar values are encoded as WTF-8 allows.
46/// Values higher than 0x10FFFF are not supported.
47pub fn encode_wtf8_single(int_value: u32) -> Result<Vec<u8>> {
48    ensure!(
49        int_value <= 0x10FFFF,
50        format!("WTF-8: Codepoint {int_value} out of range")
51    );
52    fn create_byte(int_value: u32, shift: u32) -> u8 {
53        u8::try_from(((int_value >> shift) & 0x3F) | 0x80)
54            .expect("Failed to create byte")
55    }
56
57    let mut symbol = Vec::new();
58    if (int_value & 0xFFFFFF80) == 0 {
59        // 1-byte sequence
60        symbol.push(u8::try_from(int_value).expect("Failed to create byte"));
61    } else {
62        if (int_value & 0xFFFFF800) == 0 {
63            // 2-byte sequence
64            symbol.push(
65                u8::try_from((int_value >> 6) & 0x1F | 0xC0)
66                    .expect("Failed to create byte"),
67            );
68        } else if (int_value & 0xFFFF0000) == 0 {
69            // 3-byte sequence
70            symbol.push(
71                u8::try_from((int_value >> 12) & 0x0F | 0xE0)
72                    .expect("Failed to create byte"),
73            );
74            symbol.push(create_byte(int_value, 6));
75        } else if (int_value & 0xFFE00000) == 0 {
76            // 4-byte sequence
77            symbol.push(
78                u8::try_from((int_value >> 18) & 0x07 | 0xF0)
79                    .expect("Failed to create byte"),
80            );
81            symbol.push(create_byte(int_value, 12));
82            symbol.push(create_byte(int_value, 6));
83        }
84        symbol.push(
85            u8::try_from((int_value & 0x3F) | 0x80)
86                .expect("Failed to create byte"),
87        );
88    }
89    Ok(symbol)
90}
91
92/// More concise `encode_wtf8` using `ucs2decode/encode_wtf8_single` logic.
93/// Encodes a slice of Unicode scalar codepoints to WTF-8 bytes.
94/// Surrogate pairs are combined and encoded as a single codepoint.
95/// Unpaired surrogates are encoded as 3-byte WTF-8 sequences.
96/// Values higher than 0x10FFFF are not supported.
97pub fn encode_wtf8_from_scalars(codepoints: &[u32]) -> Result<Vec<u8>> {
98    let mut out = Vec::new();
99    let mut i = 0;
100    while i < codepoints.len() {
101        let cp = codepoints[i];
102        // Check for surrogate pair
103        if (0xD800..=0xDBFF).contains(&cp) && i + 1 < codepoints.len() {
104            let next = codepoints[i + 1];
105            if (0xDC00..=0xDFFF).contains(&next) {
106                // Combine surrogate pair
107                let high = cp;
108                let low = next;
109                let full = 0x10000 + (((high - 0xD800) << 10) | (low - 0xDC00));
110                out.extend(encode_wtf8_single(full)?);
111                i += 2;
112                continue;
113            }
114        }
115        // Otherwise, encode as single codepoint
116        out.extend(encode_wtf8_single(cp)?);
117        i += 1;
118    }
119    Ok(out)
120}
121
122/// WTF-8 encode a UTF-16/UCS-2/JS string (as &[u16]) into `Vec<u8>`.
123/// Uses ucs2decode to convert to codepoints, then encodes as WTF-8.
124pub fn encode_wtf8_from_ucs2(ucs2: &[u16]) -> Vec<u8> {
125    let codepoints = ucs2decode(ucs2);
126    encode_wtf8_from_scalars(codepoints.as_slice())
127        .expect("It should not be possible to fail here")
128}
129
130fn read_continuation_byte(input: &[u8], byte_index: &mut usize) -> Result<u8> {
131    if *byte_index >= input.len() {
132        return Err(anyhow::anyhow!("WTF-8: Invalid byte index"));
133    }
134    let continuation_byte = input[*byte_index];
135    *byte_index += 1;
136    if (continuation_byte & 0xC0) == 0x80 {
137        Ok(continuation_byte & 0x3F)
138    } else {
139        // If we end up here, it’s not a continuation byte.
140        Err(anyhow::anyhow!("WTF-8: Invalid continuation byte"))
141    }
142}
143
144/// Decode one WTF-8 codepoint from a byte array slice.
145/// Returns Ok((codepoint, length)) or Err on error.
146pub fn decode_wtf8_single(
147    byte_array_input: &[u8],
148) -> anyhow::Result<(u32, usize), anyhow::Error> {
149    let mut byte_index = 0;
150    let byte_count = byte_array_input.len();
151
152    if byte_index > byte_count {
153        return Err(anyhow::anyhow!("Invalid WTF-8 sequence"));
154    }
155    if byte_index == byte_count {
156        return Err(anyhow::anyhow!(
157            "The original WTF-8 returned false here, not sure why"
158        ));
159    }
160
161    let byte1 = byte_array_input[byte_index];
162    byte_index += 1;
163
164    // 1-byte sequence (no continuation bytes)
165    if (byte1 & 0x80) == 0 {
166        return Ok((u32::from(byte1), 1));
167    }
168
169    // 2-byte sequence
170    if (byte1 & 0xE0) == 0xC0 {
171        let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
172        let int_value = (u32::from(byte1 & 0x1F) << 6) | u32::from(byte2);
173        if int_value >= 0x80 {
174            return Ok((int_value, 2));
175        }
176        return Err(anyhow::anyhow!("WTF-8: Invalid 2-byte sequence"));
177    }
178
179    // 3-byte sequence (may include unpaired surrogates)
180    if (byte1 & 0xF0) == 0xE0 {
181        let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
182        let byte3 = read_continuation_byte(byte_array_input, &mut byte_index)?;
183        let int_value = (u32::from(byte1 & 0x0F) << 12)
184            | (u32::from(byte2) << 6)
185            | u32::from(byte3);
186        if int_value >= 0x0800 {
187            return Ok((int_value, 3));
188        }
189        return Err(anyhow::anyhow!("WTF-8: Invalid 3-byte sequence"));
190    }
191
192    // 4-byte sequence
193    if (byte1 & 0xF8) == 0xF0 {
194        let byte2 = read_continuation_byte(byte_array_input, &mut byte_index)?;
195        let byte3 = read_continuation_byte(byte_array_input, &mut byte_index)?;
196        let byte4 = read_continuation_byte(byte_array_input, &mut byte_index)?;
197        let int_value = (u32::from(byte1 & 0x0F)) << 18
198            | (u32::from(byte2)) << 12
199            | (u32::from(byte3)) << 6
200            | u32::from(byte4);
201        if (0x010000..=0x10FFFF).contains(&int_value) {
202            return Ok((int_value, 4));
203        }
204    }
205
206    Err(anyhow::anyhow!("WTF-8: Invalid 4-byte sequence"))
207}
208
209/// Decode a WTF-8 byte array into a `Vec<u16>` (UCS-2).
210pub fn decode_wtf8_to_ucs2(byte_array_input: &[u8]) -> Result<Vec<u16>> {
211    let mut codepoints_with_unpaired_surrogates: Vec<u16> = Vec::new();
212    let mut byte_index = 0;
213    while byte_index < byte_array_input.len() {
214        let (cp, len) = decode_wtf8_single(&byte_array_input[byte_index..])
215            .context(format!(
216                "WTF-8: Invalid byte sequence at index {byte_index}"
217            ))?;
218        if cp > 0xFFFF {
219            let surrogates = ucs2encode(&[cp]).context(format!(
220                "WTF-8: Failed to convert codepoint to unpaired surrogates: {cp:?}"
221            ))?;
222            codepoints_with_unpaired_surrogates.extend(surrogates);
223        } else {
224            codepoints_with_unpaired_surrogates
225                .push(u16::try_from(cp).expect("Failed to create u16"));
226        }
227        byte_index += len;
228    }
229
230    Ok(codepoints_with_unpaired_surrogates)
231}
232
233/// Decode a WTF-8 byte array into a `Vec<u16>` (UCS-2).
234pub fn decode_wtf8_to_scalars(byte_array_input: &[u8]) -> Result<Vec<u32>> {
235    let codepoints_with_unpaired_surrogates =
236        decode_wtf8_to_ucs2(byte_array_input)
237            .context("WTF-8: Failed to decode to UCS-2")?;
238    let codepoints_as_u32: Vec<u32> = codepoints_with_unpaired_surrogates
239        .iter()
240        .map(|&cp| u32::from(cp))
241        .collect();
242    Ok(unpaired_surrogates_to_scalars(&codepoints_as_u32))
243}
244
245/// Returns true if the given slice is a valid WTF-8 single codepoint sequence.
246pub fn is_unpackable_wtf8(byte_array_input: &[u8]) -> bool {
247    decode_wtf8_single(byte_array_input).is_ok()
248}
249
250#[cfg(test)]
251mod tests {
252    use crate::formats::unicode::{UNICODE_HISTORIC_MAX, UNICODE_MAX};
253
254    use super::*;
255
256    // Based on https://github.com/mathiasbynens/wtf-8/blob/bdab8ed45a2446eddffae28d27b353bb817189c5/tests/tests.js
257
258    struct TestCase {
259        decoded: &'static [u16],
260        encoded: &'static [u8],
261        description: &'static str,
262    }
263
264    #[crate::ctb_test]
265    fn test_wtf8_encode_decode() {
266        let cases = [
267            // 1-byte
268            TestCase {
269                decoded: &[0x0000],
270                encoded: b"\0",
271                description: "U+0000",
272            },
273            TestCase {
274                decoded: &[0x005C],
275                encoded: b"\x5C",
276                description: "U+005C",
277            },
278            TestCase {
279                decoded: &[0x007F],
280                encoded: b"\x7F",
281                description: "U+007F",
282            },
283            // 2-byte
284            TestCase {
285                decoded: &[0x0080],
286                encoded: &[0xC2, 0x80],
287                description: "U+0080",
288            },
289            TestCase {
290                decoded: &[0x05CA],
291                encoded: &[0xD7, 0x8A],
292                description: "U+05CA",
293            },
294            TestCase {
295                decoded: &[0x07FF],
296                encoded: &[0xDF, 0xBF],
297                description: "U+07FF",
298            },
299            // 3-byte
300            TestCase {
301                decoded: &[0x0800],
302                encoded: &[0xE0, 0xA0, 0x80],
303                description: "U+0800",
304            },
305            TestCase {
306                decoded: &[0x2C3C],
307                encoded: &[0xE2, 0xB0, 0xBC],
308                description: "U+2C3C",
309            },
310            TestCase {
311                decoded: &[0xFFFF],
312                encoded: &[0xEF, 0xBF, 0xBF],
313                description: "U+FFFF",
314            },
315            // Unmatched surrogate halves
316
317            // high surrogates: 0xD800 to 0xDBFF
318            TestCase {
319                decoded: &[0xD800],
320                encoded: &[0xED, 0xA0, 0x80],
321                description: "U+D800",
322            },
323            TestCase {
324                decoded: &[0xD800, 0xD800],
325                encoded: &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80],
326                description: "High surrogate followed by another high surrogate",
327            },
328            TestCase {
329                decoded: &[0xD800, 0x41_u16],
330                encoded: &[0xED, 0xA0, 0x80, b'A'],
331                description: "High surrogate followed by a symbol that is not a surrogate",
332            },
333            TestCase {
334                decoded: &[0xD800, 0xD834, 0xDF06, 0xD800],
335                encoded: &[
336                    0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80,
337                ],
338                description: "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate",
339            },
340            TestCase {
341                decoded: &[0xD9AF],
342                encoded: &[0xED, 0xA6, 0xAF],
343                description: "U+D9AF",
344            },
345            TestCase {
346                decoded: &[0xDBFF],
347                encoded: &[0xED, 0xAF, 0xBF],
348                description: "U+DBFF",
349            },
350            // low surrogates: 0xDC00 to 0xDFFF
351            TestCase {
352                decoded: &[0xDC00],
353                encoded: &[0xED, 0xB0, 0x80],
354                description: "U+DC00",
355            },
356            TestCase {
357                decoded: &[0xDC00, 0xDC00],
358                encoded: &[0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80],
359                description: "Low surrogate followed by another low surrogate",
360            },
361            TestCase {
362                decoded: &[0xDC00, 0x41_u16],
363                encoded: &[0xED, 0xB0, 0x80, b'A'],
364                description: "Low surrogate followed by a symbol that is not a surrogate",
365            },
366            TestCase {
367                decoded: &[0xDC00, 0xD834, 0xDF06, 0xDC00],
368                encoded: &[
369                    0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80,
370                ],
371                description: "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate",
372            },
373            TestCase {
374                decoded: &[0xDEEE],
375                encoded: &[0xED, 0xBB, 0xAE],
376                description: "U+DEEE",
377            },
378            TestCase {
379                decoded: &[0xDFFF],
380                encoded: &[0xED, 0xBF, 0xBF],
381                description: "U+DFFF",
382            },
383            // 4-byte
384            TestCase {
385                // 0x010000 as surrogates
386                decoded: &[0xD800, 0xDC00],
387                encoded: &[0xF0, 0x90, 0x80, 0x80],
388                description: "U+10000",
389            },
390            TestCase {
391                // 0x01D306 as surrogates
392                decoded: &[0xD834, 0xDF06],
393                encoded: &[0xF0, 0x9D, 0x8C, 0x86],
394                description: "U+1D306",
395            },
396            TestCase {
397                // 0x10FFFF as surrogates
398                decoded: &[0xDBFF, 0xDFFF],
399                encoded: &[0xF4, 0x8F, 0xBF, 0xBF],
400                description: "U+10FFFF",
401            },
402        ];
403        for case in cases.iter() {
404            // Encode
405            let encoded = encode_wtf8_from_ucs2(case.decoded);
406            assert_eq!(encoded, case.encoded, "Encoding: {}", case.description);
407            // Decode
408            let decoded = decode_wtf8_to_ucs2(case.encoded);
409            if let Ok(decoded) = &decoded {
410                assert_eq!(
411                    decoded.as_slice(),
412                    case.decoded,
413                    "Decoding: {}, encoded: {:?}, decoded: {:?}, expected: {:?}",
414                    case.description,
415                    case.encoded,
416                    decoded,
417                    case.decoded
418                );
419            } else {
420                panic!(
421                    "Decoding error for case '{}':\n  error: {:?}\n  encoded: {:?}\n  expected decoded: {:?}",
422                    case.description,
423                    decoded.err(),
424                    case.encoded,
425                    case.decoded,
426                );
427            }
428        }
429    }
430
431    #[crate::ctb_test]
432    fn test_wtf8_decode_errors() {
433        // Invalid WTF-8 detected
434        assert!(decode_wtf8_single(&[0xFF]).is_err());
435        // Invalid continuation byte (4-byte sequence expected)
436        assert!(decode_wtf8_single(&[0xE9, 0x00, 0x00]).is_err());
437        // Invalid continuation byte
438        assert!(decode_wtf8_single(&[0xC2, 0xFF, 0xFF]).is_err());
439        assert!(decode_wtf8_single(&[0xC2, 0xEF, 0xBF, 0xBF]).is_err());
440        // Invalid byte index
441        assert!(decode_wtf8_single(&[0xF0, 0x9D]).is_err());
442        assert!(
443            decode_wtf8_to_scalars(&[0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF])
444                .is_err()
445        ); // UNICODE_HISTORIC_MAX
446    }
447
448    #[crate::ctb_test]
449    fn test_encode_wtf8_single_basic() {
450        // 1-byte
451        assert_eq!(encode_wtf8_single(0x00).unwrap(), vec![0x00]);
452        assert_eq!(encode_wtf8_single(0x7F).unwrap(), vec![0x7F]);
453        // 2-byte
454        assert_eq!(encode_wtf8_single(0x80).unwrap(), vec![0xC2, 0x80]);
455        assert_eq!(encode_wtf8_single(0x7FF).unwrap(), vec![0xDF, 0xBF]);
456        // 3-byte
457        assert_eq!(encode_wtf8_single(0x800).unwrap(), vec![0xE0, 0xA0, 0x80]);
458        assert_eq!(encode_wtf8_single(0xFFFF).unwrap(), vec![0xEF, 0xBF, 0xBF]);
459        // 4-byte
460        assert_eq!(
461            encode_wtf8_single(0x10000).unwrap(),
462            vec![0xF0, 0x90, 0x80, 0x80]
463        );
464        assert_eq!(
465            encode_wtf8_single(UNICODE_MAX).unwrap(),
466            vec![0xF4, 0x8F, 0xBF, 0xBF]
467        );
468        assert!(encode_wtf8_single(UNICODE_HISTORIC_MAX).is_err());
469    }
470
471    #[crate::ctb_test]
472    fn test_encode_wtf8_from_scalars_surrogates() {
473        // Unpaired high surrogate
474        assert_eq!(
475            encode_wtf8_from_scalars(&[0xD800]).unwrap(),
476            vec![0xED, 0xA0, 0x80]
477        );
478        // Surrogate pair
479        assert_eq!(
480            encode_wtf8_from_scalars(&[0xD834, 0xDF06]).unwrap(),
481            vec![0xF0, 0x9D, 0x8C, 0x86]
482        );
483        // Mixed
484        assert_eq!(
485            encode_wtf8_from_scalars(&[0xD800, 0xD834, 0xDF06, 0xD800])
486                .unwrap(),
487            vec![0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80]
488        );
489    }
490
491    #[crate::ctb_test]
492    fn test_decode_wtf8_to_scalars_basic() {
493        // 1-byte
494        assert_eq!(decode_wtf8_to_scalars(&[0x00]).unwrap(), vec![0x00]);
495        // 2-byte
496        assert_eq!(decode_wtf8_to_scalars(&[0xC2, 0x80]).unwrap(), vec![0x80]);
497        // 3-byte
498        assert_eq!(
499            decode_wtf8_to_scalars(&[0xE0, 0xA0, 0x80]).unwrap(),
500            vec![0x800]
501        );
502        // 4-byte
503        assert_eq!(
504            decode_wtf8_to_scalars(&[0xF0, 0x90, 0x80, 0x80]).unwrap(),
505            vec![0x10000]
506        );
507        assert_eq!(
508            decode_wtf8_to_scalars(&[0xF4, 0x8F, 0xBF, 0xBF]).unwrap(),
509            vec![UNICODE_MAX]
510        );
511        assert!(
512            decode_wtf8_to_scalars(&[0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF])
513                .is_err()
514        );
515    }
516
517    #[crate::ctb_test]
518    fn test_is_unpackable_wtf8() {
519        assert!(is_unpackable_wtf8(&[0x00]));
520        assert!(is_unpackable_wtf8(b"AAAA"));
521        assert!(is_unpackable_wtf8(&[b'A', b'A', b'A', 0xE0, 0xA0, 0x80]));
522        assert!(is_unpackable_wtf8(&[0xE0, 0xA0, 0x80, b'A', b'A', b'A']));
523        assert!(is_unpackable_wtf8(&[0xC2, 0x80]));
524        assert!(is_unpackable_wtf8(&[0xE0, 0xA0, 0x80]));
525        assert!(is_unpackable_wtf8(&[0xF0, 0x90, 0x80, 0x80]));
526        assert!(!is_unpackable_wtf8(&[0xC2])); // incomplete
527        assert!(!is_unpackable_wtf8(&[0xF0, 0x90])); // incomplete
528        assert!(!is_unpackable_wtf8(&[0xFF])); // invalid
529    }
530}