ctoolbox/formats/
unicode.rs

1/* SPDX-License-Identifier: MIT */
2/* Copyright © Mathias Bynens <https://mathiasbynens.be/> */
3
4//! Utilities for Unicode, including:
5//! - Conversion of scalars to surrogates and vice versa
6//! - UCS-2 encoding and decoding from scalars
7
8use anyhow::Result;
9
10// Based on https://web.archive.org/web/20190305073920/https://github.com/mathiasbynens/wtf-8/blob/58c6b976c6678144d180b2307bee5615457e2cc7/wtf-8.js
11// This code for wtf8 is included under the following license (from https://web.archive.org/web/20190305074047/https://github.com/mathiasbynens/wtf-8/blob/58c6b976c6678144d180b2307bee5615457e2cc7/LICENSE-MIT.txt):
12/*
13Copyright Mathias Bynens <https://mathiasbynens.be/>
14
15Permission is hereby granted, free of charge, to any person obtaining
16a copy of this software and associated documentation files (the
17"Software"), to deal in the Software without restriction, including
18without limitation the rights to use, copy, modify, merge, publish,
19distribute, sublicense, and/or sell copies of the Software, and to
20permit persons to whom the Software is furnished to do so, subject to
21the following conditions:
22
23The above copyright notice and this permission notice shall be
24included in all copies or substantial portions of the Software.
25
26THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
30LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
31OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
32WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33*/
34
35pub const UNICODE_MAX: u32 = 0x10FFFF;
36pub const UNICODE_HISTORIC_MAX: u32 = 0x7FFFFFFF; // UTF-8 0xFDBFBFBFBFBF
37
38/// Combines surrogate pairs in an iterator of code units into Unicode scalar values.
39/// Unpaired surrogates are left as-is.
40fn combine_surrogates<I>(input: I) -> Vec<u32>
41where
42    I: IntoIterator,
43    I::Item: Into<u32> + Copy,
44{
45    let mut codepoints = Vec::new();
46    let mut iter = input.into_iter().peekable();
47    while let Some(cp) = iter.next() {
48        let cp = cp.into();
49        if (0xD800..=0xDBFF).contains(&cp) {
50            if let Some(&next) = iter.peek() {
51                let next = next.into();
52                if (0xDC00..=0xDFFF).contains(&next) {
53                    codepoints.push(
54                        0x10000 + (((cp - 0xD800) << 10) | (next - 0xDC00)),
55                    );
56                    iter.next(); // consume low surrogate
57                    continue;
58                }
59            }
60        }
61        codepoints.push(cp);
62    }
63    codepoints
64}
65
66/// Decodes a slice of UCS-2/UTF-16 code units (`u16`) into Unicode scalar values (`u32`).
67/// Surrogate pairs are combined into scalars; unpaired surrogates are left as-is.
68pub fn ucs2decode(ucs2: &[u16]) -> Vec<u32> {
69    combine_surrogates(ucs2.iter().copied())
70}
71
72/// Decodes a slice of code units (`u32`), combining surrogate pairs into scalars.
73/// Unpaired surrogates are left as-is.
74pub fn unpaired_surrogates_to_scalars(surrogates: &[u32]) -> Vec<u32> {
75    combine_surrogates(surrogates.iter().copied())
76}
77
78/// Encodes an array of Unicode scalar values (`u32`) to UTF-16/UCS-2 code units (`Vec<u16>`).
79/// Surrogate pairs are generated for codepoints above U+FFFF.
80/// Returns an error for invalid codepoints.
81pub fn ucs2encode(array: &[u32]) -> Result<Vec<u16>> {
82    let mut output = Vec::new();
83    for &value in array {
84        if value > 0x10FFFF {
85            return Err(anyhow::anyhow!("Invalid codepoint: {value}"));
86        } else if value > 0xFFFF {
87            let mut v = value - 0x10000;
88            output.push(
89                u16::try_from((v >> 10) & 0x3FF | 0xD800)
90                    .expect("Failed to create u16"),
91            );
92            v = 0xDC00 | (v & 0x3FF);
93            output.push(u16::try_from(v).expect("Failed to create u16"));
94        } else if value <= 0x10FFFF {
95            output.push(u16::try_from(value).expect("Failed to create u16"));
96        }
97    }
98    Ok(output)
99}
100
101/// Converts a slice of Unicode scalar values (`u32`) to unpaired surrogates.
102/// Codepoints above U+FFFF are split into surrogate pairs.
103/// Returns an error for invalid codepoints.
104pub fn scalars_to_unpaired_surrogates(codepoints: &[u32]) -> Result<Vec<u32>> {
105    let mut result = Vec::new();
106    for &cp in codepoints {
107        if cp <= 0xFFFF {
108            result.push(cp);
109        } else if cp <= 0x10FFFF {
110            let scalar = cp - 0x10000;
111            let high = 0xD800 + ((scalar >> 10) & 0x3FF);
112            let low = 0xDC00 + (scalar & 0x3FF);
113            result.push(high);
114            result.push(low);
115        } else {
116            return Err(anyhow::anyhow!("Invalid codepoint: {cp}"));
117        }
118    }
119    Ok(result)
120}
121
122pub fn scalars_to_string_lossy(scalars: &[u32]) -> String {
123    combine_surrogates(scalars.to_vec())
124        .iter()
125        .map(|&cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
126        .collect()
127}
128
129pub fn string_to_scalars(s: &str) -> Vec<u32> {
130    s.chars().map(u32::from).collect()
131}
132
133// Rust equivalent of JS 'string'.slice(...), replicating the behavior of being
134// able to slice in between surrogates.
135pub fn js_like_slice_utf16(input: &str, start: usize, len: usize) -> Vec<u16> {
136    // Encode as UTF-16 code units
137    let utf16: Vec<u16> = input.encode_utf16().collect();
138    // Compute end index (clamp to bounds)
139    let end = usize::min(start.saturating_add(len), utf16.len());
140    if start >= end {
141        Vec::new()
142    } else {
143        utf16[start..end].to_vec()
144    }
145}
146
147// Convert a Vec<u16> to a byte array (little-endian)
148pub fn u16_vec_to_le_bytes(v: &[u16]) -> Vec<u8> {
149    let mut b = Vec::with_capacity(v.len() * 2);
150    for &u in v {
151        b.push(u8::try_from(u & 0xFF).expect("Failed to create byte"));
152        b.push(u8::try_from(u >> 8).expect("Failed to create byte"));
153    }
154    b
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[crate::ctb_test]
162    fn test_unpaired_surrogates_to_scalars() {
163        let surrogates = vec![0xD800, 0xDC00, 0xD800, 0xDC01];
164        let scalars = unpaired_surrogates_to_scalars(&surrogates);
165        assert_eq!(scalars, vec![0x10000, 0x10001]);
166    }
167
168    #[crate::ctb_test]
169    fn test_scalars_to_unpaired_surrogates() {
170        let scalars = vec![0x10000, 0x10001];
171        let surrogates = scalars_to_unpaired_surrogates(&scalars)
172            .expect("Failed to convert scalars to unpaired surrogates");
173        assert_eq!(surrogates, vec![0xD800, 0xDC00, 0xD800, 0xDC01]);
174    }
175
176    #[crate::ctb_test]
177    fn test_ucs2decode_basic() {
178        // U+0041, U+10000 (surrogate pair), U+0042
179        let ucs2 = vec![0x0041, 0xD800, 0xDC00, 0x0042];
180        let scalars = ucs2decode(&ucs2);
181        assert_eq!(scalars, vec![0x0041, 0x10000, 0x0042]);
182    }
183
184    #[crate::ctb_test]
185    fn test_ucs2encode_basic() {
186        // U+0041, U+10000, U+0042
187        let scalars = vec![0x0041, 0x10000, 0x0042];
188        let ucs2 = ucs2encode(&scalars).unwrap();
189        assert_eq!(ucs2, vec![0x0041, 0xD800, 0xDC00, 0x0042]);
190    }
191
192    #[crate::ctb_test]
193    fn test_ucs2encode_invalid() {
194        // Invalid codepoint above U+10FFFF
195        let scalars = vec![0x110000];
196        let result = ucs2encode(&scalars);
197        assert!(result.is_err());
198    }
199
200    #[crate::ctb_test]
201    fn test_scalars_to_unpaired_surrogates_invalid() {
202        // Invalid codepoint above U+10FFFF
203        let scalars = vec![0x110000];
204        let result = scalars_to_unpaired_surrogates(&scalars);
205        assert!(result.is_err());
206    }
207
208    #[crate::ctb_test]
209    fn test_js_like_slice_utf16() {
210        let input = "a𠜎";
211        let result = js_like_slice_utf16(input, 0, 2);
212        assert_eq!(result, vec![0x61, 0xD841]);
213    }
214
215    #[crate::ctb_test]
216    fn test_u16_vec_to_le_bytes() {
217        let input = vec![0x61, 0xD841];
218        let result = u16_vec_to_le_bytes(&input);
219        assert_eq!(result, vec![0x61, 0x00, 0x41, 0xD8]);
220    }
221}