ctoolbox/formats/
unicode.rs1use anyhow::Result;
9
10pub const UNICODE_MAX: u32 = 0x10FFFF;
36pub const UNICODE_HISTORIC_MAX: u32 = 0x7FFFFFFF; fn combine_surrogates<I>(input: I) -> Vec<u32>
41where
42 I: IntoIterator,
43 I::Item: Into<u32> + Copy,
44{
45 let mut codepoints = Vec::new();
46 let mut iter = input.into_iter().peekable();
47 while let Some(cp) = iter.next() {
48 let cp = cp.into();
49 if (0xD800..=0xDBFF).contains(&cp) {
50 if let Some(&next) = iter.peek() {
51 let next = next.into();
52 if (0xDC00..=0xDFFF).contains(&next) {
53 codepoints.push(
54 0x10000 + (((cp - 0xD800) << 10) | (next - 0xDC00)),
55 );
56 iter.next(); continue;
58 }
59 }
60 }
61 codepoints.push(cp);
62 }
63 codepoints
64}
65
66pub fn ucs2decode(ucs2: &[u16]) -> Vec<u32> {
69 combine_surrogates(ucs2.iter().copied())
70}
71
72pub fn unpaired_surrogates_to_scalars(surrogates: &[u32]) -> Vec<u32> {
75 combine_surrogates(surrogates.iter().copied())
76}
77
78pub fn ucs2encode(array: &[u32]) -> Result<Vec<u16>> {
82 let mut output = Vec::new();
83 for &value in array {
84 if value > 0x10FFFF {
85 return Err(anyhow::anyhow!("Invalid codepoint: {value}"));
86 } else if value > 0xFFFF {
87 let mut v = value - 0x10000;
88 output.push(
89 u16::try_from((v >> 10) & 0x3FF | 0xD800)
90 .expect("Failed to create u16"),
91 );
92 v = 0xDC00 | (v & 0x3FF);
93 output.push(u16::try_from(v).expect("Failed to create u16"));
94 } else if value <= 0x10FFFF {
95 output.push(u16::try_from(value).expect("Failed to create u16"));
96 }
97 }
98 Ok(output)
99}
100
101pub fn scalars_to_unpaired_surrogates(codepoints: &[u32]) -> Result<Vec<u32>> {
105 let mut result = Vec::new();
106 for &cp in codepoints {
107 if cp <= 0xFFFF {
108 result.push(cp);
109 } else if cp <= 0x10FFFF {
110 let scalar = cp - 0x10000;
111 let high = 0xD800 + ((scalar >> 10) & 0x3FF);
112 let low = 0xDC00 + (scalar & 0x3FF);
113 result.push(high);
114 result.push(low);
115 } else {
116 return Err(anyhow::anyhow!("Invalid codepoint: {cp}"));
117 }
118 }
119 Ok(result)
120}
121
122pub fn scalars_to_string_lossy(scalars: &[u32]) -> String {
123 combine_surrogates(scalars.to_vec())
124 .iter()
125 .map(|&cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
126 .collect()
127}
128
129pub fn string_to_scalars(s: &str) -> Vec<u32> {
130 s.chars().map(u32::from).collect()
131}
132
133pub fn js_like_slice_utf16(input: &str, start: usize, len: usize) -> Vec<u16> {
136 let utf16: Vec<u16> = input.encode_utf16().collect();
138 let end = usize::min(start.saturating_add(len), utf16.len());
140 if start >= end {
141 Vec::new()
142 } else {
143 utf16[start..end].to_vec()
144 }
145}
146
147pub fn u16_vec_to_le_bytes(v: &[u16]) -> Vec<u8> {
149 let mut b = Vec::with_capacity(v.len() * 2);
150 for &u in v {
151 b.push(u8::try_from(u & 0xFF).expect("Failed to create byte"));
152 b.push(u8::try_from(u >> 8).expect("Failed to create byte"));
153 }
154 b
155}
156
157#[cfg(test)]
158mod tests {
159 use super::*;
160
161 #[crate::ctb_test]
162 fn test_unpaired_surrogates_to_scalars() {
163 let surrogates = vec![0xD800, 0xDC00, 0xD800, 0xDC01];
164 let scalars = unpaired_surrogates_to_scalars(&surrogates);
165 assert_eq!(scalars, vec![0x10000, 0x10001]);
166 }
167
168 #[crate::ctb_test]
169 fn test_scalars_to_unpaired_surrogates() {
170 let scalars = vec![0x10000, 0x10001];
171 let surrogates = scalars_to_unpaired_surrogates(&scalars)
172 .expect("Failed to convert scalars to unpaired surrogates");
173 assert_eq!(surrogates, vec![0xD800, 0xDC00, 0xD800, 0xDC01]);
174 }
175
176 #[crate::ctb_test]
177 fn test_ucs2decode_basic() {
178 let ucs2 = vec![0x0041, 0xD800, 0xDC00, 0x0042];
180 let scalars = ucs2decode(&ucs2);
181 assert_eq!(scalars, vec![0x0041, 0x10000, 0x0042]);
182 }
183
184 #[crate::ctb_test]
185 fn test_ucs2encode_basic() {
186 let scalars = vec![0x0041, 0x10000, 0x0042];
188 let ucs2 = ucs2encode(&scalars).unwrap();
189 assert_eq!(ucs2, vec![0x0041, 0xD800, 0xDC00, 0x0042]);
190 }
191
192 #[crate::ctb_test]
193 fn test_ucs2encode_invalid() {
194 let scalars = vec![0x110000];
196 let result = ucs2encode(&scalars);
197 assert!(result.is_err());
198 }
199
200 #[crate::ctb_test]
201 fn test_scalars_to_unpaired_surrogates_invalid() {
202 let scalars = vec![0x110000];
204 let result = scalars_to_unpaired_surrogates(&scalars);
205 assert!(result.is_err());
206 }
207
208 #[crate::ctb_test]
209 fn test_js_like_slice_utf16() {
210 let input = "a𠜎";
211 let result = js_like_slice_utf16(input, 0, 2);
212 assert_eq!(result, vec![0x61, 0xD841]);
213 }
214
215 #[crate::ctb_test]
216 fn test_u16_vec_to_le_bytes() {
217 let input = vec![0x61, 0xD841];
218 let result = u16_vec_to_le_bytes(&input);
219 assert_eq!(result, vec![0x61, 0x00, 0x41, 0xD8]);
220 }
221}