1use anyhow::{Result, anyhow, bail};
2
3pub const UTF8_REPLACEMENT_CHARACTER: &[u8; 3] = &[0xEF, 0xBF, 0xBD];
4
5pub fn first_char_of_utf8_string(bytes: &[u8]) -> Result<(Vec<u8>, usize)> {
10 let (bytes, (consumed, valid)) = _first_char_of_utf8_string(bytes, true)?;
11 Ok((bytes, consumed))
12}
13
14pub fn first_char_of_utf8_string_lossless(
17 bytes: &[u8],
18) -> Result<(Vec<u8>, (usize, bool))> {
19 _first_char_of_utf8_string(bytes, false)
20}
21
22fn _first_char_of_utf8_string(
23 bytes: &[u8],
24 replace_invalid: bool,
25) -> Result<(Vec<u8>, (usize, bool))> {
26 if bytes.is_empty() {
33 return Err(anyhow!("Empty input in first_char_of_utf8_string"));
34 }
35 let mut iter = bytes.utf8_chunks();
36 let chunk = iter.next().ok_or_else(|| {
37 anyhow!("At least some chunk should be found for non-empty string")
38 })?;
39
40 let valid = chunk.valid();
41 if valid.is_empty() {
42 let invalid = chunk.invalid();
43 if !invalid.is_empty() {
44 if replace_invalid {
45 return Ok((vec![0xEF, 0xBF, 0xBD], (invalid.len(), false)));
47 }
48 return Ok((invalid.to_vec(), (invalid.len(), false)));
49 }
50 } else {
51 let out = &mut [0u8; 4];
52 let first_char_len =
53 valid.chars().next().unwrap().encode_utf8(out).len();
54
55 return Ok((out[..first_char_len].to_vec(), (first_char_len, true)));
56 }
57
58 bail!("Chunk contained neither valid nor invalid data")
59}
60
61pub fn utf8_from_scalar(cp: u32) -> Result<Vec<u8>> {
62 if cp > 0x10FFFF {
63 bail!("Invalid Unicode codepoint U+{cp:X}");
64 }
65 let mut buf = [0u8; 4];
66 let s = char::from_u32(cp)
67 .ok_or_else(|| anyhow!("Invalid Unicode codepoint U+{cp:X}"))?
68 .encode_utf8(&mut buf);
69 Ok(s.as_bytes().to_vec())
70}
71
72#[cfg(test)]
73mod tests {
74 use super::*;
75
76 #[crate::ctb_test]
77 fn test_first_char_of_utf8_string_ascii() {
78 let input = b"hello";
79 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
80 assert_eq!(ch, vec![b'h']);
81 assert_eq!(consumed, 1);
82 }
83
84 #[crate::ctb_test]
85 fn test_first_char_of_utf8_string_multibyte() {
86 let input = "éclair".as_bytes();
87 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
88 assert_eq!(ch, "é".as_bytes());
89 assert_eq!(consumed, "é".as_bytes().len());
90 }
91
92 #[crate::ctb_test]
93 fn test_first_char_of_utf8_string_astral() {
94 let input = "🥴test".as_bytes();
95 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
96 assert_eq!(ch, "🥴".as_bytes());
97 assert_eq!(consumed, "🥴".len());
98 }
99
100 #[crate::ctb_test]
101 fn test_first_char_of_utf8_string_invalid() {
102 let input = &[0xFF, 0x61, 0x62];
103 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
104 assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]); assert_eq!(consumed, 1);
106 }
107
108 #[crate::ctb_test]
109 fn test_first_char_of_utf8_string_overlong() {
110 let input = &[0xC1, 0x81];
111 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
112 assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]); assert_eq!(consumed, 1);
114 }
115
116 #[crate::ctb_test]
117 fn test_first_char_of_utf8_string_partly_invalid() {
118 let input = &[0xE2, 0x80, 0xA9, 0xFF, 0x61, 0x62];
119 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
120 assert_eq!(ch, vec![0xE2, 0x80, 0xA9]);
121 assert_eq!(consumed, 3);
122 }
123
124 #[crate::ctb_test]
125 fn test_first_char_of_utf8_string_empty() {
126 let input = b"";
127 let result = first_char_of_utf8_string(input);
128 assert!(result.is_err());
129 }
130
131 #[crate::ctb_test]
132 fn test_first_char_of_utf8_string_only_invalid() {
133 let input = &[0xFF, 0xFE];
134 let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
135 assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]);
136 assert_eq!(consumed, 1);
137 }
138
139 #[crate::ctb_test]
140 fn test_first_char_of_utf8_string_lossless_valid() {
141 let input = &[0x61, 0x62];
142 let (ch, (consumed, valid)) =
143 first_char_of_utf8_string_lossless(input).unwrap();
144 assert_eq!(ch, vec![0x61]);
145 assert_eq!(consumed, 1);
146 assert!(valid);
147 }
148
149 #[crate::ctb_test]
150 fn test_first_char_of_utf8_string_lossless_multibyte() {
151 let input = "🥴test".as_bytes();
152 let (ch, (consumed, valid)) =
153 first_char_of_utf8_string_lossless(input).unwrap();
154 assert_eq!(ch, "🥴".as_bytes());
155 assert_eq!(consumed, "🥴".len());
156 assert!(valid);
157 }
158
159 #[crate::ctb_test]
160 fn test_first_char_of_utf8_string_lossless_invalid() {
161 let input = &[0xFF, 0x61, 0x62];
162 let (ch, (consumed, valid)) =
163 first_char_of_utf8_string_lossless(input).unwrap();
164 assert_eq!(ch, vec![0xFF]); assert_eq!(consumed, 1);
166 assert!(!valid);
167 }
168
169 #[crate::ctb_test]
170 fn test_first_char_of_utf8_string_lossless_only_invalid() {
171 let input = &[0xFF, 0xFE];
172 let (ch, (consumed, valid)) =
173 first_char_of_utf8_string_lossless(input).unwrap();
174 assert_eq!(ch, vec![0xFF]);
175 assert_eq!(consumed, 1);
176 assert!(!valid);
177 }
178}