ctoolbox/formats/
utf8.rs

1use anyhow::{Result, anyhow, bail};
2
3pub const UTF8_REPLACEMENT_CHARACTER: &[u8; 3] = &[0xEF, 0xBF, 0xBD];
4
5/// Return a character (or for invalid part of the string, the replacement
6/// character) of a UTF-8 string as bytes, as well as the number of input bytes
7/// consumed. For valid UTF-8 input, the number of input bytes will always match
8/// the number of output bytes.
9pub fn first_char_of_utf8_string(bytes: &[u8]) -> Result<(Vec<u8>, usize)> {
10    let (bytes, (consumed, valid)) = _first_char_of_utf8_string(bytes, true)?;
11    Ok((bytes, consumed))
12}
13
14/// Return a character or invalid part of a UTF-8 string as bytes, as well as
15/// the number of input bytes consumed.
16pub fn first_char_of_utf8_string_lossless(
17    bytes: &[u8],
18) -> Result<(Vec<u8>, (usize, bool))> {
19    _first_char_of_utf8_string(bytes, false)
20}
21
22fn _first_char_of_utf8_string(
23    bytes: &[u8],
24    replace_invalid: bool,
25) -> Result<(Vec<u8>, (usize, bool))> {
26    // This is inefficient because it operates on the WHOLE string, meaning it
27    // will decode all valid characters in a single chunk, then all but the
28    // first is discarded. Could possibly trim the string to the maximum length
29    // of a valid character to make it faster, but I don't know if that might
30    // have side effects on how many replacement characters would be returned
31    // compared to using String from_utf8_lossy(). It could also be made more efficient by returning a Cow slice into the input buffer instead of a Vec.
32    if bytes.is_empty() {
33        return Err(anyhow!("Empty input in first_char_of_utf8_string"));
34    }
35    let mut iter = bytes.utf8_chunks();
36    let chunk = iter.next().ok_or_else(|| {
37        anyhow!("At least some chunk should be found for non-empty string")
38    })?;
39
40    let valid = chunk.valid();
41    if valid.is_empty() {
42        let invalid = chunk.invalid();
43        if !invalid.is_empty() {
44            if replace_invalid {
45                // Return replacement character for invalid sequence
46                return Ok((vec![0xEF, 0xBF, 0xBD], (invalid.len(), false)));
47            }
48            return Ok((invalid.to_vec(), (invalid.len(), false)));
49        }
50    } else {
51        let out = &mut [0u8; 4];
52        let first_char_len =
53            valid.chars().next().unwrap().encode_utf8(out).len();
54
55        return Ok((out[..first_char_len].to_vec(), (first_char_len, true)));
56    }
57
58    bail!("Chunk contained neither valid nor invalid data")
59}
60
61pub fn utf8_from_scalar(cp: u32) -> Result<Vec<u8>> {
62    if cp > 0x10FFFF {
63        bail!("Invalid Unicode codepoint U+{cp:X}");
64    }
65    let mut buf = [0u8; 4];
66    let s = char::from_u32(cp)
67        .ok_or_else(|| anyhow!("Invalid Unicode codepoint U+{cp:X}"))?
68        .encode_utf8(&mut buf);
69    Ok(s.as_bytes().to_vec())
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[crate::ctb_test]
77    fn test_first_char_of_utf8_string_ascii() {
78        let input = b"hello";
79        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
80        assert_eq!(ch, vec![b'h']);
81        assert_eq!(consumed, 1);
82    }
83
84    #[crate::ctb_test]
85    fn test_first_char_of_utf8_string_multibyte() {
86        let input = "éclair".as_bytes();
87        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
88        assert_eq!(ch, "é".as_bytes());
89        assert_eq!(consumed, "é".as_bytes().len());
90    }
91
92    #[crate::ctb_test]
93    fn test_first_char_of_utf8_string_astral() {
94        let input = "🥴test".as_bytes();
95        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
96        assert_eq!(ch, "🥴".as_bytes());
97        assert_eq!(consumed, "🥴".len());
98    }
99
100    #[crate::ctb_test]
101    fn test_first_char_of_utf8_string_invalid() {
102        let input = &[0xFF, 0x61, 0x62];
103        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
104        assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]); // UTF-8 replacement character
105        assert_eq!(consumed, 1);
106    }
107
108    #[crate::ctb_test]
109    fn test_first_char_of_utf8_string_overlong() {
110        let input = &[0xC1, 0x81];
111        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
112        assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]); // UTF-8 replacement character
113        assert_eq!(consumed, 1);
114    }
115
116    #[crate::ctb_test]
117    fn test_first_char_of_utf8_string_partly_invalid() {
118        let input = &[0xE2, 0x80, 0xA9, 0xFF, 0x61, 0x62];
119        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
120        assert_eq!(ch, vec![0xE2, 0x80, 0xA9]);
121        assert_eq!(consumed, 3);
122    }
123
124    #[crate::ctb_test]
125    fn test_first_char_of_utf8_string_empty() {
126        let input = b"";
127        let result = first_char_of_utf8_string(input);
128        assert!(result.is_err());
129    }
130
131    #[crate::ctb_test]
132    fn test_first_char_of_utf8_string_only_invalid() {
133        let input = &[0xFF, 0xFE];
134        let (ch, consumed) = first_char_of_utf8_string(input).unwrap();
135        assert_eq!(ch, vec![0xEF, 0xBF, 0xBD]);
136        assert_eq!(consumed, 1);
137    }
138
139    #[crate::ctb_test]
140    fn test_first_char_of_utf8_string_lossless_valid() {
141        let input = &[0x61, 0x62];
142        let (ch, (consumed, valid)) =
143            first_char_of_utf8_string_lossless(input).unwrap();
144        assert_eq!(ch, vec![0x61]);
145        assert_eq!(consumed, 1);
146        assert!(valid);
147    }
148
149    #[crate::ctb_test]
150    fn test_first_char_of_utf8_string_lossless_multibyte() {
151        let input = "🥴test".as_bytes();
152        let (ch, (consumed, valid)) =
153            first_char_of_utf8_string_lossless(input).unwrap();
154        assert_eq!(ch, "🥴".as_bytes());
155        assert_eq!(consumed, "🥴".len());
156        assert!(valid);
157    }
158
159    #[crate::ctb_test]
160    fn test_first_char_of_utf8_string_lossless_invalid() {
161        let input = &[0xFF, 0x61, 0x62];
162        let (ch, (consumed, valid)) =
163            first_char_of_utf8_string_lossless(input).unwrap();
164        assert_eq!(ch, vec![0xFF]); // UTF-8 replacement character
165        assert_eq!(consumed, 1);
166        assert!(!valid);
167    }
168
169    #[crate::ctb_test]
170    fn test_first_char_of_utf8_string_lossless_only_invalid() {
171        let input = &[0xFF, 0xFE];
172        let (ch, (consumed, valid)) =
173            first_char_of_utf8_string_lossless(input).unwrap();
174        assert_eq!(ch, vec![0xFF]);
175        assert_eq!(consumed, 1);
176        assert!(!valid);
177    }
178}