ctoolbox/formats/eite/encoding/
utf8.rs

1use anyhow::{Result, anyhow, bail};
2
3use crate::formats::utf8::first_char_of_utf8_string;
4
5/// Convert a UTF-8 byte slice into a vector of Unicode scalar codepoints (as i32),
6/// analogous to utf8CharArrayFromByteArray in the original code.
7pub fn utf8_char_array_from_byte_array(bytes: &[u8]) -> Result<Vec<u32>> {
8    let mut out: Vec<u8> = Vec::new();
9    let mut i = 0;
10    while i < bytes.len() {
11        let (mut temp, consumed) = first_char_of_utf8_string(&bytes[i..])?;
12        out.append(&mut temp);
13        i += consumed;
14    }
15    Ok(String::from_utf8(out)
16        .expect("first_char_of_utf8_string should produce valid UTF-8")
17        .chars()
18        .map(u32::from)
19        .collect())
20}
21
22/// Encode an array of Unicode codepoints (as u32) into UTF-8 bytes.
23/// (byteArrayFromUtf8CharArray in original.)
24pub fn byte_array_from_utf8_char_array(codepoints: &[u32]) -> Result<Vec<u8>> {
25    let mut out = Vec::new();
26    for &cp in codepoints {
27        if let Some(ch) = std::char::from_u32(cp) {
28            let mut buf = [0u8; 4];
29            let encoded = ch.encode_utf8(&mut buf);
30            out.extend_from_slice(encoded.as_bytes());
31        } else {
32            return Err(anyhow!("Invalid Unicode scalar value: {cp}"));
33        }
34    }
35    Ok(out)
36}
37
38pub fn unicode_scalar_from_utf8(bytes: &[u8]) -> Result<u32> {
39    let (codepoint, len) = first_utf8_codepoint(bytes)?;
40
41    if len > bytes.len() {
42        Err(anyhow!("This function is for a single character"))
43    } else {
44        Ok(codepoint)
45    }
46}
47
48/// Helper: decode first UTF-8 codepoint (or raw byte) returning (codepoint, `byte_len`).
49pub fn first_utf8_codepoint(bytes: &[u8]) -> Result<(u32, usize)> {
50    if bytes.is_empty() {
51        return Ok((0, 0));
52    }
53    // Try valid UTF-8 for at least the first char.
54    for end in 1..=bytes.len().min(4) {
55        if let Ok(s) = std::str::from_utf8(&bytes[..end]) {
56            if let Some(ch) = s.chars().next() {
57                return Ok((u32::from(ch), ch.len_utf8()));
58            }
59        }
60    }
61    // Fallback: treat first byte as standalone.
62    bail!("Invalid UTF-8 sequence")
63}
64
65/// Helper: decode last UTF-8 codepoint (or raw byte) returning (codepoint, `byte_len`).
66pub fn last_utf8_codepoint(bytes: &[u8]) -> (u32, usize) {
67    if bytes.is_empty() {
68        return (0, 0);
69    }
70    // Scan backwards up to 4 bytes.
71    let len = bytes.len();
72    for start in (0.max(len.saturating_sub(4))..len).rev() {
73        if let Ok(s) = std::str::from_utf8(&bytes[start..]) {
74            if let Some(ch) = s.chars().next() {
75                return (u32::from(ch), ch.len_utf8());
76            }
77        }
78    }
79    // Fallback: last byte.
80    (u32::from(bytes[len - 1]), 1)
81}
82
83#[cfg(test)]
84mod tests {
85    use const_default::ConstDefault;
86
87    use crate::formats::eite::formats::utf8::UTF8FormatSettings;
88    use crate::formats::eite::formats::utf8::{dca_from_utf8, dca_to_utf8};
89    use crate::formats::{
90        assert_vec_u8_ok_eq_no_warnings, assert_vec_u32_ok_eq_no_warnings,
91    };
92    use crate::utilities::{assert_vec_u8_ok_eq, assert_vec_u32_ok_eq};
93
94    use super::*;
95
96    const SETTINGS: UTF8FormatSettings =
97        <UTF8FormatSettings as ConstDefault>::DEFAULT;
98
99    #[crate::ctb_test]
100    fn test_utf8_char_array_conversion() {
101        let s = "hé🙂";
102        let bytes = s.as_bytes();
103        let cps = utf8_char_array_from_byte_array(bytes).expect("decode cps");
104        let re = byte_array_from_utf8_char_array(&cps).expect("encode bytes");
105        assert_eq!(re, bytes);
106    }
107
108    #[crate::ctb_test]
109    fn test_format_utf8_conversions() {
110        // /* FIXME: Update tests for new remainder character format. */
111        // dcaFromUtf8([ 49, 32, 50 ]) -> [ 35, 18, 36 ]
112        assert_vec_u32_ok_eq_no_warnings(
113            &[35, 18, 36],
114            dca_from_utf8(&[49, 32, 50], &SETTINGS),
115        );
116
117        // dcaToUtf8([ 35, 18, 36 ]) -> [ 49, 32, 50 ]
118        assert_vec_u8_ok_eq_no_warnings(
119            &[49, 32, 50],
120            dca_to_utf8(&[35, 18, 36], &SETTINGS),
121        );
122    }
123
124    #[crate::ctb_test]
125    fn test_utf8_byte_array_conversions_work() {
126        // utf8CharArrayFromByteArray
127        let utf8_bytes = [
128            50, 53, 54, 32, 50, 53, 56, 32, 50, 54, 48, 32, 50, 54, 50, 32, 50,
129            54, 52, 32, 50, 54, 51, 32, 53, 55, 32, 56, 54, 32, 57, 51, 32, 57,
130            51, 32, 57, 54, 32, 51, 48, 32, 49, 56, 32, 50, 56, 54, 32, 55, 50,
131            32, 57, 54, 32, 57, 57, 32, 57, 51, 32, 56, 53, 32, 50, 56, 55, 32,
132            49, 57, 32, 49, 56, 32, 50, 56, 52, 32, 50, 54, 49, 32, 50, 53, 57,
133            32, 35, 32, 115, 97, 121, 32, 34, 72, 101, 108, 108, 111, 44, 32,
134            47, 87, 111, 114, 108, 100, 47, 33, 32, 226, 154, 189, 34, 10, 49,
135            32, 50, 32, 35, 32, 226, 154, 189, 10,
136        ];
137        let expected_codepoints = [
138            50, 53, 54, 32, 50, 53, 56, 32, 50, 54, 48, 32, 50, 54, 50, 32, 50,
139            54, 52, 32, 50, 54, 51, 32, 53, 55, 32, 56, 54, 32, 57, 51, 32, 57,
140            51, 32, 57, 54, 32, 51, 48, 32, 49, 56, 32, 50, 56, 54, 32, 55, 50,
141            32, 57, 54, 32, 57, 57, 32, 57, 51, 32, 56, 53, 32, 50, 56, 55, 32,
142            49, 57, 32, 49, 56, 32, 50, 56, 52, 32, 50, 54, 49, 32, 50, 53, 57,
143            32, 35, 32, 115, 97, 121, 32, 34, 72, 101, 108, 108, 111, 44, 32,
144            47, 87, 111, 114, 108, 100, 47, 33, 32, 9917, 34, 10, 49, 32, 50,
145            32, 35, 32, 9917, 10,
146        ];
147        assert_vec_u32_ok_eq(
148            &expected_codepoints,
149            utf8_char_array_from_byte_array(&utf8_bytes),
150        );
151
152        // byteArrayFromUtf8CharArray (round trip)
153        assert_vec_u8_ok_eq(
154            &utf8_bytes,
155            byte_array_from_utf8_char_array(&expected_codepoints),
156        );
157    }
158}