1use anyhow::{Result, anyhow, bail};
2
3use crate::formats::utf8::first_char_of_utf8_string;
4
5pub fn utf8_char_array_from_byte_array(bytes: &[u8]) -> Result<Vec<u32>> {
8 let mut out: Vec<u8> = Vec::new();
9 let mut i = 0;
10 while i < bytes.len() {
11 let (mut temp, consumed) = first_char_of_utf8_string(&bytes[i..])?;
12 out.append(&mut temp);
13 i += consumed;
14 }
15 Ok(String::from_utf8(out)
16 .expect("first_char_of_utf8_string should produce valid UTF-8")
17 .chars()
18 .map(u32::from)
19 .collect())
20}
21
22pub fn byte_array_from_utf8_char_array(codepoints: &[u32]) -> Result<Vec<u8>> {
25 let mut out = Vec::new();
26 for &cp in codepoints {
27 if let Some(ch) = std::char::from_u32(cp) {
28 let mut buf = [0u8; 4];
29 let encoded = ch.encode_utf8(&mut buf);
30 out.extend_from_slice(encoded.as_bytes());
31 } else {
32 return Err(anyhow!("Invalid Unicode scalar value: {cp}"));
33 }
34 }
35 Ok(out)
36}
37
38pub fn unicode_scalar_from_utf8(bytes: &[u8]) -> Result<u32> {
39 let (codepoint, len) = first_utf8_codepoint(bytes)?;
40
41 if len > bytes.len() {
42 Err(anyhow!("This function is for a single character"))
43 } else {
44 Ok(codepoint)
45 }
46}
47
48pub fn first_utf8_codepoint(bytes: &[u8]) -> Result<(u32, usize)> {
50 if bytes.is_empty() {
51 return Ok((0, 0));
52 }
53 for end in 1..=bytes.len().min(4) {
55 if let Ok(s) = std::str::from_utf8(&bytes[..end]) {
56 if let Some(ch) = s.chars().next() {
57 return Ok((u32::from(ch), ch.len_utf8()));
58 }
59 }
60 }
61 bail!("Invalid UTF-8 sequence")
63}
64
65pub fn last_utf8_codepoint(bytes: &[u8]) -> (u32, usize) {
67 if bytes.is_empty() {
68 return (0, 0);
69 }
70 let len = bytes.len();
72 for start in (0.max(len.saturating_sub(4))..len).rev() {
73 if let Ok(s) = std::str::from_utf8(&bytes[start..]) {
74 if let Some(ch) = s.chars().next() {
75 return (u32::from(ch), ch.len_utf8());
76 }
77 }
78 }
79 (u32::from(bytes[len - 1]), 1)
81}
82
83#[cfg(test)]
84mod tests {
85 use const_default::ConstDefault;
86
87 use crate::formats::eite::formats::utf8::UTF8FormatSettings;
88 use crate::formats::eite::formats::utf8::{dca_from_utf8, dca_to_utf8};
89 use crate::formats::{
90 assert_vec_u8_ok_eq_no_warnings, assert_vec_u32_ok_eq_no_warnings,
91 };
92 use crate::utilities::{assert_vec_u8_ok_eq, assert_vec_u32_ok_eq};
93
94 use super::*;
95
96 const SETTINGS: UTF8FormatSettings =
97 <UTF8FormatSettings as ConstDefault>::DEFAULT;
98
99 #[crate::ctb_test]
100 fn test_utf8_char_array_conversion() {
101 let s = "hé🙂";
102 let bytes = s.as_bytes();
103 let cps = utf8_char_array_from_byte_array(bytes).expect("decode cps");
104 let re = byte_array_from_utf8_char_array(&cps).expect("encode bytes");
105 assert_eq!(re, bytes);
106 }
107
108 #[crate::ctb_test]
109 fn test_format_utf8_conversions() {
110 assert_vec_u32_ok_eq_no_warnings(
113 &[35, 18, 36],
114 dca_from_utf8(&[49, 32, 50], &SETTINGS),
115 );
116
117 assert_vec_u8_ok_eq_no_warnings(
119 &[49, 32, 50],
120 dca_to_utf8(&[35, 18, 36], &SETTINGS),
121 );
122 }
123
124 #[crate::ctb_test]
125 fn test_utf8_byte_array_conversions_work() {
126 let utf8_bytes = [
128 50, 53, 54, 32, 50, 53, 56, 32, 50, 54, 48, 32, 50, 54, 50, 32, 50,
129 54, 52, 32, 50, 54, 51, 32, 53, 55, 32, 56, 54, 32, 57, 51, 32, 57,
130 51, 32, 57, 54, 32, 51, 48, 32, 49, 56, 32, 50, 56, 54, 32, 55, 50,
131 32, 57, 54, 32, 57, 57, 32, 57, 51, 32, 56, 53, 32, 50, 56, 55, 32,
132 49, 57, 32, 49, 56, 32, 50, 56, 52, 32, 50, 54, 49, 32, 50, 53, 57,
133 32, 35, 32, 115, 97, 121, 32, 34, 72, 101, 108, 108, 111, 44, 32,
134 47, 87, 111, 114, 108, 100, 47, 33, 32, 226, 154, 189, 34, 10, 49,
135 32, 50, 32, 35, 32, 226, 154, 189, 10,
136 ];
137 let expected_codepoints = [
138 50, 53, 54, 32, 50, 53, 56, 32, 50, 54, 48, 32, 50, 54, 50, 32, 50,
139 54, 52, 32, 50, 54, 51, 32, 53, 55, 32, 56, 54, 32, 57, 51, 32, 57,
140 51, 32, 57, 54, 32, 51, 48, 32, 49, 56, 32, 50, 56, 54, 32, 55, 50,
141 32, 57, 54, 32, 57, 57, 32, 57, 51, 32, 56, 53, 32, 50, 56, 55, 32,
142 49, 57, 32, 49, 56, 32, 50, 56, 52, 32, 50, 54, 49, 32, 50, 53, 57,
143 32, 35, 32, 115, 97, 121, 32, 34, 72, 101, 108, 108, 111, 44, 32,
144 47, 87, 111, 114, 108, 100, 47, 33, 32, 9917, 34, 10, 49, 32, 50,
145 32, 35, 32, 9917, 10,
146 ];
147 assert_vec_u32_ok_eq(
148 &expected_codepoints,
149 utf8_char_array_from_byte_array(&utf8_bytes),
150 );
151
152 assert_vec_u8_ok_eq(
154 &utf8_bytes,
155 byte_array_from_utf8_char_array(&expected_codepoints),
156 );
157 }
158}