ctoolbox/formats/eite/
dc.rs

1pub mod data;
2use anyhow::{Result, anyhow, ensure};
3
4use crate::formats::base64::{
5    bytes_to_standard_base64, decimal_to_standard_base64,
6    standard_base64_to_bytes, standard_base64_to_decimal,
7};
8use crate::formats::eite::dc::data::{
9    DCDATA_BIDI_CLASS_COL, DCDATA_CASING_COL, DCDATA_COMBINING_CLASS_COL,
10    DCDATA_COMPLEX_TRAITS_COL, DCDATA_DESCRIPTION_COL, DCDATA_NAME_COL,
11    DCDATA_SCRIPT_COL, DCDATA_TYPE_COL, dc_data_lookup_by_id,
12    dc_dataset_length, is_dc_dataset,
13};
14use crate::formats::eite::util::string::substring_bug_compatible;
15
16/// Replacement for incoming character with value not mapped to a Dc
17pub const DC_REPLACEMENT_UNAVAIL_DC: u32 = 207;
18
19/// Replacement for incoming character with value unknown or unrepresentable in Unicode
20pub const DC_REPLACEMENT_UNAVAIL_UNICODE: u32 = 206;
21
22pub const DC_ESCAPE_NEXT: u32 = 255;
23
24pub const DC_START_ENCAPSULATION_UTF8: u32 = 191;
25pub const DC_END_ENCAPSULATION_UTF8: u32 = 192;
26
27pub const DC_START_ENCAPSULATION_BINARY: u32 = 203;
28pub const DC_END_ENCAPSULATION_BINARY: u32 = 204;
29
30/* ===== Dc classification & queries ===== */
31
32pub fn is_known_dc(v: u32) -> bool {
33    v <= u32::try_from(maximum_known_dc())
34        .expect("Failed to convert maximum_known_dc to u32")
35}
36
37pub fn maximum_known_dc() -> usize {
38    // JS: dcDatasetLength('DcData')
39    dc_dataset_length("DcData")
40        .checked_sub(1)
41        .expect("Failed to get maximum known Dc")
42}
43
44/// Return true if Dc should be treated as a newline (coarse heuristic).
45pub fn dc_is_newline(dc: u32) -> bool {
46    // Copied literal list from original: [119,120,121,240,294,295]
47    matches!(dc, 119 | 120 | 121 | 240 | 294 | 295)
48}
49
50/// True if general category 'Zs'.
51pub fn dc_is_space(dc: u32) -> Result<bool> {
52    ensure!(is_known_dc(dc), "Unknown Dc {dc}");
53    Ok(dc_get_type(dc)? == "Zs")
54}
55
56/// True if printable (excludes line/para separators, categories starting with
57/// '!' or 'C').
58pub fn dc_is_printable(dc: u32) -> Result<bool> {
59    ensure!(is_known_dc(dc), "Unknown Dc {dc}");
60    let t = dc_get_type(dc)?;
61    if t == "Zl" || t == "Zp" {
62        return Ok(false);
63    }
64    let general = t.chars().next().unwrap_or(' ');
65    if general == '!' || general == 'C' {
66        return Ok(false);
67    }
68    Ok(true)
69}
70
71pub fn dc_is_el_code(dc: u32) -> Result<bool> {
72    ensure!(is_known_dc(dc), "Unknown Dc {dc}");
73    let script = dc_get_script(dc)?;
74    Ok(script.get(0..3) == Some("EL "))
75}
76
77pub fn dc_get_el_class(dc: u32) -> Result<String> {
78    ensure!(is_known_dc(dc), "Unknown Dc {dc}");
79    let script = dc_get_script(dc)?;
80    Ok(substring_bug_compatible(&script, 3, -1))
81}
82
83// ---------------------------------------------------------------------------
84// Field access
85// ---------------------------------------------------------------------------
86
87/// Generic field fetch (dataset “`DcData`”, by numeric Dc id and original JS field number).
88pub fn dc_get_field(dc: u32, field_number: usize) -> Result<String> {
89    // Pass through field_number. If storage uses 0-based, change to field_number - 1 (with checks).
90    dc_data_lookup_by_id(
91        "DcData",
92        usize::try_from(dc).expect("Could not get usize from Dc"),
93        field_number,
94    )
95    .map_err(|e| anyhow!("dc_get_field: {e}"))
96}
97
98/// Name (field 1).
99pub fn dc_get_name(dc: u32) -> Result<String> {
100    dc_get_field(dc, DCDATA_NAME_COL)
101}
102
103/// Combining class (field 2).
104pub fn dc_get_combining_class(dc: u32) -> Result<String> {
105    dc_get_field(dc, DCDATA_COMBINING_CLASS_COL)
106}
107
108/// Bidi class (field 3).
109pub fn dc_get_bidi_class(dc: u32) -> Result<String> {
110    dc_get_field(dc, DCDATA_BIDI_CLASS_COL)
111}
112
113/// Casing (field 4).
114pub fn dc_get_casing(dc: u32) -> Result<String> {
115    dc_get_field(dc, DCDATA_CASING_COL)
116}
117
118/// Type (field 5).
119pub fn dc_get_type(dc: u32) -> Result<String> {
120    dc_get_field(dc, DCDATA_TYPE_COL)
121}
122
123/// Script (field 6).
124pub fn dc_get_script(dc: u32) -> Result<String> {
125    dc_get_field(dc, DCDATA_SCRIPT_COL)
126}
127
128/// Complex traits (field 7).
129pub fn dc_get_complex_traits(dc: u32) -> Result<String> {
130    dc_get_field(dc, DCDATA_COMPLEX_TRAITS_COL)
131}
132
133/// Description (field 8).
134pub fn dc_get_description(dc: u32) -> Result<String> {
135    dc_get_field(dc, DCDATA_DESCRIPTION_COL)
136}
137
138/// Return length of the primary '`DcData`' dataset.
139pub fn get_dc_count() -> usize {
140    dc_dataset_length("DcData")
141}
142
143/// Extract an entire column (by field number) from a dataset.
144pub fn dc_get_column(
145    dataset: &str,
146    field_number: usize,
147) -> Result<Vec<String>> {
148    if !is_dc_dataset(dataset) {
149        return Err(anyhow!("dc_get_column: unknown dataset '{dataset}'"));
150    }
151    let len = dc_dataset_length(dataset);
152    let mut out = Vec::with_capacity(len);
153    for row in 0..len {
154        let v = dc_data_lookup_by_id(dataset, row, field_number)
155            .map_err(|e| anyhow!("dc_get_column: {e}"))?;
156        out.push(v);
157    }
158    Ok(out)
159}
160
161/// Look up a Dc (document character) mapping into a specific output format.
162///
163/// Equivalent of dcGetMappingToFormat(intDc, strFormat) in the original.
164/// Uses dataset path "mappings/to/{format}" and retrieves field 1 (second column)
165/// of the row number equal to the Dc value.
166///
167/// Returns an empty string if lookup fails (mimicking loosely the JS behavior),
168/// but logs an error via Result if the underlying dataset access errors.
169pub fn dc_get_mapping_to_format(dc: u32, format: &str) -> Result<String> {
170    let dataset = format!("mappings/to/{format}");
171    // Underlying call may error if dataset/indices are invalid:
172    match dc_data_lookup_by_id(
173        &dataset,
174        usize::try_from(dc).expect("Could not get usize from Dc"),
175        1,
176    ) {
177        Ok(s) => Ok(s),
178        Err(e) => Err(anyhow!("dc_get_mapping_to_format failed: {e}")),
179    }
180}
181
182pub fn is_dc_base64_encapsulation_character(dc: u32) -> bool {
183    (127..=190).contains(&dc) || dc == 195
184}
185
186pub fn string_to_dc_encapsulated_utf8(input: &str) -> Vec<u32> {
187    bytes_as_dc_encapsulated_utf8(input.as_bytes())
188}
189
190pub fn bytes_as_dc_encapsulated_utf8(input: &[u8]) -> Vec<u32> {
191    let mut out: Vec<u32> = Vec::new();
192
193    out.push(191); // Dc UTF-8 encapsulation start
194    out.append(&mut bytes_to_dc_encapsulated_raw(input));
195    out.push(192); // Dc UTF-8 encapsulation end
196
197    out
198}
199
200pub fn bytes_to_dc_encapsulated_binary(input: &[u8]) -> Vec<u32> {
201    let mut out: Vec<u32> = Vec::new();
202
203    out.push(203); // Dc binary encapsulation start
204    out.append(&mut bytes_to_dc_encapsulated_raw(input));
205    out.push(204); // Dc binary encapsulation end
206
207    out
208}
209
210pub fn bytes_to_dc_encapsulated_raw(bytes: &[u8]) -> Vec<u32> {
211    let decimal = standard_base64_to_decimal(bytes_to_standard_base64(bytes))
212        .expect("Failed to encode base64");
213
214    let mut dc_encoded: Vec<u32> = Vec::new();
215    for b64 in decimal {
216        if b64 == 64 {
217            // Padding
218            dc_encoded.push(195_u32);
219        } else {
220            dc_encoded.push((b64 + 127).into());
221        }
222    }
223
224    dc_encoded
225}
226
227pub fn dc_encapsulated_raw_to_bytes(input: &[u32]) -> Result<Vec<u8>> {
228    let mut out: Vec<u8> = Vec::new();
229
230    // let input_as_u8: Vec<u8> = input.iter().map(|&x| x as u8).collect();
231    let mut dc_decoded: Vec<u8> = Vec::new();
232    for dc in input {
233        if *dc == 195 {
234            dc_decoded.push(64);
235            continue;
236        }
237        if !is_dc_base64_encapsulation_character(*dc) {
238            return Err(anyhow!(
239                "Invalid Dc {dc} in encapsulated raw sequence"
240            ));
241        }
242        dc_decoded.push(u8::try_from(dc - 127)?);
243    }
244
245    let base64 = decimal_to_standard_base64(dc_decoded)
246        .expect("Failed to translate Dcs to base64");
247
248    out.extend_from_slice(&standard_base64_to_bytes(base64)?);
249
250    Ok(out)
251}
252
253#[cfg(test)]
254mod tests {
255
256    use crate::utilities::{assert_vec_u8_ok_eq, assert_vec_u32_eq};
257
258    use super::*;
259
260    #[crate::ctb_test]
261    fn test_dc_newline_list() {
262        for dc in [119, 120, 121, 240, 294, 295] {
263            assert!(dc_is_newline(dc));
264        }
265        assert!(!dc_is_newline(118));
266    }
267
268    #[crate::ctb_test]
269    fn test_dc_bidi_class_120() {
270        assert_eq!(
271            dc_get_bidi_class(120).expect("Bidi class was incorrect"),
272            "B"
273        );
274    }
275
276    #[crate::ctb_test]
277    fn test_dc_is_space() {
278        assert!(is_known_dc(18));
279        assert_eq!(dc_get_type(18).expect("Dc type was incorrect"), "Zs");
280        assert!(dc_is_space(18).expect("Dc 18 is a space"));
281    }
282
283    #[crate::ctb_test]
284    fn test_format_dc_predicates() {
285        // These tests rely on dataset-driven predicates. If datasets are not
286        // loaded in the test harness, fail early and print a clear message.
287
288        // dc_is_printable(21) expected true
289        match dc_is_printable(21) {
290            Ok(v) => assert!(v, "Expected dc 21 printable"),
291            Err(e) => panic!("Failed to run dc_is_printable(21): {e}"),
292        }
293
294        // dc_is_printable(231) expected false (Not(dcIsPrintable(231)))
295        match dc_is_printable(231) {
296            Ok(v) => assert!(!v, "Expected dc 231 NOT printable"),
297            Err(e) => panic!("Failed to run dc_is_printable(231): {e}"),
298        }
299
300        // dc_is_newline(120) expected true
301        assert!(
302            dc_is_newline(120),
303            "Expected dc 120 to be recognized as newline"
304        );
305    }
306
307    #[crate::ctb_test]
308    fn test_bytes_to_dc_encapsulated_raw() {
309        let input = b"Hello, world!";
310        // Base64: SGVsbG8sIHdvcmxkIQ==
311        // Decimal: 18 6 21 44 27 6
312        //          60 44 8 7 29 47
313        //          28 38 49 36 8 16
314        //          64 64
315        let expected = vec![
316            145, 133, 148, 171, 154, 133, // comment to assuage rustfmt
317            187, 171, 135, 134, 156, 174, //
318            155, 165, 176, 163, 135, 143, //
319            195, 195,
320        ];
321        let result = bytes_to_dc_encapsulated_raw(input);
322        assert_vec_u32_eq(&expected, &result);
323    }
324
325    #[crate::ctb_test]
326    fn test_dc_encapsulated_raw_to_bytes() {
327        let input = vec![
328            145, 133, 148, 171, 154, 133, // comment to assuage rustfmt
329            187, 171, 135, 134, 156, 174, //
330            155, 165, 176, 163, 135, 143, //
331            195, 195,
332        ];
333        let expected = b"Hello, world!";
334        let result = dc_encapsulated_raw_to_bytes(&input);
335        assert_vec_u8_ok_eq(expected, result);
336    }
337
338    #[crate::ctb_test]
339    fn test_bytes_to_dc_encapsulated_utf8() {
340        let input = "Hello, world!";
341        // Base64: SGVsbG8sIHdvcmxkIQ==
342        // Decimal: 18 6 21 44 27 6
343        //          60 44 8 7 29 47
344        //          28 38 49 36 8 16
345        //          64 64
346        let expected = vec![
347            191, //
348            145, 133, 148, 171, 154, 133, //
349            187, 171, 135, 134, 156, 174, //
350            155, 165, 176, 163, 135, 143, //
351            195, 195, //
352            192,
353        ];
354        let result = string_to_dc_encapsulated_utf8(input);
355        assert_eq!(result, expected);
356        let result = bytes_as_dc_encapsulated_utf8(input.as_bytes());
357        assert_eq!(result, expected);
358    }
359
360    #[crate::ctb_test]
361    fn test_bytes_to_dc_encapsulated_binary() {
362        let input = b"Hello, world!";
363        // Base64: SGVsbG8sIHdvcmxkIQ==
364        // Decimal: 18 6 21 44 27 6
365        //          60 44 8 7 29 47
366        //          28 38 49 36 8 16
367        //          64 64
368        let expected = vec![
369            203, //
370            145, 133, 148, 171, 154, 133, //
371            187, 171, 135, 134, 156, 174, //
372            155, 165, 176, 163, 135, 143, //
373            195, 195, //
374            204,
375        ];
376        let result = bytes_to_dc_encapsulated_binary(input);
377        assert_eq!(result, expected);
378    }
379}