ctoolbox/formats/eite/dc/
data.rs

1use std::{collections::HashMap, sync::LazyLock};
2
3use anyhow::{Context, Result, anyhow};
4
5use crate::formats::eite::exceptions::DC_DATA_NO_RESULT_EXCEPTION;
6use crate::json;
7
8pub const DCDATA_ID_COL: usize = 0;
9pub const DCDATA_NAME_COL: usize = 1;
10pub const DCDATA_COMBINING_CLASS_COL: usize = 2;
11/// Bidi class column index.
12pub const DCDATA_BIDI_CLASS_COL: usize = 3;
13pub const DCDATA_CASING_COL: usize = 4;
14/// General category column index
15pub const DCDATA_TYPE_COL: usize = 5;
16/// Script column index.
17pub const DCDATA_SCRIPT_COL: usize = 6;
18pub const DCDATA_COMPLEX_TRAITS_COL: usize = 7;
19pub const DCDATA_DESCRIPTION_COL: usize = 8;
20
21pub const DC_FORMATS_ID_COL: usize = 0;
22pub const DC_FORMATS_NAME_COL: usize = 1;
23pub const DC_FORMATS_EXTENSION_COL: usize = 2;
24pub const DC_FORMATS_IMPORT_SUPPORT_COL: usize = 3;
25pub const DC_FORMATS_EXPORT_SUPPORT_COL: usize = 4;
26pub const DC_FORMATS_TEST_COVERAGE_COL: usize = 5;
27pub const DC_FORMATS_TYPE_COL: usize = 6;
28pub const DC_FORMATS_LABEL_COL: usize = 7;
29pub const DC_FORMATS_VARIANT_TYPES_COL: usize = 8;
30pub const DC_FORMATS_COMMENTS_COL: usize = 9;
31
32pub struct EiteData {
33    // Data / datasets
34    datasets: Vec<String>,
35    datasets_loaded: bool,
36    pub data: HashMap<String, Vec<Vec<String>>>, // dataset name -> rows (including header + trailing newline row)
37}
38
39impl EiteData {
40    pub fn new() -> Result<Self> {
41        let mut data = HashMap::new();
42        let mut loaded_datasets = Vec::new();
43
44        for dataset_name in list_dc_datasets() {
45            let path = format!("resources/data/eite/{dataset_name}.csv");
46            // Try to load the asset
47            let dataset_bytes = crate::storage::get_asset(&path)
48                .with_context(|| format!("asset not found: {path}"))?;
49            let mut rdr =
50                csv::Reader::from_reader(std::io::Cursor::new(dataset_bytes));
51            let mut rows = Vec::new();
52            for result in rdr.records() {
53                match result {
54                    Ok(record) => {
55                        // record is a csv::StringRecord, convert to Vec<String>
56                        rows.push(
57                            record
58                                .iter()
59                                .map(std::string::ToString::to_string)
60                                .collect(),
61                        );
62                    }
63                    Err(e) => {
64                        anyhow::bail!(
65                            "Error parsing row in dataset {dataset_name}: {e}"
66                        );
67                    }
68                }
69            }
70            data.insert(dataset_name.to_string(), rows);
71            loaded_datasets.push(dataset_name.to_string());
72        }
73
74        Ok(Self {
75            datasets: loaded_datasets,
76            datasets_loaded: true,
77            data,
78        })
79    }
80
81    pub fn json(&self) -> String {
82        json!(self.data).to_string()
83    }
84
85    /// Returns total rows excluding header.
86    fn dc_dataset_length(&self, dataset: &str) -> usize {
87        let rows = self
88            .data
89            .get(dataset)
90            .expect("dataset not loaded: {dataset}");
91        rows.len().saturating_sub(2)
92    }
93
94    fn dc_data_get_column(&self, dataset: &str, col_num: usize) -> Vec<String> {
95        let rows = self
96            .data
97            .get(dataset)
98            .expect("dataset not loaded: {dataset}");
99        let mut out = Vec::new();
100        for row in rows.iter().take(rows.len()) {
101            if let Some(v) = row.get(col_num) {
102                out.push(v.clone());
103            }
104        }
105        out
106    }
107
108    /// rowNum is zero-based for content rows (header skipped).
109    /// If out of range (beyond trailing sentinel) returns UUID sentinel constant.
110    fn dc_data_lookup_by_id(
111        &self,
112        dataset: &str,
113        row_num: usize,
114        field_num: usize,
115    ) -> Result<String> {
116        let rows = self
117            .data
118            .get(dataset)
119            .ok_or_else(|| anyhow!("dataset not loaded: {dataset}"))?;
120
121        if row_num >= rows.len() {
122            return Err(anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()));
123        }
124
125        let row = rows
126            .get(row_num)
127            .ok_or_else(|| anyhow!("index out of bounds"))?;
128        let value = row
129            .get(field_num)
130            .cloned()
131            .ok_or_else(|| anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()))?;
132
133        Ok(value)
134    }
135
136    /// Returns first match or sentinel UUID if none.
137    fn dc_data_lookup_by_value(
138        &self,
139        dataset: &str,
140        filter_field: usize,
141        filter_value: &str,
142        desired_field: usize,
143    ) -> Result<String> {
144        let rows = self
145            .data
146            .get(dataset)
147            .ok_or_else(|| anyhow!("dataset not loaded: {dataset}"))?;
148
149        for row in rows {
150            if row.get(filter_field).is_some_and(|s| s == filter_value) {
151                let value =
152                    row.get(desired_field).cloned().ok_or_else(|| {
153                        anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string())
154                    })?;
155                return Ok(value);
156            }
157        }
158        Err(anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()))
159    }
160
161    /// All matches.
162    fn dc_data_filter_by_value(
163        &self,
164        dataset: &str,
165        filter_field: usize,
166        filter_value: &str,
167        desired_field: usize,
168    ) -> Vec<String> {
169        let rows = self.data.get(dataset).expect("dataset not loaded");
170        let mut out = Vec::new();
171        for row in rows.iter().take(rows.len()) {
172            if row.get(filter_field).is_some_and(|s| s == filter_value) {
173                if let Some(v) = row.get(desired_field) {
174                    out.push(v.clone());
175                }
176            }
177        }
178        out
179    }
180
181    fn dc_data_filter_by_value_greater(
182        &self,
183        dataset: &str,
184        filter_field: usize,
185        filter_value: i32,
186        desired_field: usize,
187    ) -> Vec<String> {
188        let rows = self.data.get(dataset).expect("dataset not loaded");
189        let mut out = Vec::new();
190        for row in rows.iter().take(rows.len()) {
191            if let Some(cell) = row.get(filter_field) {
192                if let Ok(v) = cell.parse::<i32>() {
193                    if v > filter_value {
194                        if let Some(d) = row.get(desired_field) {
195                            out.push(d.clone());
196                        }
197                    }
198                }
199            }
200        }
201        out
202    }
203}
204
205// Lazy static instance
206static EITE_DATA: LazyLock<EiteData> =
207    LazyLock::new(|| EiteData::new().expect("Failed to initialize EITE_DATA"));
208
209/// Static list of known Dc datasets (mirrors original JS array).
210pub fn list_dc_datasets() -> Vec<&'static str> {
211    vec![
212        "DcData",
213        "formats",
214        "mappings/from/ascii",
215        "mappings/from/unicode",
216        "mappings/to/html",
217        "mappings/to/lang_en",
218        "mappings/to/unicode",
219    ]
220}
221
222pub fn json() -> String {
223    EITE_DATA.json()
224}
225
226/// Returns true if the provided dataset name is one of the known Dc datasets.
227pub fn is_dc_dataset(name: &str) -> bool {
228    list_dc_datasets().iter().any(|s| s == &name)
229}
230
231pub fn dc_dataset_length(dataset: &str) -> usize {
232    EITE_DATA.dc_dataset_length(dataset)
233}
234
235pub fn dc_data_get_column(dataset: &str, col_num: usize) -> Vec<String> {
236    EITE_DATA.dc_data_get_column(dataset, col_num)
237}
238
239pub fn dc_data_lookup_by_id(
240    dataset: &str,
241    row_num: usize,
242    field_num: usize,
243) -> Result<String> {
244    EITE_DATA.dc_data_lookup_by_id(dataset, row_num, field_num)
245}
246
247pub fn dc_data_lookup_by_value(
248    dataset: &str,
249    filter_field: usize,
250    filter_value: &str,
251    desired_field: usize,
252) -> Result<String> {
253    EITE_DATA.dc_data_lookup_by_value(
254        dataset,
255        filter_field,
256        filter_value,
257        desired_field,
258    )
259}
260
261pub fn dc_data_lookup_by_dc_in_col_0(
262    dataset: &str,
263    dc: u32,
264    desired_field: usize,
265) -> Result<String> {
266    dc_data_lookup_by_value(dataset, 0, &dc.to_string(), desired_field)
267}
268
269pub fn dc_data_filter_by_value(
270    dataset: &str,
271    filter_field: usize,
272    filter_value: &str,
273    desired_field: usize,
274) -> Vec<String> {
275    EITE_DATA.dc_data_filter_by_value(
276        dataset,
277        filter_field,
278        filter_value,
279        desired_field,
280    )
281}
282
283pub fn dc_data_filter_by_value_greater(
284    dataset: &str,
285    filter_field: usize,
286    filter_value: i32,
287    desired_field: usize,
288) -> Vec<String> {
289    EITE_DATA.dc_data_filter_by_value_greater(
290        dataset,
291        filter_field,
292        filter_value,
293        desired_field,
294    )
295}
296
297#[cfg(test)]
298mod tests {
299    use crate::formats::eite::{
300        dc::{get_dc_count, maximum_known_dc},
301        formats::is_format,
302    };
303
304    use super::*;
305
306    #[crate::ctb_test]
307    fn test_data_loaded() {
308        assert_eq!(dc_dataset_length("DcData"), 299);
309        assert_eq!(get_dc_count(), 299);
310        assert_eq!(maximum_known_dc(), 298);
311        assert!(is_format("unicode"));
312        assert!(is_format("utf8"));
313    }
314
315    #[crate::ctb_test]
316    fn test_list_dc_datasets_contains_expected() {
317        let list = list_dc_datasets();
318        assert!(list.contains(&"DcData"));
319        assert!(list.contains(&"mappings/to/html"));
320        assert!(!list.contains(&"nonexistent_dataset_xyz"));
321    }
322}