ctoolbox/formats/eite/dc/
data.rs1use std::{collections::HashMap, sync::LazyLock};
2
3use anyhow::{Context, Result, anyhow};
4
5use crate::formats::eite::exceptions::DC_DATA_NO_RESULT_EXCEPTION;
6use crate::json;
7
8pub const DCDATA_ID_COL: usize = 0;
9pub const DCDATA_NAME_COL: usize = 1;
10pub const DCDATA_COMBINING_CLASS_COL: usize = 2;
11pub const DCDATA_BIDI_CLASS_COL: usize = 3;
13pub const DCDATA_CASING_COL: usize = 4;
14pub const DCDATA_TYPE_COL: usize = 5;
16pub const DCDATA_SCRIPT_COL: usize = 6;
18pub const DCDATA_COMPLEX_TRAITS_COL: usize = 7;
19pub const DCDATA_DESCRIPTION_COL: usize = 8;
20
21pub const DC_FORMATS_ID_COL: usize = 0;
22pub const DC_FORMATS_NAME_COL: usize = 1;
23pub const DC_FORMATS_EXTENSION_COL: usize = 2;
24pub const DC_FORMATS_IMPORT_SUPPORT_COL: usize = 3;
25pub const DC_FORMATS_EXPORT_SUPPORT_COL: usize = 4;
26pub const DC_FORMATS_TEST_COVERAGE_COL: usize = 5;
27pub const DC_FORMATS_TYPE_COL: usize = 6;
28pub const DC_FORMATS_LABEL_COL: usize = 7;
29pub const DC_FORMATS_VARIANT_TYPES_COL: usize = 8;
30pub const DC_FORMATS_COMMENTS_COL: usize = 9;
31
32pub struct EiteData {
33 datasets: Vec<String>,
35 datasets_loaded: bool,
36 pub data: HashMap<String, Vec<Vec<String>>>, }
38
39impl EiteData {
40 pub fn new() -> Result<Self> {
41 let mut data = HashMap::new();
42 let mut loaded_datasets = Vec::new();
43
44 for dataset_name in list_dc_datasets() {
45 let path = format!("resources/data/eite/{dataset_name}.csv");
46 let dataset_bytes = crate::storage::get_asset(&path)
48 .with_context(|| format!("asset not found: {path}"))?;
49 let mut rdr =
50 csv::Reader::from_reader(std::io::Cursor::new(dataset_bytes));
51 let mut rows = Vec::new();
52 for result in rdr.records() {
53 match result {
54 Ok(record) => {
55 rows.push(
57 record
58 .iter()
59 .map(std::string::ToString::to_string)
60 .collect(),
61 );
62 }
63 Err(e) => {
64 anyhow::bail!(
65 "Error parsing row in dataset {dataset_name}: {e}"
66 );
67 }
68 }
69 }
70 data.insert(dataset_name.to_string(), rows);
71 loaded_datasets.push(dataset_name.to_string());
72 }
73
74 Ok(Self {
75 datasets: loaded_datasets,
76 datasets_loaded: true,
77 data,
78 })
79 }
80
81 pub fn json(&self) -> String {
82 json!(self.data).to_string()
83 }
84
85 fn dc_dataset_length(&self, dataset: &str) -> usize {
87 let rows = self
88 .data
89 .get(dataset)
90 .expect("dataset not loaded: {dataset}");
91 rows.len().saturating_sub(2)
92 }
93
94 fn dc_data_get_column(&self, dataset: &str, col_num: usize) -> Vec<String> {
95 let rows = self
96 .data
97 .get(dataset)
98 .expect("dataset not loaded: {dataset}");
99 let mut out = Vec::new();
100 for row in rows.iter().take(rows.len()) {
101 if let Some(v) = row.get(col_num) {
102 out.push(v.clone());
103 }
104 }
105 out
106 }
107
108 fn dc_data_lookup_by_id(
111 &self,
112 dataset: &str,
113 row_num: usize,
114 field_num: usize,
115 ) -> Result<String> {
116 let rows = self
117 .data
118 .get(dataset)
119 .ok_or_else(|| anyhow!("dataset not loaded: {dataset}"))?;
120
121 if row_num >= rows.len() {
122 return Err(anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()));
123 }
124
125 let row = rows
126 .get(row_num)
127 .ok_or_else(|| anyhow!("index out of bounds"))?;
128 let value = row
129 .get(field_num)
130 .cloned()
131 .ok_or_else(|| anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()))?;
132
133 Ok(value)
134 }
135
136 fn dc_data_lookup_by_value(
138 &self,
139 dataset: &str,
140 filter_field: usize,
141 filter_value: &str,
142 desired_field: usize,
143 ) -> Result<String> {
144 let rows = self
145 .data
146 .get(dataset)
147 .ok_or_else(|| anyhow!("dataset not loaded: {dataset}"))?;
148
149 for row in rows {
150 if row.get(filter_field).is_some_and(|s| s == filter_value) {
151 let value =
152 row.get(desired_field).cloned().ok_or_else(|| {
153 anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string())
154 })?;
155 return Ok(value);
156 }
157 }
158 Err(anyhow!(DC_DATA_NO_RESULT_EXCEPTION.to_string()))
159 }
160
161 fn dc_data_filter_by_value(
163 &self,
164 dataset: &str,
165 filter_field: usize,
166 filter_value: &str,
167 desired_field: usize,
168 ) -> Vec<String> {
169 let rows = self.data.get(dataset).expect("dataset not loaded");
170 let mut out = Vec::new();
171 for row in rows.iter().take(rows.len()) {
172 if row.get(filter_field).is_some_and(|s| s == filter_value) {
173 if let Some(v) = row.get(desired_field) {
174 out.push(v.clone());
175 }
176 }
177 }
178 out
179 }
180
181 fn dc_data_filter_by_value_greater(
182 &self,
183 dataset: &str,
184 filter_field: usize,
185 filter_value: i32,
186 desired_field: usize,
187 ) -> Vec<String> {
188 let rows = self.data.get(dataset).expect("dataset not loaded");
189 let mut out = Vec::new();
190 for row in rows.iter().take(rows.len()) {
191 if let Some(cell) = row.get(filter_field) {
192 if let Ok(v) = cell.parse::<i32>() {
193 if v > filter_value {
194 if let Some(d) = row.get(desired_field) {
195 out.push(d.clone());
196 }
197 }
198 }
199 }
200 }
201 out
202 }
203}
204
205static EITE_DATA: LazyLock<EiteData> =
207 LazyLock::new(|| EiteData::new().expect("Failed to initialize EITE_DATA"));
208
209pub fn list_dc_datasets() -> Vec<&'static str> {
211 vec![
212 "DcData",
213 "formats",
214 "mappings/from/ascii",
215 "mappings/from/unicode",
216 "mappings/to/html",
217 "mappings/to/lang_en",
218 "mappings/to/unicode",
219 ]
220}
221
222pub fn json() -> String {
223 EITE_DATA.json()
224}
225
226pub fn is_dc_dataset(name: &str) -> bool {
228 list_dc_datasets().iter().any(|s| s == &name)
229}
230
231pub fn dc_dataset_length(dataset: &str) -> usize {
232 EITE_DATA.dc_dataset_length(dataset)
233}
234
235pub fn dc_data_get_column(dataset: &str, col_num: usize) -> Vec<String> {
236 EITE_DATA.dc_data_get_column(dataset, col_num)
237}
238
239pub fn dc_data_lookup_by_id(
240 dataset: &str,
241 row_num: usize,
242 field_num: usize,
243) -> Result<String> {
244 EITE_DATA.dc_data_lookup_by_id(dataset, row_num, field_num)
245}
246
247pub fn dc_data_lookup_by_value(
248 dataset: &str,
249 filter_field: usize,
250 filter_value: &str,
251 desired_field: usize,
252) -> Result<String> {
253 EITE_DATA.dc_data_lookup_by_value(
254 dataset,
255 filter_field,
256 filter_value,
257 desired_field,
258 )
259}
260
261pub fn dc_data_lookup_by_dc_in_col_0(
262 dataset: &str,
263 dc: u32,
264 desired_field: usize,
265) -> Result<String> {
266 dc_data_lookup_by_value(dataset, 0, &dc.to_string(), desired_field)
267}
268
269pub fn dc_data_filter_by_value(
270 dataset: &str,
271 filter_field: usize,
272 filter_value: &str,
273 desired_field: usize,
274) -> Vec<String> {
275 EITE_DATA.dc_data_filter_by_value(
276 dataset,
277 filter_field,
278 filter_value,
279 desired_field,
280 )
281}
282
283pub fn dc_data_filter_by_value_greater(
284 dataset: &str,
285 filter_field: usize,
286 filter_value: i32,
287 desired_field: usize,
288) -> Vec<String> {
289 EITE_DATA.dc_data_filter_by_value_greater(
290 dataset,
291 filter_field,
292 filter_value,
293 desired_field,
294 )
295}
296
297#[cfg(test)]
298mod tests {
299 use crate::formats::eite::{
300 dc::{get_dc_count, maximum_known_dc},
301 formats::is_format,
302 };
303
304 use super::*;
305
306 #[crate::ctb_test]
307 fn test_data_loaded() {
308 assert_eq!(dc_dataset_length("DcData"), 299);
309 assert_eq!(get_dc_count(), 299);
310 assert_eq!(maximum_known_dc(), 298);
311 assert!(is_format("unicode"));
312 assert!(is_format("utf8"));
313 }
314
315 #[crate::ctb_test]
316 fn test_list_dc_datasets_contains_expected() {
317 let list = list_dc_datasets();
318 assert!(list.contains(&"DcData"));
319 assert!(list.contains(&"mappings/to/html"));
320 assert!(!list.contains(&"nonexistent_dataset_xyz"));
321 }
322}