ctoolbox/formats/eite/
formats.rs

1//! EITE format compatibility module.
2//! Provides conversions and metadata for document character (Dc) arrays and external formats.
3//! This module supports parsing, converting, and serializing Dc arrays to and from various external formats,
4//! as well as providing metadata and handling warnings for import and export operations.
5//!
6//! Import/export/tests support columns meaning:
7//!   -1=N/A
8//!   blank/0=none
9//!   1=WIP
10//!   2=mostly, or fully for at least one version of a format with options
11//!   3=fully implemented for semantic content (and tests for variation in non-semantic details of the structure of the doc being imported)
12//!   4=lossless and roundtrippable (with enough info for unambiguous bit-for-bit reconstruction of any given input)
13//!   5=4+optional strict validation.
14
15use anyhow::{Context, Result, bail, ensure};
16
17use crate::formats::FormatLog;
18use crate::formats::eite::dc::data::{
19    dc_data_filter_by_value, dc_data_filter_by_value_greater,
20    dc_data_get_column, dc_data_lookup_by_dc_in_col_0, dc_data_lookup_by_id,
21    dc_data_lookup_by_value,
22};
23use crate::formats::eite::dc::{DC_REPLACEMENT_UNAVAIL_DC, is_known_dc};
24use crate::formats::eite::eite_state::EiteState;
25use crate::formats::eite::encoding::base::{
26    dec_to_hex_single, hex_to_dec_single,
27};
28use crate::formats::eite::encoding::is_supported_char_encoding;
29use crate::formats::eite::encoding::utf8::unicode_scalar_from_utf8;
30use crate::formats::eite::exceptions::{exc_or_empty, excep};
31use crate::formats::eite::formats::ascii::{
32    AsciiSafeSubsetFormatSettings, dca_from_ascii, dca_from_ascii_safe_subset,
33    dca_to_ascii, dca_to_ascii_safe_subset,
34};
35use crate::formats::eite::formats::html::{
36    dc_to_colorcoded, dca_to_colorcoded, dca_to_html, dca_to_html_fragment,
37};
38use crate::formats::eite::formats::integer_list::{
39    dca_from_integer_list, dca_to_integer_list,
40};
41use crate::formats::eite::formats::sems::{SEMSFormatSettings, dca_from_sems};
42use crate::formats::eite::formats::utf8::{
43    UTF8FormatSettings, dca_from_utf8, dca_to_utf8,
44};
45use crate::formats::eite::transform::{
46    DocumentTransformation, apply_prefilters,
47};
48use crate::formats::utf8::utf8_from_scalar;
49use crate::{debug, json};
50
51pub mod ascii;
52pub mod dcbasenb;
53pub mod elad;
54pub mod html;
55pub mod integer_list;
56pub mod sems;
57pub mod utf8;
58
59pub enum Format {
60    SEMS {
61        settings: SEMSFormatSettings,
62    },
63    UTF8 {
64        settings: UTF8FormatSettings,
65    },
66    IntegerList,
67    ASCII,
68    ASCIISafeSubset {
69        settings: AsciiSafeSubsetFormatSettings,
70    },
71    HTML,
72    HTMLFragment,
73    ColorCoded,
74}
75
76impl Format {
77    pub fn utf8_default() -> Self {
78        Format::UTF8 {
79            settings: UTF8FormatSettings::default(),
80        }
81    }
82    pub fn sems_default() -> Self {
83        Format::SEMS {
84            settings: SEMSFormatSettings::default(),
85        }
86    }
87    pub fn ascii_safe_subset_default() -> Self {
88        Format::ASCIISafeSubset {
89            settings: AsciiSafeSubsetFormatSettings::default(),
90        }
91    }
92
93    pub fn from_string(format: &str) -> Result<Self> {
94        match format {
95            "sems" => Ok(Format::sems_default()),
96            "utf8" => Ok(Format::utf8_default()),
97            "integerList" => Ok(Format::IntegerList),
98            "ascii" => Ok(Format::ASCII),
99            "asciiSafeSubset" => Ok(Format::ascii_safe_subset_default()),
100            "html" => Ok(Format::HTML),
101            "htmlFragment" => Ok(Format::HTMLFragment),
102            "colorcoded" => Ok(Format::ColorCoded),
103            _ => Err(anyhow::anyhow!("Unknown EITE format: {format}")),
104        }
105    }
106}
107
108impl std::fmt::Display for Format {
109    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
110        write!(
111            f,
112            "{}",
113            match self {
114                Format::SEMS { settings: _ } => "sems",
115                Format::UTF8 { settings: _ } => "utf8",
116                Format::IntegerList => "integerList",
117                Format::ASCII => "ascii",
118                Format::ASCIISafeSubset { settings: _ } => "asciiSafeSubset",
119                Format::HTML => "html",
120                Format::HTMLFragment => "htmlFragment",
121                Format::ColorCoded => "colorcoded",
122            }
123        )
124    }
125}
126
127/* ===== Conversions between full Dc arrays and external formats ===== */
128
129pub fn with_default_log<T, E>(r: Result<T, E>) -> Result<(T, FormatLog), E> {
130    r.map(|val| (val, FormatLog::default()))
131}
132
133/// Parse input bytes (in a given "format") into a vector of Dc integers.
134/// Many formats are still TODO.
135pub fn dca_from_format(
136    state: &mut EiteState,
137    in_format: &Format,
138    content_bytes: &[u8],
139) -> Result<(Vec<u32>, FormatLog)> {
140    let format_string = in_format.to_string();
141    ensure!(
142        is_supported_input_format(format_string.as_str()),
143        "Unsupported input format {format_string}"
144    );
145    match in_format {
146        Format::SEMS { settings } => dca_from_sems(content_bytes, settings),
147        Format::IntegerList => {
148            dca_from_integer_list(content_bytes, &Default::default())
149        }
150        Format::ASCII => dca_from_ascii(content_bytes),
151        Format::ASCIISafeSubset { settings } => {
152            dca_from_ascii_safe_subset(content_bytes, settings)
153        }
154        Format::UTF8 { settings } => dca_from_utf8(content_bytes, settings),
155        _other => {
156            bail!("Unimplemented document parsing format: {format_string}")
157        }
158    }
159}
160
161/// Convert a Dc array into a target output format (serialized bytes).
162/// Returns a vector of bytes representing the Dc array in the specified output format.
163pub fn dca_to_format(
164    state: &mut EiteState,
165    out_format: &Format,
166    dc_array: &[u32],
167    prefilter_settings: &PrefilterSettings,
168) -> Result<(Vec<u8>, FormatLog)> {
169    let format_string = out_format.to_string();
170    ensure!(
171        is_supported_output_format(format_string.as_str()),
172        "Unsupported output format {format_string}"
173    );
174    let dc_array = &apply_prefilters(dc_array, prefilter_settings)?;
175    match out_format {
176        Format::IntegerList => {
177            with_default_log(Ok(dca_to_integer_list(dc_array)))
178        }
179        Format::ASCII => dca_to_ascii(dc_array),
180        Format::ASCIISafeSubset { settings: _ } => {
181            dca_to_ascii_safe_subset(dc_array)
182        }
183        Format::ColorCoded => with_default_log(dca_to_colorcoded(dc_array)),
184        Format::UTF8 { settings } => dca_to_utf8(dc_array, settings),
185        Format::HTML => dca_to_html(dc_array),
186        Format::HTMLFragment => dca_to_html_fragment(dc_array),
187        _other => bail!(
188            "Unimplemented document render output format: {format_string}"
189        ),
190    }
191}
192
193/// Wrapper performing in -> internal Dc -> out.
194/// Converts input document from one format to another via internal Dc representation.
195pub fn convert_formats(
196    state: &mut EiteState,
197    in_format: &Format,
198    out_format: &Format,
199    input: &[u8],
200    prefilter_settings: &PrefilterSettings,
201) -> Result<(Vec<u8>, FormatLog)> {
202    let (dc_array, mut log) = dca_from_format(state, in_format, input)?;
203
204    let (out, out_log) =
205        dca_to_format(state, out_format, &dc_array, prefilter_settings)?;
206    log.merge(&out_log);
207
208    Ok((out, log))
209}
210
211/// Produce an export filename extension for a format (mirrors JS).
212/// Returns the recommended file extension for the given format.
213pub fn get_export_extension(format: &str) -> Result<String> {
214    if is_supported_char_encoding(format) {
215        return Ok(format!("{}.txt", get_format_extension(format)?));
216    }
217    get_format_extension(format)
218}
219
220/* ===== Single-Dc conversions ===== */
221
222/// Convert a single Dc to output bytes (for the subset of formats that support
223/// a per-Dc conversion).
224pub fn dc_to_format(out_format: &str, dc: u32) -> Result<(Vec<u8>, FormatLog)> {
225    ensure!(
226        is_supported_output_format(out_format),
227        "Unsupported output format {out_format}"
228    );
229    ensure!(is_known_dc(dc), "Unknown Dc {dc}");
230    let mut log = FormatLog::default();
231
232    match out_format {
233        "utf8" => {
234            // Look up Unicode mapping (mappings/to/unicode row=dc field=1)
235            let hex_str = dc_data_lookup_by_value(
236                "mappings/to/unicode",
237                0,
238                dc.to_string().as_str(),
239                1,
240            );
241            if !exc_or_empty(&hex_str) {
242                if let Err(err) = hex_str {
243                    return Err(err)
244                        .context(format!("Failed lookup Dc {dc} unicode",));
245                }
246                let hex_str = hex_str.expect("checked above");
247                let cp = hex_to_dec_single(&hex_str)?;
248                return Ok((utf8_from_scalar(cp)?, log));
249            }
250
251            // Fallback attempt (mirrors structure)
252            let row_str = dc_data_lookup_by_value(
253                "mappings/from/unicode",
254                1,
255                &dc.to_string(),
256                0,
257            );
258            if !exc_or_empty(&row_str) {
259                if let Err(err) = row_str {
260                    return Err(err).context(format!(
261                        "Failed lookup Dc {dc} unicode fallback"
262                    ));
263                }
264                let row_str = row_str.expect("checked above");
265
266                let cp = hex_to_dec_single(&row_str)?;
267                return Ok((utf8_from_scalar(cp)?, log));
268            }
269
270            log.warn(format!("Could not convert Dc {dc} to UTF-8").as_str());
271
272            Ok((Vec::new(), log))
273        }
274        "colorcoded" => Ok((dc_to_colorcoded(dc)?, log)),
275        "html" => {
276            let html_map =
277                dc_data_lookup_by_dc_in_col_0("mappings/to/html", dc, 1);
278            if !exc_or_empty(&html_map) {
279                if let Err(err) = html_map {
280                    return Err(err).context(format!(
281                        "Failed lookup HTML mapping for Dc {dc}"
282                    ));
283                }
284                let html_map = html_map.expect("checked above");
285
286                return Ok((html_map.as_bytes().to_vec(), log));
287            }
288            dc_to_format("utf8", dc)
289        }
290        other => bail!("Unimplemented character output format: {other}"),
291    }
292}
293
294/// Attempt to map a single external-format chunk to a Dc.
295pub fn dc_from_format(
296    in_format: &str,
297    content: &[u8],
298) -> Result<(Vec<u32>, FormatLog)> {
299    ensure!(
300        is_supported_internal_format(in_format),
301        "Unsupported internal/source format {in_format}"
302    );
303    let mut res: Vec<u32> = Vec::new();
304    let mut log = FormatLog::default();
305    match in_format {
306        "ascii" | "unicode" => {
307            if content.is_empty() {
308                return Ok((res, log));
309            }
310            let c = unicode_scalar_from_utf8(content)?;
311            if in_format == "ascii" && c > 0x7F {
312                bail!("Not a 7-bit ASCII char: {c}");
313            }
314            let hex = dec_to_hex_single(c)?;
315            // dataset: 'mappings/from/unicode', filter_field=0 (hex), desired_field=1 (Dc id)
316            let dc_str =
317                dc_data_lookup_by_value("mappings/from/unicode", 0, &hex, 1);
318            if excep(&dc_str)
319                || (dc_str.is_ok() && dc_str.as_ref().unwrap() == "")
320            {
321                // FIXME: Add an option to save unmapped Unicode characters
322                // using Dcs 127-192 (individually)?
323                log.warn(&format!("Unmapped Unicode character U+{hex}"));
324                res.push(DC_REPLACEMENT_UNAVAIL_DC);
325                return Ok((res, log));
326            } else if dc_str.is_err() {
327                return Err(dc_str.err().unwrap()).context(format!(
328                    "Unexpected error looking up Unicode char U+{hex}"
329                ));
330            }
331            if let Ok(dc_id) = dc_str?.parse::<u32>() {
332                res.push(dc_id);
333            }
334        }
335        other => bail!("Unimplemented character source format: {other}"),
336    }
337    Ok((res, log))
338}
339
340/// Add an import warning for a specific character index and problem description.
341pub fn import_warning(state: &mut EiteState, index: usize, problem: &str) {
342    let warn = format!(
343        "A problem was encountered while importing at character {index}: {problem}"
344    );
345    state.import_deferred_settings_stack.push(warn.clone());
346    debug!("{}, {}, {}", json!(state), 1, &warn);
347}
348
349/// Add an export warning for a specific character index and problem description.
350pub fn export_warning(state: &mut EiteState, index: usize, problem: &str) {
351    let warn = format!(
352        "A problem was encountered while exporting at character {index}: {problem}"
353    );
354    state.export_deferred_settings_stack.push(warn.clone());
355    debug!("{}, {}, {}", json!(state), 1, &warn);
356}
357
358/// Retrieve and clear all import warnings from the state.
359/// Returns a vector of warning messages that were collected.
360pub fn get_import_warnings(state: &mut EiteState) -> Vec<String> {
361    let res = state.import_deferred_settings_stack.clone();
362    state.import_deferred_settings_stack.clear();
363    res
364}
365
366/// Retrieve and clear all export warnings from the state.
367/// Returns a vector of warning messages that were collected.
368pub fn get_export_warnings(state: &mut EiteState) -> Vec<String> {
369    let res = state.export_deferred_settings_stack.clone();
370    state.export_deferred_settings_stack.clear();
371    res
372}
373
374/// Add an export warning for an unmappable Dc value at a specific index.
375pub fn export_warning_unmappable(state: &mut EiteState, index: usize, dc: u32) {
376    export_warning(
377        state,
378        index,
379        &format!(
380            "The character {dc} could not be represented in the chosen export format."
381        ),
382    );
383}
384
385/* ===== Format enumeration / metadata ===== */
386
387/// List all supported formats.
388/// Returns a vector of format names that are supported for conversion.
389pub fn list_formats() -> Vec<String> {
390    dc_data_get_column("formats", 1) // The original simply pulled column 1 for all rows.
391}
392
393/// Check if a format is known.
394/// Returns true if the format is recognized, false otherwise.
395pub fn is_format(format: &str) -> bool {
396    list_formats().iter().any(|f| f == format)
397}
398
399/// List all supported input formats.
400/// Returns a vector of format names that can be used as input for conversion.
401pub fn list_input_formats() -> Vec<String> {
402    dc_data_filter_by_value_greater("formats", 3, 0, 1)
403}
404
405/// Check if a format is a supported input format.
406/// Returns true if the format can be used as input for conversion, false otherwise.
407pub fn is_supported_input_format(fmt: &str) -> bool {
408    list_input_formats().iter().any(|f| f == fmt)
409}
410
411/// List all supported internal formats.
412/// Returns a vector of format names that are supported for internal processing.
413pub fn list_internal_formats() -> Vec<String> {
414    dc_data_filter_by_value("formats", 6, "internal", 1)
415}
416
417/// Check if a format is a supported internal format.
418/// Returns true if the format is supported for internal processing, false otherwise.
419pub fn is_supported_internal_format(fmt: &str) -> bool {
420    let inf = list_input_formats();
421    let intern = list_internal_formats();
422    inf.iter().any(|f| f == fmt) || intern.iter().any(|f| f == fmt)
423}
424
425/// List all supported output formats.
426/// Returns a vector of format names that can be used as output for conversion.
427pub fn list_output_formats() -> Vec<String> {
428    dc_data_filter_by_value_greater("formats", 4, 0, 1)
429}
430
431/// Check if a format is a supported output format.
432/// Returns true if the format can be used as output for conversion, false otherwise.
433pub fn is_supported_output_format(fmt: &str) -> bool {
434    list_output_formats().iter().any(|f| f == fmt)
435}
436
437/// List formats used for storing arbitrary data (currently only basenb)
438pub fn list_data_types() -> Vec<String> {
439    dc_data_filter_by_value("formats", 6, "data", 1)
440}
441
442/// List all variants that are available for the given format. Does NOT return v: prefix for variants.
443pub fn list_variants_for_format(format: &str) -> Result<Vec<String>> {
444    let normalized = normalize_format(format)?;
445    let all = list_formats();
446    let mut res = Vec::new();
447    for f in all {
448        let ftype = get_format_type(&f)?;
449        if ftype.starts_with("v:") {
450            let mut variant_type = ftype[2..].to_string();
451            // Normalize
452            if variant_type == "unicodePua" {
453                variant_type = "unicode".to_string();
454            }
455            if variant_type == normalized {
456                res.push(f);
457            }
458        }
459    }
460    Ok(res)
461}
462
463/// Returns the internal ID used to reference the format.
464pub fn get_format_id(format: &str) -> Result<usize> {
465    ensure!(is_format(format), "Unknown format {format}");
466    let id_str = dc_data_lookup_by_value("formats", 1, format, 0)?;
467    let id = id_str
468        .parse::<usize>()
469        .with_context(|| format!("Invalid format id value: {id_str}"))?;
470    Ok(id)
471}
472
473/// Normalize a format name to its canonical representation.
474/// Currently, this converts "utf8" to "unicode" as they are functionally equivalent in this context.
475pub fn normalize_format(format: &str) -> Result<String> {
476    ensure!(is_format(format), "Unknown format {format}");
477    if format == "utf8" {
478        return Ok("unicode".to_string());
479    }
480    Ok(format.to_string())
481}
482
483/// Get the display name for a format.
484/// Returns the user-friendly name of the format, suitable for display in a UI.
485pub fn get_format_name(format: &str) -> Result<String> {
486    let id = get_format_id(format)?;
487    dc_data_lookup_by_id("formats", id, 1)
488}
489
490/// Get the file extension for a format.
491/// Returns the default file extension associated with the format, if any.
492pub fn get_format_extension(format: &str) -> Result<String> {
493    let id = get_format_id(format)?;
494    dc_data_lookup_by_id("formats", id, 2)
495}
496
497/// Get the import support value for a format.
498/// Returns an integer indicating the level of import support for the format.
499pub fn get_format_import_support(format: &str) -> Result<i32> {
500    let id = get_format_id(format)?;
501    let v = dc_data_lookup_by_id("formats", id, 3)?;
502    Ok(v.parse::<i32>().unwrap_or(0))
503}
504
505/// Get the export support value for a format.
506/// Returns an integer indicating the level of export support for the format.
507pub fn get_format_export_support(format: &str) -> Result<i32> {
508    let id = get_format_id(format)?;
509    let v = dc_data_lookup_by_id("formats", id, 4)?;
510    Ok(v.parse::<i32>().unwrap_or(0))
511}
512
513/// Get the test status value for a format.
514/// Returns an integer indicating the test coverage status of the format.
515pub fn get_format_tests_status(format: &str) -> Result<i32> {
516    let id = get_format_id(format)?;
517    let v = dc_data_lookup_by_id("formats", id, 5)?;
518    Ok(v.parse::<i32>().unwrap_or(0))
519}
520
521/// Get the type string for a format.
522/// Returns a string indicating the type/category of the format (e.g., text, encoding, internal).
523pub fn get_format_type(format: &str) -> Result<String> {
524    let id = get_format_id(format)?;
525    dc_data_lookup_by_id("formats", id, 6)
526}
527
528/// Get the label for a format.
529/// Returns a string label that summarizes the format, suitable for display in a UI.
530pub fn get_format_label(format: &str) -> Result<String> {
531    let id = get_format_id(format)?;
532    dc_data_lookup_by_id("formats", id, 7)
533}
534
535/// Get the variant types for a format.
536/// Returns a list of variant types (e.g., "unicodePua", or "encoding") that the
537/// format supports. Will NOT include the v: prefix that appears to denote that
538/// a given type is exclusively a variant type in the Type column.
539pub fn get_format_variant_types(format: &str) -> Result<Vec<String>> {
540    let id = get_format_id(format)?;
541    let raw = dc_data_lookup_by_id("formats", id, 8)?;
542    Ok(raw
543        .split(',')
544        .filter(|s| !s.is_empty())
545        .map(std::string::ToString::to_string)
546        .collect())
547}
548
549/// Get comments for a format.
550/// Returns any additional comments or notes associated with the format.
551pub fn get_format_comments(format: &str) -> Result<String> {
552    let id = get_format_id(format)?;
553    dc_data_lookup_by_id("formats", id, 9)
554}
555
556/// Returns true if the format is a variant (as opposed to a base format), false
557/// otherwise. Note that this seems to only return true for formats that are
558/// *only* variants, as opposed to things like character encodings that are
559/// variants in regard to the html format, but are also formats on their own
560/// terms.
561pub fn format_is_variant(format: &str) -> Result<bool> {
562    ensure!(is_format(format), "Unknown format {format}");
563    let t = get_format_type(format)?;
564    Ok(t.starts_with("v:"))
565}
566
567/// Is the format a language, either human or programming?
568pub fn format_is_any_language(format: &str) -> Result<bool> {
569    ensure!(is_format(format), "Unknown format {format}");
570    let t = get_format_type(format)?;
571    Ok(t.eq("language") || t.eq("programming"))
572}
573
574/// Is the format a human language?
575pub fn format_is_human_language(format: &str) -> Result<bool> {
576    ensure!(is_format(format), "Unknown format {format}");
577    let t = get_format_type(format)?;
578    Ok(t.eq("language"))
579}
580
581/// Is the format a programming language?
582pub fn format_is_programming_language(format: &str) -> Result<bool> {
583    ensure!(is_format(format), "Unknown format {format}");
584    let t = get_format_type(format)?;
585    Ok(t.eq("programming"))
586}
587
588/// Check if a string is a recognized variant type.
589/// Returns true if the variant type is one of the known types ("encoding" or
590/// "unicodePua"). Does NOT expect a v: prefix for variant types.
591pub fn is_variant_type(variant_type: &str) -> bool {
592    matches!(variant_type, "encoding" | "unicodePua")
593}
594
595/// Get the variant type string for a variant format.
596/// Returns the underlying variant type (e.g., "unicodePua") for a format that
597/// is a variant (e.g. "dcBasenb").
598pub fn format_get_variant_type(variant: &str) -> Result<String> {
599    ensure!(
600        format_is_variant(variant)?,
601        "Format {variant} is not a variant"
602    );
603    let t = get_format_type(variant)?;
604    // Remove "v:" prefix
605    Ok(t[2..].to_string())
606}
607
608/// Check if a format supports a given variant type.
609/// Returns true if the format can handle the specified variant type, false
610/// otherwise. Does NOT expect (or accept) v: prefix that in the type column,
611/// denotes a variant type.
612pub fn format_supports_variant_type(
613    format: &str,
614    variant_type: &str,
615) -> Result<bool> {
616    Ok(get_format_variant_types(format)?
617        .iter()
618        .any(|t| t == variant_type))
619}
620
621/// Check if a format supports a specific variant.
622/// Returns true if the format can handle the specified variant, false
623/// otherwise.
624pub fn format_supports_variant(format: &str, variant: &str) -> Result<bool> {
625    let vt = format_get_variant_type(variant)?;
626    format_supports_variant_type(format, &vt)
627}
628
629/// Get the metrics type for a format.
630/// Returns a string indicating the metrics type (e.g., "character", "internal-unicode", "complex-dcBasenb").
631pub fn get_format_metrics_type(format: &str) -> Result<String> {
632    ensure!(is_format(format), "Unknown format {format}");
633    let t = get_format_type(format)?;
634    let res = if t == "text" || matches!(t.as_str(), "encoding" | "terminal") {
635        "character".to_string()
636    } else if t == "internal" {
637        format!("internal-{format}")
638    } else {
639        format!("complex-{format}")
640    };
641    Ok(res)
642}
643
644pub struct DcOutputLanguage {
645    language: String,
646}
647
648impl DcOutputLanguage {
649    pub fn new(language: &str) -> Result<Self> {
650        ensure!(
651            format_is_any_language(language)?,
652            format!("Unknown language {language}")
653        );
654        ensure!(
655            is_supported_output_format(language),
656            format!("Output support for language {language}")
657        );
658        Ok(DcOutputLanguage {
659            language: language.to_string(),
660        })
661    }
662}
663
664impl std::fmt::Display for DcOutputLanguage {
665    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
666        write!(f, "{}", self.language)
667    }
668}
669
670impl Default for DcOutputLanguage {
671    fn default() -> Self {
672        DcOutputLanguage {
673            language: "lang_en".to_string(),
674        }
675    }
676}
677
678#[derive(Default)]
679/// Supported "dct" / document transformations
680pub struct PrefilterSettings {
681    pub enabled_prefilters: Vec<DocumentTransformation>,
682}
683
684impl PrefilterSettings {
685    pub fn apply(&self, dc_array_in: &[u32]) -> Result<Vec<u32>> {
686        let mut out = dc_array_in.to_vec();
687
688        for prefilter in &self.enabled_prefilters {
689            out = prefilter.apply(&out)?;
690        }
691
692        Ok(out)
693    }
694}
695
696#[cfg(test)]
697mod tests {
698    use crate::formats::assert_vec_u8_ok_eq_no_warnings;
699
700    use super::*;
701
702    #[crate::ctb_test]
703    fn test_import_export_warning_buffers() {
704        let mut state = EiteState::new();
705        import_warning(&mut state, 2, "Example problem");
706        export_warning(&mut state, 5, "Another issue");
707        let iw = get_import_warnings(&mut state);
708        let ew = get_export_warnings(&mut state);
709        assert_eq!(iw.len(), 1);
710        assert_eq!(ew.len(), 1);
711        assert!(get_import_warnings(&mut state).is_empty());
712        assert!(get_export_warnings(&mut state).is_empty());
713    }
714
715    #[crate::ctb_test]
716    fn test_dca_from_format_ascii() {
717        let mut state = EiteState::new();
718        let input = b"ABC";
719        let (res, log) =
720            dca_from_format(&mut state, &Format::ASCII, input).unwrap();
721        assert_eq!(res, vec![50, 51, 52]);
722    }
723    #[crate::ctb_test]
724    fn test_dca_to_format_ascii() {
725        let mut state = EiteState::new();
726        let input = vec![50, 51, 52];
727        assert_vec_u8_ok_eq_no_warnings(
728            b"ABC",
729            dca_to_format(
730                &mut state,
731                &Format::ASCII,
732                &input,
733                &PrefilterSettings::default(),
734            ),
735        );
736    }
737    #[crate::ctb_test]
738    fn test_convert_formats_ascii_to_utf8() {
739        let mut state = EiteState::new();
740        let input = b"ABC";
741        assert_vec_u8_ok_eq_no_warnings(
742            b"ABC",
743            convert_formats(
744                &mut state,
745                &Format::ASCII,
746                &Format::utf8_default(),
747                input,
748                &PrefilterSettings::default(),
749            ),
750        );
751    }
752    #[crate::ctb_test]
753    fn test_get_export_extension_ascii() {
754        let ext = get_export_extension("ascii").unwrap();
755        assert!(ext.ends_with(".txt"));
756    }
757    #[crate::ctb_test]
758    fn test_dc_to_format_utf8() {
759        let (out, log) = dc_to_format("utf8", 65).unwrap();
760        assert_eq!(out, b"P");
761        assert!(log.has_no_warnings_or_errors());
762    }
763    #[crate::ctb_test]
764    fn test_dc_from_format_ascii() {
765        let (res, log) = dc_from_format("ascii", b"A").unwrap();
766        assert_eq!(1, res.len());
767    }
768    #[crate::ctb_test]
769    fn test_import_warning_add() {
770        let mut state = EiteState::new();
771        import_warning(&mut state, 0, "test");
772        assert_eq!(state.import_deferred_settings_stack.len(), 1);
773    }
774    #[crate::ctb_test]
775    fn test_export_warning_add() {
776        let mut state = EiteState::new();
777        export_warning(&mut state, 0, "test");
778        assert_eq!(state.export_deferred_settings_stack.len(), 1);
779    }
780    #[crate::ctb_test]
781    fn test_get_import_warnings_clear() {
782        let mut state = EiteState::new();
783        import_warning(&mut state, 0, "test");
784        let _ = get_import_warnings(&mut state);
785        assert!(state.import_deferred_settings_stack.is_empty());
786    }
787    #[crate::ctb_test]
788    fn test_get_export_warnings_clear() {
789        let mut state = EiteState::new();
790        export_warning(&mut state, 0, "test");
791        let _ = get_export_warnings(&mut state);
792        assert!(state.export_deferred_settings_stack.is_empty());
793    }
794    #[crate::ctb_test]
795    fn test_export_warning_unmappable() {
796        let mut state = EiteState::new();
797        export_warning_unmappable(&mut state, 0, 999);
798        assert_eq!(state.export_deferred_settings_stack.len(), 1);
799    }
800    #[crate::ctb_test]
801    fn test_list_formats() {
802        let formats = list_formats();
803        assert!(formats.contains(&"utf8".to_string()));
804        assert!(formats.contains(&"ascii".to_string()));
805        assert!(formats.contains(&"vt100".to_string()));
806    }
807    #[crate::ctb_test]
808    fn test_is_format_true_false() {
809        assert!(is_format("utf8"));
810        assert!(!is_format("not_a_format"));
811    }
812    #[crate::ctb_test]
813    fn test_list_input_formats() {
814        let formats = list_input_formats();
815        assert!(formats.contains(&"semanticToText".to_string()));
816    }
817    #[crate::ctb_test]
818    fn test_is_supported_input_format_true_false() {
819        assert!(is_supported_input_format("utf8"));
820        assert!(!is_supported_input_format("not_a_format"));
821    }
822    #[crate::ctb_test]
823    fn test_list_internal_formats() {
824        let formats = list_internal_formats();
825        assert!(formats.contains(&"dc".to_string()));
826        assert!(!formats.contains(&"utf8".to_string()));
827    }
828    #[crate::ctb_test]
829    fn test_is_supported_internal_format_true_false() {
830        assert!(is_supported_internal_format("unicode"));
831        assert!(!is_supported_internal_format("not_a_format"));
832    }
833    #[crate::ctb_test]
834    fn test_list_output_formats() {
835        let formats = list_output_formats();
836        assert!(formats.contains(&"asciiSafeSubset".to_string()));
837    }
838    #[crate::ctb_test]
839    fn test_is_supported_output_format() {
840        assert!(is_supported_output_format("utf8"));
841        assert!(is_supported_output_format("asciiSafeSubset"));
842        assert!(!is_supported_output_format("sems")); // not sufficient support
843        assert!(!is_supported_output_format("not_a_format")); // not known
844    }
845    #[crate::ctb_test]
846    fn test_list_data_types() {
847        let types = list_data_types();
848        assert!(types.contains(&"basenb".to_string()));
849    }
850    #[crate::ctb_test]
851    fn test_list_variants_for_format() {
852        let res = list_variants_for_format("utf8");
853        assert!(res.unwrap().contains(&"dcBasenb".to_string()));
854    }
855    #[crate::ctb_test]
856    fn test_get_format_id() {
857        let id = get_format_id("utf8").unwrap();
858        assert_eq!(id, 0);
859        let id = get_format_id("sems").unwrap();
860        assert_eq!(id, 6);
861    }
862    #[crate::ctb_test]
863    fn test_normalize_format() {
864        let norm = normalize_format("utf8").unwrap();
865        assert_eq!(norm, "unicode");
866        let norm = normalize_format("sems").unwrap();
867        assert_eq!(norm, "sems");
868        assert!(normalize_format("not_a_format").is_err());
869    }
870    #[crate::ctb_test]
871    fn test_get_format_name_utf8() {
872        let name = get_format_name("utf8").unwrap();
873        assert_eq!(name, "utf8");
874    }
875    #[crate::ctb_test]
876    fn test_get_format_extension_utf8() {
877        let ext = get_format_extension("utf8").unwrap();
878        assert_eq!(ext, "utf8");
879    }
880    #[crate::ctb_test]
881    fn test_get_format_import_support_utf8() {
882        let val = get_format_import_support("utf8").unwrap();
883        assert_eq!(val, 1);
884    }
885    #[crate::ctb_test]
886    fn test_get_format_export_support_utf8() {
887        let val = get_format_export_support("utf8").unwrap();
888        assert_eq!(val, 2);
889    }
890    #[crate::ctb_test]
891    fn test_get_format_tests_status_utf8() {
892        let val = get_format_tests_status("utf8").unwrap();
893        assert_eq!(val, 1);
894    }
895    #[crate::ctb_test]
896    fn test_get_format_type_utf8() {
897        let typ = get_format_type("utf8").unwrap();
898        assert_eq!(typ, "encoding");
899    }
900    #[crate::ctb_test]
901    fn test_get_format_label_utf8() {
902        let label = get_format_label("utf8").unwrap();
903        assert_eq!(label, "UTF-8");
904    }
905
906    #[crate::ctb_test]
907    fn test_get_format_variant_types_utf8() {
908        let types = get_format_variant_types("utf8").unwrap();
909        assert_eq!(types, vec!["unicodePua"]);
910    }
911
912    #[crate::ctb_test]
913    fn test_get_format_variant_types_empty() {
914        let types = get_format_variant_types("unknown_format");
915        assert!(types.is_err()); // should error for unknown format
916    }
917
918    #[crate::ctb_test]
919    fn test_format_is_variant() {
920        assert!(format_is_variant("dcBasenb").unwrap());
921        assert!(!format_is_variant("utf8").unwrap());
922    }
923
924    #[crate::ctb_test]
925    fn test_is_variant_type_known_and_unknown() {
926        assert!(is_variant_type("encoding"));
927        assert!(is_variant_type("unicodePua"));
928        assert!(!is_variant_type("v:unicodePua")); // I guess
929        assert!(!is_variant_type("foobar"));
930    }
931
932    #[crate::ctb_test]
933    fn test_format_get_variant_type_success_and_failure() {
934        // Success
935        assert_eq!(format_get_variant_type("dcBasenb").unwrap(), "unicodePua");
936        // Failure
937        assert!(format_get_variant_type("utf8").is_err());
938        assert!(format_get_variant_type("unicodePua").is_err());
939        assert!(format_get_variant_type("v:unicodePua").is_err());
940    }
941
942    #[crate::ctb_test]
943    fn test_format_supports_variant_type_true_and_false() {
944        assert!(format_supports_variant_type("utf8", "unicodePua").unwrap());
945        assert!(!format_supports_variant_type("utf8", "v:unicodePua").unwrap()); // I guess
946        assert!(!format_supports_variant_type("utf8", "encoding").unwrap());
947    }
948
949    #[crate::ctb_test]
950    fn test_format_supports_variant_utf8() {
951        let res = format_supports_variant("utf8", "utf8");
952        assert!(res.is_err(), "utf8 is not a variant of utf8");
953        let res = format_supports_variant("html", "utf8");
954        assert!(res.is_err(), "utf8 is a variant of html");
955        let res = format_supports_variant("utf8", "v:dcBasenb");
956        assert!(res.is_err()); // false, I guess
957        let res = format_supports_variant("utf8", "dcBasenb");
958        assert!(res.unwrap()); // true
959    }
960
961    #[crate::ctb_test]
962    fn test_get_format_comments_utf8() {
963        let comments = get_format_comments("ascii").unwrap();
964        assert_eq!(comments, "7-bit ASCII. Variants: Line ending variants");
965    }
966
967    #[crate::ctb_test]
968    fn test_get_format_metrics_type_utf8() {
969        let typ = get_format_metrics_type("utf8").unwrap();
970        assert_eq!(typ, "character");
971        let typ = get_format_metrics_type("unicode").unwrap();
972        assert_eq!(typ, "internal-unicode");
973        let typ = get_format_metrics_type("dcBasenb").unwrap();
974        assert_eq!(typ, "complex-dcBasenb");
975    }
976}