ctoolbox/
formats.rs

1use crate::formats::eite::formats::utf8::UTF8FormatSettings;
2use crate::formats::eite::formats::utf8::dca_to_utf8;
3use crate::formats::utf_8e_128::decode_utf_8e_128;
4use crate::formats::utf_8e_128::encode_utf_8e_128_buf;
5use crate::utilities::*;
6use std::collections::HashMap;
7use std::iter;
8use uuid::Uuid;
9
10pub mod base16b;
11pub mod base64;
12pub mod eite;
13pub mod html;
14pub mod ip;
15pub mod markdown;
16pub mod multipart;
17pub mod troff;
18pub mod unicode;
19pub mod utf8;
20pub mod utf_8e_128;
21pub mod wtf8;
22#[derive(Default, Debug)]
23pub struct FormatLog {
24    errors: Vec<String>,
25    warnings: Vec<String>,
26    debug_messages: Vec<String>,
27    /// Stores the order and type of all log entries, so that formatting can preserve log order and type.
28    log_order: Vec<(LogType, usize)>,
29}
30
31/// Tracks the type of log entry.
32#[derive(Copy, Clone, Debug)]
33enum LogType {
34    Error,
35    Warning,
36    Debug,
37}
38
39impl FormatLog {
40    /// Record a serious error that may indicate the document could not be fully processed.
41    pub fn error(&mut self, message: &str) {
42        #[cfg(debug_assertions)]
43        crate::debug!("FormatLog error: {}", message);
44        self.errors.push(message.to_string());
45        self.log_order.push((LogType::Error, self.errors.len() - 1));
46    }
47
48    pub fn warn(&mut self, message: &str) {
49        #[cfg(debug_assertions)]
50        crate::debug!("FormatLog warn: {}", message);
51        self.warnings.push(message.to_string());
52        self.log_order
53            .push((LogType::Warning, self.warnings.len() - 1));
54    }
55
56    #[cfg(not(debug_assertions))]
57    pub fn debug(&mut self, message: &str) {}
58
59    #[cfg(debug_assertions)]
60    pub fn debug(&mut self, message: &str) {
61        #[cfg(debug_assertions)]
62        crate::debug!("FormatLog debug: {}", message);
63        self.debug_messages.push(message.to_string());
64        self.log_order
65            .push((LogType::Debug, self.debug_messages.len() - 1));
66    }
67
68    pub fn get_errors(&self) -> Vec<String> {
69        self.errors.clone()
70    }
71
72    pub fn get_warnings(&self) -> Vec<String> {
73        self.warnings.clone()
74    }
75
76    pub fn has_errors(&self) -> bool {
77        !self.errors.is_empty()
78    }
79
80    pub fn has_no_errors(&self) -> bool {
81        !self.has_errors()
82    }
83
84    pub fn has_warnings(&self) -> bool {
85        !self.warnings.is_empty()
86    }
87
88    pub fn has_debug_messages(&self) -> bool {
89        !self.debug_messages.is_empty()
90    }
91
92    pub fn has_any(&self) -> bool {
93        self.has_errors() || self.has_warnings() || self.has_debug_messages()
94    }
95
96    pub fn has_warnings_or_errors(&self) -> bool {
97        self.has_warnings() || self.has_errors()
98    }
99
100    pub fn has_no_warnings_or_errors(&self) -> bool {
101        !self.has_warnings_or_errors()
102    }
103
104    /// Add an import error at a specific character index and problem description.
105    pub fn import_error(&mut self, index: u64, problem: &str) {
106        let error = format!(
107            "An unrecoverable problem was encountered while importing at character {index}: {problem}"
108        );
109        self.error(error.as_str());
110    }
111
112    /// Add an import warning for a specific character index and problem description.
113    pub fn import_warning(&mut self, index: u64, problem: &str) {
114        let warn = format!(
115            "A problem was encountered while importing at character {index}: {problem}"
116        );
117        self.warn(warn.as_str());
118    }
119
120    /// Add an export error at a specific character index and problem description.
121    pub fn export_error(&mut self, index: u64, problem: &str) {
122        let error = format!(
123            "An unrecoverable problem was encountered while exporting at character {index}: {problem}"
124        );
125        self.error(error.as_str());
126    }
127
128    /// Add an export warning for a specific character index and problem description.
129    pub fn export_warning(&mut self, index: u64, problem: &str) {
130        let warn = format!(
131            "A problem was encountered while exporting at character {index}: {problem}"
132        );
133        self.warn(warn.as_str());
134    }
135
136    pub fn export_warning_unmappable(
137        &mut self,
138        index: u64,
139        problem_dc: u32,
140        format: &str,
141    ) {
142        self.export_warning(index, format!("The character {problem_dc} could not be represented in the chosen export format ({format}).").as_str());
143    }
144
145    pub fn merge(&mut self, other: &FormatLog) {
146        let error_offset = self.errors.len();
147        let warning_offset = self.warnings.len();
148        let debug_offset = self.debug_messages.len();
149
150        self.errors.extend(other.errors.clone());
151        self.warnings.extend(other.warnings.clone());
152        self.debug_messages.extend(other.debug_messages.clone());
153
154        for &(typ, idx) in &other.log_order {
155            let adjusted_idx = match typ {
156                LogType::Error => idx + error_offset,
157                LogType::Warning => idx + warning_offset,
158                LogType::Debug => idx + debug_offset,
159            };
160            self.log_order.push((typ, adjusted_idx));
161        }
162    }
163
164    /// Formats all log messages in the order they were logged, with proper prefixing.
165    pub fn format_all(&self) -> String {
166        if !self.has_any() {
167            return String::new();
168        }
169        let mut output = String::new();
170        output.push_str("Messages during format processing:\n");
171        // idx are not consecutive if printed, this uses them to encode the sort
172        // order, I think. They're consecutive w/i each message type.
173        for &(typ, idx) in &self.log_order {
174            match typ {
175                LogType::Error => {
176                    output.push_str("* [ERROR] ");
177                    output.push_str(&self.errors[idx]);
178                }
179                LogType::Warning => {
180                    output.push_str("- [WARNING] ");
181                    output.push_str(&self.warnings[idx]);
182                }
183                LogType::Debug => {
184                    output.push_str("- [DEBUG] ");
185                    output.push_str(&self.debug_messages[idx]);
186                }
187            }
188            output.push('\n');
189        }
190        output
191    }
192
193    pub fn format_errors(&self) -> String {
194        let mut errors = String::new();
195        if self.has_errors() {
196            for e in &self.errors {
197                errors.push_str("- ");
198                errors.push_str(e);
199                errors.push('\n');
200            }
201            format!("Errors during format processing:\n{errors}")
202        } else {
203            String::new()
204        }
205    }
206
207    pub fn format_warnings(&self) -> String {
208        let mut warnings = String::new();
209        if self.has_warnings() {
210            for w in &self.warnings {
211                warnings.push_str("- ");
212                warnings.push_str(w);
213                warnings.push('\n');
214            }
215            format!("Warnings during format processing:\n{warnings}")
216        } else {
217            String::new()
218        }
219    }
220
221    pub fn format_debug(&self) -> String {
222        let mut debug = String::new();
223        if self.debug_messages.is_empty() {
224            String::new()
225        } else {
226            for d in &self.debug_messages {
227                debug.push_str("- ");
228                debug.push_str(d);
229                debug.push('\n');
230            }
231            format!("Debug messages during format processing:\n{debug}")
232        }
233    }
234
235    pub fn auto_log(&self) {
236        if self.has_any() {
237            error!(self.format_errors());
238            warn!(self.format_warnings());
239            debug!(self.format_debug());
240        } else {
241            info!("No errors or warnings during format processing.");
242        }
243    }
244}
245
246impl std::fmt::Display for FormatLog {
247    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
248        if self.has_any() {
249            writeln!(f, "{}", self.format_all())?;
250        } else {
251            writeln!(f, "No errors or warnings during format processing.")?;
252        }
253        Ok(())
254    }
255}
256
257pub fn string_result_with_log_to_vec(
258    result: Result<(String, FormatLog)>,
259) -> Result<(Vec<u8>, FormatLog)> {
260    result.map(|res| {
261        let result_bytes = res.0.into_bytes();
262        (result_bytes, res.1)
263    })
264}
265
266pub fn get_format_uuids<'a>() -> HashMap<Vec<u8>, Vec<u8>> {
267    HashMap::from([(
268        strtovec("9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7"),
269        strtovec("html"),
270    )])
271}
272
273pub fn get_format_from_uuid(document: Vec<u8>) -> Option<Vec<u8>> {
274    let head: Vec<u8>;
275    if document.len() < 36 {
276        head = document;
277    } else {
278        head = document[..36].to_vec();
279    }
280    let uuid = get_uuid_from_document(head);
281    uuid.as_ref()?;
282
283    Some(get_format_uuids()[&uuid.expect("checked earlier")].clone())
284}
285
286pub fn get_uuid_from_document(document: Vec<u8>) -> Option<Vec<u8>> {
287    if document.len() < 16 {
288        return None;
289    }
290
291    let uuid_binary = Uuid::from_slice(&document[..16])
292        .expect("The length should be 16")
293        .hyphenated()
294        .to_string()
295        .into_bytes();
296    // let uuid_utf8=String::from_utf8_lossy(&document);
297    let uuid_string = String::from_utf8_lossy(&document[..36])
298        .to_string()
299        .into_bytes();
300
301    let formats = get_format_uuids();
302
303    if formats.contains_key(&uuid_binary) {
304        return Some(uuid_binary);
305    } else if formats.contains_key(&uuid_string) {
306        return Some(uuid_string);
307    }
308
309    None
310}
311
312pub fn convert_if_needed(document: Vec<u8>) -> Vec<u8> {
313    // TODO
314
315    document
316}
317
318pub fn convert_from(document: Vec<u8>, filetype: Vec<u8>) -> Vec<u8> {
319    // TODO
320
321    document
322}
323
324pub fn sanitize_html(document: Vec<u8>) -> Vec<u8> {
325    let mut builder = ammonia::Builder::default();
326
327    builder.add_generic_attributes(iter::once("class").chain(iter::once("id")));
328
329    builder
330        .clean(&String::from_utf8_lossy(&document))
331        .to_string()
332        .into_bytes()
333}
334
335// Converts dctext to dcutf
336pub fn dctext_to_dcutf(document: Vec<u8>) -> Vec<u8> {
337    // Format looks like (w/o backticks): `Unicode (UTF-8) text @123@miesu@214748364@@L662@`
338    // where `Unicode text` is actual unicode text, and between each pair of @ signs, is a DcId. A DcId can be any int 128 bits (u128) in decimal, and it may have an `L` prefix.
339    // Output format is sort of UTF-8 text. For normal Unicode input characters, the output character is the same. For DcIds less than or equal to 1114111 (the largest Unicode character, I believe), the output character is the corresponding "generalized UTF-8", the numeric value encoded in the same underlying algorithm as UTF-8. For DcIds greater than 1114111 and not prefixed with an L, the output character is the decimal DcId represented by extending the usual algorithm of UTF-8 encoding, but for those larger numbers. For DcIds prefixed with an L, the output is equivalent to @1114408@ followed by the Dc that followed the L (the L is just a shorthand for that 1114408 Dc). That is to say, it's not a true Unicode encoding, it's simply using an extension of the algorithm underlying UTF-8 as a convenient encoding of ints.
340    let document = String::from_utf8(document).unwrap();
341    let mut output = Vec::new();
342    for line in document.lines() {
343        let mut rest = line;
344        while let Some(start) = rest.find('@') {
345            // Output text before @ as plain UTF-8
346            output.extend_from_slice(&rest.as_bytes()[..start]);
347            rest = &rest[start + 1..];
348            if let Some(end) = rest.find('@') {
349                let token = &rest[..end];
350                let mut dcid_str = token;
351                let mut l_prefix = false;
352                if dcid_str.is_empty() {
353                    dcid_str = "64"; // @ sign
354                }
355                if dcid_str.starts_with('L') {
356                    l_prefix = true;
357                    dcid_str = &dcid_str[1..];
358                }
359                if let Ok(dcid) = dcid_str.parse::<u128>() {
360                    let mut buf = [0u8; 24];
361                    if l_prefix {
362                        // Output as UTF-8 codepoint 1114408, then the dcid
363                        let n1 = encode_utf_8e_128_buf(&mut buf, 1114408);
364                        output.extend_from_slice(&buf[..n1]);
365                        let n2 = encode_utf_8e_128_buf(&mut buf, dcid);
366                        output.extend_from_slice(&buf[..n2]);
367                    } else {
368                        let n = encode_utf_8e_128_buf(&mut buf, dcid);
369                        output.extend_from_slice(&buf[..n]);
370                    }
371                }
372                rest = &rest[end + 1..];
373            } else {
374                // No matching @, output the rest and break
375                output.extend_from_slice(rest.as_bytes());
376                break;
377            }
378        }
379        // Output any remaining text
380        output.extend_from_slice(rest.as_bytes());
381    }
382
383    output
384}
385
386// Converts dcutf to dctext
387pub fn dcutf_to_dctext(document: Vec<u8>) -> Vec<u8> {
388    let mut output = String::new();
389    let mut i = 0;
390    while i < document.len() {
391        if let Some((codepoint, size)) = decode_utf_8e_128(&document[i..]) {
392            if codepoint == 64 {
393                output.push_str("@@");
394            } else if codepoint <= 0x10FFFF {
395                // Normal Unicode character
396                if let Some(ch) = std::char::from_u32(
397                    u32::try_from(codepoint).expect("Checked range already"),
398                ) {
399                    output.push(ch);
400                } else {
401                    output.push_str(format!("@{codepoint}@").as_str());
402                }
403            } else if codepoint == 1114408 {
404                // L-prefixed DcId follows
405                i += size;
406                if let Some((next_dc, next_size)) =
407                    decode_utf_8e_128(&document[i..])
408                {
409                    output.push_str(format!("@L{next_dc}@").as_str());
410                    i += next_size;
411                    continue;
412                }
413                output.push_str("@1114408@");
414            } else {
415                // Generalized DcId
416                output.push_str(format!("@{codepoint}@").as_str());
417            }
418            i += size;
419        } else {
420            // Not valid UTF-8, output as is
421            output.push(char::from(document[i]));
422            i += 1;
423        }
424    }
425    output.into_bytes()
426}
427
428// Trait to extend char with a as_utf8_bytes() convenience method
429pub trait CharUtfBytesExt {
430    fn as_utf8_bytes(&self) -> Vec<u8>;
431}
432
433impl CharUtfBytesExt for char {
434    /// Similar to `encode_utf8` - more convenient, but slower and copies.
435    fn as_utf8_bytes(&self) -> Vec<u8> {
436        let mut buf = [0u8; 4];
437        let s = self.encode_utf8(&mut buf);
438        s.as_bytes().to_vec()
439    }
440}
441
442// Test helpers
443pub fn assert_vec_u8_ok_eq_no_warnings(
444    expected: &[u8],
445    actual: Result<(Vec<u8>, FormatLog)>,
446) -> Vec<u8> {
447    let (actual_bytes, log) = actual.unwrap();
448    assert!(
449        log.has_no_warnings_or_errors(),
450        "Warnings or errors found:\n{}",
451        log.format_all()
452    );
453    assert_vec_u8_eq_log(expected, &actual_bytes, &log);
454    actual_bytes
455}
456
457pub fn assert_vec_u8_ok_eq_no_errors(
458    expected: &[u8],
459    actual: Result<(Vec<u8>, FormatLog)>,
460) -> (Vec<u8>, FormatLog) {
461    let (actual_bytes, log) = actual.unwrap();
462
463    assert!(log.has_no_errors(), "Errors found:\n{}", log.format_all());
464    assert_vec_u8_eq_log(expected, &actual_bytes, &log);
465    (actual_bytes, log)
466}
467
468fn _format_dcs_for_log(expected: &[u32], actual: &[u32]) -> String {
469    let (utf8_expected, _) =
470        dca_to_utf8(expected, &UTF8FormatSettings::default())
471            .unwrap_or_default();
472    let (utf8_actual, _) =
473        dca_to_utf8(actual, &UTF8FormatSettings::default()).unwrap_or_default();
474
475    format!(
476        "Expected formatted Dcs: {}\nActual formatted Dcs: {}",
477        String::from_utf8_lossy(&utf8_expected),
478        String::from_utf8_lossy(&utf8_actual)
479    )
480}
481
482fn _assert_vec_u32_ok_eq_log(
483    expected: &[u32],
484    actual: Result<(Vec<u32>, FormatLog)>,
485    print_dcs: bool,
486    disallow_warnings: bool,
487) -> (Vec<u32>, FormatLog) {
488    let (actual_vec, log) = actual.unwrap();
489
490    let mut log_problem_type = "Errors";
491    if disallow_warnings {
492        log_problem_type = "Warnings or errors";
493    }
494    let mut message =
495        format!("{log_problem_type} found:\n{}", log.format_all());
496
497    if print_dcs {
498        message.push_str(&_format_dcs_for_log(expected, &actual_vec));
499    }
500
501    if disallow_warnings {
502        assert!(log.has_no_warnings_or_errors(), "{message}");
503    } else {
504        assert!(log.has_no_errors(), "{message}");
505    }
506
507    if print_dcs {
508        assert_vec_dc_eq_log(expected, &actual_vec, &log);
509    } else {
510        assert_vec_u32_eq_log(expected, &actual_vec, &log);
511    }
512    (actual_vec, log)
513}
514
515pub fn assert_vec_u32_ok_eq_no_warnings(
516    expected: &[u32],
517    actual: Result<(Vec<u32>, FormatLog)>,
518) -> (Vec<u32>, FormatLog) {
519    _assert_vec_u32_ok_eq_log(expected, actual, false, true)
520}
521
522pub fn assert_vec_dc_ok_eq_no_warnings(
523    expected: &[u32],
524    actual: Result<(Vec<u32>, FormatLog)>,
525) -> (Vec<u32>, FormatLog) {
526    _assert_vec_u32_ok_eq_log(expected, actual, true, true)
527}
528
529pub fn assert_vec_u32_ok_eq_no_errors(
530    expected: &[u32],
531    actual: Result<(Vec<u32>, FormatLog)>,
532) -> (Vec<u32>, FormatLog) {
533    _assert_vec_u32_ok_eq_log(expected, actual, false, false)
534}
535
536pub fn assert_vec_dc_ok_eq_no_errors(
537    expected: &[u32],
538    actual: Result<(Vec<u32>, FormatLog)>,
539) -> (Vec<u32>, FormatLog) {
540    _assert_vec_u32_ok_eq_log(expected, actual, true, false)
541}
542
543/// Equivalent to `assert_vec_u32_eq`, but prints the provided log on failure
544pub fn assert_vec_u32_eq_log(
545    expected: &[u32],
546    actual: &[u32],
547    log: &FormatLog,
548) {
549    _assert_vec_u32_eq_log(expected, actual, log, false);
550}
551
552pub fn assert_vec_dc_eq_log(expected: &[u32], actual: &[u32], log: &FormatLog) {
553    _assert_vec_u32_eq_log(expected, actual, log, true);
554}
555
556fn _assert_vec_u32_eq_log(
557    expected: &[u32],
558    actual: &[u32],
559    log: &FormatLog,
560    print_dcs: bool,
561) {
562    let mut message = format!(
563        "Vectors (u32) differ.\n{}\nLog:      {}",
564        fmt_mismatch_vec_u32(expected, actual),
565        log.format_all()
566    );
567
568    if print_dcs {
569        message.push_str(&_format_dcs_for_log(expected, actual));
570    }
571
572    assert_eq!(expected, actual, "{message}");
573}
574
575/// Equivalent to `assert_vec_u8_eq`, but prints the provided log on failure
576pub fn assert_vec_u8_eq_log(expected: &[u8], actual: &[u8], log: &FormatLog) {
577    assert_eq!(
578        expected,
579        actual,
580        "Vectors (u8) differ.\n{}\nLog:      {}",
581        fmt_mismatch_vec_u8(expected, actual),
582        log.format_all()
583    );
584}
585
586pub fn assert_string_eq(expected: &str, actual: String) -> String {
587    let actual_string = actual;
588
589    assert_eq!(
590        expected,
591        &actual_string,
592        "Strings differ.\n{}",
593        fmt_mismatch_string(expected, &actual_string),
594    );
595    actual_string
596}
597
598pub fn assert_string_ok_eq(expected: &str, actual: Result<String>) -> String {
599    let actual_string = actual.unwrap();
600
601    assert_eq!(
602        expected,
603        &actual_string,
604        "Strings differ.\n{}",
605        fmt_mismatch_string(expected, &actual_string),
606    );
607    actual_string
608}
609
610pub fn assert_string_ok_eq_no_warnings(
611    expected: &str,
612    actual: Result<(String, FormatLog)>,
613) -> (String, FormatLog) {
614    let (actual_string, log) = actual.unwrap();
615
616    assert!(
617        log.has_no_warnings_or_errors(),
618        "Warnings or errors found:\n{}",
619        log.format_all()
620    );
621    assert_eq!(
622        expected,
623        &actual_string,
624        "Strings differ.\n{}\nLog:      {}",
625        fmt_mismatch_string(expected, &actual_string),
626        log.format_all()
627    );
628    (actual_string, log)
629}
630
631pub fn assert_string_ok_eq_no_errors(
632    expected: &str,
633    actual: Result<(String, FormatLog)>,
634) -> (String, FormatLog) {
635    let (actual_string, log) = actual.unwrap();
636
637    assert!(log.has_no_errors(), "Errors found:\n{}", log.format_all());
638    assert_eq!(
639        expected,
640        &actual_string,
641        "Strings differ.\n{}\nLog:      {}",
642        fmt_mismatch_string(expected, &actual_string),
643        log.format_all()
644    );
645    (actual_string, log)
646}
647
648#[cfg(test)]
649mod tests {
650    use super::*;
651
652    #[crate::ctb_test]
653    fn test_dctext_to_dcutf() {
654        let text = "hi @64@ @@ @65@ @128@ there 🥴 @L42@ noncharacter @1114111@ surrogate @56191@ unicode null @0@ dc null @1114112@ @2147483648@ 2^128-1 @340282366920938463463374607431768211455@";
655        let dcutf = dctext_to_dcutf(text.as_bytes().to_vec());
656        assert_eq!(
657            "686920402040204120c28020746865726520f09fa5b420ff84849084a82a206e6f6e63686172616374657220f48fbfbf20737572726f6761746520edadbf20756e69636f6465206e756c6c2000206463206e756c6c20ff848490808020ff8682808080808020325e3132382d3120ff9683bfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbf",
658            vectohex(&dcutf)
659        );
660
661        let roundtrip = dcutf_to_dctext(dcutf.clone());
662        let roundtrip_str = String::from_utf8(roundtrip).unwrap();
663
664        // Should match original
665        let expected_roundtrip = "hi @@ @@ A \u{80} there 🥴 @L42@ noncharacter 􏿿 surrogate @56191@ unicode null \u{0} dc null @1114112@ @2147483648@ 2^128-1 @340282366920938463463374607431768211455@";
666        assert!(roundtrip_str.eq(expected_roundtrip));
667    }
668
669    #[crate::ctb_test]
670    fn can_get_uuid_from_document() {
671        assert_eq!(
672            strtovec("9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7"),
673            get_uuid_from_document(strtovec(
674                "9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7<html>"
675            ))
676            .unwrap()
677        );
678    }
679
680    #[crate::ctb_test]
681    fn can_get_format_from_uuid() {
682        assert_eq!(
683            strtovec("html"),
684            get_format_from_uuid(strtovec(
685                "9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7<html>"
686            ))
687            .unwrap()
688        );
689    }
690}