1use crate::formats::eite::formats::utf8::UTF8FormatSettings;
2use crate::formats::eite::formats::utf8::dca_to_utf8;
3use crate::formats::utf_8e_128::decode_utf_8e_128;
4use crate::formats::utf_8e_128::encode_utf_8e_128_buf;
5use crate::utilities::*;
6use std::collections::HashMap;
7use std::iter;
8use uuid::Uuid;
9
10pub mod base16b;
11pub mod base64;
12pub mod eite;
13pub mod html;
14pub mod ip;
15pub mod markdown;
16pub mod multipart;
17pub mod troff;
18pub mod unicode;
19pub mod utf8;
20pub mod utf_8e_128;
21pub mod wtf8;
22#[derive(Default, Debug)]
23pub struct FormatLog {
24 errors: Vec<String>,
25 warnings: Vec<String>,
26 debug_messages: Vec<String>,
27 log_order: Vec<(LogType, usize)>,
29}
30
31#[derive(Copy, Clone, Debug)]
33enum LogType {
34 Error,
35 Warning,
36 Debug,
37}
38
39impl FormatLog {
40 pub fn error(&mut self, message: &str) {
42 #[cfg(debug_assertions)]
43 crate::debug!("FormatLog error: {}", message);
44 self.errors.push(message.to_string());
45 self.log_order.push((LogType::Error, self.errors.len() - 1));
46 }
47
48 pub fn warn(&mut self, message: &str) {
49 #[cfg(debug_assertions)]
50 crate::debug!("FormatLog warn: {}", message);
51 self.warnings.push(message.to_string());
52 self.log_order
53 .push((LogType::Warning, self.warnings.len() - 1));
54 }
55
56 #[cfg(not(debug_assertions))]
57 pub fn debug(&mut self, message: &str) {}
58
59 #[cfg(debug_assertions)]
60 pub fn debug(&mut self, message: &str) {
61 #[cfg(debug_assertions)]
62 crate::debug!("FormatLog debug: {}", message);
63 self.debug_messages.push(message.to_string());
64 self.log_order
65 .push((LogType::Debug, self.debug_messages.len() - 1));
66 }
67
68 pub fn get_errors(&self) -> Vec<String> {
69 self.errors.clone()
70 }
71
72 pub fn get_warnings(&self) -> Vec<String> {
73 self.warnings.clone()
74 }
75
76 pub fn has_errors(&self) -> bool {
77 !self.errors.is_empty()
78 }
79
80 pub fn has_no_errors(&self) -> bool {
81 !self.has_errors()
82 }
83
84 pub fn has_warnings(&self) -> bool {
85 !self.warnings.is_empty()
86 }
87
88 pub fn has_debug_messages(&self) -> bool {
89 !self.debug_messages.is_empty()
90 }
91
92 pub fn has_any(&self) -> bool {
93 self.has_errors() || self.has_warnings() || self.has_debug_messages()
94 }
95
96 pub fn has_warnings_or_errors(&self) -> bool {
97 self.has_warnings() || self.has_errors()
98 }
99
100 pub fn has_no_warnings_or_errors(&self) -> bool {
101 !self.has_warnings_or_errors()
102 }
103
104 pub fn import_error(&mut self, index: u64, problem: &str) {
106 let error = format!(
107 "An unrecoverable problem was encountered while importing at character {index}: {problem}"
108 );
109 self.error(error.as_str());
110 }
111
112 pub fn import_warning(&mut self, index: u64, problem: &str) {
114 let warn = format!(
115 "A problem was encountered while importing at character {index}: {problem}"
116 );
117 self.warn(warn.as_str());
118 }
119
120 pub fn export_error(&mut self, index: u64, problem: &str) {
122 let error = format!(
123 "An unrecoverable problem was encountered while exporting at character {index}: {problem}"
124 );
125 self.error(error.as_str());
126 }
127
128 pub fn export_warning(&mut self, index: u64, problem: &str) {
130 let warn = format!(
131 "A problem was encountered while exporting at character {index}: {problem}"
132 );
133 self.warn(warn.as_str());
134 }
135
136 pub fn export_warning_unmappable(
137 &mut self,
138 index: u64,
139 problem_dc: u32,
140 format: &str,
141 ) {
142 self.export_warning(index, format!("The character {problem_dc} could not be represented in the chosen export format ({format}).").as_str());
143 }
144
145 pub fn merge(&mut self, other: &FormatLog) {
146 let error_offset = self.errors.len();
147 let warning_offset = self.warnings.len();
148 let debug_offset = self.debug_messages.len();
149
150 self.errors.extend(other.errors.clone());
151 self.warnings.extend(other.warnings.clone());
152 self.debug_messages.extend(other.debug_messages.clone());
153
154 for &(typ, idx) in &other.log_order {
155 let adjusted_idx = match typ {
156 LogType::Error => idx + error_offset,
157 LogType::Warning => idx + warning_offset,
158 LogType::Debug => idx + debug_offset,
159 };
160 self.log_order.push((typ, adjusted_idx));
161 }
162 }
163
164 pub fn format_all(&self) -> String {
166 if !self.has_any() {
167 return String::new();
168 }
169 let mut output = String::new();
170 output.push_str("Messages during format processing:\n");
171 for &(typ, idx) in &self.log_order {
174 match typ {
175 LogType::Error => {
176 output.push_str("* [ERROR] ");
177 output.push_str(&self.errors[idx]);
178 }
179 LogType::Warning => {
180 output.push_str("- [WARNING] ");
181 output.push_str(&self.warnings[idx]);
182 }
183 LogType::Debug => {
184 output.push_str("- [DEBUG] ");
185 output.push_str(&self.debug_messages[idx]);
186 }
187 }
188 output.push('\n');
189 }
190 output
191 }
192
193 pub fn format_errors(&self) -> String {
194 let mut errors = String::new();
195 if self.has_errors() {
196 for e in &self.errors {
197 errors.push_str("- ");
198 errors.push_str(e);
199 errors.push('\n');
200 }
201 format!("Errors during format processing:\n{errors}")
202 } else {
203 String::new()
204 }
205 }
206
207 pub fn format_warnings(&self) -> String {
208 let mut warnings = String::new();
209 if self.has_warnings() {
210 for w in &self.warnings {
211 warnings.push_str("- ");
212 warnings.push_str(w);
213 warnings.push('\n');
214 }
215 format!("Warnings during format processing:\n{warnings}")
216 } else {
217 String::new()
218 }
219 }
220
221 pub fn format_debug(&self) -> String {
222 let mut debug = String::new();
223 if self.debug_messages.is_empty() {
224 String::new()
225 } else {
226 for d in &self.debug_messages {
227 debug.push_str("- ");
228 debug.push_str(d);
229 debug.push('\n');
230 }
231 format!("Debug messages during format processing:\n{debug}")
232 }
233 }
234
235 pub fn auto_log(&self) {
236 if self.has_any() {
237 error!(self.format_errors());
238 warn!(self.format_warnings());
239 debug!(self.format_debug());
240 } else {
241 info!("No errors or warnings during format processing.");
242 }
243 }
244}
245
246impl std::fmt::Display for FormatLog {
247 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
248 if self.has_any() {
249 writeln!(f, "{}", self.format_all())?;
250 } else {
251 writeln!(f, "No errors or warnings during format processing.")?;
252 }
253 Ok(())
254 }
255}
256
257pub fn string_result_with_log_to_vec(
258 result: Result<(String, FormatLog)>,
259) -> Result<(Vec<u8>, FormatLog)> {
260 result.map(|res| {
261 let result_bytes = res.0.into_bytes();
262 (result_bytes, res.1)
263 })
264}
265
266pub fn get_format_uuids<'a>() -> HashMap<Vec<u8>, Vec<u8>> {
267 HashMap::from([(
268 strtovec("9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7"),
269 strtovec("html"),
270 )])
271}
272
273pub fn get_format_from_uuid(document: Vec<u8>) -> Option<Vec<u8>> {
274 let head: Vec<u8>;
275 if document.len() < 36 {
276 head = document;
277 } else {
278 head = document[..36].to_vec();
279 }
280 let uuid = get_uuid_from_document(head);
281 uuid.as_ref()?;
282
283 Some(get_format_uuids()[&uuid.expect("checked earlier")].clone())
284}
285
286pub fn get_uuid_from_document(document: Vec<u8>) -> Option<Vec<u8>> {
287 if document.len() < 16 {
288 return None;
289 }
290
291 let uuid_binary = Uuid::from_slice(&document[..16])
292 .expect("The length should be 16")
293 .hyphenated()
294 .to_string()
295 .into_bytes();
296 let uuid_string = String::from_utf8_lossy(&document[..36])
298 .to_string()
299 .into_bytes();
300
301 let formats = get_format_uuids();
302
303 if formats.contains_key(&uuid_binary) {
304 return Some(uuid_binary);
305 } else if formats.contains_key(&uuid_string) {
306 return Some(uuid_string);
307 }
308
309 None
310}
311
312pub fn convert_if_needed(document: Vec<u8>) -> Vec<u8> {
313 document
316}
317
318pub fn convert_from(document: Vec<u8>, filetype: Vec<u8>) -> Vec<u8> {
319 document
322}
323
324pub fn sanitize_html(document: Vec<u8>) -> Vec<u8> {
325 let mut builder = ammonia::Builder::default();
326
327 builder.add_generic_attributes(iter::once("class").chain(iter::once("id")));
328
329 builder
330 .clean(&String::from_utf8_lossy(&document))
331 .to_string()
332 .into_bytes()
333}
334
335pub fn dctext_to_dcutf(document: Vec<u8>) -> Vec<u8> {
337 let document = String::from_utf8(document).unwrap();
341 let mut output = Vec::new();
342 for line in document.lines() {
343 let mut rest = line;
344 while let Some(start) = rest.find('@') {
345 output.extend_from_slice(&rest.as_bytes()[..start]);
347 rest = &rest[start + 1..];
348 if let Some(end) = rest.find('@') {
349 let token = &rest[..end];
350 let mut dcid_str = token;
351 let mut l_prefix = false;
352 if dcid_str.is_empty() {
353 dcid_str = "64"; }
355 if dcid_str.starts_with('L') {
356 l_prefix = true;
357 dcid_str = &dcid_str[1..];
358 }
359 if let Ok(dcid) = dcid_str.parse::<u128>() {
360 let mut buf = [0u8; 24];
361 if l_prefix {
362 let n1 = encode_utf_8e_128_buf(&mut buf, 1114408);
364 output.extend_from_slice(&buf[..n1]);
365 let n2 = encode_utf_8e_128_buf(&mut buf, dcid);
366 output.extend_from_slice(&buf[..n2]);
367 } else {
368 let n = encode_utf_8e_128_buf(&mut buf, dcid);
369 output.extend_from_slice(&buf[..n]);
370 }
371 }
372 rest = &rest[end + 1..];
373 } else {
374 output.extend_from_slice(rest.as_bytes());
376 break;
377 }
378 }
379 output.extend_from_slice(rest.as_bytes());
381 }
382
383 output
384}
385
386pub fn dcutf_to_dctext(document: Vec<u8>) -> Vec<u8> {
388 let mut output = String::new();
389 let mut i = 0;
390 while i < document.len() {
391 if let Some((codepoint, size)) = decode_utf_8e_128(&document[i..]) {
392 if codepoint == 64 {
393 output.push_str("@@");
394 } else if codepoint <= 0x10FFFF {
395 if let Some(ch) = std::char::from_u32(
397 u32::try_from(codepoint).expect("Checked range already"),
398 ) {
399 output.push(ch);
400 } else {
401 output.push_str(format!("@{codepoint}@").as_str());
402 }
403 } else if codepoint == 1114408 {
404 i += size;
406 if let Some((next_dc, next_size)) =
407 decode_utf_8e_128(&document[i..])
408 {
409 output.push_str(format!("@L{next_dc}@").as_str());
410 i += next_size;
411 continue;
412 }
413 output.push_str("@1114408@");
414 } else {
415 output.push_str(format!("@{codepoint}@").as_str());
417 }
418 i += size;
419 } else {
420 output.push(char::from(document[i]));
422 i += 1;
423 }
424 }
425 output.into_bytes()
426}
427
428pub trait CharUtfBytesExt {
430 fn as_utf8_bytes(&self) -> Vec<u8>;
431}
432
433impl CharUtfBytesExt for char {
434 fn as_utf8_bytes(&self) -> Vec<u8> {
436 let mut buf = [0u8; 4];
437 let s = self.encode_utf8(&mut buf);
438 s.as_bytes().to_vec()
439 }
440}
441
442pub fn assert_vec_u8_ok_eq_no_warnings(
444 expected: &[u8],
445 actual: Result<(Vec<u8>, FormatLog)>,
446) -> Vec<u8> {
447 let (actual_bytes, log) = actual.unwrap();
448 assert!(
449 log.has_no_warnings_or_errors(),
450 "Warnings or errors found:\n{}",
451 log.format_all()
452 );
453 assert_vec_u8_eq_log(expected, &actual_bytes, &log);
454 actual_bytes
455}
456
457pub fn assert_vec_u8_ok_eq_no_errors(
458 expected: &[u8],
459 actual: Result<(Vec<u8>, FormatLog)>,
460) -> (Vec<u8>, FormatLog) {
461 let (actual_bytes, log) = actual.unwrap();
462
463 assert!(log.has_no_errors(), "Errors found:\n{}", log.format_all());
464 assert_vec_u8_eq_log(expected, &actual_bytes, &log);
465 (actual_bytes, log)
466}
467
468fn _format_dcs_for_log(expected: &[u32], actual: &[u32]) -> String {
469 let (utf8_expected, _) =
470 dca_to_utf8(expected, &UTF8FormatSettings::default())
471 .unwrap_or_default();
472 let (utf8_actual, _) =
473 dca_to_utf8(actual, &UTF8FormatSettings::default()).unwrap_or_default();
474
475 format!(
476 "Expected formatted Dcs: {}\nActual formatted Dcs: {}",
477 String::from_utf8_lossy(&utf8_expected),
478 String::from_utf8_lossy(&utf8_actual)
479 )
480}
481
482fn _assert_vec_u32_ok_eq_log(
483 expected: &[u32],
484 actual: Result<(Vec<u32>, FormatLog)>,
485 print_dcs: bool,
486 disallow_warnings: bool,
487) -> (Vec<u32>, FormatLog) {
488 let (actual_vec, log) = actual.unwrap();
489
490 let mut log_problem_type = "Errors";
491 if disallow_warnings {
492 log_problem_type = "Warnings or errors";
493 }
494 let mut message =
495 format!("{log_problem_type} found:\n{}", log.format_all());
496
497 if print_dcs {
498 message.push_str(&_format_dcs_for_log(expected, &actual_vec));
499 }
500
501 if disallow_warnings {
502 assert!(log.has_no_warnings_or_errors(), "{message}");
503 } else {
504 assert!(log.has_no_errors(), "{message}");
505 }
506
507 if print_dcs {
508 assert_vec_dc_eq_log(expected, &actual_vec, &log);
509 } else {
510 assert_vec_u32_eq_log(expected, &actual_vec, &log);
511 }
512 (actual_vec, log)
513}
514
515pub fn assert_vec_u32_ok_eq_no_warnings(
516 expected: &[u32],
517 actual: Result<(Vec<u32>, FormatLog)>,
518) -> (Vec<u32>, FormatLog) {
519 _assert_vec_u32_ok_eq_log(expected, actual, false, true)
520}
521
522pub fn assert_vec_dc_ok_eq_no_warnings(
523 expected: &[u32],
524 actual: Result<(Vec<u32>, FormatLog)>,
525) -> (Vec<u32>, FormatLog) {
526 _assert_vec_u32_ok_eq_log(expected, actual, true, true)
527}
528
529pub fn assert_vec_u32_ok_eq_no_errors(
530 expected: &[u32],
531 actual: Result<(Vec<u32>, FormatLog)>,
532) -> (Vec<u32>, FormatLog) {
533 _assert_vec_u32_ok_eq_log(expected, actual, false, false)
534}
535
536pub fn assert_vec_dc_ok_eq_no_errors(
537 expected: &[u32],
538 actual: Result<(Vec<u32>, FormatLog)>,
539) -> (Vec<u32>, FormatLog) {
540 _assert_vec_u32_ok_eq_log(expected, actual, true, false)
541}
542
543pub fn assert_vec_u32_eq_log(
545 expected: &[u32],
546 actual: &[u32],
547 log: &FormatLog,
548) {
549 _assert_vec_u32_eq_log(expected, actual, log, false);
550}
551
552pub fn assert_vec_dc_eq_log(expected: &[u32], actual: &[u32], log: &FormatLog) {
553 _assert_vec_u32_eq_log(expected, actual, log, true);
554}
555
556fn _assert_vec_u32_eq_log(
557 expected: &[u32],
558 actual: &[u32],
559 log: &FormatLog,
560 print_dcs: bool,
561) {
562 let mut message = format!(
563 "Vectors (u32) differ.\n{}\nLog: {}",
564 fmt_mismatch_vec_u32(expected, actual),
565 log.format_all()
566 );
567
568 if print_dcs {
569 message.push_str(&_format_dcs_for_log(expected, actual));
570 }
571
572 assert_eq!(expected, actual, "{message}");
573}
574
575pub fn assert_vec_u8_eq_log(expected: &[u8], actual: &[u8], log: &FormatLog) {
577 assert_eq!(
578 expected,
579 actual,
580 "Vectors (u8) differ.\n{}\nLog: {}",
581 fmt_mismatch_vec_u8(expected, actual),
582 log.format_all()
583 );
584}
585
586pub fn assert_string_eq(expected: &str, actual: String) -> String {
587 let actual_string = actual;
588
589 assert_eq!(
590 expected,
591 &actual_string,
592 "Strings differ.\n{}",
593 fmt_mismatch_string(expected, &actual_string),
594 );
595 actual_string
596}
597
598pub fn assert_string_ok_eq(expected: &str, actual: Result<String>) -> String {
599 let actual_string = actual.unwrap();
600
601 assert_eq!(
602 expected,
603 &actual_string,
604 "Strings differ.\n{}",
605 fmt_mismatch_string(expected, &actual_string),
606 );
607 actual_string
608}
609
610pub fn assert_string_ok_eq_no_warnings(
611 expected: &str,
612 actual: Result<(String, FormatLog)>,
613) -> (String, FormatLog) {
614 let (actual_string, log) = actual.unwrap();
615
616 assert!(
617 log.has_no_warnings_or_errors(),
618 "Warnings or errors found:\n{}",
619 log.format_all()
620 );
621 assert_eq!(
622 expected,
623 &actual_string,
624 "Strings differ.\n{}\nLog: {}",
625 fmt_mismatch_string(expected, &actual_string),
626 log.format_all()
627 );
628 (actual_string, log)
629}
630
631pub fn assert_string_ok_eq_no_errors(
632 expected: &str,
633 actual: Result<(String, FormatLog)>,
634) -> (String, FormatLog) {
635 let (actual_string, log) = actual.unwrap();
636
637 assert!(log.has_no_errors(), "Errors found:\n{}", log.format_all());
638 assert_eq!(
639 expected,
640 &actual_string,
641 "Strings differ.\n{}\nLog: {}",
642 fmt_mismatch_string(expected, &actual_string),
643 log.format_all()
644 );
645 (actual_string, log)
646}
647
648#[cfg(test)]
649mod tests {
650 use super::*;
651
652 #[crate::ctb_test]
653 fn test_dctext_to_dcutf() {
654 let text = "hi @64@ @@ @65@ @128@ there 🥴 @L42@ noncharacter @1114111@ surrogate @56191@ unicode null @0@ dc null @1114112@ @2147483648@ 2^128-1 @340282366920938463463374607431768211455@";
655 let dcutf = dctext_to_dcutf(text.as_bytes().to_vec());
656 assert_eq!(
657 "686920402040204120c28020746865726520f09fa5b420ff84849084a82a206e6f6e63686172616374657220f48fbfbf20737572726f6761746520edadbf20756e69636f6465206e756c6c2000206463206e756c6c20ff848490808020ff8682808080808020325e3132382d3120ff9683bfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbfbf",
658 vectohex(&dcutf)
659 );
660
661 let roundtrip = dcutf_to_dctext(dcutf.clone());
662 let roundtrip_str = String::from_utf8(roundtrip).unwrap();
663
664 let expected_roundtrip = "hi @@ @@ A \u{80} there 🥴 @L42@ noncharacter surrogate @56191@ unicode null \u{0} dc null @1114112@ @2147483648@ 2^128-1 @340282366920938463463374607431768211455@";
666 assert!(roundtrip_str.eq(expected_roundtrip));
667 }
668
669 #[crate::ctb_test]
670 fn can_get_uuid_from_document() {
671 assert_eq!(
672 strtovec("9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7"),
673 get_uuid_from_document(strtovec(
674 "9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7<html>"
675 ))
676 .unwrap()
677 );
678 }
679
680 #[crate::ctb_test]
681 fn can_get_format_from_uuid() {
682 assert_eq!(
683 strtovec("html"),
684 get_format_from_uuid(strtovec(
685 "9ba60c52-9cf8-41a7-b3ea-7a1e14f6c5d7<html>"
686 ))
687 .unwrap()
688 );
689 }
690}