ctoolbox/formats/eite/encoding/
basenb.rs

1//! Basenb encoding/decoding: pack binary data into Unicode PUA.
2//!
3//! Basenb is a way of encoding arbitrary binary data into a compact string
4//! representation that can be embedded in Unicode text, as runs of Unicode
5//! private-use characters.
6//!
7//! It's a modified version of Base16b that additionally encodes a "remainder"
8//! length in a trailing character, which seems to be needed to reliably
9//! round-trip values.
10//!
11//! Actually using it requires some sort of protocol for when to switch between
12//! basenb and regular PUA characters. dcBasenb addresses that by encoding UUIDs
13//! and using them as in-band start/end markers. (Round-tripping a UTF-8 file
14//! that included dcBasenb UUIDs, for instance in reference to them, would
15//! probably only be possible by encoding the UUIDs as UTF-8 encapsulated within
16//! the Dcs, and being careful with the encode/decode settings.)
17//!
18//! dcBasenb is a way of encoding arbitrary Dcs into runs of Unicode private-use
19//! characters; see dcbasenb.rs
20
21use anyhow::{Result, anyhow, bail};
22
23use crate::formats::base16b;
24use crate::formats::eite::encoding::pack32::{
25    is_pack32_char, pack32, unpack32,
26};
27use crate::formats::eite::util::array::subset;
28use crate::formats::eite::util::bitwise::{
29    byte_array_from_int_bit_array, byte_array_to_int_bit_array,
30};
31use crate::formats::eite::util::math::int_is_between_u32;
32use crate::{bail_if_none, log};
33
34/// FIXME UNIMPLEMENTED
35pub const ARMORED_BASE17B_UTF8_START_UUID_BYTES: [u8; 32] = [
36    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
37    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
38    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
39];
40
41/// FIXME UNIMPLEMENTED
42pub const ARMORED_BASE17B_UTF8_END_UUID_BYTES: [u8; 32] = [
43    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
44    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
45    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
46];
47
48/// internalIntBitArrayToBasenbString(intBase, bytes)
49/// Returns UTF-8 bytes of the encoded string (mirroring JS returning a byte array).
50pub fn int_bit_array_to_basenb_no_remainder_marker(
51    base: u32,
52    input: &[u8],
53) -> Result<Vec<u8>> {
54    let encoded = base16b::encode(input, base)?;
55
56    Ok(encoded.into_bytes())
57}
58
59/// internalIntBitArrayFromBasenbString(byteArrayInput, intRemainder)
60/// JS passes a `Uint8Array` of UTF-8 bytes and an int remainder
61pub fn int_bit_array_from_basenb_string(
62    input_bytes: &[u8],
63    remainder_len: Option<u32>,
64) -> Result<Vec<u8>> {
65    let s = std::str::from_utf8(input_bytes).map_err(|e| {
66        anyhow!(
67            "utf8 error on input {:?}, from_lossy {:?}: {e}",
68            input_bytes,
69            String::from_utf8_lossy(input_bytes)
70        )
71    })?;
72    log!(
73        "Decoding with input str {:?}, remainder {:?}",
74        s,
75        remainder_len
76    );
77    base16b::decode(s, remainder_len)
78}
79
80/// Is the provided base valid for Basenb? (Original: 7 through 17 inclusive.)
81pub fn is_basenb_base(base: u32) -> bool {
82    (7..=17).contains(&base)
83}
84
85/// True if the pack32 character represents a Basenb character codepoint.
86///
87/// The Basenb character ranges):
88///   - 983040 ..= 1048573: U+F0000 to U+FFFFD
89///   - 1048576 ..= 1114109: U+100000 to U+10FFFD
90///   - 63481  ..= 63501   (special / remainder markers): U+F7F9 to U+F80D
91///
92/// Distinct remainder markers subset: 63481 ..= 63497: U+F7F9 to U+F809
93pub fn is_basenb_char(packed_char: &[u8]) -> bool {
94    if !is_pack32_char(packed_char) {
95        return false;
96    }
97    if let Ok(cp) = unpack32(packed_char) {
98        // U+F0000 to U+FFFFD
99        if int_is_between_u32(cp, 983_040, 1_048_573) {
100            return true;
101        }
102        // U+100000 to U+10FFFD
103        if int_is_between_u32(cp, 1_048_576, 1_114_109) {
104            return true;
105        }
106        // U+F7F9 to U+F80D. Remainders are U+F7F9 to U+F809; U+F80A to U+F80D
107        // are used for Base16b and Base17b.
108        if int_is_between_u32(cp, 63_481, 63_501) {
109            return true;
110        }
111    }
112    false
113}
114
115/// True if the pack32 character is one of the distinct remainder markers
116/// (63481..=63497).
117pub fn is_basenb_distinct_remainder_char(packed_char: &[u8]) -> bool {
118    if !is_pack32_char(packed_char) {
119        return false;
120    }
121    if let Ok(cp) = unpack32(packed_char) {
122        // U+F7F9 to U+F809
123        return int_is_between_u32(cp, 63_481, 63_497);
124    }
125    false
126}
127
128pub fn byte_array_to_basenb_no_remainder_marker(
129    base: u32,
130    input: &[u8],
131) -> Result<Vec<u8>> {
132    if !is_basenb_base(base) {
133        return Err(anyhow!(
134            "byte_array_to_basenb_no_remainder_marker: invalid base {base}, expected 7..=17"
135        ));
136    }
137    let bit_array = byte_array_to_int_bit_array(input);
138    let encoded =
139        int_bit_array_to_basenb_no_remainder_marker(base, &bit_array)?;
140    Ok(encoded)
141}
142
143/// Encode a raw byte array into Basenb (UTF-8 sequence of pack32 codepoints).
144///
145/// Steps:
146/// 1. Convert bytes to bit array.
147/// 2. Encode bit array via `int_bit_array_to_basenb_string`.
148/// 3. Append remainder length marker: pack32(63497 - (`bit_len` % 17)).
149///    (Matches the original implementation’s workaround re: remainder storage.)
150///
151/// Returns the full UTF-8 (actually just raw bytes containing appended pack32
152/// code units) representing the Basenb encoding.
153pub fn byte_array_to_basenb_utf8(base: u32, input: &[u8]) -> Result<Vec<u8>> {
154    let mut encoded = byte_array_to_basenb_no_remainder_marker(base, input)?;
155    /* Remainder marker. The remainder length also needs to be stored, to be able to decode successfully. We'll calculate, encode, and append it. It's always 4 bytes, 1 UTF-8 character, and 2 UTF-16 characters long, after encoding (it has 2 added to it to make it always be the same byte length and UTF-16 length; this must be subtracted before passing it to the Base16b.decode function). */
156    let remainder = (input.len() * 8) % usize::try_from(base)?;
157    // Start with U+F809, and subtract the remainder to find the codepoint.
158    let codepoint = 63_497 - (u32::try_from(remainder)?);
159    encoded.extend(pack32(codepoint)?);
160    Ok(encoded)
161}
162
163/// Sentinel UUID returned by the legacy JS implementation to indicate an invalid
164/// basenb UTF-8 decode input (only remainder char present or incomplete data).
165/// UUID: 3362daa3-1705-40ec-9a97-59d052fd4037
166pub const BYTE_ARRAY_FROM_BASENB_UTF8_INVALID_INPUT_EXCEPTION_BYTES: [u8; 16] = [
167    51, 98, 218, 163, 23, 5, 64, 236, 154, 151, 89, 208, 82, 253, 64, 55,
168];
169
170/// Decode a Basenb UTF-8 byte sequence into the original byte array.
171///
172/// This replicates the JS `byteArrayFromBasenbUtf8(intArrayIn)` logic:
173/// - Determines the encoded remainder-length indicator (either a distinct
174///   3‑byte remainder char or a generic 4‑byte packed char).
175/// - For a distinct remainder char (3 bytes): remainder = 63497 - unpack32
176///   (last3)
177/// - For a generic (4 bytes) remainder char: decode that char as an 8‑bit
178///   value, then `remainder = decoded_byte - 2`
179/// - If the full input length is exactly (or smaller than) the remainder char
180///   length, returns the sentinel UUID bytes to indicate invalid input (legacy
181///   behavior).
182///
183/// Remainder length (in bits) is passed to `int_bit_array_from_basenb_string`
184/// which reconstructs the concatenated bit array; that is then converted back
185/// to bytes.
186///
187/// Returns:
188/// - `Ok(Vec<u8>)` with decoded bytes (or sentinel bytes if invalid input).
189pub fn byte_array_from_basenb_utf8(input: &[u8]) -> Result<Vec<u8>> {
190    /* Extract remainder length */
191    let mut remainder: u32;
192    /* last 3 bytes (1 character), which represent the remainder */
193    let mut remainder_arr: Vec<u8>;
194    remainder_arr = bail_if_none!(subset(input, -3, -1));
195    if is_basenb_distinct_remainder_char(&remainder_arr) {
196        remainder = unpack32(&remainder_arr)?;
197        remainder = 63497_u32.checked_sub(remainder).unwrap();
198    } else {
199        /* last 4 bytes (1 character), which represent the remainder */
200        remainder_arr = bail_if_none!(subset(input, -4, -1));
201        let remainder_decoded: Vec<u8> = byte_array_from_int_bit_array(
202            &int_bit_array_from_basenb_string(&remainder_arr, Some(8))?,
203        )?;
204        let temp: &u8 = bail_if_none!(remainder_decoded.first());
205        let temp: i16 = bail_if_none!(i16::from(*temp).checked_add(-2));
206        remainder = u32::try_from(temp)?;
207    }
208    if input.len() <= remainder_arr.len() {
209        // Mirrors legacy path: only a (missing) remainder => sentinel.
210        return Ok(
211            BYTE_ARRAY_FROM_BASENB_UTF8_INVALID_INPUT_EXCEPTION_BYTES.to_vec()
212        );
213    }
214    let mut subset_end = i64::try_from(remainder_arr.len())?;
215    subset_end *= -1;
216    subset_end += -1;
217
218    let subset = bail_if_none!(subset(input, 0, subset_end));
219
220    log!("Getting bits from subset: {:?}, {:?}", &subset, remainder);
221
222    let bits = &int_bit_array_from_basenb_string(&subset, Some(remainder))?;
223    log!("Bits from decoder: {:?}", &bits);
224    byte_array_from_int_bit_array(bits)
225}
226
227/// Convenience wrapper (encode bytes to Basenb 17 UTF-8 representation).
228///
229/// JS original: `byteArrayToBase17bUtf8(intArrayIn)` calling `byteArrayToBasenbUtf8(17, ...)`.
230/// Renamed to try to make it clearer that this isn't the original base17b
231/// format.
232pub fn byte_array_to_basenb_17_utf8(input: &[u8]) -> Result<Vec<u8>> {
233    byte_array_to_basenb_utf8(17, input)
234}
235
236/// Convenience wrapper (decode Basenb  17 UTF-8 representation).
237///
238/// JS original simply forwarded to `byteArrayFromBasenbUtf8`.
239pub fn byte_array_from_basenb_17_utf8(input: &[u8]) -> Result<Vec<u8>> {
240    byte_array_from_basenb_utf8(input)
241}
242
243// “Armored” Base17b UTF-8 helpers.
244
245/// Produce an “armored” Base17b UTF-8 run, encoding arbitrary binary data:
246///   armored = `start_uuid` || `base17b_encode(bytes)` || `end_uuid`
247///
248/// This mirrors the original JS `byteArrayToArmoredBase17bUtf8`.
249/// FIXME Unimplemented/untested!
250pub fn byte_array_to_armored_base17b_utf8(input: &[u8]) -> Result<Vec<u8>> {
251    // Encode payload to Base17b UTF-8 (already provided in earlier translation).
252    let encoded = byte_array_to_basenb_17_utf8(input)?;
253    let mut out = ARMORED_BASE17B_UTF8_START_UUID_BYTES.to_vec();
254    out.extend(encoded);
255    out.extend(ARMORED_BASE17B_UTF8_END_UUID_BYTES);
256    Ok(out)
257}
258
259/// Decode an armored Base17b UTF-8 run to a byte array. FIXME untested!
260pub fn byte_array_from_armored_base17b_utf8(input: &[u8]) -> Result<Vec<u8>> {
261    let start = ARMORED_BASE17B_UTF8_START_UUID_BYTES;
262    let end = ARMORED_BASE17B_UTF8_END_UUID_BYTES;
263
264    let min_len = start.len() + end.len();
265    if input.len() < min_len {
266        bail!(
267            "Armored Base17b input too short: {} < required framing {}",
268            input.len(),
269            min_len
270        );
271    }
272
273    if !input.starts_with(&start) {
274        bail!("Armored Base17b input missing or corrupt start UUID marker");
275    }
276    if !input.ends_with(&end) {
277        bail!("Armored Base17b input missing or corrupt end UUID marker");
278    }
279
280    let inner_len = input.len() - start.len() - end.len();
281    let inner = &input[start.len()..start.len() + inner_len];
282    // Decode the inner Base17b UTF-8 segment back to raw bytes.
283    let decoded = byte_array_from_basenb_17_utf8(inner)?;
284    Ok(decoded)
285}
286
287#[cfg(test)]
288mod tests {
289
290    use crate::formats::eite::formats::dcbasenb::DC_BASENB_EMBEDDED_START_BYTES;
291    use crate::formats::eite::{
292        encoding::pack32::pack32, util::bitwise::byte_array_to_int_bit_array,
293    };
294    use crate::utilities::{assert_vec_u8_eq, assert_vec_u8_ok_eq};
295
296    use super::*;
297
298    #[crate::ctb_test]
299    fn test_is_basenb_base() {
300        for b in 0..=30 {
301            let valid = is_basenb_base(b);
302            if (7..=17).contains(&b) {
303                assert!(valid);
304            } else {
305                assert!(!valid);
306            }
307        }
308    }
309
310    #[crate::ctb_test]
311    fn test_basenb_remainder_marker_range() {
312        // Distinct remainder chars subset 63481..=63497
313        for cp in 63_480..=63_500 {
314            let packed = pack32(cp).unwrap();
315            if (63_481..=63_497).contains(&cp) {
316                assert!(is_basenb_distinct_remainder_char(&packed));
317            } else {
318                assert!(!is_basenb_distinct_remainder_char(&packed));
319            }
320        }
321    }
322
323    #[crate::ctb_test]
324    fn test_basenb_char_ranges() {
325        // sample points
326        for cp in [
327            63_480, 63_481, 63_495, 63_501, 63_502, 983_040, 983_100,
328            1_048_573, 1_048_574,
329        ] {
330            let packed = pack32(cp).unwrap();
331            let is_char = is_basenb_char(&packed);
332            let expected = (63_481..=63_501).contains(&cp)
333                || (983_040..=1_048_573).contains(&cp)
334                || (1_048_576..=1_114_109).contains(&cp);
335            assert_eq!(
336                is_char, expected,
337                "cp={} expected {} got {}",
338                cp, expected, is_char
339            );
340        }
341    }
342
343    #[crate::ctb_test]
344    fn test_byte_array_to_basenb_utf8_remainder_marker() {
345        // The remainder length also needs to be stored, to be able to decode successfully. We'll calculate, encode, and append it. It's always 4 bytes, 1 UTF-8 character, and 2 UTF-16 characters long, after encoding (it has 2 added to it to make it always be the same byte length and UTF-16 length; this must be subtracted before passing it to the Base16b.decode function).
346        // Known: remainder marker = 63497 - (bit_len % 17)
347        let data = b"\xAB\xCD"; // 16 bits
348        let bits = byte_array_to_int_bit_array(data);
349        assert_eq!(bits.len(), 16);
350        let base = 10;
351        let encoded_with_remainder =
352            byte_array_to_basenb_utf8(base, data).unwrap();
353        let encoded_b10b =
354            byte_array_to_basenb_no_remainder_marker(base, data).unwrap();
355        assert!(
356            encoded_with_remainder.len() >= 4,
357            "Expected at least one codepoint + remainder marker"
358        );
359        let len_diff = encoded_with_remainder.len() - encoded_b10b.len();
360        let lastn =
361            &encoded_with_remainder[encoded_with_remainder.len() - len_diff..];
362        assert!(is_basenb_distinct_remainder_char(lastn));
363        let cp = unpack32(lastn).unwrap();
364        let expected = 63_497
365            - (u32::try_from(bits.len()).expect("Could not fit length in u32")
366                % base);
367        assert_eq!(cp, expected);
368        assert_vec_u8_ok_eq(
369            data,
370            byte_array_from_basenb_utf8(&encoded_with_remainder),
371        );
372    }
373
374    #[crate::ctb_test]
375    fn test_byte_array_to_basenb_utf8_invalid_base() {
376        let data = b"abc";
377        assert!(byte_array_to_basenb_utf8(6, data).is_err());
378        assert!(byte_array_to_basenb_utf8(18, data).is_err());
379    }
380
381    fn is_sentinel(bytes: &[u8]) -> bool {
382        bytes == BYTE_ARRAY_FROM_BASENB_UTF8_INVALID_INPUT_EXCEPTION_BYTES
383    }
384
385    #[crate::ctb_test]
386    fn test_decode_empty_input() {
387        let enc = assert_vec_u8_ok_eq(
388            "\u{f80d}\u{f809}".as_bytes(),
389            byte_array_to_basenb_utf8(17, &[]),
390        );
391        let dec = byte_array_from_basenb_utf8(&enc);
392        assert_vec_u8_ok_eq(&[], dec);
393    }
394
395    #[crate::ctb_test]
396    fn test_decode_invalid_only_remainder() {
397        let dec = byte_array_from_basenb_utf8("\u{f809}".as_bytes()).unwrap();
398        assert!(
399            is_sentinel(&dec),
400            "Expected sentinel for empty input decode; got {:?}",
401            dec
402        );
403    }
404
405    #[crate::ctb_test]
406    fn test_round_trip_base17_small_samples() {
407        let samples: Vec<Vec<u8>> = vec![
408            vec![0u8],
409            vec![1, 2, 3],
410            vec![255],
411            b"hello".to_vec(),
412            b"\x00\x01\x02\x03\xFE\xFF".to_vec(),
413            (0u8..32u8).collect(),
414        ];
415
416        for sample in samples {
417            let enc = byte_array_to_basenb_17_utf8(&sample).unwrap();
418            crate::log!(
419                "Sample {:?} encoded to Basenb 17 UTF-8 bytes: {:?}",
420                sample.clone(),
421                enc.clone()
422            );
423            let dec = byte_array_from_basenb_17_utf8(&enc).unwrap();
424            if is_sentinel(&dec) && !sample.is_empty() {
425                panic!(
426                    "Unexpected sentinel for non-empty sample {:?} (encoded {:?})",
427                    sample, enc
428                );
429            }
430            if !sample.is_empty() {
431                assert_vec_u8_eq(&sample, &dec);
432            }
433        }
434    }
435
436    #[crate::ctb_test]
437    fn test_basenb_encode_uuid() {
438        // e82eef60-19bc-4a00-a44a-763a3445c16f
439        let input: Vec<u8> = vec![
440            0xe8, 0x2e, 0xef, 0x60, //
441            0x19, 0xbc, 0x4a, 0x00, //
442            0xa4, 0x4a, 0x76, 0x3a, //
443            0x34, 0x45, 0xc1, 0x6f,
444        ];
445
446        // Working out the remainder byte by hand:
447        // 16 bytes * 8 bits = 128 bits
448        // 128 % 17 = 9
449        // The last remainder codepoint is U+F809, - 9 = U+F800.
450
451        let expected_uuid = DC_BASENB_EMBEDDED_START_BYTES.to_vec();
452        let remainder = "\u{F800}".as_bytes().to_vec();
453        let expected = [expected_uuid, remainder].concat();
454
455        let enc = assert_vec_u8_ok_eq(
456            &expected,
457            byte_array_to_basenb_17_utf8(&input),
458        );
459
460        let dec = byte_array_from_basenb_17_utf8(&enc).unwrap();
461        if is_sentinel(&dec) && !input.is_empty() {
462            panic!(
463                "Unexpected exception UUID for {:?} (encoded {:?})",
464                input, enc
465            );
466        }
467
468        assert_vec_u8_eq(&input, &dec);
469    }
470
471    #[crate::ctb_test]
472    fn test_armored_round_trip() {
473        let payload = b"Hello Base17b Armored!";
474        let armored = byte_array_to_armored_base17b_utf8(payload).unwrap();
475        // Basic framing checks
476        let start = ARMORED_BASE17B_UTF8_START_UUID_BYTES;
477        let end = ARMORED_BASE17B_UTF8_END_UUID_BYTES;
478        assert!(armored.starts_with(&start));
479        assert!(armored.ends_with(&end));
480
481        let decoded = byte_array_from_armored_base17b_utf8(&armored);
482        assert_vec_u8_ok_eq(payload, decoded);
483    }
484
485    #[crate::ctb_test]
486    fn test_armored_invalid_prefix() {
487        let payload = b"xyz";
488        let mut armored = byte_array_to_armored_base17b_utf8(payload).unwrap();
489        // Corrupt first byte
490        armored[0] ^= 0xFF;
491        let err = byte_array_from_armored_base17b_utf8(&armored).unwrap_err();
492        assert!(
493            err.to_string().contains("missing or corrupt start UUID"),
494            "Unexpected error: {err}"
495        );
496    }
497
498    #[crate::ctb_test]
499    fn test_armored_invalid_suffix() {
500        let payload = b"xyz";
501        let mut armored = byte_array_to_armored_base17b_utf8(payload).unwrap();
502        // Corrupt last byte
503        let last = armored.len() - 1;
504        armored[last] ^= 0xAA;
505        let err = byte_array_from_armored_base17b_utf8(&armored).unwrap_err();
506        assert!(
507            err.to_string().contains("missing or corrupt end UUID"),
508            "Unexpected error: {err}"
509        );
510    }
511
512    #[crate::ctb_test]
513    fn test_armored_too_short() {
514        let data: Vec<u8> = vec![1, 2, 3, 4]; // shorter than any plausible framing
515        let err = byte_array_from_armored_base17b_utf8(&data).unwrap_err();
516        assert!(
517            err.to_string().contains("too short"),
518            "Unexpected error: {err}"
519        );
520    }
521}