Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

encoding/base32: Add RFC-compliant error handling and improve reliability #4641

Merged
merged 19 commits into from
Jan 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
8c76162
encoding/base32: Replace assertions with error returns
zolk3ri Dec 24, 2024
b933877
encoding/base32: Fix buffer allocation and bounds checking
zolk3ri Dec 24, 2024
7672ac9
encoding/base32: Add RFC 4648 test suite
zolk3ri Dec 24, 2024
f1f2ed3
encoding/base32: Fix decode implementation per RFC 4648
zolk3ri Dec 24, 2024
93238db
encoding/base32: Use consistent allocator and add proper cleanup
zolk3ri Dec 24, 2024
e75a49f
encoding/base32: Set optimization mode for decode()
zolk3ri Dec 24, 2024
8211a91
encoding/base32: Replace padding map with switch statement
zolk3ri Dec 24, 2024
e7fb02a
encoding/base32: Add custom validation support
zolk3ri Dec 25, 2024
88c0e62
encoding/base32: Use `ENC_TBL` parameter consistently in encode()
zolk3ri Dec 26, 2024
490f527
encoding/base32: Expand `DEC_TABLE` to full 256 bytes
zolk3ri Dec 26, 2024
c9c59ed
encoding/base32: Move tests to base32_test.odin
zolk3ri Dec 29, 2024
0d4c006
encoding/base32: Add encode->decode roundtrip test
zolk3ri Dec 30, 2024
591dd87
encoding/base32: Remove incorrect defer delete in encode()
zolk3ri Dec 30, 2024
8292509
encoding/base32: Add custom alphabet test case
zolk3ri Dec 30, 2024
5ce6990
encoding/base32: Add proper cleanup for encoded strings in tests
zolk3ri Dec 30, 2024
3d25128
encoding/base32: Convert files to UTF-8 with Unix line endings
zolk3ri Dec 30, 2024
d6f4412
encoding/base32: Fix style issues for CI
zolk3ri Dec 31, 2024
fe88c22
encoding/base32: Fix RFC 4648 references and add RFC reference URL
zolk3ri Dec 31, 2024
a4a1562
encoding/base32: Add `@(rodata)` attribute to default tables
zolk3ri Jan 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
378 changes: 230 additions & 148 deletions core/encoding/base32/base32.odin
Original file line number Diff line number Diff line change
@@ -1,148 +1,230 @@
package encoding_base32

// @note(zh): Encoding utility for Base32
// A secondary param can be used to supply a custom alphabet to
// @link(encode) and a matching decoding table to @link(decode).
// If none is supplied it just uses the standard Base32 alphabet.
// Incase your specific version does not use padding, you may
// truncate it from the encoded output.

ENC_TABLE := [32]byte {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '2', '3', '4', '5', '6', '7',
}

PADDING :: '='

DEC_TABLE := [?]u8 {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 26, 27, 28, 29, 30, 31, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}

encode :: proc(data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) -> string {
out_length := (len(data) + 4) / 5 * 8
out := make([]byte, out_length)
_encode(out, data)
return string(out)
}

@private
_encode :: proc(out, data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) {
out := out
data := data

for len(data) > 0 {
carry: byte
switch len(data) {
case:
out[7] = ENC_TABLE[data[4] & 0x1f]
carry = data[4] >> 5
fallthrough
case 4:
out[6] = ENC_TABLE[carry | (data[3] << 3) & 0x1f]
out[5] = ENC_TABLE[(data[3] >> 2) & 0x1f]
carry = data[3] >> 7
fallthrough
case 3:
out[4] = ENC_TABLE[carry | (data[2] << 1) & 0x1f]
carry = (data[2] >> 4) & 0x1f
fallthrough
case 2:
out[3] = ENC_TABLE[carry | (data[1] << 4) & 0x1f]
out[2] = ENC_TABLE[(data[1] >> 1) & 0x1f]
carry = (data[1] >> 6) & 0x1f
fallthrough
case 1:
out[1] = ENC_TABLE[carry | (data[0] << 2) & 0x1f]
out[0] = ENC_TABLE[data[0] >> 3]
}

if len(data) < 5 {
out[7] = byte(PADDING)
if len(data) < 4 {
out[6] = byte(PADDING)
out[5] = byte(PADDING)
if len(data) < 3 {
out[4] = byte(PADDING)
if len(data) < 2 {
out[3] = byte(PADDING)
out[2] = byte(PADDING)
}
}
}
break
}
data = data[5:]
out = out[8:]
}
}

decode :: proc(data: string, DEC_TBL := DEC_TABLE, allocator := context.allocator) -> []byte #no_bounds_check{
if len(data) == 0 {
return nil
}

outi := 0
data := data

out := make([]byte, len(data) / 8 * 5, allocator)
end := false
for len(data) > 0 && !end {
dbuf : [8]byte
dlen := 8

for j := 0; j < 8; {
if len(data) == 0 {
dlen, end = j, true
break
}
input := data[0]
data = data[1:]
if input == byte(PADDING) && j >= 2 && len(data) < 8 {
assert(!(len(data) + j < 8 - 1), "Corrupted input")
for k := 0; k < 8-1-j; k +=1 {
assert(len(data) < k || data[k] == byte(PADDING), "Corrupted input")
}
dlen, end = j, true
assert(dlen != 1 && dlen != 3 && dlen != 6, "Corrupted input")
break
}
dbuf[j] = DEC_TABLE[input]
assert(dbuf[j] != 0xff, "Corrupted input")
j += 1
}

switch dlen {
case 8:
out[outi + 4] = dbuf[6] << 5 | dbuf[7]
fallthrough
case 7:
out[outi + 3] = dbuf[4] << 7 | dbuf[5] << 2 | dbuf[6] >> 3
fallthrough
case 5:
out[outi + 2] = dbuf[3] << 4 | dbuf[4] >> 1
fallthrough
case 4:
out[outi + 1] = dbuf[1] << 6 | dbuf[2] << 1 | dbuf[3] >> 4
fallthrough
case 2:
out[outi + 0] = dbuf[0] << 3 | dbuf[1] >> 2
}
outi += 5
}
return out
}
// Base32 encoding/decoding implementation as specified in RFC 4648.
// [[ More; https://www.rfc-editor.org/rfc/rfc4648.html ]]
package encoding_base32

// @note(zh): Encoding utility for Base32
// A secondary param can be used to supply a custom alphabet to
// @link(encode) and a matching decoding table to @link(decode).
// If none is supplied it just uses the standard Base32 alphabet.
// In case your specific version does not use padding, you may
// truncate it from the encoded output.

// Error represents errors that can occur during base32 decoding operations.
// As per RFC 4648:
// - Section 3.3: Invalid character handling
// - Section 3.2: Padding requirements
// - Section 6: Base32 encoding specifics (including block size requirements)
Error :: enum {
None,
Invalid_Character, // Input contains characters outside the specified alphabet
Invalid_Length, // Input length is not valid for base32 (must be a multiple of 8 with proper padding)
Malformed_Input, // Input has improper structure (wrong padding position or incomplete groups)
}

Validate_Proc :: #type proc(c: byte) -> bool

@private
_validate_default :: proc(c: byte) -> bool {
return (c >= 'A' && c <= 'Z') || (c >= '2' && c <= '7')
}

@(rodata)
ENC_TABLE := [32]byte {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
zolk3ri marked this conversation as resolved.
Show resolved Hide resolved
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '2', '3', '4', '5', '6', '7',
}

PADDING :: '='

@(rodata)
DEC_TABLE := [256]u8 {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 26, 27, 28, 29, 30, 31, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}

encode :: proc(data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) -> string {
out_length := (len(data) + 4) / 5 * 8
out := make([]byte, out_length, allocator)
_encode(out, data, ENC_TBL)
return string(out[:])
}

@private
_encode :: proc(out, data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) {
out := out
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can #no_bounds_check can be used for this proc?

data := data

for len(data) > 0 {
carry: byte
switch len(data) {
case:
out[7] = ENC_TBL[data[4] & 0x1f]
carry = data[4] >> 5
fallthrough
case 4:
out[6] = ENC_TBL[carry | (data[3] << 3) & 0x1f]
out[5] = ENC_TBL[(data[3] >> 2) & 0x1f]
carry = data[3] >> 7
fallthrough
case 3:
out[4] = ENC_TBL[carry | (data[2] << 1) & 0x1f]
carry = (data[2] >> 4) & 0x1f
fallthrough
case 2:
out[3] = ENC_TBL[carry | (data[1] << 4) & 0x1f]
out[2] = ENC_TBL[(data[1] >> 1) & 0x1f]
carry = (data[1] >> 6) & 0x1f
fallthrough
case 1:
out[1] = ENC_TBL[carry | (data[0] << 2) & 0x1f]
out[0] = ENC_TBL[data[0] >> 3]
}

if len(data) < 5 {
out[7] = byte(PADDING)
if len(data) < 4 {
out[6] = byte(PADDING)
out[5] = byte(PADDING)
if len(data) < 3 {
out[4] = byte(PADDING)
if len(data) < 2 {
out[3] = byte(PADDING)
out[2] = byte(PADDING)
}
}
}
break
}
data = data[5:]
out = out[8:]
}
}

@(optimization_mode="favor_size")
decode :: proc(
data: string,
DEC_TBL := DEC_TABLE,
validate: Validate_Proc = _validate_default,
allocator := context.allocator) -> (out: []byte, err: Error) {
if len(data) == 0 {
return nil, .None
}

// Check minimum length requirement first
if len(data) < 2 {
return nil, .Invalid_Length
}

// Validate characters using provided validation function
for i := 0; i < len(data); i += 1 {
c := data[i]
zolk3ri marked this conversation as resolved.
Show resolved Hide resolved
if c == byte(PADDING) {
break
}
if !validate(c) {
return nil, .Invalid_Character
}
}

// Validate padding and length
data_len := len(data)
padding_count := 0
for i := data_len - 1; i >= 0; i -= 1 {
if data[i] != byte(PADDING) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#no_bounds_check can be used here

break
}
padding_count += 1
}

// Check for proper padding and length combinations
if padding_count > 0 {
// Verify no padding in the middle
for i := 0; i < data_len - padding_count; i += 1 {
if data[i] == byte(PADDING) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#no_bounds_check can be used here

return nil, .Malformed_Input
}
}

content_len := data_len - padding_count
mod8 := content_len % 8
required_padding: int
switch mod8 {
case 2: required_padding = 6 // 2 chars need 6 padding chars
Copy link
Member

@Kelimion Kelimion Jan 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this works fine, but I might be a bit more explicit about the values here:

// Remainders of 2, 4, 5, and 7 need to be padded to 8 bytes.
switch mod8 {
case 2, 4, 5, 7: required_padding = 8 - mod8
case: required_padding = 0
}

Alternatively, you could use a LUT:

// Remainders of 2, 4, 5, and 7 need to be padded to 8 bytes.
PAD_LENGTH := [8]int{0, 0, 6, 0, 4, 3, 0, 1}
required_padding := PAD_LENGTH[mod8]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Should we not keep the explicit mapping as it directly documents the relationship between character count and required padding (e.g., "2 chars need 6 padding chars"). Using 8 - mod8 or a LUT would obscure this relationship by hiding the actual padding requirements behind a formula or indirect mapping. The current form makes the RFC requirements more apparent.

Copy link
Member

@Kelimion Kelimion Jan 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't read this particular RFC, so I'll defer to you on that. But to me at least, 6, 4, 3 and 1 felt like magic numbers, even though they were clearly derived from 8 - mod8, so the 8 - mod8 felt more explicit than "these are the magic numbers". If you follow my train of thought.

But that could be added to the comments instead?

// Requirement is for 2, 4, 5 and 7 remaining characters to be padded to 8. So the values here are `8 - mod8`.
switch mod8 {
case 2: required_padding = 6 // 2 chars need 6 padding chars
case 4: required_padding = 4 // 4 chars need 4 padding chars
case 5: required_padding = 3 // 5 chars need 3 padding chars
case 7: required_padding = 1 // 7 chars need 1 padding char
case:   required_padding = 0
}

case 4: required_padding = 4 // 4 chars need 4 padding chars
case 5: required_padding = 3 // 5 chars need 3 padding chars
case 7: required_padding = 1 // 7 chars need 1 padding char
case: required_padding = 0
}

if required_padding > 0 {
if padding_count != required_padding {
Kelimion marked this conversation as resolved.
Show resolved Hide resolved
return nil, .Malformed_Input
}
} else if mod8 != 0 {
return nil, .Malformed_Input
}
} else {
// No padding - must be multiple of 8
if data_len % 8 != 0 {
return nil, .Malformed_Input
}
}

// Calculate decoded length: 5 bytes for every 8 input chars
input_chars := data_len - padding_count
out_len := input_chars * 5 / 8
out = make([]byte, out_len, allocator)
defer if err != .None {
delete(out)
}

// Process input in 8-byte blocks
outi := 0
for i := 0; i < input_chars; i += 8 {
buf: [8]byte
block_size := min(8, input_chars - i)
zolk3ri marked this conversation as resolved.
Show resolved Hide resolved

// Decode block
for j := 0; j < block_size; j += 1 {
buf[j] = DEC_TBL[data[i + j]]
zolk3ri marked this conversation as resolved.
Show resolved Hide resolved
}

// Convert to output bytes based on block size
bytes_to_write := block_size * 5 / 8
switch block_size {
case 8:
out[outi + 4] = (buf[6] << 5) | buf[7]
fallthrough
case 7:
out[outi + 3] = (buf[4] << 7) | (buf[5] << 2) | (buf[6] >> 3)
fallthrough
case 5:
out[outi + 2] = (buf[3] << 4) | (buf[4] >> 1)
fallthrough
case 4:
out[outi + 1] = (buf[1] << 6) | (buf[2] << 1) | (buf[3] >> 4)
fallthrough
case 2:
out[outi] = (buf[0] << 3) | (buf[1] >> 2)
}
outi += bytes_to_write
}

return
}
Loading
Loading