Skip to content

Commit

Permalink
Process all non-ASCII bytes with the UTF-8 parser.
Browse files Browse the repository at this point in the history
The UTF-8 parser knows how to handle invalid byte sequences, so don't
preprocess the input in the main parser; just hand any non-ASCII byte to
the UTF-8 parser to handle.

This includes what were previously interpreted as 8-bit C1 control codes;
they are now interpreted as UTF-8 continuation characters.
  • Loading branch information
sunfishcode committed Jun 15, 2020
1 parent 2a92abe commit f36bebf
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 10 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
CHANGELOG
=========

## 0.9.0

- Remove 8-bit C1 support. 8-bit C1 codes are now interpreted as UTF-8
continuation bytes.

## 0.8.0

- Remove C1 ST support in OSCs, fixing OSCs with ST in the payload
Expand Down
44 changes: 43 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ pub trait Perform {
/// Draw a character to the screen and update states.
fn print(&mut self, _: char);

/// Execute a C0 or C1 control function.
/// Execute a C0 control function.
fn execute(&mut self, byte: u8);

/// Invoked when a final character arrives in first part of device control string.
Expand Down Expand Up @@ -846,6 +846,48 @@ mod tests {
#[cfg(feature = "no_std")]
assert_eq!(dispatcher.params[1].len(), MAX_OSC_RAW - dispatcher.params[0].len());
}

#[derive(Default)]
struct InvalidUtf8ByteDispatcher {
num_invalid: u8,
}

impl Perform for InvalidUtf8ByteDispatcher {
fn print(&mut self, c: char) {
assert_eq!(c, '�');
self.num_invalid += 1;
}

fn execute(&mut self, _: u8) {}

fn hook(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}

fn put(&mut self, _: u8) {}

fn unhook(&mut self) {}

fn osc_dispatch(&mut self, _: &[&[u8]], _: bool) {}

fn csi_dispatch(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}

fn esc_dispatch(&mut self, _: &[u8], _: bool, _: u8) {}
}

#[test]
fn parse_invalid_utf8_byte() {
let mut dispatcher = InvalidUtf8ByteDispatcher::default();
let mut parser = Parser::new();

for byte in 0x80..0xc2 {
parser.advance(&mut dispatcher, byte);
}
for byte in 0xf5..=0xff {
parser.advance(&mut dispatcher, byte);
}

// Continuation bytes, overlong bytes, invalid code points, invalid code units.
assert_eq!(dispatcher.num_invalid, 64 + 2 + 9 + 2);
}
}

#[cfg(all(feature = "nightly", test))]
Expand Down
12 changes: 3 additions & 9 deletions src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,9 @@ generate_state_changes!(state_changes, {
0x19 => (Anywhere, Execute),
0x1c..=0x1f => (Anywhere, Execute),
0x20..=0x7f => (Anywhere, Print),
0x80..=0x8f => (Anywhere, Execute),
0x91..=0x9a => (Anywhere, Execute),
0x9c => (Anywhere, Execute),
// Beginning of UTF-8 2 byte sequence
0xc2..=0xdf => (Utf8, BeginUtf8),
// Beginning of UTF-8 3 byte sequence
0xe0..=0xef => (Utf8, BeginUtf8),
// Beginning of UTF-8 4 byte sequence
0xf0..=0xf4 => (Utf8, BeginUtf8),
// Hand all non-ASCII bytes to the UTF-8 parser to figure out. This
// includes 8-bit C1 codes, since we don't recognize them as such.
0x80..=0xff => (Utf8, BeginUtf8),
},

Escape {
Expand Down

0 comments on commit f36bebf

Please sign in to comment.