Skip to content

Commit

Permalink
Allow transition functions to modify the main parser state.
Browse files Browse the repository at this point in the history
This way if a transition function detects an invalid UTF-8 sequence, it
can reset the main state to Ground.
  • Loading branch information
sunfishcode committed Jun 15, 2020
1 parent f7e4af1 commit 476ba17
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ CHANGELOG

## 0.9.0

- Invalid UTF-8 sequences are now translated into replacement characters
in a manner consistent with `Rust::from_utf8_lossy` and the resolution to
["How many replacement characters?"](https://hsivonen.fi/broken-utf-8/).
- Add a `Parser::end` function allowing users to mark the end of a stream,
so that an incomplete UTF-8 encoding at the end of the stream can be
reported.
Expand Down
19 changes: 15 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,14 +197,14 @@ impl Parser {
let exit_action = self.state.exit_action();
maybe_action!(exit_action, byte);

// Assume the new state
self.state = state;

// Transition action
maybe_action!(action, byte);

// Entry action for new state
maybe_action!(state.entry_action(), byte);

// Assume the new state
self.state = state;
maybe_action!(self.state.entry_action(), byte);
},
}
}
Expand Down Expand Up @@ -929,10 +929,12 @@ mod tests {

impl Perform for PrintDispatcher {
fn print(&mut self, c: char) {
assert!(c.is_whitespace() || !c.is_ascii_control() || c == '\u{007f}');
self.printed.push(c);
}

fn execute(&mut self, b: u8) {
assert!(b.is_ascii_control() && b != b'\x7f');
self.printed.push(b as char)
}

Expand Down Expand Up @@ -978,6 +980,15 @@ mod tests {
b"\x22\x6e\x35\x3d\x84\x34\x25\xe5\x2d\x49\xf6\x4e\xce\xfa\x06\xb3",
"\"n5=�4%�-I�N��\u{6}�",
);
test_print(b"\xfe\x19\xdb\xf5", "�\u{19}��");
test_print(b"\xfe\x19\xdb\xf5", "�\u{19}��");
test_print(b"\x80\xc2\x80\x9f\xc2\x9f", "�\u{80}\u{9f}");
test_print(b"\xc2\x18\xc2\x00\xc2\x1a", "�\u{18}\u{0}\u{1a}");
test_print(b"\xdd\xdd\xfa\x2a\x47\xd9\xd8\x9b\x9c\x97\xa1\x9a\x9b", "���*G�؛�����");
test_print(
b"\x9b\x9c\x97\xa1\x9a\x9b\xfd\x44\xaa\x14\x52\x5f\x33\x5f\x22\x6e\x35\x3d\x84\x34",
"�������D�\u{14}R_3_\"n5=�4",
);
}

// Tests derived from https://hsivonen.fi/broken-utf-8/test.html
Expand Down

0 comments on commit 476ba17

Please sign in to comment.