From 2b4dbcb9f38a3d3a2069097c826dde06c1234562 Mon Sep 17 00:00:00 2001 From: StrongestNumber9 <16169054+StrongestNumber9@users.noreply.github.com> Date: Wed, 11 Oct 2023 09:23:51 +0300 Subject: [PATCH] Adds size limit to tokenizer --- .../java/com/teragrep/blf_01/Tokenizer.java | 14 +++++++++++-- .../com/teragrep/blf_01/TokenizerTest.java | 20 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/teragrep/blf_01/Tokenizer.java b/src/main/java/com/teragrep/blf_01/Tokenizer.java index 112e878..64e30ea 100644 --- a/src/main/java/com/teragrep/blf_01/Tokenizer.java +++ b/src/main/java/com/teragrep/blf_01/Tokenizer.java @@ -59,8 +59,12 @@ public class Tokenizer { final Entanglement entanglement; final TokenScan majorTokenScan; final TokenScan minorTokenScan; + final long maxTokenCount; public Tokenizer() { + this(Long.MAX_VALUE); + } + public Tokenizer(long maxTokenCount) { final MajorDelimiters majorDelimiters = new MajorDelimiters(); final MinorDelimiters minorDelimiters = new MinorDelimiters(); @@ -69,7 +73,7 @@ public Tokenizer() { this.entanglement = new Entanglement(); this.majorTokenScan = new TokenScan(majorDelimiters); this.minorTokenScan = new TokenScan(minorDelimiters); - + this.maxTokenCount = maxTokenCount; } /** @@ -94,8 +98,14 @@ public List tokenize(InputStream is) { ArrayList minorTokens = minorTokenScan.findBy(stream); + ArrayList tokens; + if (minorTokens.size() > maxTokenCount) { + tokens = minorTokens; + } else { + tokens = entanglement.entangle(minorTokens); + } - allTokens.addAll(entanglement.entangle(minorTokens)); + allTokens.addAll(tokens); } return allTokens; diff --git a/src/test/java/com/teragrep/blf_01/TokenizerTest.java b/src/test/java/com/teragrep/blf_01/TokenizerTest.java index 2b009c3..9ebff9e 100644 --- a/src/test/java/com/teragrep/blf_01/TokenizerTest.java +++ b/src/test/java/com/teragrep/blf_01/TokenizerTest.java @@ -90,6 +90,26 @@ public void testTokenization() { } + + @Test + public void testTokenizerSizeLimit() { + Tokenizer tokenizer = new Tokenizer(3); + String input = "Abc#####Xyz"; + ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)); + List result = tokenizer.tokenize(bais); + + List expected = + Arrays.asList( + "Abc#####Xyz", "Abc", "#", "#", "#", "#", "#", "Xyz" + ); + + assertTrue(result.stream() + .map(Token::toString) + .collect(Collectors.toList()) + .containsAll(expected)); + + } + @Test @Benchmark public void tokenizeFileInput() throws FileNotFoundException {