diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 238e64cee..93a584663 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -153,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException { isLastTokenDelimiter = true; return true; } + Arrays.fill(delimiterBuf, '\0'); reader.peek(delimiterBuf); for (int i = 0; i < delimiterBuf.length; i++) { if (delimiterBuf[i] != delimiter[i + 1]) { @@ -274,7 +275,6 @@ Token nextToken(final Token token) throws IOException { token.type = Token.Type.COMMENT; return token; } - Arrays.fill(delimiterBuf, '\0'); // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { // ignore whitespaces at beginning of a token diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index dca37fc5a..c1ca3d7a4 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -1696,6 +1696,21 @@ void testPartialMultiCharacterDelimiterAtEOF() throws IOException { } } + /** + * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + try (CSVParser parser = format.parse(new StringReader("x[a][|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("x[a][|", record.get(0)); + assertEquals(1, record.size()); + } + } + @Test void testProvidedHeader() throws Exception { final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z"); diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index 244079df6..e5f831fb2 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -433,6 +433,19 @@ void testPartialMultiCharacterDelimiterAtEOF() throws IOException { } } + /** + * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token). + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + try (Lexer lexer = createLexer("x[a][|", format)) { + assertNextToken(EOF, "x[a][|", lexer); + } + } + @Test void testReadEscapeBackspace() throws IOException { try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {