From 1d89cd5f0aa454ef3853dfc7528242399ef26b74 Mon Sep 17 00:00:00 2001 From: OldTruckDriver Date: Fri, 19 Jun 2026 02:05:55 +1000 Subject: [PATCH] [CSV-329] Fix byte tracking for supplementary delimiters ExtendedBufferedReader.read(char[], int, int) updated lastChar before computing the encoded byte length, so a surrogate pair in the delimiter lookahead buffer was paired against the post-update lastChar and threw CharacterCodingException. Count bytes before updating lastChar, and pair each char against the preceding char in the buffer seeded from lastChar so pairs split across reads still count. Add parser and ExtendedBufferedReader regression tests. Reviewed-by: OpenAI Codex Reviewed-by: Anthropic Claude Code --- src/changes/changes.xml | 1 + .../commons/csv/ExtendedBufferedReader.java | 20 +++++++++------ .../org/apache/commons/csv/CSVParserTest.java | 25 +++++++++++++++++++ .../csv/ExtendedBufferedReaderTest.java | 14 +++++++++++ 4 files changed, 53 insertions(+), 7 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 66073c9dd..f6a474dbf 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -53,6 +53,7 @@ CSVParser applies characterOffset to bytePosition (#604). CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back. CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used. + CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters. Escape Reader values with quote and escape (#606). Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611). Escape quote char in printWithEscapes when QuoteMode is NONE (#609). diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 889b58edc..5b519a08c 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -108,9 +108,11 @@ long getBytesRead() { } private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { - int len = 0; - for (int i = offset; i < length; i++) { - len += getEncodedCharLength(buf[i]); + long len = 0; + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(previous, buf[i]); + previous = buf[i]; } return len; } @@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int * @throws CharacterCodingException if the character cannot be encoded. */ private int getEncodedCharLength(final int current) throws CharacterCodingException { + return getEncodedCharLength(lastChar, current); + } + + private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE return 0; } final int len = super.read(buf, offset, length); + if (encoder != null && len > 0) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } if (len > 0) { for (int i = offset; i < offset + len; i++) { final char ch = buf[i]; @@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE } else if (len == EOF) { lastChar = EOF; } - if (encoder != null) { - this.bytesRead += getEncodedCharLength(buf, offset, len); - } position += len; return len; } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 5bece571f..29ca0cf1f 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException { } } + /** + * Tests CSV-329. + */ + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException { + final String delimiter = "x😀"; + final String code = "ax😀b\ncx😀d\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(UTF_8) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertValuesEquals(new String[] { "a", "b" }, first); + assertValuesEquals(new String[] { "c", "d" }, second); + assertEquals(0, first.getBytePosition()); + assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition()); + } + } + @Test void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { final String row0 = "é,x\n"; diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java index 056b8a9c9..b8d9b9f19 100644 --- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java +++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception { } } + @Test + void testReadingSupplementaryCharacterTracksBytes() throws Exception { + final String input = "😀"; + final char[] buffer = new char[input.length()]; + try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) { + assertEquals(input.length(), reader.read(buffer, 0, buffer.length)); + assertArrayEquals(input.toCharArray(), buffer); + assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead()); + assertEquals(input.length(), reader.getPosition()); + assertEquals(input.charAt(input.length() - 1), reader.getLastChar()); + } + } + @Test void testReadLine() throws Exception { try (ExtendedBufferedReader br = createBufferedReader("")) {