diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 64f936554..006de7711 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -53,6 +53,7 @@ CSVParser applies characterOffset to bytePosition (#604). CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back. CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used. + CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters. CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null). Escape Reader values with quote and escape (#606). Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611). diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 889b58edc..5b519a08c 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -108,9 +108,11 @@ long getBytesRead() { } private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { - int len = 0; - for (int i = offset; i < length; i++) { - len += getEncodedCharLength(buf[i]); + long len = 0; + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(previous, buf[i]); + previous = buf[i]; } return len; } @@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int * @throws CharacterCodingException if the character cannot be encoded. */ private int getEncodedCharLength(final int current) throws CharacterCodingException { + return getEncodedCharLength(lastChar, current); + } + + private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE return 0; } final int len = super.read(buf, offset, length); + if (encoder != null && len > 0) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } if (len > 0) { for (int i = offset; i < offset + len; i++) { final char ch = buf[i]; @@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE } else if (len == EOF) { lastChar = EOF; } - if (encoder != null) { - this.bytesRead += getEncodedCharLength(buf, offset, len); - } position += len; return len; } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 5bece571f..29ca0cf1f 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException { } } + /** + * Tests CSV-329. + */ + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException { + final String delimiter = "x😀"; + final String code = "ax😀b\ncx😀d\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(UTF_8) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertValuesEquals(new String[] { "a", "b" }, first); + assertValuesEquals(new String[] { "c", "d" }, second); + assertEquals(0, first.getBytePosition()); + assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition()); + } + } + @Test void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { final String row0 = "é,x\n"; diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java index 056b8a9c9..b8d9b9f19 100644 --- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java +++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception { } } + @Test + void testReadingSupplementaryCharacterTracksBytes() throws Exception { + final String input = "😀"; + final char[] buffer = new char[input.length()]; + try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) { + assertEquals(input.length(), reader.read(buffer, 0, buffer.length)); + assertArrayEquals(input.toCharArray(), buffer); + assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead()); + assertEquals(input.length(), reader.getPosition()); + assertEquals(input.charAt(input.length() - 1), reader.getLastChar()); + } + } + @Test void testReadLine() throws Exception { try (ExtendedBufferedReader br = createBufferedReader("")) {