diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 64f936554..006de7711 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -53,6 +53,7 @@
CSVParser applies characterOffset to bytePosition (#604).
CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.
CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.
+ CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.
CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null).
Escape Reader values with quote and escape (#606).
Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 889b58edc..5b519a08c 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -108,9 +108,11 @@ long getBytesRead() {
}
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
- int len = 0;
- for (int i = offset; i < length; i++) {
- len += getEncodedCharLength(buf[i]);
+ long len = 0;
+ int previous = lastChar;
+ for (int i = offset; i < offset + length; i++) {
+ len += getEncodedCharLength(previous, buf[i]);
+ previous = buf[i];
}
return len;
}
@@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
+ return getEncodedCharLength(lastChar, current);
+ }
+
+ private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
final char cChar = (char) current;
- final char lChar = (char) lastChar;
+ final char lChar = (char) previous;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
@@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
return 0;
}
final int len = super.read(buf, offset, length);
+ if (encoder != null && len > 0) {
+ this.bytesRead += getEncodedCharLength(buf, offset, len);
+ }
if (len > 0) {
for (int i = offset; i < offset + len; i++) {
final char ch = buf[i];
@@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
} else if (len == EOF) {
lastChar = EOF;
}
- if (encoder != null) {
- this.bytesRead += getEncodedCharLength(buf, offset, len);
- }
position += len;
return len;
}
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 5bece571f..29ca0cf1f 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
}
}
+ /**
+ * Tests CSV-329.
+ */
+ @Test
+ void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
+ final String delimiter = "x😀";
+ final String code = "ax😀b\ncx😀d\n";
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(format)
+ .setCharset(UTF_8)
+ .setTrackBytes(true)
+ .get()) {
+ final CSVRecord first = parser.nextRecord();
+ final CSVRecord second = parser.nextRecord();
+ assertNotNull(first);
+ assertNotNull(second);
+ assertValuesEquals(new String[] { "a", "b" }, first);
+ assertValuesEquals(new String[] { "c", "d" }, second);
+ assertEquals(0, first.getBytePosition());
+ assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
+ }
+ }
+
@Test
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
final String row0 = "é,x\n";
diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
index 056b8a9c9..b8d9b9f19 100644
--- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
+++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
@@ -26,6 +26,7 @@
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
import org.junit.jupiter.api.Test;
@@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
}
}
+ @Test
+ void testReadingSupplementaryCharacterTracksBytes() throws Exception {
+ final String input = "😀";
+ final char[] buffer = new char[input.length()];
+ try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
+ assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
+ assertArrayEquals(input.toCharArray(), buffer);
+ assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
+ assertEquals(input.length(), reader.getPosition());
+ assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
+ }
+ }
+
@Test
void testReadLine() throws Exception {
try (ExtendedBufferedReader br = createBufferedReader("")) {