Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-325">CSVParser applies characterOffset to bytePosition (#604).</action>
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-326">CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.</action>
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-327">CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.</action>
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-329">CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.</action>
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-328">CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null).</action>
<action type="fix" dev="ggregory" due-to="OldTruckDriver, Gary Gregory" issue="CSV-326">Escape Reader values with quote and escape (#606).</action>
<action type="fix" dev="ggregory" due-to="Dexter.k, Gary Gregory">Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).</action>
Expand Down
20 changes: 13 additions & 7 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,11 @@ long getBytesRead() {
}

private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
int len = 0;
for (int i = offset; i < length; i++) {
len += getEncodedCharLength(buf[i]);
long len = 0;
int previous = lastChar;
for (int i = offset; i < offset + length; i++) {
len += getEncodedCharLength(previous, buf[i]);
previous = buf[i];
}
return len;
}
Expand Down Expand Up @@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
return getEncodedCharLength(lastChar, current);
}

private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
final char cChar = (char) current;
final char lChar = (char) lastChar;
final char lChar = (char) previous;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
Expand Down Expand Up @@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
return 0;
}
final int len = super.read(buf, offset, length);
if (encoder != null && len > 0) {
this.bytesRead += getEncodedCharLength(buf, offset, len);
}
if (len > 0) {
for (int i = offset; i < offset + len; i++) {
final char ch = buf[i];
Expand All @@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
} else if (len == EOF) {
lastChar = EOF;
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(buf, offset, len);
}
position += len;
return len;
}
Expand Down
25 changes: 25 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
}
}

/**
* Tests <a href="https://issues.apache.org/jira/browse/CSV-329">CSV-329</a>.
*/
@Test
void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
final String delimiter = "x😀";
final String code = "ax😀b\ncx😀d\n";
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
try (CSVParser parser = CSVParser.builder()
.setReader(new StringReader(code))
.setFormat(format)
.setCharset(UTF_8)
.setTrackBytes(true)
.get()) {
final CSVRecord first = parser.nextRecord();
final CSVRecord second = parser.nextRecord();
assertNotNull(first);
assertNotNull(second);
assertValuesEquals(new String[] { "a", "b" }, first);
assertValuesEquals(new String[] { "c", "d" }, second);
assertEquals(0, first.getBytePosition());
assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
}
}

@Test
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
final String row0 = "é,x\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import static org.junit.jupiter.api.Assertions.assertNull;

import java.io.StringReader;
import java.nio.charset.StandardCharsets;

import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
}
}

@Test
void testReadingSupplementaryCharacterTracksBytes() throws Exception {
final String input = "😀";
final char[] buffer = new char[input.length()];
try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
assertArrayEquals(input.toCharArray(), buffer);
assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
assertEquals(input.length(), reader.getPosition());
assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
}
}

@Test
void testReadLine() throws Exception {
try (ExtendedBufferedReader br = createBufferedReader("")) {
Expand Down
Loading