Skip to content

Commit a19e985

Browse files
authored
GH-3133: Fix SizeStatistics to handle omitted histogram (#3135)
1 parent de3c2d0 commit a19e985

File tree

5 files changed

+31
-6
lines changed

5 files changed

+31
-6
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,10 @@ public SizeStatistics(
136136
List<Long> definitionLevelHistogram) {
137137
this.type = type;
138138
this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes;
139-
this.repetitionLevelHistogram = repetitionLevelHistogram;
140-
this.definitionLevelHistogram = definitionLevelHistogram;
139+
this.repetitionLevelHistogram =
140+
repetitionLevelHistogram == null ? Collections.emptyList() : repetitionLevelHistogram;
141+
this.definitionLevelHistogram =
142+
definitionLevelHistogram == null ? Collections.emptyList() : definitionLevelHistogram;
141143
}
142144

143145
/**

parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java

+17
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import java.util.Arrays;
22+
import java.util.Collections;
2223
import java.util.Optional;
2324
import org.apache.parquet.io.api.Binary;
2425
import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -124,4 +125,20 @@ public void testCopyStatistics() {
124125
Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getRepetitionLevelHistogram());
125126
Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getDefinitionLevelHistogram());
126127
}
128+
129+
@Test
130+
public void testOmittedHistogram() {
131+
PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
132+
.as(LogicalTypeAnnotation.stringType())
133+
.named("a");
134+
SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null);
135+
Assert.assertEquals(Optional.of(1024L), statistics.getUnencodedByteArrayDataBytes());
136+
Assert.assertEquals(Collections.emptyList(), statistics.getRepetitionLevelHistogram());
137+
Assert.assertEquals(Collections.emptyList(), statistics.getDefinitionLevelHistogram());
138+
139+
SizeStatistics copy = statistics.copy();
140+
Assert.assertEquals(Optional.of(1024L), copy.getUnencodedByteArrayDataBytes());
141+
Assert.assertEquals(Collections.emptyList(), copy.getRepetitionLevelHistogram());
142+
Assert.assertEquals(Collections.emptyList(), copy.getDefinitionLevelHistogram());
143+
}
127144
}

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

+8-2
Original file line numberDiff line numberDiff line change
@@ -2382,8 +2382,14 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s
23822382
formatStats.setUnencoded_byte_array_data_bytes(
23832383
stats.getUnencodedByteArrayDataBytes().get());
23842384
}
2385-
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
2386-
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
2385+
List<Long> repLevelHistogram = stats.getRepetitionLevelHistogram();
2386+
if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
2387+
formatStats.setRepetition_level_histogram(repLevelHistogram);
2388+
}
2389+
List<Long> defLevelHistogram = stats.getDefinitionLevelHistogram();
2390+
if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
2391+
formatStats.setDefinition_level_histogram(defLevelHistogram);
2392+
}
23872393
return formatStats;
23882394
}
23892395
}

parquet-plugins/parquet-encoding-vector/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<parent>
2323
<groupId>org.apache.parquet</groupId>
2424
<artifactId>parquet</artifactId>
25-
<version>1.15.0-SNAPSHOT</version>
25+
<version>1.15.1-SNAPSHOT</version>
2626
<relativePath>../../pom.xml</relativePath>
2727
</parent>
2828

parquet-plugins/parquet-plugins-benchmarks/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<parent>
2323
<groupId>org.apache.parquet</groupId>
2424
<artifactId>parquet</artifactId>
25-
<version>1.15.0-SNAPSHOT</version>
25+
<version>1.15.1-SNAPSHOT</version>
2626
<relativePath>../../pom.xml</relativePath>
2727
</parent>
2828

0 commit comments

Comments
 (0)