@@ -54,17 +54,27 @@ use crate::schema::types::{
54
54
Type as SchemaType ,
55
55
} ;
56
56
57
- /// [`Index`] for each row group of each column.
57
+ /// Page level statistics for each column chunk of each row group.
58
+ ///
59
+ /// This structure is an in-memory representation of multiple [`ColumnIndex`]
60
+ /// structures in a parquet file footer, as described in the Parquet [PageIndex
61
+ /// documentation]. Each [`Index`] holds statistics about all the pages in a
62
+ /// particular column chunk.
58
63
///
59
64
/// `column_index[row_group_number][column_number]` holds the
60
65
/// [`Index`] corresponding to column `column_number` of row group
61
66
/// `row_group_number`.
62
67
///
63
68
/// For example `column_index[2][3]` holds the [`Index`] for the forth
64
69
/// column in the third row group of the parquet file.
70
+ ///
71
+ /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
65
72
pub type ParquetColumnIndex = Vec < Vec < Index > > ;
66
73
67
- /// [`PageLocation`] for each data page of each row group of each column.
74
+ /// [`PageLocation`] for each data page of each row group of each column
75
+ ///
76
+ /// This structure is the parsed representation of the [`OffsetIndex`] from the
77
+ /// Parquet file footer, as described in the Parquet [PageIndex documentation].
68
78
///
69
79
/// `offset_index[row_group_number][column_number][page_number]` holds
70
80
/// the [`PageLocation`] corresponding to page `page_number` of column
@@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
73
83
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
74
84
/// the fifth page of the forth column in the third row group of the
75
85
/// parquet file.
86
+ ///
87
+ /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
76
88
pub type ParquetOffsetIndex = Vec < Vec < Vec < PageLocation > > > ;
77
89
78
90
/// Parsed metadata for a single Parquet file
@@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
946
958
}
947
959
}
948
960
949
- /// Builder for column index
961
+ /// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
962
+ ///
963
+ /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
950
964
pub struct ColumnIndexBuilder {
951
965
null_pages : Vec < bool > ,
952
966
min_values : Vec < Vec < u8 > > ,
953
967
max_values : Vec < Vec < u8 > > ,
954
968
null_counts : Vec < i64 > ,
955
969
boundary_order : BoundaryOrder ,
956
- // If one page can't get build index, need to ignore all index in this column
970
+ /// Is the information in the builder valid?
971
+ ///
972
+ /// Set to `false` if any entry in the page doesn't have statistics for
973
+ /// some reason, so statistics for that page won't be written to the file.
974
+ /// This might happen if the page is entirely null, or
975
+ /// is a floating point column without any non-nan values
976
+ /// e.g. <https://github.com/apache/parquet-format/pull/196>
957
977
valid : bool ,
958
978
}
959
979
@@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
975
995
}
976
996
}
977
997
998
+ /// Append statistics for the next page
978
999
pub fn append (
979
1000
& mut self ,
980
1001
null_page : bool ,
@@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
992
1013
self . boundary_order = boundary_order;
993
1014
}
994
1015
1016
+ /// Mark this column index as invalid
995
1017
pub fn to_invalid ( & mut self ) {
996
1018
self . valid = false ;
997
1019
}
998
1020
1021
+ /// Is the information in the builder valid?
999
1022
pub fn valid ( & self ) -> bool {
1000
1023
self . valid
1001
1024
}
1002
1025
1003
1026
/// Build and get the thrift metadata of column index
1027
+ ///
1028
+ /// Note: callers should check [`Self::valid`] before calling this method
1004
1029
pub fn build_to_thrift ( self ) -> ColumnIndex {
1005
1030
ColumnIndex :: new (
1006
1031
self . null_pages ,
@@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
1012
1037
}
1013
1038
}
1014
1039
1015
- /// Builder for offset index
1040
+ /// Builder for offset index, part of the Parquet [PageIndex].
1041
+ ///
1042
+ /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1016
1043
pub struct OffsetIndexBuilder {
1017
1044
offset_array : Vec < i64 > ,
1018
1045
compressed_page_size_array : Vec < i32 > ,
0 commit comments