Skip to content

Commit 9be0eb5

Browse files
alambprogval
andauthored
Minor: Improve parquet PageIndex documentation (#6042)
* Minor: Improve parquet PageIndex documentation * More improvements * Add reasons for data page being without null * Apply suggestions from code review Co-authored-by: Val Lorentz <[email protected]> * Update parquet/src/file/page_index/index.rs --------- Co-authored-by: Val Lorentz <[email protected]>
1 parent b44497e commit 9be0eb5

File tree

3 files changed

+67
-20
lines changed

3 files changed

+67
-20
lines changed

parquet/src/file/metadata/mod.rs

+32-5
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,27 @@ use crate::schema::types::{
5454
Type as SchemaType,
5555
};
5656

57-
/// [`Index`] for each row group of each column.
57+
/// Page level statistics for each column chunk of each row group.
58+
///
59+
/// This structure is an in-memory representation of multiple [`ColumnIndex`]
60+
/// structures in a parquet file footer, as described in the Parquet [PageIndex
61+
/// documentation]. Each [`Index`] holds statistics about all the pages in a
62+
/// particular column chunk.
5863
///
5964
/// `column_index[row_group_number][column_number]` holds the
6065
/// [`Index`] corresponding to column `column_number` of row group
6166
/// `row_group_number`.
6267
///
6368
/// For example `column_index[2][3]` holds the [`Index`] for the forth
6469
/// column in the third row group of the parquet file.
70+
///
71+
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
6572
pub type ParquetColumnIndex = Vec<Vec<Index>>;
6673

67-
/// [`PageLocation`] for each data page of each row group of each column.
74+
/// [`PageLocation`] for each data page of each row group of each column
75+
///
76+
/// This structure is the parsed representation of the [`OffsetIndex`] from the
77+
/// Parquet file footer, as described in the Parquet [PageIndex documentation].
6878
///
6979
/// `offset_index[row_group_number][column_number][page_number]` holds
7080
/// the [`PageLocation`] corresponding to page `page_number` of column
@@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
7383
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
7484
/// the fifth page of the forth column in the third row group of the
7585
/// parquet file.
86+
///
87+
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
7688
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
7789

7890
/// Parsed metadata for a single Parquet file
@@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
946958
}
947959
}
948960

949-
/// Builder for column index
961+
/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
962+
///
963+
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
950964
pub struct ColumnIndexBuilder {
951965
null_pages: Vec<bool>,
952966
min_values: Vec<Vec<u8>>,
953967
max_values: Vec<Vec<u8>>,
954968
null_counts: Vec<i64>,
955969
boundary_order: BoundaryOrder,
956-
// If one page can't get build index, need to ignore all index in this column
970+
/// Is the information in the builder valid?
971+
///
972+
/// Set to `false` if any entry in the page doesn't have statistics for
973+
/// some reason, so statistics for that page won't be written to the file.
974+
/// This might happen if the page is entirely null, or
975+
/// is a floating point column without any non-nan values
976+
/// e.g. <https://github.com/apache/parquet-format/pull/196>
957977
valid: bool,
958978
}
959979

@@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
975995
}
976996
}
977997

998+
/// Append statistics for the next page
978999
pub fn append(
9791000
&mut self,
9801001
null_page: bool,
@@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
9921013
self.boundary_order = boundary_order;
9931014
}
9941015

1016+
/// Mark this column index as invalid
9951017
pub fn to_invalid(&mut self) {
9961018
self.valid = false;
9971019
}
9981020

1021+
/// Is the information in the builder valid?
9991022
pub fn valid(&self) -> bool {
10001023
self.valid
10011024
}
10021025

10031026
/// Build and get the thrift metadata of column index
1027+
///
1028+
/// Note: callers should check [`Self::valid`] before calling this method
10041029
pub fn build_to_thrift(self) -> ColumnIndex {
10051030
ColumnIndex::new(
10061031
self.null_pages,
@@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
10121037
}
10131038
}
10141039

1015-
/// Builder for offset index
1040+
/// Builder for offset index, part of the Parquet [PageIndex].
1041+
///
1042+
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
10161043
pub struct OffsetIndexBuilder {
10171044
offset_array: Vec<i64>,
10181045
compressed_page_size_array: Vec<i32>,

parquet/src/file/page_index/index.rs

+21-13
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex};
2525
use crate::util::bit_util::from_le_slice;
2626
use std::fmt::Debug;
2727

28-
/// PageIndex Statistics for one data page, as described in [Column Index].
28+
/// Typed statistics for one data page
2929
///
30-
/// One significant difference from the row group level
31-
/// [`Statistics`](crate::format::Statistics) is that page level
32-
/// statistics may not store actual column values as min and max
33-
/// (e.g. they may store truncated strings to save space)
34-
///
35-
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
30+
/// See [`NativeIndex`] for more details
3631
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3732
pub struct PageIndex<T> {
3833
/// The minimum value, It is None when all values are null
@@ -70,11 +65,9 @@ where
7065

7166
#[derive(Debug, Clone, PartialEq)]
7267
#[allow(non_camel_case_types)]
73-
/// Typed statistics for a data page in a column chunk.
68+
/// Statistics for data pages in a column chunk.
7469
///
75-
/// This structure is part of the "Page Index" and is optionally part of
76-
/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
77-
/// while reading the file data.
70+
/// See [`NativeIndex`] for more information
7871
pub enum Index {
7972
/// Sometimes reading page index from parquet file
8073
/// will only return pageLocations without min_max index,
@@ -117,10 +110,25 @@ impl Index {
117110
}
118111
}
119112

120-
/// Stores the [`PageIndex`] for each page of a column
113+
/// Strongly typed statistics for data pages in a column chunk.
114+
///
115+
/// This structure is a natively typed, in memory representation of the
116+
/// [`ColumnIndex`] structure in a parquet file footer, as described in the
117+
/// Parquet [PageIndex documentation]. The statistics stored in this structure
118+
/// can be used by query engines to skip decoding pages while reading parquet
119+
/// data.
120+
///
121+
/// # Differences with Row Group Level Statistics
122+
///
123+
/// One significant difference between `NativeIndex` and row group level
124+
/// [`Statistics`] is that page level statistics may not store actual column
125+
/// values as min and max (e.g. they may store truncated strings to save space)
126+
///
127+
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
128+
/// [`Statistics`]: crate::file::statistics::Statistics
121129
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
122130
pub struct NativeIndex<T: ParquetValueType> {
123-
/// The indexes, one item per page
131+
/// The actual column indexes, one item per page
124132
pub indexes: Vec<PageIndex<T>>,
125133
/// If the min/max elements are ordered, and if so in which
126134
/// direction. See [source] for details.

parquet/src/file/statistics.rs

+14-2
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
287287
Some(thrift_stats)
288288
}
289289

290-
/// Statistics for a column chunk and data page.
290+
/// Strongly typed statistics for a column chunk within a row group.
291+
///
292+
/// This structure is a natively typed, in memory representation of the
293+
/// [`Statistics`] structure in a parquet file footer. The statistics stored in
294+
/// this structure can be used by query engines to skip decoding pages while
295+
/// reading parquet data.
296+
///
297+
/// Page level statistics are stored separately, in [NativeIndex].
298+
///
299+
/// [`Statistics`]: crate::format::Statistics
300+
/// [NativeIndex]: crate::file::page_index::index::NativeIndex
291301
#[derive(Debug, Clone, PartialEq)]
292302
pub enum Statistics {
293303
Boolean(ValueStatistics<bool>),
@@ -445,7 +455,9 @@ impl fmt::Display for Statistics {
445455
/// Typed implementation for [`Statistics`].
446456
pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
447457

448-
/// Statistics for a particular `ParquetValueType`
458+
/// Typed statistics for one column chunk
459+
///
460+
/// See [`Statistics`] for more details
449461
#[derive(Clone, Eq, PartialEq)]
450462
pub struct ValueStatistics<T> {
451463
min: Option<T>,

0 commit comments

Comments
 (0)