@@ -24,14 +24,14 @@ use ordered_float::OrderedFloat;
24
24
use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
25
25
use parquet:: file:: metadata:: RowGroupMetaData ;
26
26
use parquet:: file:: page_index:: index:: Index ;
27
- use parquet:: format :: PageLocation ;
27
+ use parquet:: file :: page_index :: offset_index :: OffsetIndexMetaData ;
28
28
29
29
use crate :: expr:: visitors:: bound_predicate_visitor:: { visit, BoundPredicateVisitor } ;
30
30
use crate :: expr:: { BoundPredicate , BoundReference } ;
31
31
use crate :: spec:: { Datum , PrimitiveLiteral , PrimitiveType , Schema } ;
32
32
use crate :: { Error , ErrorKind , Result } ;
33
33
34
- type OffsetIndex = Vec < Vec < PageLocation > > ;
34
+ type OffsetIndex = Vec < OffsetIndexMetaData > ;
35
35
36
36
const IN_PREDICATE_LIMIT : usize = 200 ;
37
37
@@ -206,13 +206,14 @@ impl<'a> PageIndexEvaluator<'a> {
206
206
}
207
207
208
208
/// returns a list of row counts per page
209
- fn calc_row_counts ( & self , offset_index : & [ PageLocation ] ) -> Vec < usize > {
209
+ fn calc_row_counts ( & self , offset_index : & OffsetIndexMetaData ) -> Vec < usize > {
210
210
let mut remaining_rows = self . row_group_metadata . num_rows ( ) as usize ;
211
211
let mut row_counts = Vec :: with_capacity ( self . offset_index . len ( ) ) ;
212
212
213
- for ( idx, page_location) in offset_index. iter ( ) . enumerate ( ) {
214
- let row_count = if idx < offset_index. len ( ) - 1 {
215
- let row_count = ( offset_index[ idx + 1 ] . first_row_index
213
+ let page_locations = offset_index. page_locations ( ) ;
214
+ for ( idx, page_location) in page_locations. iter ( ) . enumerate ( ) {
215
+ let row_count = if idx < page_locations. len ( ) - 1 {
216
+ let row_count = ( page_locations[ idx + 1 ] . first_row_index
216
217
- page_location. first_row_index ) as usize ;
217
218
remaining_rows -= row_count;
218
219
row_count
@@ -868,6 +869,7 @@ mod tests {
868
869
use parquet:: data_type:: ByteArray ;
869
870
use parquet:: file:: metadata:: { ColumnChunkMetaData , RowGroupMetaData } ;
870
871
use parquet:: file:: page_index:: index:: { Index , NativeIndex , PageIndex } ;
872
+ use parquet:: file:: page_index:: offset_index:: OffsetIndexMetaData ;
871
873
use parquet:: file:: statistics:: Statistics ;
872
874
use parquet:: format:: { BoundaryOrder , PageLocation } ;
873
875
use parquet:: schema:: types:: {
@@ -1417,28 +1419,36 @@ mod tests {
1417
1419
Ok ( row_group_metadata?)
1418
1420
}
1419
1421
1420
- fn create_page_index ( ) -> Result < ( Vec < Index > , Vec < Vec < PageLocation > > ) > {
1422
+ fn create_page_index ( ) -> Result < ( Vec < Index > , Vec < OffsetIndexMetaData > ) > {
1421
1423
let idx_float = Index :: FLOAT ( NativeIndex :: < f32 > {
1422
1424
indexes : vec ! [
1423
1425
PageIndex {
1424
1426
min: None ,
1425
1427
max: None ,
1426
1428
null_count: Some ( 1024 ) ,
1429
+ repetition_level_histogram: None ,
1430
+ definition_level_histogram: None ,
1427
1431
} ,
1428
1432
PageIndex {
1429
1433
min: Some ( 0.0 ) ,
1430
1434
max: Some ( 10.0 ) ,
1431
1435
null_count: Some ( 0 ) ,
1436
+ repetition_level_histogram: None ,
1437
+ definition_level_histogram: None ,
1432
1438
} ,
1433
1439
PageIndex {
1434
1440
min: Some ( 10.0 ) ,
1435
1441
max: Some ( 20.0 ) ,
1436
1442
null_count: Some ( 1 ) ,
1443
+ repetition_level_histogram: None ,
1444
+ definition_level_histogram: None ,
1437
1445
} ,
1438
1446
PageIndex {
1439
1447
min: None ,
1440
1448
max: None ,
1441
1449
null_count: None ,
1450
+ repetition_level_histogram: None ,
1451
+ definition_level_histogram: None ,
1442
1452
} ,
1443
1453
] ,
1444
1454
boundary_order : BoundaryOrder ( 0 ) , // UNORDERED
@@ -1450,26 +1460,36 @@ mod tests {
1450
1460
min: Some ( "AA" . into( ) ) ,
1451
1461
max: Some ( "DD" . into( ) ) ,
1452
1462
null_count: Some ( 0 ) ,
1463
+ repetition_level_histogram: None ,
1464
+ definition_level_histogram: None ,
1453
1465
} ,
1454
1466
PageIndex {
1455
1467
min: Some ( "DE" . into( ) ) ,
1456
1468
max: Some ( "DE" . into( ) ) ,
1457
1469
null_count: Some ( 0 ) ,
1470
+ repetition_level_histogram: None ,
1471
+ definition_level_histogram: None ,
1458
1472
} ,
1459
1473
PageIndex {
1460
1474
min: Some ( "DF" . into( ) ) ,
1461
1475
max: Some ( "UJ" . into( ) ) ,
1462
1476
null_count: Some ( 1 ) ,
1477
+ repetition_level_histogram: None ,
1478
+ definition_level_histogram: None ,
1463
1479
} ,
1464
1480
PageIndex {
1465
1481
min: None ,
1466
1482
max: None ,
1467
1483
null_count: Some ( 48 ) ,
1484
+ repetition_level_histogram: None ,
1485
+ definition_level_histogram: None ,
1468
1486
} ,
1469
1487
PageIndex {
1470
1488
min: None ,
1471
1489
max: None ,
1472
1490
null_count: None ,
1491
+ repetition_level_histogram: None ,
1492
+ definition_level_histogram: None ,
1473
1493
} ,
1474
1494
] ,
1475
1495
boundary_order : BoundaryOrder ( 0 ) , // UNORDERED
@@ -1491,8 +1511,14 @@ mod tests {
1491
1511
] ;
1492
1512
1493
1513
Ok ( ( vec ! [ idx_float, idx_string] , vec ! [
1494
- page_locs_float,
1495
- page_locs_string,
1514
+ OffsetIndexMetaData {
1515
+ page_locations: page_locs_float,
1516
+ unencoded_byte_array_data_bytes: None ,
1517
+ } ,
1518
+ OffsetIndexMetaData {
1519
+ page_locations: page_locs_string,
1520
+ unencoded_byte_array_data_bytes: None ,
1521
+ } ,
1496
1522
] ) )
1497
1523
}
1498
1524
}
0 commit comments