@@ -36,8 +36,8 @@ use crate::io::object_cache::ObjectCache;
36
36
use crate :: io:: FileIO ;
37
37
use crate :: runtime:: spawn;
38
38
use crate :: spec:: {
39
- DataContentType , ManifestContentType , ManifestEntryRef , ManifestFile , ManifestList , Schema ,
40
- SchemaRef , SnapshotRef , TableMetadataRef ,
39
+ DataContentType , DataFileFormat , ManifestContentType , ManifestEntryRef , ManifestFile ,
40
+ ManifestList , Schema , SchemaRef , SnapshotRef , TableMetadataRef ,
41
41
} ;
42
42
use crate :: table:: Table ;
43
43
use crate :: utils:: available_parallelism;
@@ -529,14 +529,19 @@ impl ManifestEntryContext {
529
529
/// created from it
530
530
fn into_file_scan_task ( self ) -> FileScanTask {
531
531
FileScanTask {
532
- data_file_path : self . manifest_entry . file_path ( ) . to_string ( ) ,
533
532
start : 0 ,
534
533
length : self . manifest_entry . file_size_in_bytes ( ) ,
534
+ record_count : Some ( self . manifest_entry . record_count ( ) ) ,
535
+
536
+ data_file_path : self . manifest_entry . file_path ( ) . to_string ( ) ,
537
+ data_file_content : self . manifest_entry . content_type ( ) ,
538
+ data_file_format : self . manifest_entry . file_format ( ) ,
539
+
540
+ schema : self . snapshot_schema ,
535
541
project_field_ids : self . field_ids . to_vec ( ) ,
536
542
predicate : self
537
543
. bound_predicates
538
544
. map ( |x| x. as_ref ( ) . snapshot_bound_predicate . clone ( ) ) ,
539
- schema : self . snapshot_schema ,
540
545
}
541
546
}
542
547
}
@@ -854,35 +859,30 @@ impl ExpressionEvaluatorCache {
854
859
/// A task to scan part of file.
855
860
#[ derive( Debug , Clone , Serialize , Deserialize ) ]
856
861
pub struct FileScanTask {
857
- data_file_path : String ,
858
- start : u64 ,
859
- length : u64 ,
860
- project_field_ids : Vec < i32 > ,
862
+ /// The start offset of the file to scan.
863
+ pub start : u64 ,
864
+ /// The length of the file to scan.
865
+ pub length : u64 ,
866
+ /// The number of records in the file to scan.
867
+ ///
868
+ /// This is an optional field, and only available if we are
869
+ /// reading the entire data file.
870
+ pub record_count : Option < u64 > ,
871
+
872
+ /// The data file path corresponding to the task.
873
+ pub data_file_path : String ,
874
+ /// The content type of the file to scan.
875
+ pub data_file_content : DataContentType ,
876
+ /// The format of the file to scan.
877
+ pub data_file_format : DataFileFormat ,
878
+
879
+ /// The schema of the file to scan.
880
+ pub schema : SchemaRef ,
881
+ /// The field ids to project.
882
+ pub project_field_ids : Vec < i32 > ,
883
+ /// The predicate to filter.
861
884
#[ serde( skip_serializing_if = "Option::is_none" ) ]
862
- predicate : Option < BoundPredicate > ,
863
- schema : SchemaRef ,
864
- }
865
-
866
- impl FileScanTask {
867
- /// Returns the data file path of this file scan task.
868
- pub fn data_file_path ( & self ) -> & str {
869
- & self . data_file_path
870
- }
871
-
872
- /// Returns the project field id of this file scan task.
873
- pub fn project_field_ids ( & self ) -> & [ i32 ] {
874
- & self . project_field_ids
875
- }
876
-
877
- /// Returns the predicate of this file scan task.
878
- pub fn predicate ( & self ) -> Option < & BoundPredicate > {
879
- self . predicate . as_ref ( )
880
- }
881
-
882
- /// Returns the schema id of this file scan task.
883
- pub fn schema ( & self ) -> & Schema {
884
- & self . schema
885
- }
885
+ pub predicate : Option < BoundPredicate > ,
886
886
}
887
887
888
888
#[ cfg( test) ]
@@ -1219,17 +1219,17 @@ mod tests {
1219
1219
1220
1220
assert_eq ! ( tasks. len( ) , 2 ) ;
1221
1221
1222
- tasks. sort_by_key ( |t| t. data_file_path ( ) . to_string ( ) ) ;
1222
+ tasks. sort_by_key ( |t| t. data_file_path . to_string ( ) ) ;
1223
1223
1224
1224
// Check first task is added data file
1225
1225
assert_eq ! (
1226
- tasks[ 0 ] . data_file_path( ) ,
1226
+ tasks[ 0 ] . data_file_path,
1227
1227
format!( "{}/1.parquet" , & fixture. table_location)
1228
1228
) ;
1229
1229
1230
1230
// Check second task is existing data file
1231
1231
assert_eq ! (
1232
- tasks[ 1 ] . data_file_path( ) ,
1232
+ tasks[ 1 ] . data_file_path,
1233
1233
format!( "{}/3.parquet" , & fixture. table_location)
1234
1234
) ;
1235
1235
}
@@ -1582,22 +1582,28 @@ mod tests {
1582
1582
) ;
1583
1583
let task = FileScanTask {
1584
1584
data_file_path : "data_file_path" . to_string ( ) ,
1585
+ data_file_content : DataContentType :: Data ,
1585
1586
start : 0 ,
1586
1587
length : 100 ,
1587
1588
project_field_ids : vec ! [ 1 , 2 , 3 ] ,
1588
1589
predicate : None ,
1589
1590
schema : schema. clone ( ) ,
1591
+ record_count : Some ( 100 ) ,
1592
+ data_file_format : DataFileFormat :: Parquet ,
1590
1593
} ;
1591
1594
test_fn ( task) ;
1592
1595
1593
1596
// with predicate
1594
1597
let task = FileScanTask {
1595
1598
data_file_path : "data_file_path" . to_string ( ) ,
1599
+ data_file_content : DataContentType :: Data ,
1596
1600
start : 0 ,
1597
1601
length : 100 ,
1598
1602
project_field_ids : vec ! [ 1 , 2 , 3 ] ,
1599
1603
predicate : Some ( BoundPredicate :: AlwaysTrue ) ,
1600
1604
schema,
1605
+ record_count : None ,
1606
+ data_file_format : DataFileFormat :: Avro ,
1601
1607
} ;
1602
1608
test_fn ( task) ;
1603
1609
}
0 commit comments