Skip to content

Commit d03c4f8

Browse files
authored
Migrate to arrow-* v53 (apache#626)
* chore: migrate to arrow-* v53 * chore: update datafusion to 42 * test: fix incorrect test assertion * chore: update python bindings to arrow 53
1 parent 88e5e4a commit d03c4f8

File tree

5 files changed

+188
-97
lines changed

5 files changed

+188
-97
lines changed

Cargo.toml

+7-7
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ rust-version = "1.77.1"
3939
anyhow = "1.0.72"
4040
apache-avro = "0.17"
4141
array-init = "2"
42-
arrow-arith = { version = "52" }
43-
arrow-array = { version = "52" }
44-
arrow-ord = { version = "52" }
45-
arrow-schema = { version = "52" }
46-
arrow-select = { version = "52" }
47-
arrow-string = { version = "52" }
42+
arrow-arith = { version = "53" }
43+
arrow-array = { version = "53" }
44+
arrow-ord = { version = "53" }
45+
arrow-schema = { version = "53" }
46+
arrow-select = { version = "53" }
47+
arrow-string = { version = "53" }
4848
async-stream = "0.3.5"
4949
async-trait = "0.1"
5050
async-std = "1.12"
@@ -72,7 +72,7 @@ murmur3 = "0.5.2"
7272
once_cell = "1"
7373
opendal = "0.50"
7474
ordered-float = "4"
75-
parquet = "52"
75+
parquet = "53"
7676
paste = "1"
7777
pilota = "0.11.2"
7878
pretty_assertions = "1.4"

bindings/python/Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,5 @@ crate-type = ["cdylib"]
3232

3333
[dependencies]
3434
iceberg = { path = "../../crates/iceberg" }
35-
pyo3 = { version = "0.21", features = ["extension-module"] }
36-
arrow = { version = "52", features = ["pyarrow"] }
35+
pyo3 = { version = "0.22.3", features = ["extension-module"] }
36+
arrow = { version = "53", features = ["pyarrow"] }

crates/iceberg/src/arrow/schema.rs

+57-35
Original file line numberDiff line numberDiff line change
@@ -665,54 +665,70 @@ pub(crate) fn get_arrow_datum(datum: &Datum) -> Result<Box<dyn ArrowDatum + Send
665665
}
666666

667667
macro_rules! get_parquet_stat_as_datum {
668-
($limit_type:ident) => {
668+
($limit_type:tt) => {
669669
paste::paste! {
670670
/// Gets the $limit_type value from a parquet Statistics struct, as a Datum
671671
pub(crate) fn [<get_parquet_stat_ $limit_type _as_datum>](
672672
primitive_type: &PrimitiveType, stats: &Statistics
673673
) -> Result<Option<Datum>> {
674-
Ok(Some(match (primitive_type, stats) {
675-
(PrimitiveType::Boolean, Statistics::Boolean(stats)) => Datum::bool(*stats.$limit_type()),
676-
(PrimitiveType::Int, Statistics::Int32(stats)) => Datum::int(*stats.$limit_type()),
677-
(PrimitiveType::Date, Statistics::Int32(stats)) => Datum::date(*stats.$limit_type()),
678-
(PrimitiveType::Long, Statistics::Int64(stats)) => Datum::long(*stats.$limit_type()),
679-
(PrimitiveType::Time, Statistics::Int64(stats)) => Datum::time_micros(*stats.$limit_type())?,
674+
Ok(match (primitive_type, stats) {
675+
(PrimitiveType::Boolean, Statistics::Boolean(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::bool(*val)),
676+
(PrimitiveType::Int, Statistics::Int32(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::int(*val)),
677+
(PrimitiveType::Date, Statistics::Int32(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::date(*val)),
678+
(PrimitiveType::Long, Statistics::Int64(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::long(*val)),
679+
(PrimitiveType::Time, Statistics::Int64(stats)) => {
680+
let Some(val) = stats.[<$limit_type _opt>]() else {
681+
return Ok(None);
682+
};
683+
684+
Some(Datum::time_micros(*val)?)
685+
}
680686
(PrimitiveType::Timestamp, Statistics::Int64(stats)) => {
681-
Datum::timestamp_micros(*stats.$limit_type())
687+
stats.[<$limit_type _opt>]().map(|val|Datum::timestamp_micros(*val))
682688
}
683689
(PrimitiveType::Timestamptz, Statistics::Int64(stats)) => {
684-
Datum::timestamptz_micros(*stats.$limit_type())
690+
stats.[<$limit_type _opt>]().map(|val|Datum::timestamptz_micros(*val))
685691
}
686692
(PrimitiveType::TimestampNs, Statistics::Int64(stats)) => {
687-
Datum::timestamp_nanos(*stats.$limit_type())
693+
stats.[<$limit_type _opt>]().map(|val|Datum::timestamp_nanos(*val))
688694
}
689695
(PrimitiveType::TimestamptzNs, Statistics::Int64(stats)) => {
690-
Datum::timestamptz_nanos(*stats.$limit_type())
696+
stats.[<$limit_type _opt>]().map(|val|Datum::timestamptz_nanos(*val))
691697
}
692-
(PrimitiveType::Float, Statistics::Float(stats)) => Datum::float(*stats.$limit_type()),
693-
(PrimitiveType::Double, Statistics::Double(stats)) => Datum::double(*stats.$limit_type()),
698+
(PrimitiveType::Float, Statistics::Float(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::float(*val)),
699+
(PrimitiveType::Double, Statistics::Double(stats)) => stats.[<$limit_type _opt>]().map(|val|Datum::double(*val)),
694700
(PrimitiveType::String, Statistics::ByteArray(stats)) => {
695-
Datum::string(stats.$limit_type().as_utf8()?)
701+
let Some(val) = stats.[<$limit_type _opt>]() else {
702+
return Ok(None);
703+
};
704+
705+
Some(Datum::string(val.as_utf8()?))
696706
}
697707
(PrimitiveType::Decimal {
698708
precision: _,
699709
scale: _,
700710
}, Statistics::ByteArray(stats)) => {
701-
Datum::new(
711+
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
712+
return Ok(None);
713+
};
714+
715+
Some(Datum::new(
702716
primitive_type.clone(),
703-
PrimitiveLiteral::Int128(i128::from_le_bytes(stats.[<$limit_type _bytes>]().try_into()?)),
704-
)
717+
PrimitiveLiteral::Int128(i128::from_le_bytes(bytes.try_into()?)),
718+
))
705719
}
706720
(
707721
PrimitiveType::Decimal {
708722
precision: _,
709723
scale: _,
710724
},
711725
Statistics::Int32(stats)) => {
712-
Datum::new(
713-
primitive_type.clone(),
714-
PrimitiveLiteral::Int128(i128::from(*stats.$limit_type())),
715-
)
726+
stats.[<$limit_type _opt>]().map(|val| {
727+
Datum::new(
728+
primitive_type.clone(),
729+
PrimitiveLiteral::Int128(i128::from(*val)),
730+
)
731+
})
716732
}
717733

718734
(
@@ -722,40 +738,46 @@ macro_rules! get_parquet_stat_as_datum {
722738
},
723739
Statistics::Int64(stats),
724740
) => {
725-
Datum::new(
726-
primitive_type.clone(),
727-
PrimitiveLiteral::Int128(i128::from(*stats.$limit_type())),
728-
)
741+
stats.[<$limit_type _opt>]().map(|val| {
742+
Datum::new(
743+
primitive_type.clone(),
744+
PrimitiveLiteral::Int128(i128::from(*val)),
745+
)
746+
})
729747
}
730748
(PrimitiveType::Uuid, Statistics::FixedLenByteArray(stats)) => {
731-
let raw = stats.[<$limit_type _bytes>]();
732-
if raw.len() != 16 {
749+
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
750+
return Ok(None);
751+
};
752+
if bytes.len() != 16 {
733753
return Err(Error::new(
734754
ErrorKind::Unexpected,
735755
"Invalid length of uuid bytes.",
736756
));
737757
}
738-
Datum::uuid(Uuid::from_bytes(
739-
raw[..16].try_into().unwrap(),
740-
))
758+
Some(Datum::uuid(Uuid::from_bytes(
759+
bytes[..16].try_into().unwrap(),
760+
)))
741761
}
742762
(PrimitiveType::Fixed(len), Statistics::FixedLenByteArray(stat)) => {
743-
let raw = stat.[<$limit_type _bytes>]();
744-
if raw.len() != *len as usize {
763+
let Some(bytes) = stat.[<$limit_type _bytes_opt>]() else {
764+
return Ok(None);
765+
};
766+
if bytes.len() != *len as usize {
745767
return Err(Error::new(
746768
ErrorKind::Unexpected,
747769
"Invalid length of fixed bytes.",
748770
));
749771
}
750-
Datum::fixed(raw.to_vec())
772+
Some(Datum::fixed(bytes.to_vec()))
751773
}
752774
(PrimitiveType::Binary, Statistics::ByteArray(stat)) => {
753-
Datum::binary(stat.[<$limit_type _bytes>]().to_vec())
775+
return Ok(stat.[<$limit_type _bytes_opt>]().map(|bytes|Datum::binary(bytes.to_vec())))
754776
}
755777
_ => {
756778
return Ok(None);
757779
}
758-
}))
780+
})
759781
}
760782
}
761783
}

0 commit comments

Comments
 (0)