From 9229a8879893925aff6f0d40a59c0da9ffc509ec Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 10:34:57 -0400 Subject: [PATCH 01/17] bucket transform rust binding --- bindings/python/Cargo.toml | 2 + bindings/python/pyproject.toml | 2 +- bindings/python/src/lib.rs | 130 +++++++++++++++++++++++- bindings/python/tests/test_transform.py | 41 ++++++++ 4 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 bindings/python/tests/test_transform.py diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index c2c1007b7..8cf1d7667 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -33,3 +33,5 @@ crate-type = ["cdylib"] [dependencies] iceberg = { path = "../../crates/iceberg" } pyo3 = { version = "0.22", features = ["extension-module"] } +arrow = { version = "52.2.0", features = ["ffi"] } +libc = "0.2" \ No newline at end of file diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 4a489adde..f0f331b1f 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -32,7 +32,7 @@ classifiers = [ ] [project.optional-dependencies] -test = ["pytest"] +test = ["pytest", "pyarrow"] [tool.maturin] features = ["pyo3/extension-module"] diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index f0d5d1935..53c33ea9c 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -15,8 +15,25 @@ // specific language governing permissions and limitations // under the License. +use std::{error, fmt, sync::Arc}; +use iceberg::spec::Transform; +use iceberg::transform::create_transform_function; +use iceberg::Error; use iceberg::io::FileIOBuilder; -use pyo3::prelude::*; + +use arrow::{ + array::{make_array, Array, ArrayData, ArrayRef}, + error::ArrowError, + ffi::{from_ffi, to_ffi}, +}; +use libc::uintptr_t; +use pyo3::wrap_pyfunction; +use pyo3::{exceptions::PyOSError, exceptions::PyValueError, prelude::*}; + +#[derive(Debug)] +enum PyO3ArrowError { + ArrowError(ArrowError), +} #[pyfunction] fn hello_world() -> PyResult { @@ -24,8 +41,117 @@ fn hello_world() -> PyResult { Ok("Hello, world!".to_string()) } +impl fmt::Display for PyO3ArrowError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + PyO3ArrowError::ArrowError(ref e) => e.fmt(f), + } + } +} + +impl error::Error for PyO3ArrowError { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + match *self { + // The cause is the underlying implementation error type. Is implicitly + // cast to the trait object `&error::Error`. This works because the + // underlying type already implements the `Error` trait. + PyO3ArrowError::ArrowError(ref e) => Some(e), + } + } +} + +impl From for PyO3ArrowError { + fn from(err: ArrowError) -> PyO3ArrowError { + PyO3ArrowError::ArrowError(err) + } +} + +impl From for PyErr { + fn from(err: PyO3ArrowError) -> PyErr { + PyOSError::new_err(err.to_string()) + } +} + +#[derive(Debug)] +enum PyO3IcebergError { + Error(Error), +} + +impl fmt::Display for PyO3IcebergError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + PyO3IcebergError::Error(ref e) => e.fmt(f), + } + } +} + +impl error::Error for PyO3IcebergError { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + match *self { + // The cause is the underlying implementation error type. Is implicitly + // cast to the trait object `&error::Error`. This works because the + // underlying type already implements the `Error` trait. + PyO3IcebergError::Error(ref e) => Some(e), + } + } +} + +impl From for PyO3IcebergError { + fn from(err: Error) -> PyO3IcebergError { + PyO3IcebergError::Error(err) + } +} + +impl From for PyErr { + fn from(err: PyO3IcebergError) -> PyErr { + PyValueError::new_err(err.to_string()) + } +} + +fn to_rust(ob: PyObject, py: Python) -> PyResult { + // prepare a pointer to receive the Array struct + let (array, schema) = to_ffi(&ArrayData::new_empty(&arrow::datatypes::DataType::Null)) + .map_err(PyO3ArrowError::from)?; + let array_pointer = &array as *const _ as uintptr_t; + let schema_pointer = &schema as *const _ as uintptr_t; + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + ob.call_method1(py, "_export_to_c", (array_pointer, schema_pointer))?; + + let array = unsafe { from_ffi(array, &schema) }.map_err(PyO3ArrowError::from)?; + let array = make_array(array); + Ok(array) +} + +fn to_py(array: ArrayRef, py: Python) -> PyResult { + let (array, schema) = to_ffi(&array.to_data()).map_err(PyO3ArrowError::from)?; + let array_pointer = &array as *const _ as uintptr_t; + let schema_pointer = &schema as *const _ as uintptr_t; + + let pa = py.import_bound("pyarrow")?; + + let array = pa.getattr("Array")?.call_method1( + "_import_from_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + Ok(array.to_object(py)) +} + +#[pyfunction] +fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { + // import + let array = to_rust(array, py)?; + let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; + let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; + let array = Arc::new(array); + // export + to_py(array, py) +} + #[pymodule] fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; + m.add_function(wrap_pyfunction!(bucket_transform, m)?)?; Ok(()) -} +} \ No newline at end of file diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py new file mode 100644 index 000000000..16f00a94b --- /dev/null +++ b/bindings/python/tests/test_transform.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyiceberg_core import bucket_transform + +import pytest +import pyarrow as pa + +def test_bucket_pyarrow_array(): + arr = pa.array([1, 2]) + result = bucket_transform(arr, 10) + expected = pa.array([6, 2], type=pa.int32()) + assert result == expected + +def test_bucket_pyarrow_array_list_type_fails(): + arr = pa.array([[1, 2], [3, 4]]) + with pytest.raises(ValueError, match=r"FeatureUnsupported => Unsupported data type for bucket transform"): + bucket_transform(arr, 10) + +def test_bucket_chunked_array(): + chunked = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) + result_chunks = [] + for arr in chunked.iterchunks(): + result_chunks.append(bucket_transform(arr, 10)) + + expected = pa.chunked_array([pa.array([6, 2], type=pa.int32()), pa.array([5, 0], type=pa.int32())]) + assert pa.chunked_array(result_chunks).equals(expected) \ No newline at end of file From 3106792fc4ed0a81a145e1424eca43e482fb16dc Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 11:05:31 -0400 Subject: [PATCH 02/17] format --- bindings/python/tests/test_transform.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py index 16f00a94b..2b6d277af 100644 --- a/bindings/python/tests/test_transform.py +++ b/bindings/python/tests/test_transform.py @@ -20,22 +20,30 @@ import pytest import pyarrow as pa + def test_bucket_pyarrow_array(): arr = pa.array([1, 2]) result = bucket_transform(arr, 10) expected = pa.array([6, 2], type=pa.int32()) assert result == expected - + + def test_bucket_pyarrow_array_list_type_fails(): arr = pa.array([[1, 2], [3, 4]]) - with pytest.raises(ValueError, match=r"FeatureUnsupported => Unsupported data type for bucket transform"): + with pytest.raises( + ValueError, + match=r"FeatureUnsupported => Unsupported data type for bucket transform", + ): bucket_transform(arr, 10) + def test_bucket_chunked_array(): chunked = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) result_chunks = [] for arr in chunked.iterchunks(): result_chunks.append(bucket_transform(arr, 10)) - - expected = pa.chunked_array([pa.array([6, 2], type=pa.int32()), pa.array([5, 0], type=pa.int32())]) - assert pa.chunked_array(result_chunks).equals(expected) \ No newline at end of file + + expected = pa.chunked_array( + [pa.array([6, 2], type=pa.int32()), pa.array([5, 0], type=pa.int32())] + ) + assert pa.chunked_array(result_chunks).equals(expected) From 1930df2e0b123bfcae999958ec248a4103071e96 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 13:04:50 -0400 Subject: [PATCH 03/17] poetry x maturin --- bindings/python/Makefile | 31 +++++ bindings/python/README.md | 10 +- bindings/python/poetry.lock | 218 +++++++++++++++++++++++++++++++++ bindings/python/pyproject.toml | 15 ++- 4 files changed, 264 insertions(+), 10 deletions(-) create mode 100644 bindings/python/Makefile create mode 100644 bindings/python/poetry.lock diff --git a/bindings/python/Makefile b/bindings/python/Makefile new file mode 100644 index 000000000..aae52da60 --- /dev/null +++ b/bindings/python/Makefile @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install-poetry: + pip install poetry==1.8.3 + +install: | install-poetry + poetry install --no-root + +develop: + poetry run maturin develop + +build: + poetry run maturin build + +test: + poetry run pytest \ No newline at end of file diff --git a/bindings/python/README.md b/bindings/python/README.md index 566a7bcb8..0a64507f8 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -24,21 +24,17 @@ This project is used to build an iceberg-rust powered core for pyiceberg. ## Setup ```shell -python -m venv venv -source ./venv/bin/activate - -pip install maturin +make install ``` ## Build ```shell -maturin develop +make develop ``` ## Test ```shell -maturin develop -E test -pytest -v +make test ``` \ No newline at end of file diff --git a/bindings/python/poetry.lock b/bindings/python/poetry.lock new file mode 100644 index 000000000..02e0b2c22 --- /dev/null +++ b/bindings/python/poetry.lock @@ -0,0 +1,218 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "maturin" +version = "1.7.0" +description = "Build and publish crates with pyo3, cffi and uniffi bindings as well as rust binaries as python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "maturin-1.7.0-py3-none-linux_armv6l.whl", hash = "sha256:15fe7920391a128897714f6ed38ebbc771150410b795a55cefca73f089d5aecb"}, + {file = "maturin-1.7.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:87a1fae70f1a6ad694832c735abf9f010edc4971c5cf89d2e7a54651a1a3792a"}, + {file = "maturin-1.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6fd312c56846d3cafa7c45e362d96b526170e79b9adb5b8ea02a10c88906069c"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:928b82ceba924b1642c53f6684271e814b5ce5049cb4d35ff36bed078837eb83"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:7460122333971b2492154c102d2981ae337ae0486dde7f4df7e645d724de59a5"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1f521ebe0344db8260df0d12779aefc06c1f763cd654151cf4a238fe14f65dc1"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:0af4f2a4cfb99206d414dec138dd3aac3f506eb8928b7e38dfac570461b393d6"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:29187d5c3e1e166c14eaadc63a8adc25b6bbb3e5b055d1bc87f6ca92b4b6e331"}, + {file = "maturin-1.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9cd5b992b6c131c5f47c85e7bc266bf5bf94f29720856678431ce6c91b726df"}, + {file = "maturin-1.7.0-py3-none-win32.whl", hash = "sha256:c1ae0b4162fb1152aea83098bf1b66a7bf6dd73fd1b108e6c4e22160118a997c"}, + {file = "maturin-1.7.0-py3-none-win_amd64.whl", hash = "sha256:2bd8227e020a9308c076253f29224c53b08b2a4ed41fcd94b4eb9349684fcfe7"}, + {file = "maturin-1.7.0-py3-none-win_arm64.whl", hash = "sha256:7c05226547778f31b73d48a19d11f57792bcc44f4047b84c73ea66cae2e62473"}, + {file = "maturin-1.7.0.tar.gz", hash = "sha256:1ba5277dd7832dc6181d69a005182b97b3520945825058484ffd9296f2efb59c"}, +] + +[package.dependencies] +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + +[package.extras] +patchelf = ["patchelf"] +zig = ["ziglang (>=0.10.0,<0.13.0)"] + +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pyarrow" +version = "17.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, + {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, + {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, + {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, + {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, + {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, + {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + +[[package]] +name = "pytest" +version = "8.3.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8" +content-hash = "2934e145a3b88bf1ccfa896630fc288c515106985fd8806dc61d43575a640637" diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index f0f331b1f..3d6b79a97 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -19,9 +19,13 @@ requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" -[project] +[tool.poetry] name = "pyiceberg_core" version = "0.0.1" +repository = "https://github.com/apache/iceberg-rust" +description = "Apache Iceberg is an open table format for huge analytic datasets" +authors = ["Apache Software Foundation "] +license = "Apache License 2.0" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -31,8 +35,13 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] -[project.optional-dependencies] -test = ["pytest", "pyarrow"] +[tool.poetry.dependencies] +python = "^3.8" + +[tool.poetry.dev-dependencies] +maturin = "^1.0" +pytest = "^8.3.2" +pyarrow = "^17.0.0" [tool.maturin] features = ["pyo3/extension-module"] From 7ebcf5c565ef64e80eb883bdb1712383dc47206c Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 13:08:36 -0400 Subject: [PATCH 04/17] ignore poetry.lock in license check --- .licenserc.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.licenserc.yaml b/.licenserc.yaml index 38aa58402..c02cee2c8 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -31,4 +31,6 @@ header: - '**/DEPENDENCIES.*.tsv' # Release distributions - 'dist/*' + # Generated content by poetry + - poetry.lock comment: on-failure From 683a9cc5d05c168f5ef7a985676faa17350d53f7 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 13:08:56 -0400 Subject: [PATCH 05/17] update bindings_python_ci to use makefile --- .github/workflows/bindings_python_ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 82f13e752..8e161bbd6 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -78,6 +78,6 @@ jobs: shell: bash run: | set -e - pip install dist/pyiceberg_core-*.whl --force-reinstall - pip install pytest - pytest -v + make install + poetry run pip install dist/pyiceberg_core-*.whl --force-reinstall + make test From 5a5bff5e0933b39dc2e69d61c9c17fe03353b95e Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 13:21:05 -0400 Subject: [PATCH 06/17] newline --- bindings/python/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 53c33ea9c..56f77c183 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -154,4 +154,4 @@ fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; m.add_function(wrap_pyfunction!(bucket_transform, m)?)?; Ok(()) -} \ No newline at end of file +} From 7f6c65393674451894fd1122e1426ab4bd5af74f Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 13:42:28 -0400 Subject: [PATCH 07/17] https://github.com/python-poetry/poetry/pull/9135 --- .gitignore | 2 ++ bindings/python/pyproject.toml | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/.gitignore b/.gitignore index 05c11eda6..809115d66 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ dist/* **/venv *.so *.pyc +*.whl +*.tar.gz \ No newline at end of file diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 3d6b79a97..d5e13f886 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -19,6 +19,19 @@ requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" +[project] +name = "pyiceberg_core" +version = "0.0.1" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +# Duplicate section required: https://github.com/python-poetry/poetry/pull/9135 [tool.poetry] name = "pyiceberg_core" version = "0.0.1" From 2ccbffefe0d0dc83376497bcc4b8486bb06ffd66 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Fri, 16 Aug 2024 16:20:47 -0400 Subject: [PATCH 08/17] use hatch instead of poetry --- .github/workflows/bindings_python_ci.yml | 6 +- bindings/python/Makefile | 31 ---- bindings/python/README.md | 6 +- bindings/python/poetry.lock | 218 ----------------------- bindings/python/pyproject.toml | 37 ++-- 5 files changed, 18 insertions(+), 280 deletions(-) delete mode 100644 bindings/python/Makefile delete mode 100644 bindings/python/poetry.lock diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 8e161bbd6..d4b1aa922 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -78,6 +78,6 @@ jobs: shell: bash run: | set -e - make install - poetry run pip install dist/pyiceberg_core-*.whl --force-reinstall - make test + pip install hatch==1.12.0 + hatch run dev:pip install dist/pyiceberg_core-*.whl --force-reinstall + hatch run dev:test diff --git a/bindings/python/Makefile b/bindings/python/Makefile deleted file mode 100644 index aae52da60..000000000 --- a/bindings/python/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -install-poetry: - pip install poetry==1.8.3 - -install: | install-poetry - poetry install --no-root - -develop: - poetry run maturin develop - -build: - poetry run maturin build - -test: - poetry run pytest \ No newline at end of file diff --git a/bindings/python/README.md b/bindings/python/README.md index 0a64507f8..fe4300e1f 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -24,17 +24,17 @@ This project is used to build an iceberg-rust powered core for pyiceberg. ## Setup ```shell -make install +pip install hatch==1.12.0 ``` ## Build ```shell -make develop +hatch run dev:develop ``` ## Test ```shell -make test +hatch run dev:test ``` \ No newline at end of file diff --git a/bindings/python/poetry.lock b/bindings/python/poetry.lock deleted file mode 100644 index 02e0b2c22..000000000 --- a/bindings/python/poetry.lock +++ /dev/null @@ -1,218 +0,0 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -files = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] - -[[package]] -name = "exceptiongroup" -version = "1.2.2" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, - {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, -] - -[package.extras] -test = ["pytest (>=6)"] - -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -optional = false -python-versions = ">=3.7" -files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - -[[package]] -name = "maturin" -version = "1.7.0" -description = "Build and publish crates with pyo3, cffi and uniffi bindings as well as rust binaries as python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "maturin-1.7.0-py3-none-linux_armv6l.whl", hash = "sha256:15fe7920391a128897714f6ed38ebbc771150410b795a55cefca73f089d5aecb"}, - {file = "maturin-1.7.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:87a1fae70f1a6ad694832c735abf9f010edc4971c5cf89d2e7a54651a1a3792a"}, - {file = "maturin-1.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6fd312c56846d3cafa7c45e362d96b526170e79b9adb5b8ea02a10c88906069c"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:928b82ceba924b1642c53f6684271e814b5ce5049cb4d35ff36bed078837eb83"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:7460122333971b2492154c102d2981ae337ae0486dde7f4df7e645d724de59a5"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1f521ebe0344db8260df0d12779aefc06c1f763cd654151cf4a238fe14f65dc1"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:0af4f2a4cfb99206d414dec138dd3aac3f506eb8928b7e38dfac570461b393d6"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:29187d5c3e1e166c14eaadc63a8adc25b6bbb3e5b055d1bc87f6ca92b4b6e331"}, - {file = "maturin-1.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9cd5b992b6c131c5f47c85e7bc266bf5bf94f29720856678431ce6c91b726df"}, - {file = "maturin-1.7.0-py3-none-win32.whl", hash = "sha256:c1ae0b4162fb1152aea83098bf1b66a7bf6dd73fd1b108e6c4e22160118a997c"}, - {file = "maturin-1.7.0-py3-none-win_amd64.whl", hash = "sha256:2bd8227e020a9308c076253f29224c53b08b2a4ed41fcd94b4eb9349684fcfe7"}, - {file = "maturin-1.7.0-py3-none-win_arm64.whl", hash = "sha256:7c05226547778f31b73d48a19d11f57792bcc44f4047b84c73ea66cae2e62473"}, - {file = "maturin-1.7.0.tar.gz", hash = "sha256:1ba5277dd7832dc6181d69a005182b97b3520945825058484ffd9296f2efb59c"}, -] - -[package.dependencies] -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} - -[package.extras] -patchelf = ["patchelf"] -zig = ["ziglang (>=0.10.0,<0.13.0)"] - -[[package]] -name = "numpy" -version = "1.24.4" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, - {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, - {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, - {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, - {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, - {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, - {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, - {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, - {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, - {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, - {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, -] - -[[package]] -name = "packaging" -version = "24.1" -description = "Core utilities for Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, -] - -[[package]] -name = "pluggy" -version = "1.5.0" -description = "plugin and hook calling mechanisms for python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[[package]] -name = "pyarrow" -version = "17.0.0" -description = "Python library for Apache Arrow" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, - {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, - {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, - {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, - {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, - {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, - {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, - {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, - {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, - {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, - {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, - {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, - {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, - {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, -] - -[package.dependencies] -numpy = ">=1.16.6" - -[package.extras] -test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] - -[[package]] -name = "pytest" -version = "8.3.2" -description = "pytest: simple powerful testing with Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, - {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=1.5,<2" -tomli = {version = ">=1", markers = "python_version < \"3.11\""} - -[package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] - -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - -[metadata] -lock-version = "2.0" -python-versions = "^3.8" -content-hash = "2934e145a3b88bf1ccfa896630fc288c515106985fd8806dc61d43575a640637" diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index d5e13f886..389fd6e6c 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -31,31 +31,6 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] -# Duplicate section required: https://github.com/python-poetry/poetry/pull/9135 -[tool.poetry] -name = "pyiceberg_core" -version = "0.0.1" -repository = "https://github.com/apache/iceberg-rust" -description = "Apache Iceberg is an open table format for huge analytic datasets" -authors = ["Apache Software Foundation "] -license = "Apache License 2.0" -classifiers = [ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", -] - -[tool.poetry.dependencies] -python = "^3.8" - -[tool.poetry.dev-dependencies] -maturin = "^1.0" -pytest = "^8.3.2" -pyarrow = "^17.0.0" - [tool.maturin] features = ["pyo3/extension-module"] python-source = "python" @@ -63,3 +38,15 @@ module-name = "pyiceberg_core.pyiceberg_core_rust" [tool.ruff.lint] ignore = ["F403", "F405"] + +[tool.hatch.envs.dev] +dependencies = [ + "maturin>=1.0,<2.0", + "pytest>=8.3.2", + "pyarrow>=17.0.0", +] + +[tool.hatch.envs.dev.scripts] +develop = "maturin develop" +build = "maturin build --out dist --sdist" +test = "pytest" \ No newline at end of file From fce09ae3148051b6bc0156e0d0f1e0d1ef5ed03d Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Sun, 18 Aug 2024 21:21:19 -0400 Subject: [PATCH 09/17] refactor --- bindings/python/src/lib.rs | 127 +-------------------------- bindings/python/src/transform.rs | 142 +++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 124 deletions(-) create mode 100644 bindings/python/src/transform.rs diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 56f77c183..c3363e1be 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -15,25 +15,11 @@ // specific language governing permissions and limitations // under the License. -use std::{error, fmt, sync::Arc}; -use iceberg::spec::Transform; -use iceberg::transform::create_transform_function; -use iceberg::Error; use iceberg::io::FileIOBuilder; - -use arrow::{ - array::{make_array, Array, ArrayData, ArrayRef}, - error::ArrowError, - ffi::{from_ffi, to_ffi}, -}; -use libc::uintptr_t; +use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use pyo3::{exceptions::PyOSError, exceptions::PyValueError, prelude::*}; -#[derive(Debug)] -enum PyO3ArrowError { - ArrowError(ArrowError), -} +mod transform; #[pyfunction] fn hello_world() -> PyResult { @@ -41,116 +27,9 @@ fn hello_world() -> PyResult { Ok("Hello, world!".to_string()) } -impl fmt::Display for PyO3ArrowError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3ArrowError::ArrowError(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3ArrowError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3ArrowError::ArrowError(ref e) => Some(e), - } - } -} - -impl From for PyO3ArrowError { - fn from(err: ArrowError) -> PyO3ArrowError { - PyO3ArrowError::ArrowError(err) - } -} - -impl From for PyErr { - fn from(err: PyO3ArrowError) -> PyErr { - PyOSError::new_err(err.to_string()) - } -} - -#[derive(Debug)] -enum PyO3IcebergError { - Error(Error), -} - -impl fmt::Display for PyO3IcebergError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3IcebergError::Error(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3IcebergError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3IcebergError::Error(ref e) => Some(e), - } - } -} - -impl From for PyO3IcebergError { - fn from(err: Error) -> PyO3IcebergError { - PyO3IcebergError::Error(err) - } -} - -impl From for PyErr { - fn from(err: PyO3IcebergError) -> PyErr { - PyValueError::new_err(err.to_string()) - } -} - -fn to_rust(ob: PyObject, py: Python) -> PyResult { - // prepare a pointer to receive the Array struct - let (array, schema) = to_ffi(&ArrayData::new_empty(&arrow::datatypes::DataType::Null)) - .map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - ob.call_method1(py, "_export_to_c", (array_pointer, schema_pointer))?; - - let array = unsafe { from_ffi(array, &schema) }.map_err(PyO3ArrowError::from)?; - let array = make_array(array); - Ok(array) -} - -fn to_py(array: ArrayRef, py: Python) -> PyResult { - let (array, schema) = to_ffi(&array.to_data()).map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - let pa = py.import_bound("pyarrow")?; - - let array = pa.getattr("Array")?.call_method1( - "_import_from_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - Ok(array.to_object(py)) -} - -#[pyfunction] -fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { - // import - let array = to_rust(array, py)?; - let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; - let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; - let array = Arc::new(array); - // export - to_py(array, py) -} - #[pymodule] fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { + use transform::bucket_transform; m.add_function(wrap_pyfunction!(hello_world, m)?)?; m.add_function(wrap_pyfunction!(bucket_transform, m)?)?; Ok(()) diff --git a/bindings/python/src/transform.rs b/bindings/python/src/transform.rs new file mode 100644 index 000000000..610e1aa8d --- /dev/null +++ b/bindings/python/src/transform.rs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{error, fmt, sync::Arc}; +use iceberg::spec::Transform; +use iceberg::transform::create_transform_function; +use iceberg::Error; + +use arrow::{ + array::{make_array, Array, ArrayData, ArrayRef}, + error::ArrowError, + ffi::{from_ffi, to_ffi}, +}; +use libc::uintptr_t; +use pyo3::{exceptions::PyOSError, exceptions::PyValueError, prelude::*}; + +#[derive(Debug)] +enum PyO3ArrowError { + ArrowError(ArrowError), +} + +impl fmt::Display for PyO3ArrowError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + PyO3ArrowError::ArrowError(ref e) => e.fmt(f), + } + } +} + +impl error::Error for PyO3ArrowError { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + match *self { + // The cause is the underlying implementation error type. Is implicitly + // cast to the trait object `&error::Error`. This works because the + // underlying type already implements the `Error` trait. + PyO3ArrowError::ArrowError(ref e) => Some(e), + } + } +} + +impl From for PyO3ArrowError { + fn from(err: ArrowError) -> PyO3ArrowError { + PyO3ArrowError::ArrowError(err) + } +} + +impl From for PyErr { + fn from(err: PyO3ArrowError) -> PyErr { + PyOSError::new_err(err.to_string()) + } +} + +#[derive(Debug)] +enum PyO3IcebergError { + Error(Error), +} + +impl fmt::Display for PyO3IcebergError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + PyO3IcebergError::Error(ref e) => e.fmt(f), + } + } +} + +impl error::Error for PyO3IcebergError { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + match *self { + // The cause is the underlying implementation error type. Is implicitly + // cast to the trait object `&error::Error`. This works because the + // underlying type already implements the `Error` trait. + PyO3IcebergError::Error(ref e) => Some(e), + } + } +} + +impl From for PyO3IcebergError { + fn from(err: Error) -> PyO3IcebergError { + PyO3IcebergError::Error(err) + } +} + +impl From for PyErr { + fn from(err: PyO3IcebergError) -> PyErr { + PyValueError::new_err(err.to_string()) + } +} + +fn to_rust_arrow_array(ob: PyObject, py: Python) -> PyResult { + // prepare a pointer to receive the Array struct + let (array, schema) = to_ffi(&ArrayData::new_empty(&arrow::datatypes::DataType::Null)) + .map_err(PyO3ArrowError::from)?; + let array_pointer = &array as *const _ as uintptr_t; + let schema_pointer = &schema as *const _ as uintptr_t; + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + ob.call_method1(py, "_export_to_c", (array_pointer, schema_pointer))?; + + let array = unsafe { from_ffi(array, &schema) }.map_err(PyO3ArrowError::from)?; + let array = make_array(array); + Ok(array) +} + +fn to_pyarrow_array(array: ArrayRef, py: Python) -> PyResult { + let (array, schema) = to_ffi(&array.to_data()).map_err(PyO3ArrowError::from)?; + let array_pointer = &array as *const _ as uintptr_t; + let schema_pointer = &schema as *const _ as uintptr_t; + + let pa = py.import_bound("pyarrow")?; + + let array = pa.getattr("Array")?.call_method1( + "_import_from_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + Ok(array.to_object(py)) +} + +#[pyfunction] +pub fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { + // import + let array = to_rust_arrow_array(array, py)?; + let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; + let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; + let array = Arc::new(array); + // export + to_pyarrow_array(array, py) +} From 2a3bdb32d350e51694dd6346043252b00812807c Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Mon, 19 Aug 2024 12:36:40 -0400 Subject: [PATCH 10/17] revert licenserc change --- .licenserc.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.licenserc.yaml b/.licenserc.yaml index c02cee2c8..38aa58402 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -31,6 +31,4 @@ header: - '**/DEPENDENCIES.*.tsv' # Release distributions - 'dist/*' - # Generated content by poetry - - poetry.lock comment: on-failure From 3308626e98ae3a3ce2af3653a17ad540ce103850 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Mon, 19 Aug 2024 14:01:08 -0400 Subject: [PATCH 11/17] adopt review feedback --- bindings/python/Cargo.toml | 4 +- bindings/python/src/lib.rs | 16 ++++- bindings/python/src/transform.rs | 83 +++---------------------- bindings/python/tests/test_transform.py | 2 +- 4 files changed, 25 insertions(+), 80 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 8cf1d7667..a8bed6757 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -32,6 +32,6 @@ crate-type = ["cdylib"] [dependencies] iceberg = { path = "../../crates/iceberg" } -pyo3 = { version = "0.22", features = ["extension-module"] } -arrow = { version = "52.2.0", features = ["ffi"] } +pyo3 = { version = "0.21.1", features = ["extension-module"] } +arrow = { version = "52.2.0", features = ["pyarrow"] } libc = "0.2" \ No newline at end of file diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index c3363e1be..d257760fd 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -28,9 +28,21 @@ fn hello_world() -> PyResult { } #[pymodule] -fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn submodule(_py: Python, module: &Bound<'_, PyModule>) -> PyResult<()> { use transform::bucket_transform; + + module.add_wrapped(wrap_pyfunction!(bucket_transform))?; + Ok(()) +} + +#[pymodule] +fn pyiceberg_core_rust(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; - m.add_function(wrap_pyfunction!(bucket_transform, m)?)?; + + // https://github.com/PyO3/pyo3/issues/759 + let child_module = PyModule::new_bound(py, "pyiceberg_core.transform")?; + submodule(py, &child_module)?; + m.add("transform", child_module.clone())?; + py.import_bound("sys")?.getattr("modules")?.set_item("pyiceberg_core.transform", child_module)?; Ok(()) } diff --git a/bindings/python/src/transform.rs b/bindings/python/src/transform.rs index 610e1aa8d..58cdf0d17 100644 --- a/bindings/python/src/transform.rs +++ b/bindings/python/src/transform.rs @@ -15,54 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::{error, fmt, sync::Arc}; +use std::{error, fmt}; use iceberg::spec::Transform; use iceberg::transform::create_transform_function; use iceberg::Error; use arrow::{ - array::{make_array, Array, ArrayData, ArrayRef}, - error::ArrowError, - ffi::{from_ffi, to_ffi}, + array::{make_array, Array, ArrayData}, }; -use libc::uintptr_t; -use pyo3::{exceptions::PyOSError, exceptions::PyValueError, prelude::*}; - -#[derive(Debug)] -enum PyO3ArrowError { - ArrowError(ArrowError), -} - -impl fmt::Display for PyO3ArrowError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3ArrowError::ArrowError(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3ArrowError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3ArrowError::ArrowError(ref e) => Some(e), - } - } -} - -impl From for PyO3ArrowError { - fn from(err: ArrowError) -> PyO3ArrowError { - PyO3ArrowError::ArrowError(err) - } -} - -impl From for PyErr { - fn from(err: PyO3ArrowError) -> PyErr { - PyOSError::new_err(err.to_string()) - } -} +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; +use pyo3::{exceptions::PyValueError, prelude::*}; #[derive(Debug)] enum PyO3IcebergError { @@ -100,43 +62,14 @@ impl From for PyErr { } } -fn to_rust_arrow_array(ob: PyObject, py: Python) -> PyResult { - // prepare a pointer to receive the Array struct - let (array, schema) = to_ffi(&ArrayData::new_empty(&arrow::datatypes::DataType::Null)) - .map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - ob.call_method1(py, "_export_to_c", (array_pointer, schema_pointer))?; - - let array = unsafe { from_ffi(array, &schema) }.map_err(PyO3ArrowError::from)?; - let array = make_array(array); - Ok(array) -} - -fn to_pyarrow_array(array: ArrayRef, py: Python) -> PyResult { - let (array, schema) = to_ffi(&array.to_data()).map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - let pa = py.import_bound("pyarrow")?; - - let array = pa.getattr("Array")?.call_method1( - "_import_from_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - Ok(array.to_object(py)) -} - #[pyfunction] pub fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { // import - let array = to_rust_arrow_array(array, py)?; + let array = ArrayData::from_pyarrow_bound(array.bind(py))?; + let array = make_array(array); let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; - let array = Arc::new(array); // export - to_pyarrow_array(array, py) + let array = array.into_data(); + array.to_pyarrow(py) } diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py index 2b6d277af..8a160ce12 100644 --- a/bindings/python/tests/test_transform.py +++ b/bindings/python/tests/test_transform.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyiceberg_core import bucket_transform +from pyiceberg_core.transform import bucket_transform import pytest import pyarrow as pa From 342527741b859afbe79b5cab0a030d57c1226d0e Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Mon, 19 Aug 2024 14:41:33 -0400 Subject: [PATCH 12/17] comments --- bindings/python/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index d257760fd..761b8c1a1 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -40,6 +40,10 @@ fn pyiceberg_core_rust(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; // https://github.com/PyO3/pyo3/issues/759 + // Submodules added through PyO3 cannot be imported in Python using + // the syntax: 'from parent.child import function'. + // We need to add the submodule in sys.modules manually so that + // Python can find it. let child_module = PyModule::new_bound(py, "pyiceberg_core.transform")?; submodule(py, &child_module)?; m.add("transform", child_module.clone())?; From fdff7e93b3ec48e50e95947bd2b283f8ffad95a7 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Mon, 19 Aug 2024 15:22:59 -0400 Subject: [PATCH 13/17] unused dependency --- bindings/python/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index a8bed6757..0260f788b 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -34,4 +34,3 @@ crate-type = ["cdylib"] iceberg = { path = "../../crates/iceberg" } pyo3 = { version = "0.21.1", features = ["extension-module"] } arrow = { version = "52.2.0", features = ["pyarrow"] } -libc = "0.2" \ No newline at end of file From 6e1b411eb8f891a08cc65520dbbbda64457eb4f7 Mon Sep 17 00:00:00 2001 From: sungwy Date: Wed, 21 Aug 2024 13:58:26 -0400 Subject: [PATCH 14/17] adopt review comment --- bindings/python/src/transform.rs | 42 +++----------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/bindings/python/src/transform.rs b/bindings/python/src/transform.rs index 58cdf0d17..87115fbce 100644 --- a/bindings/python/src/transform.rs +++ b/bindings/python/src/transform.rs @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -use std::{error, fmt}; use iceberg::spec::Transform; use iceberg::transform::create_transform_function; -use iceberg::Error; use arrow::{ array::{make_array, Array, ArrayData}, @@ -26,40 +24,8 @@ use arrow::{ use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::{exceptions::PyValueError, prelude::*}; -#[derive(Debug)] -enum PyO3IcebergError { - Error(Error), -} - -impl fmt::Display for PyO3IcebergError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3IcebergError::Error(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3IcebergError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3IcebergError::Error(ref e) => Some(e), - } - } -} - -impl From for PyO3IcebergError { - fn from(err: Error) -> PyO3IcebergError { - PyO3IcebergError::Error(err) - } -} - -impl From for PyErr { - fn from(err: PyO3IcebergError) -> PyErr { - PyValueError::new_err(err.to_string()) - } +fn to_py_err(err: iceberg::Error) -> PyErr { + PyValueError::new_err(err.to_string()) } #[pyfunction] @@ -67,8 +33,8 @@ pub fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResu // import let array = ArrayData::from_pyarrow_bound(array.bind(py))?; let array = make_array(array); - let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; - let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; + let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(to_py_err)?; + let array = bucket.transform(array).map_err(to_py_err)?; // export let array = array.into_data(); array.to_pyarrow(py) From 1f8eebc78d1601f9f1d52b5ddd34f82b4426b1dd Mon Sep 17 00:00:00 2001 From: sungwy Date: Wed, 21 Aug 2024 14:00:46 -0400 Subject: [PATCH 15/17] newline --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 809115d66..a3f05e817 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,4 @@ dist/* *.so *.pyc *.whl -*.tar.gz \ No newline at end of file +*.tar.gz From ac4c42d68f255ac5c2ae06a3a56a153429890193 Mon Sep 17 00:00:00 2001 From: sungwy Date: Wed, 21 Aug 2024 16:20:54 -0400 Subject: [PATCH 16/17] I like this approach a lot better --- bindings/python/src/lib.rs | 19 +-------- bindings/python/src/transform.rs | 54 +++++++++++++++++++++++-- bindings/python/tests/test_transform.py | 8 ++-- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 761b8c1a1..5c3f77ff7 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -27,26 +27,11 @@ fn hello_world() -> PyResult { Ok("Hello, world!".to_string()) } -#[pymodule] -fn submodule(_py: Python, module: &Bound<'_, PyModule>) -> PyResult<()> { - use transform::bucket_transform; - - module.add_wrapped(wrap_pyfunction!(bucket_transform))?; - Ok(()) -} #[pymodule] -fn pyiceberg_core_rust(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { +fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; - // https://github.com/PyO3/pyo3/issues/759 - // Submodules added through PyO3 cannot be imported in Python using - // the syntax: 'from parent.child import function'. - // We need to add the submodule in sys.modules manually so that - // Python can find it. - let child_module = PyModule::new_bound(py, "pyiceberg_core.transform")?; - submodule(py, &child_module)?; - m.add("transform", child_module.clone())?; - py.import_bound("sys")?.getattr("modules")?.set_item("pyiceberg_core.transform", child_module)?; + m.add_class::()?; Ok(()) } diff --git a/bindings/python/src/transform.rs b/bindings/python/src/transform.rs index 87115fbce..8f4585b2a 100644 --- a/bindings/python/src/transform.rs +++ b/bindings/python/src/transform.rs @@ -28,14 +28,60 @@ fn to_py_err(err: iceberg::Error) -> PyErr { PyValueError::new_err(err.to_string()) } -#[pyfunction] -pub fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { +#[pyclass] +pub struct ArrowArrayTransform { +} + +fn apply(array: PyObject, transform: Transform, py: Python) -> PyResult { // import let array = ArrayData::from_pyarrow_bound(array.bind(py))?; let array = make_array(array); - let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(to_py_err)?; - let array = bucket.transform(array).map_err(to_py_err)?; + let transform_function = create_transform_function(&transform).map_err(to_py_err)?; + let array = transform_function.transform(array).map_err(to_py_err)?; // export let array = array.into_data(); array.to_pyarrow(py) } + +#[pymethods] +impl ArrowArrayTransform { + #[staticmethod] + pub fn identity(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Identity, py) + } + + #[staticmethod] + pub fn void(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Void, py) + } + + #[staticmethod] + pub fn year(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Year, py) + } + + #[staticmethod] + pub fn month(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Month, py) + } + + #[staticmethod] + pub fn day(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Day, py) + } + + #[staticmethod] + pub fn hour(array: PyObject, py: Python) -> PyResult { + apply(array, Transform::Hour, py) + } + + #[staticmethod] + pub fn bucket(array: PyObject, num_buckets: u32, py: Python) -> PyResult { + apply(array, Transform::Bucket(num_buckets), py) + } + + #[staticmethod] + pub fn truncate(array: PyObject, width: u32, py: Python) -> PyResult { + apply(array, Transform::Truncate(width), py) + } +} diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py index 8a160ce12..6b24476f2 100644 --- a/bindings/python/tests/test_transform.py +++ b/bindings/python/tests/test_transform.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyiceberg_core.transform import bucket_transform +from pyiceberg_core import ArrowArrayTransform import pytest import pyarrow as pa @@ -23,7 +23,7 @@ def test_bucket_pyarrow_array(): arr = pa.array([1, 2]) - result = bucket_transform(arr, 10) + result = ArrowArrayTransform.bucket(arr, 10) expected = pa.array([6, 2], type=pa.int32()) assert result == expected @@ -34,14 +34,14 @@ def test_bucket_pyarrow_array_list_type_fails(): ValueError, match=r"FeatureUnsupported => Unsupported data type for bucket transform", ): - bucket_transform(arr, 10) + ArrowArrayTransform.bucket(arr, 10) def test_bucket_chunked_array(): chunked = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) result_chunks = [] for arr in chunked.iterchunks(): - result_chunks.append(bucket_transform(arr, 10)) + result_chunks.append(ArrowArrayTransform.bucket(arr, 10)) expected = pa.chunked_array( [pa.array([6, 2], type=pa.int32()), pa.array([5, 0], type=pa.int32())] From 743f16128f63de4c69a7e742488ec14cbe44738b Mon Sep 17 00:00:00 2001 From: sungwy Date: Wed, 21 Aug 2024 16:38:27 -0400 Subject: [PATCH 17/17] more tests --- bindings/python/tests/test_transform.py | 50 +++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py index 6b24476f2..1fa2d577a 100644 --- a/bindings/python/tests/test_transform.py +++ b/bindings/python/tests/test_transform.py @@ -15,20 +15,27 @@ # specific language governing permissions and limitations # under the License. -from pyiceberg_core import ArrowArrayTransform +from datetime import date, datetime -import pytest import pyarrow as pa +import pytest +from pyiceberg_core import ArrowArrayTransform -def test_bucket_pyarrow_array(): +def test_identity_transform(): + arr = pa.array([1, 2]) + result = ArrowArrayTransform.identity(arr) + assert result == arr + + +def test_bucket_transform(): arr = pa.array([1, 2]) result = ArrowArrayTransform.bucket(arr, 10) expected = pa.array([6, 2], type=pa.int32()) assert result == expected -def test_bucket_pyarrow_array_list_type_fails(): +def test_bucket_transform_fails_for_list_type_input(): arr = pa.array([[1, 2], [3, 4]]) with pytest.raises( ValueError, @@ -47,3 +54,38 @@ def test_bucket_chunked_array(): [pa.array([6, 2], type=pa.int32()), pa.array([5, 0], type=pa.int32())] ) assert pa.chunked_array(result_chunks).equals(expected) + + +def test_year_transform(): + arr = pa.array([date(1970, 1, 1), date(2000, 1, 1)]) + result = ArrowArrayTransform.year(arr) + expected = pa.array([0, 30], type=pa.int32()) + assert result == expected + + +def test_month_transform(): + arr = pa.array([date(1970, 1, 1), date(2000, 4, 1)]) + result = ArrowArrayTransform.month(arr) + expected = pa.array([0, 30 * 12 + 3], type=pa.int32()) + assert result == expected + + +def test_day_transform(): + arr = pa.array([date(1970, 1, 1), date(2000, 4, 1)]) + result = ArrowArrayTransform.day(arr) + expected = pa.array([0, 11048], type=pa.int32()) + assert result == expected + + +def test_hour_transform(): + arr = pa.array([datetime(1970, 1, 1, 19, 1, 23), datetime(2000, 3, 1, 12, 1, 23)]) + result = ArrowArrayTransform.hour(arr) + expected = pa.array([19, 264420], type=pa.int32()) + assert result == expected + + +def test_truncate_transform(): + arr = pa.array(["this is a long string", "hi my name is sung"]) + result = ArrowArrayTransform.truncate(arr, 5) + expected = pa.array(["this ", "hi my"]) + assert result == expected