Mise à jour de Monitor.py et autres scripts
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = [
|
||||
pytest.mark.parquet,
|
||||
]
|
||||
@@ -0,0 +1,171 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests import util
|
||||
|
||||
|
||||
def _write_table(table, path, **kwargs):
|
||||
# So we see the ImportError somewhere
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.pandas_compat import _pandas_api
|
||||
|
||||
if _pandas_api.is_data_frame(table):
|
||||
table = pa.Table.from_pandas(table)
|
||||
|
||||
pq.write_table(table, path, **kwargs)
|
||||
return table
|
||||
|
||||
|
||||
def _read_table(*args, **kwargs):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
table = pq.read_table(*args, **kwargs)
|
||||
table.validate(full=True)
|
||||
return table
|
||||
|
||||
|
||||
def _roundtrip_table(table, read_table_kwargs=None,
|
||||
write_table_kwargs=None):
|
||||
read_table_kwargs = read_table_kwargs or {}
|
||||
write_table_kwargs = write_table_kwargs or {}
|
||||
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, **write_table_kwargs)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
return _read_table(reader, **read_table_kwargs)
|
||||
|
||||
|
||||
def _check_roundtrip(table, expected=None, read_table_kwargs=None,
|
||||
**write_table_kwargs):
|
||||
if expected is None:
|
||||
expected = table
|
||||
|
||||
read_table_kwargs = read_table_kwargs or {}
|
||||
|
||||
# intentionally check twice
|
||||
result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
|
||||
write_table_kwargs=write_table_kwargs)
|
||||
assert result.schema == expected.schema
|
||||
assert result.equals(expected)
|
||||
result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
|
||||
write_table_kwargs=write_table_kwargs)
|
||||
assert result.schema == expected.schema
|
||||
assert result.equals(expected)
|
||||
|
||||
|
||||
def _roundtrip_pandas_dataframe(df, write_kwargs):
|
||||
table = pa.Table.from_pandas(df)
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=write_kwargs)
|
||||
return result.to_pandas()
|
||||
|
||||
|
||||
def _random_integers(size, dtype):
|
||||
# We do not generate integers outside the int64 range
|
||||
platform_int_info = np.iinfo('int_')
|
||||
iinfo = np.iinfo(dtype)
|
||||
return np.random.randint(max(iinfo.min, platform_int_info.min),
|
||||
min(iinfo.max, platform_int_info.max),
|
||||
size=size, dtype=dtype)
|
||||
|
||||
|
||||
def _range_integers(size, dtype):
|
||||
return pa.array(np.arange(size, dtype=dtype))
|
||||
|
||||
|
||||
def _test_dataframe(size=10000, seed=0):
|
||||
import pandas as pd
|
||||
|
||||
np.random.seed(seed)
|
||||
df = pd.DataFrame({
|
||||
'uint8': _random_integers(size, np.uint8),
|
||||
'uint16': _random_integers(size, np.uint16),
|
||||
'uint32': _random_integers(size, np.uint32),
|
||||
'uint64': _random_integers(size, np.uint64),
|
||||
'int8': _random_integers(size, np.int8),
|
||||
'int16': _random_integers(size, np.int16),
|
||||
'int32': _random_integers(size, np.int32),
|
||||
'int64': _random_integers(size, np.int64),
|
||||
'float32': np.random.randn(size).astype(np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'strings': [util.rands(10) for i in range(size)],
|
||||
'all_none': [None] * size,
|
||||
'all_none_category': [None] * size
|
||||
})
|
||||
|
||||
# TODO(PARQUET-1015)
|
||||
# df['all_none_category'] = df['all_none_category'].astype('category')
|
||||
return df
|
||||
|
||||
|
||||
def make_sample_file(table_or_df):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
if isinstance(table_or_df, pa.Table):
|
||||
a_table = table_or_df
|
||||
else:
|
||||
a_table = pa.Table.from_pandas(table_or_df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, compression='SNAPPY', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
return pq.ParquetFile(buf)
|
||||
|
||||
|
||||
def alltypes_sample(size=10000, seed=0, categorical=False):
|
||||
import pandas as pd
|
||||
|
||||
np.random.seed(seed)
|
||||
arrays = {
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float16': np.arange(size, dtype=np.float16),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
|
||||
dtype='datetime64[ms]'),
|
||||
'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
|
||||
dtype='datetime64[us]'),
|
||||
'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
|
||||
dtype='datetime64[ns]'),
|
||||
'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
|
||||
'str': pd.Series([str(x) for x in range(size)]),
|
||||
'empty_str': [''] * size,
|
||||
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
|
||||
'null': [None] * size,
|
||||
'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
|
||||
}
|
||||
if categorical:
|
||||
arrays['str_category'] = arrays['str'].astype('category')
|
||||
return pd.DataFrame(arrays)
|
||||
@@ -0,0 +1,105 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from pyarrow.util import guid
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def datadir(base_datadir):
|
||||
return base_datadir / 'parquet'
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def parquet_test_datadir():
|
||||
if sys.platform == 'emscripten':
|
||||
pytest.skip("needs PARQUET_TEST_DATA files access")
|
||||
result = os.environ.get('PARQUET_TEST_DATA')
|
||||
if not result:
|
||||
raise RuntimeError('Please point the PARQUET_TEST_DATA environment '
|
||||
'variable to the test data directory')
|
||||
return pathlib.Path(result)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_bucket(s3_server):
|
||||
boto3 = pytest.importorskip('boto3')
|
||||
botocore = pytest.importorskip('botocore')
|
||||
s3_bucket_name = 'test-s3fs'
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
s3_client = boto3.client(
|
||||
's3',
|
||||
endpoint_url=f'http://{host}:{port}',
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key,
|
||||
config=botocore.client.Config(signature_version='s3v4'),
|
||||
region_name='us-east-1'
|
||||
)
|
||||
|
||||
try:
|
||||
s3_client.create_bucket(Bucket=s3_bucket_name)
|
||||
except Exception:
|
||||
pass # we get BucketAlreadyOwnedByYou error with fsspec handler
|
||||
finally:
|
||||
s3_client.close()
|
||||
|
||||
return s3_bucket_name
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_example_s3fs(s3_server, s3_bucket):
|
||||
s3fs = pytest.importorskip('s3fs')
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
fs = s3fs.S3FileSystem(
|
||||
key=access_key,
|
||||
secret=secret_key,
|
||||
client_kwargs={
|
||||
'endpoint_url': f'http://{host}:{port}'
|
||||
}
|
||||
)
|
||||
|
||||
test_path = f'{s3_bucket}/{guid()}'
|
||||
|
||||
fs.mkdir(test_path)
|
||||
yield fs, test_path
|
||||
try:
|
||||
fs.rm(test_path, recursive=True)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_example_fs(s3_server):
|
||||
from pyarrow.fs import FileSystem
|
||||
|
||||
host, port, access_key, secret_key = s3_server['connection']
|
||||
uri = (
|
||||
f"s3://{access_key}:{secret_key}@mybucket/data.parquet?scheme=http"
|
||||
f"&endpoint_override={host}:{port}&allow_bucket_creation=True"
|
||||
)
|
||||
fs, path = FileSystem.from_uri(uri)
|
||||
|
||||
fs.create_dir("mybucket")
|
||||
|
||||
yield fs, uri, path
|
||||
@@ -0,0 +1,61 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import base64
|
||||
|
||||
import pyarrow.parquet.encryption as pe
|
||||
|
||||
|
||||
class InMemoryKmsClient(pe.KmsClient):
|
||||
"""This is a mock class implementation of KmsClient, built for testing
|
||||
only.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""Create an InMemoryKmsClient instance."""
|
||||
pe.KmsClient.__init__(self)
|
||||
self.master_keys_map = config.custom_kms_conf
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
"""Not a secure cipher - the wrapped key
|
||||
is just the master key concatenated with key bytes"""
|
||||
master_key_bytes = self.master_keys_map[master_key_identifier].encode(
|
||||
'utf-8')
|
||||
wrapped_key = b"".join([master_key_bytes, key_bytes])
|
||||
result = base64.b64encode(wrapped_key)
|
||||
return result
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
"""Not a secure cipher - just extract the key from
|
||||
the wrapped key"""
|
||||
expected_master_key = self.master_keys_map[master_key_identifier]
|
||||
decoded_wrapped_key = base64.b64decode(wrapped_key)
|
||||
master_key_bytes = decoded_wrapped_key[:16]
|
||||
decrypted_key = decoded_wrapped_key[16:]
|
||||
if (expected_master_key == master_key_bytes.decode('utf-8')):
|
||||
return decrypted_key
|
||||
raise ValueError("Incorrect master key used",
|
||||
master_key_bytes, decrypted_key)
|
||||
|
||||
|
||||
def verify_file_encrypted(path):
|
||||
"""Verify that the file is encrypted by looking at its first 4 bytes.
|
||||
If it's the magic string PARE
|
||||
then this is a parquet with encrypted footer."""
|
||||
with open(path, "rb") as file:
|
||||
magic_str = file.read(4)
|
||||
# Verify magic string for parquet with encrypted footer is PARE
|
||||
assert magic_str == b'PARE'
|
||||
@@ -0,0 +1,997 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
import io
|
||||
import warnings
|
||||
from shutil import copytree
|
||||
from decimal import Decimal
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow import fs
|
||||
from pyarrow.tests import util
|
||||
from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
|
||||
_test_dataframe)
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.pandas_examples import dataframe_with_lists
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
def test_parquet_invalid_version(tempdir):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
with pytest.raises(ValueError, match="Unsupported Parquet format version"):
|
||||
_write_table(table, tempdir / 'test_version.parquet', version="2.2")
|
||||
with pytest.raises(ValueError, match="Unsupported Parquet data page " +
|
||||
"version"):
|
||||
_write_table(table, tempdir / 'test_version.parquet',
|
||||
data_page_version="2.2")
|
||||
|
||||
|
||||
def test_set_data_page_size():
|
||||
arr = pa.array([1, 2, 3] * 100000)
|
||||
t = pa.Table.from_arrays([arr], names=['f0'])
|
||||
|
||||
# 128K, 512K
|
||||
page_sizes = [2 << 16, 2 << 18]
|
||||
for target_page_size in page_sizes:
|
||||
_check_roundtrip(t, data_page_size=target_page_size)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_set_write_batch_size():
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
_check_roundtrip(
|
||||
table, data_page_size=10, write_batch_size=1, version='2.4'
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_set_dictionary_pagesize_limit():
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
_check_roundtrip(table, dictionary_pagesize_limit=1,
|
||||
data_page_size=10, version='2.4')
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
_check_roundtrip(table, dictionary_pagesize_limit="a",
|
||||
data_page_size=10, version='2.4')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_chunked_table_write():
|
||||
# ARROW-232
|
||||
tables = []
|
||||
batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10))
|
||||
tables.append(pa.Table.from_batches([batch] * 3))
|
||||
df, _ = dataframe_with_lists()
|
||||
batch = pa.RecordBatch.from_pandas(df)
|
||||
tables.append(pa.Table.from_batches([batch] * 3))
|
||||
|
||||
for data_page_version in ['1.0', '2.0']:
|
||||
for use_dictionary in [True, False]:
|
||||
for table in tables:
|
||||
_check_roundtrip(
|
||||
table, version='2.6',
|
||||
data_page_version=data_page_version,
|
||||
use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_memory_map(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_check_roundtrip(table, read_table_kwargs={'memory_map': True},
|
||||
version='2.6')
|
||||
|
||||
filename = str(tempdir / 'tmp_file')
|
||||
with open(filename, 'wb') as f:
|
||||
_write_table(table, f, version='2.6')
|
||||
table_read = pq.read_pandas(filename, memory_map=True)
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_enable_buffered_stream(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
|
||||
version='2.6')
|
||||
|
||||
filename = str(tempdir / 'tmp_file')
|
||||
with open(filename, 'wb') as f:
|
||||
_write_table(table, f, version='2.6')
|
||||
table_read = pq.read_pandas(filename, buffer_size=4096)
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
def test_special_chars_filename(tempdir):
|
||||
table = pa.Table.from_arrays([pa.array([42])], ["ints"])
|
||||
filename = "foo # bar"
|
||||
path = tempdir / filename
|
||||
assert not path.exists()
|
||||
_write_table(table, str(path))
|
||||
assert path.exists()
|
||||
table_read = _read_table(str(path))
|
||||
assert table_read.equals(table)
|
||||
|
||||
|
||||
def test_invalid_source():
|
||||
# Test that we provide an helpful error message pointing out
|
||||
# that None wasn't expected when trying to open a Parquet None file.
|
||||
with pytest.raises(TypeError, match="None"):
|
||||
pq.read_table(None)
|
||||
|
||||
with pytest.raises(TypeError, match="None"):
|
||||
pq.ParquetFile(None)
|
||||
|
||||
|
||||
def test_read_table_without_dataset(tempdir):
|
||||
from unittest import mock
|
||||
|
||||
class MockParquetDataset:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise ImportError("MockParquetDataset")
|
||||
|
||||
path = tempdir / "test.parquet"
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
_write_table(table, path)
|
||||
|
||||
with mock.patch('pyarrow.parquet.core.ParquetDataset', new=MockParquetDataset):
|
||||
with pytest.raises(ValueError, match="the 'filters' keyword"):
|
||||
pq.read_table(path, filters=[('integer', '=', 1)])
|
||||
with pytest.raises(ValueError, match="the 'partitioning' keyword"):
|
||||
pq.read_table(path, partitioning=['week', 'color'])
|
||||
with pytest.raises(ValueError, match="the 'schema' argument"):
|
||||
pq.read_table(path, schema=table.schema)
|
||||
# Error message varies depending on OS
|
||||
with pytest.raises(OSError):
|
||||
pq.read_table(tempdir)
|
||||
result = pq.read_table(path)
|
||||
assert result == table
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_file_with_over_int16_max_row_groups():
|
||||
# PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
|
||||
# limit on the number of row groups, but this limit only impacts files with
|
||||
# encrypted row group metadata because of the int16 row group ordinal used
|
||||
# in the Parquet Thrift metadata. Unencrypted files are not impacted, so
|
||||
# this test checks that it works (even if it isn't a good idea)
|
||||
t = pa.table([list(range(40000))], names=['f0'])
|
||||
_check_roundtrip(t, row_group_size=1)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_empty_table_roundtrip():
|
||||
df = alltypes_sample(size=10)
|
||||
|
||||
# Create a non-empty table to infer the types correctly, then slice to 0
|
||||
table = pa.Table.from_pandas(df)
|
||||
table = pa.Table.from_arrays(
|
||||
[col.chunk(0)[:0] for col in table.itercolumns()],
|
||||
names=table.schema.names)
|
||||
|
||||
assert table.schema.field('null').type == pa.null()
|
||||
assert table.schema.field('null_list').type == pa.list_(pa.null())
|
||||
_check_roundtrip(
|
||||
table, version='2.6')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_empty_table_no_columns():
|
||||
df = pd.DataFrame()
|
||||
empty = pa.Table.from_pandas(df, preserve_index=False)
|
||||
_check_roundtrip(empty)
|
||||
|
||||
|
||||
def test_write_nested_zero_length_array_chunk_failure():
|
||||
# Bug report in ARROW-3792
|
||||
cols = OrderedDict(
|
||||
int32=pa.int32(),
|
||||
list_string=pa.list_(pa.string())
|
||||
)
|
||||
data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
|
||||
|
||||
# This produces a table with a column like
|
||||
# <Column name='list_string' type=ListType(list<item: string>)>
|
||||
# [
|
||||
# [],
|
||||
# [
|
||||
# [
|
||||
# "G"
|
||||
# ]
|
||||
# ]
|
||||
# ]
|
||||
#
|
||||
# Each column is a ChunkedArray with 2 elements
|
||||
my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
|
||||
for batch in data]
|
||||
my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols))
|
||||
for batch in my_arrays]
|
||||
tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
|
||||
_check_roundtrip(tbl)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multiple_path_types(tempdir):
|
||||
# Test compatibility with PEP 519 path-like objects
|
||||
path = tempdir / 'zzz.parquet'
|
||||
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
|
||||
_write_table(df, path)
|
||||
table_read = _read_table(path)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
# Test compatibility with plain string paths
|
||||
path = str(tempdir) + 'zzz.parquet'
|
||||
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
|
||||
_write_table(df, path)
|
||||
table_read = _read_table(path)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
def test_fspath(tempdir):
|
||||
# ARROW-12472 support __fspath__ objects without using str()
|
||||
path = tempdir / "test.parquet"
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
_write_table(table, path)
|
||||
|
||||
fs_protocol_obj = util.FSProtocolClass(path)
|
||||
|
||||
result = _read_table(fs_protocol_obj)
|
||||
assert result.equals(table)
|
||||
|
||||
# combined with non-local filesystem raises
|
||||
with pytest.raises(TypeError):
|
||||
_read_table(fs_protocol_obj, filesystem=fs.FileSystem())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None, fs.LocalFileSystem()
|
||||
])
|
||||
@pytest.mark.parametrize("name", ("data.parquet", "例.parquet"))
|
||||
def test_relative_paths(tempdir, filesystem, name):
|
||||
# reading and writing from relative paths
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
path = tempdir / name
|
||||
|
||||
# reading
|
||||
pq.write_table(table, str(path))
|
||||
with util.change_cwd(tempdir):
|
||||
result = pq.read_table(name, filesystem=filesystem)
|
||||
assert result.equals(table)
|
||||
|
||||
path.unlink()
|
||||
assert not path.exists()
|
||||
|
||||
# writing
|
||||
with util.change_cwd(tempdir):
|
||||
pq.write_table(table, name, filesystem=filesystem)
|
||||
result = pq.read_table(path)
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_read_non_existing_file():
|
||||
# ensure we have a proper error message
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pq.read_table('i-am-not-existing.parquet')
|
||||
|
||||
|
||||
def test_file_error_python_exception():
|
||||
class BogusFile(io.BytesIO):
|
||||
def read(self, *args):
|
||||
raise ZeroDivisionError("zorglub")
|
||||
|
||||
def seek(self, *args):
|
||||
raise ZeroDivisionError("zorglub")
|
||||
|
||||
# ensure the Python exception is restored
|
||||
with pytest.raises(ZeroDivisionError, match="zorglub"):
|
||||
pq.read_table(BogusFile(b""))
|
||||
|
||||
|
||||
def test_parquet_read_from_buffer(tempdir):
|
||||
# reading from a buffer from python's open()
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
|
||||
with open(str(tempdir / "data.parquet"), "rb") as f:
|
||||
result = pq.read_table(f)
|
||||
assert result.equals(table)
|
||||
|
||||
with open(str(tempdir / "data.parquet"), "rb") as f:
|
||||
result = pq.read_table(pa.PythonFile(f))
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_byte_stream_split():
|
||||
# This is only a smoke test.
|
||||
arr_float = pa.array(list(map(float, range(100))))
|
||||
arr_int = pa.array(list(map(int, range(100))))
|
||||
arr_bool = pa.array([True, False] * 50)
|
||||
data_float = [arr_float, arr_float]
|
||||
table = pa.Table.from_arrays(data_float, names=['a', 'b'])
|
||||
|
||||
# Check with byte_stream_split for both columns.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=False, use_byte_stream_split=True)
|
||||
|
||||
# Check with byte_stream_split for column 'b' and dictionary
|
||||
# for column 'a'.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=['a'],
|
||||
use_byte_stream_split=['b'])
|
||||
|
||||
# Check with a collision for both columns.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
use_dictionary=['a', 'b'],
|
||||
use_byte_stream_split=['a', 'b'])
|
||||
|
||||
# Check with mixed column types.
|
||||
mixed_table = pa.Table.from_arrays([arr_float, arr_float, arr_int, arr_int],
|
||||
names=['a', 'b', 'c', 'd'])
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=['b', 'd'],
|
||||
use_byte_stream_split=['a', 'c'])
|
||||
|
||||
# Try to use the wrong data type with the byte_stream_split encoding.
|
||||
# This should throw an exception.
|
||||
table = pa.Table.from_arrays([arr_bool], names=['tmp'])
|
||||
with pytest.raises(IOError, match='BYTE_STREAM_SPLIT only supports'):
|
||||
_check_roundtrip(table, expected=table, use_byte_stream_split=True,
|
||||
use_dictionary=False)
|
||||
|
||||
|
||||
def test_store_decimal_as_integer(tempdir):
|
||||
arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(5, 2))
|
||||
arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(16, 9))
|
||||
arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))),
|
||||
type=pa.decimal128(22, 2))
|
||||
arr_bool = pa.array([True, False] * 50)
|
||||
data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18]
|
||||
table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c'])
|
||||
|
||||
# Check with store_decimal_as_integer.
|
||||
_check_roundtrip(table,
|
||||
expected=table,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
# Check physical type in parquet schema
|
||||
pqtestfile_path = os.path.join(tempdir, 'test.parquet')
|
||||
pq.write_table(table, pqtestfile_path,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
pqtestfile = pq.ParquetFile(pqtestfile_path)
|
||||
pqcol_decimal_1_9 = pqtestfile.schema.column(0)
|
||||
pqcol_decimal_10_18 = pqtestfile.schema.column(1)
|
||||
|
||||
assert pqcol_decimal_1_9.physical_type == 'INT32'
|
||||
assert pqcol_decimal_10_18.physical_type == 'INT64'
|
||||
|
||||
# Check with store_decimal_as_integer and delta-int encoding.
|
||||
# DELTA_BINARY_PACKED requires parquet physical type to be INT64 or INT32
|
||||
_check_roundtrip(table,
|
||||
expected=table,
|
||||
compression="gzip",
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True,
|
||||
column_encoding={
|
||||
'a': 'DELTA_BINARY_PACKED',
|
||||
'b': 'DELTA_BINARY_PACKED'
|
||||
})
|
||||
|
||||
# Check with mixed column types.
|
||||
mixed_table = pa.Table.from_arrays(
|
||||
[arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18, arr_bool],
|
||||
names=['a', 'b', 'c', 'd'])
|
||||
_check_roundtrip(mixed_table,
|
||||
expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
store_decimal_as_integer=True)
|
||||
|
||||
|
||||
def test_column_encoding():
|
||||
arr_float = pa.array(list(map(float, range(100))))
|
||||
arr_int = pa.array(list(map(int, range(100))))
|
||||
arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary())
|
||||
arr_flba = pa.array(
|
||||
[str(x).zfill(10) for x in range(100)], type=pa.binary(10))
|
||||
arr_bool = pa.array([False, True, False, False] * 25)
|
||||
mixed_table = pa.Table.from_arrays(
|
||||
[arr_float, arr_int, arr_bin, arr_flba, arr_bool],
|
||||
names=['a', 'b', 'c', 'd', 'e'])
|
||||
|
||||
# Check "BYTE_STREAM_SPLIT" for columns 'a', 'b', 'd'
|
||||
# and "PLAIN" column_encoding for column 'c'.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
|
||||
column_encoding={'a': "BYTE_STREAM_SPLIT",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN",
|
||||
'd': "BYTE_STREAM_SPLIT"})
|
||||
|
||||
# Check "PLAIN" for all columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN")
|
||||
|
||||
# Check "DELTA_BINARY_PACKED" for integer columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "DELTA_LENGTH_BYTE_ARRAY"})
|
||||
|
||||
# Check "DELTA_BYTE_ARRAY" for byte columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'b': "DELTA_BINARY_PACKED",
|
||||
'c': "DELTA_BYTE_ARRAY",
|
||||
'd': "DELTA_BYTE_ARRAY"})
|
||||
|
||||
# Check "RLE" for boolean columns.
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'e': "RLE"})
|
||||
|
||||
# Try to pass "BYTE_STREAM_SPLIT" column encoding for boolean column 'e'.
|
||||
# This should throw an error as it is does not support BOOLEAN.
|
||||
with pytest.raises(IOError,
|
||||
match="BYTE_STREAM_SPLIT only supports"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "PLAIN",
|
||||
'c': "PLAIN",
|
||||
'e': "BYTE_STREAM_SPLIT"})
|
||||
|
||||
# Try to pass use "DELTA_BINARY_PACKED" encoding on float column.
|
||||
# This should throw an error as only integers are supported.
|
||||
with pytest.raises(OSError,
|
||||
match="DELTA_BINARY_PACKED encoder only supports"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "DELTA_BINARY_PACKED",
|
||||
'b': "PLAIN",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass "RLE_DICTIONARY".
|
||||
# This should throw an error as dictionary encoding is already used by
|
||||
# default and not supported to be specified as "fallback" encoding
|
||||
with pytest.raises(ValueError,
|
||||
match="'RLE_DICTIONARY' is already used by default"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding="RLE_DICTIONARY")
|
||||
|
||||
# Try to pass unsupported encoding.
|
||||
with pytest.raises(ValueError,
|
||||
match="Unsupported column encoding: 'MADE_UP_ENCODING'"):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding={'a': "MADE_UP_ENCODING"})
|
||||
|
||||
# Try to pass column_encoding and use_dictionary.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=['b'],
|
||||
column_encoding={'b': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_dictionary=True (default value).
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
column_encoding={'b': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_byte_stream_split on same column.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
use_byte_stream_split=['a'],
|
||||
column_encoding={'a': "RLE",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding and use_byte_stream_split=True.
|
||||
# This should throw an error.
|
||||
with pytest.raises(ValueError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
use_byte_stream_split=True,
|
||||
column_encoding={'a': "RLE",
|
||||
'b': "BYTE_STREAM_SPLIT",
|
||||
'c': "PLAIN"})
|
||||
|
||||
# Try to pass column_encoding=True.
|
||||
# This should throw an error.
|
||||
with pytest.raises(TypeError):
|
||||
_check_roundtrip(mixed_table, expected=mixed_table,
|
||||
use_dictionary=False,
|
||||
column_encoding=True)
|
||||
|
||||
|
||||
def test_compression_level():
|
||||
arr = pa.array(list(map(int, range(1000))))
|
||||
data = [arr, arr]
|
||||
table = pa.Table.from_arrays(data, names=['a', 'b'])
|
||||
|
||||
# Check one compression level.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level=1)
|
||||
|
||||
# Check another one to make sure that compression_level=1 does not
|
||||
# coincide with the default one in Arrow.
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level=5)
|
||||
|
||||
# Check that the user can provide a compression per column
|
||||
_check_roundtrip(table, expected=table,
|
||||
compression={'a': "gzip", 'b': "snappy"})
|
||||
|
||||
# Check that the user can provide a compression level per column
|
||||
_check_roundtrip(table, expected=table, compression="gzip",
|
||||
compression_level={'a': 2, 'b': 3})
|
||||
|
||||
# Check if both LZ4 compressors are working
|
||||
# (level < 3 -> fast, level >= 3 -> HC)
|
||||
_check_roundtrip(table, expected=table, compression="lz4",
|
||||
compression_level=1)
|
||||
|
||||
_check_roundtrip(table, expected=table, compression="lz4",
|
||||
compression_level=9)
|
||||
|
||||
# Check that specifying a compression level for a codec which does allow
|
||||
# specifying one, results into an error.
|
||||
# Uncompressed, snappy and lzo do not support specifying a compression
|
||||
# level.
|
||||
# GZIP (zlib) allows for specifying a compression level but as of up
|
||||
# to version 1.2.11 the valid range is [-1, 9].
|
||||
invalid_combinations = [("snappy", 4), ("gzip", -1337),
|
||||
("None", 444), ("lzo", 14)]
|
||||
buf = io.BytesIO()
|
||||
for (codec, level) in invalid_combinations:
|
||||
with pytest.raises((ValueError, OSError)):
|
||||
_write_table(table, buf, compression=codec,
|
||||
compression_level=level)
|
||||
|
||||
|
||||
def test_sanitized_spark_field_names():
|
||||
a0 = pa.array([0, 1, 2, 3, 4])
|
||||
name = 'prohib; ,\t{}'
|
||||
table = pa.Table.from_arrays([a0], [name])
|
||||
|
||||
result = _roundtrip_table(table, write_table_kwargs={'flavor': 'spark'})
|
||||
|
||||
expected_name = 'prohib______'
|
||||
assert result.schema[0].name == expected_name
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multithreaded_read():
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf, compression='SNAPPY', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
table1 = _read_table(buf, use_threads=True)
|
||||
|
||||
buf.seek(0)
|
||||
table2 = _read_table(buf, use_threads=False)
|
||||
|
||||
assert table1.equals(table2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_min_chunksize():
|
||||
data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
|
||||
table = pa.Table.from_pandas(data.reset_index())
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf, chunk_size=-1)
|
||||
|
||||
buf.seek(0)
|
||||
result = _read_table(buf)
|
||||
|
||||
assert result.equals(table)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_write_table(table, buf, chunk_size=0)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_error_deletes_incomplete_file(tempdir):
|
||||
# ARROW-1285
|
||||
df = pd.DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3, freq='ns')})
|
||||
|
||||
pdf = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'tmp_file'
|
||||
try:
|
||||
# Test relies on writing nanoseconds to raise an error
|
||||
# true for Parquet 2.4
|
||||
_write_table(pdf, filename, version="2.4")
|
||||
except pa.ArrowException:
|
||||
pass
|
||||
|
||||
assert not filename.exists()
|
||||
|
||||
|
||||
def test_read_non_existent_file(tempdir):
|
||||
path = 'nonexistent-file.parquet'
|
||||
try:
|
||||
pq.read_table(path)
|
||||
except Exception as e:
|
||||
assert path in e.args[0]
|
||||
|
||||
|
||||
def test_read_table_doesnt_warn(datadir):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter(action="error")
|
||||
pq.read_table(datadir / 'v0.7.1.parquet')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_zlib_compression_bug():
|
||||
# ARROW-3514: "zlib deflate failed, output buffer too small"
|
||||
table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
|
||||
f = io.BytesIO()
|
||||
pq.write_table(table, f, compression='gzip')
|
||||
|
||||
f.seek(0)
|
||||
roundtrip = pq.read_table(f)
|
||||
tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())
|
||||
|
||||
|
||||
def test_parquet_file_too_small(tempdir):
|
||||
path = str(tempdir / "test.parquet")
|
||||
# TODO(dataset) with datasets API it raises OSError instead
|
||||
with pytest.raises((pa.ArrowInvalid, OSError),
|
||||
match='size is 0 bytes'):
|
||||
with open(path, 'wb') as f:
|
||||
pass
|
||||
pq.read_table(path)
|
||||
|
||||
with pytest.raises((pa.ArrowInvalid, OSError),
|
||||
match='size is 4 bytes'):
|
||||
with open(path, 'wb') as f:
|
||||
f.write(b'ffff')
|
||||
pq.read_table(path)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.fastparquet
|
||||
@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
|
||||
def test_fastparquet_cross_compatibility(tempdir):
|
||||
fp = pytest.importorskip('fastparquet')
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"d": [True, False, True],
|
||||
"e": pd.date_range("20130101", periods=3),
|
||||
"f": pd.Categorical(["a", "b", "a"]),
|
||||
# fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
|
||||
# "g": [[1, 2], None, [1, 2, 3]],
|
||||
}
|
||||
)
|
||||
table = pa.table(df)
|
||||
|
||||
# Arrow -> fastparquet
|
||||
file_arrow = str(tempdir / "cross_compat_arrow.parquet")
|
||||
pq.write_table(table, file_arrow, compression=None)
|
||||
|
||||
fp_file = fp.ParquetFile(file_arrow)
|
||||
df_fp = fp_file.to_pandas()
|
||||
tm.assert_frame_equal(df, df_fp)
|
||||
|
||||
# Fastparquet -> arrow
|
||||
file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
|
||||
fp.write(file_fastparquet, df)
|
||||
|
||||
table_fp = pq.read_pandas(file_fastparquet)
|
||||
# for fastparquet written file, categoricals comes back as strings
|
||||
# (no arrow schema in parquet metadata)
|
||||
df['f'] = df['f'].astype(object)
|
||||
tm.assert_frame_equal(table_fp.to_pandas(), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('array_factory', [
|
||||
lambda: pa.array([0, None] * 10),
|
||||
lambda: pa.array([0, None] * 10).dictionary_encode(),
|
||||
lambda: pa.array(["", None] * 10),
|
||||
lambda: pa.array(["", None] * 10).dictionary_encode(),
|
||||
])
|
||||
@pytest.mark.parametrize('read_dictionary', [False, True])
|
||||
def test_buffer_contents(
|
||||
array_factory, read_dictionary
|
||||
):
|
||||
# Test that null values are deterministically initialized to zero
|
||||
# after a roundtrip through Parquet.
|
||||
# See ARROW-8006 and ARROW-8011.
|
||||
orig_table = pa.Table.from_pydict({"col": array_factory()})
|
||||
bio = io.BytesIO()
|
||||
pq.write_table(orig_table, bio, use_dictionary=True)
|
||||
bio.seek(0)
|
||||
read_dictionary = ['col'] if read_dictionary else None
|
||||
table = pq.read_table(bio, use_threads=False,
|
||||
read_dictionary=read_dictionary)
|
||||
|
||||
for col in table.columns:
|
||||
[chunk] = col.chunks
|
||||
buf = chunk.buffers()[1]
|
||||
assert buf.to_pybytes() == buf.size * b"\0"
|
||||
|
||||
|
||||
def test_parquet_compression_roundtrip(tempdir):
|
||||
# ARROW-10480: ensure even with nonstandard Parquet file naming
|
||||
# conventions, writing and then reading a file works. In
|
||||
# particular, ensure that we don't automatically double-compress
|
||||
# the stream due to auto-detecting the extension in the filename
|
||||
table = pa.table([pa.array(range(4))], names=["ints"])
|
||||
path = tempdir / "arrow-10480.pyarrow.gz"
|
||||
pq.write_table(table, path, compression="GZIP")
|
||||
result = pq.read_table(path)
|
||||
assert result.equals(table)
|
||||
|
||||
|
||||
def test_empty_row_groups(tempdir):
|
||||
# ARROW-3020
|
||||
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
|
||||
|
||||
path = tempdir / 'empty_row_groups.parquet'
|
||||
|
||||
num_groups = 3
|
||||
with pq.ParquetWriter(path, table.schema) as writer:
|
||||
for i in range(num_groups):
|
||||
writer.write_table(table)
|
||||
|
||||
reader = pq.ParquetFile(path)
|
||||
assert reader.metadata.num_row_groups == num_groups
|
||||
|
||||
for i in range(num_groups):
|
||||
assert reader.read_row_group(i).equals(table)
|
||||
|
||||
|
||||
def test_reads_over_batch(tempdir):
|
||||
data = [None] * (1 << 20)
|
||||
data.append([1])
|
||||
# Large list<int64> with mostly nones and one final
|
||||
# value. This should force batched reads when
|
||||
# reading back.
|
||||
table = pa.Table.from_arrays([data], ['column'])
|
||||
|
||||
path = tempdir / 'arrow-11607.parquet'
|
||||
pq.write_table(table, path)
|
||||
table2 = pq.read_table(path)
|
||||
assert table == table2
|
||||
|
||||
|
||||
def test_permutation_of_column_order(tempdir):
|
||||
# ARROW-2366
|
||||
case = tempdir / "dataset_column_order_permutation"
|
||||
case.mkdir(exist_ok=True)
|
||||
|
||||
data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
|
||||
pq.write_table(data1, case / "data1.parquet")
|
||||
|
||||
data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
|
||||
pq.write_table(data2, case / "data2.parquet")
|
||||
|
||||
table = pq.read_table(str(case))
|
||||
table2 = pa.table([[1, 2, 3, 4, 5, 6],
|
||||
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
|
||||
names=['a', 'b'])
|
||||
|
||||
assert table == table2
|
||||
|
||||
|
||||
def test_thrift_size_limits(tempdir):
|
||||
path = tempdir / 'largethrift.parquet'
|
||||
|
||||
array = pa.array(list(range(10)))
|
||||
num_cols = 1000
|
||||
table = pa.table(
|
||||
[array] * num_cols,
|
||||
names=[f'some_long_column_name_{i}' for i in range(num_cols)])
|
||||
pq.write_table(table, path)
|
||||
|
||||
with pytest.raises(
|
||||
OSError,
|
||||
match="Couldn't deserialize thrift:.*Exceeded size limit"):
|
||||
pq.read_table(path, thrift_string_size_limit=50 * num_cols)
|
||||
with pytest.raises(
|
||||
OSError,
|
||||
match="Couldn't deserialize thrift:.*Exceeded size limit"):
|
||||
pq.read_table(path, thrift_container_size_limit=num_cols)
|
||||
|
||||
got = pq.read_table(path, thrift_string_size_limit=100 * num_cols)
|
||||
assert got == table
|
||||
got = pq.read_table(path, thrift_container_size_limit=2 * num_cols)
|
||||
assert got == table
|
||||
got = pq.read_table(path)
|
||||
assert got == table
|
||||
|
||||
|
||||
def test_page_checksum_verification_write_table(tempdir):
|
||||
"""Check that checksum verification works for datasets created with
|
||||
pq.write_table()"""
|
||||
|
||||
# Write some sample data into a parquet file with page checksum enabled
|
||||
original_path = tempdir / 'correct.parquet'
|
||||
table_orig = pa.table({'a': [1, 2, 3, 4]})
|
||||
pq.write_table(table_orig, original_path, write_page_checksum=True)
|
||||
|
||||
# Read file and verify that the data is correct
|
||||
table_check = pq.read_table(original_path, page_checksum_verification=True)
|
||||
assert table_orig == table_check
|
||||
|
||||
# Read the original file as binary and swap the 31-th and 36-th bytes. This
|
||||
# should be equivalent to storing the following data:
|
||||
# pa.table({'a': [1, 3, 2, 4]})
|
||||
bin_data = bytearray(original_path.read_bytes())
|
||||
|
||||
# Swap two bytes to emulate corruption. Also, check that the two bytes are
|
||||
# different, otherwise no corruption occurs
|
||||
assert bin_data[31] != bin_data[36]
|
||||
bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
|
||||
|
||||
# Write the corrupted data to another parquet file
|
||||
corrupted_path = tempdir / 'corrupted.parquet'
|
||||
corrupted_path.write_bytes(bin_data)
|
||||
|
||||
# Case 1: Reading the corrupted file with read_table() and without page
|
||||
# checksum verification succeeds but yields corrupted data
|
||||
table_corrupt = pq.read_table(corrupted_path,
|
||||
page_checksum_verification=False)
|
||||
# The read should complete without error, but the table has different
|
||||
# content than the original file!
|
||||
assert table_corrupt != table_orig
|
||||
assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 2: Reading the corrupted file with read_table() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = pq.read_table(corrupted_path, page_checksum_verification=True)
|
||||
|
||||
# Case 3: Reading the corrupted file with ParquetFile.read() and without
|
||||
# page checksum verification succeeds but yields corrupted data
|
||||
corrupted_pq_file = pq.ParquetFile(corrupted_path,
|
||||
page_checksum_verification=False)
|
||||
table_corrupt2 = corrupted_pq_file.read()
|
||||
assert table_corrupt2 != table_orig
|
||||
assert table_corrupt2 == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 4: Reading the corrupted file with ParquetFile.read() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
corrupted_pq_file = pq.ParquetFile(corrupted_path,
|
||||
page_checksum_verification=True)
|
||||
# Accessing the data should result in an error
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = corrupted_pq_file.read()
|
||||
|
||||
|
||||
@pytest.mark.dataset
|
||||
def test_checksum_write_to_dataset(tempdir):
|
||||
"""Check that checksum verification works for datasets created with
|
||||
pq.write_to_dataset"""
|
||||
|
||||
table_orig = pa.table({'a': [1, 2, 3, 4]})
|
||||
|
||||
# Write a sample dataset with page checksum enabled
|
||||
original_dir_path = tempdir / 'correct_dir'
|
||||
pq.write_to_dataset(table_orig,
|
||||
original_dir_path,
|
||||
write_page_checksum=True)
|
||||
|
||||
# Read file and verify that the data is correct
|
||||
original_file_path_list = list(original_dir_path.iterdir())
|
||||
assert len(original_file_path_list) == 1
|
||||
original_path = original_file_path_list[0]
|
||||
table_check = pq.read_table(original_path, page_checksum_verification=True)
|
||||
assert table_orig == table_check
|
||||
|
||||
# Read the original file as binary and swap the 31-th and 36-th bytes. This
|
||||
# should be equivalent to storing the following data:
|
||||
# pa.table({'a': [1, 3, 2, 4]})
|
||||
bin_data = bytearray(original_path.read_bytes())
|
||||
|
||||
# Swap two bytes to emulate corruption. Also, check that the two bytes are
|
||||
# different, otherwise no corruption occurs
|
||||
assert bin_data[31] != bin_data[36]
|
||||
bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
|
||||
|
||||
# Write the corrupted data to another parquet dataset
|
||||
# Copy dataset dir (which should be just one file)
|
||||
corrupted_dir_path = tempdir / 'corrupted_dir'
|
||||
copytree(original_dir_path, corrupted_dir_path)
|
||||
# Corrupt just the one file with the dataset
|
||||
corrupted_file_path = corrupted_dir_path / original_path.name
|
||||
corrupted_file_path.write_bytes(bin_data)
|
||||
|
||||
# Case 1: Reading the corrupted file with read_table() and without page
|
||||
# checksum verification succeeds but yields corrupted data
|
||||
table_corrupt = pq.read_table(corrupted_file_path,
|
||||
page_checksum_verification=False)
|
||||
# The read should complete without error, but the table has different
|
||||
# content than the original file!
|
||||
assert table_corrupt != table_orig
|
||||
assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
|
||||
|
||||
# Case 2: Reading the corrupted file with read_table() and with page
|
||||
# checksum verification enabled raises an exception
|
||||
with pytest.raises(OSError, match="CRC checksum verification"):
|
||||
_ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
|
||||
@@ -0,0 +1,109 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table,
|
||||
_check_roundtrip)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
# Tests for ARROW-11497
|
||||
_test_data_simple = [
|
||||
{'items': [1, 2]},
|
||||
{'items': [0]},
|
||||
]
|
||||
|
||||
_test_data_complex = [
|
||||
{'items': [{'name': 'elem1', 'value': '1'},
|
||||
{'name': 'elem2', 'value': '2'}]},
|
||||
{'items': [{'name': 'elem1', 'value': '0'}]},
|
||||
]
|
||||
|
||||
parametrize_test_data = pytest.mark.parametrize(
|
||||
"test_data", [_test_data_simple, _test_data_complex])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@parametrize_test_data
|
||||
def test_write_compliant_nested_type_enable(tempdir, test_data):
|
||||
# prepare dataframe for testing
|
||||
df = pd.DataFrame(data=test_data)
|
||||
# verify that we can read/write pandas df with new flag (default behaviour)
|
||||
_roundtrip_pandas_dataframe(df,
|
||||
write_kwargs={})
|
||||
|
||||
# Write to a parquet file with compliant nested type
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
with pq.ParquetWriter(path, table.schema,
|
||||
version='2.6') as writer:
|
||||
writer.write_table(table)
|
||||
# Read back as a table
|
||||
new_table = _read_table(path)
|
||||
# Validate that "items" columns compliant to Parquet nested format
|
||||
# Should be like this: list<element: struct<name: string, value: string>>
|
||||
assert isinstance(new_table.schema.types[0], pa.ListType)
|
||||
assert new_table.schema.types[0].value_field.name == 'element'
|
||||
|
||||
# Verify that the new table can be read/written correctly
|
||||
_check_roundtrip(new_table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@parametrize_test_data
|
||||
def test_write_compliant_nested_type_disable(tempdir, test_data):
|
||||
# prepare dataframe for testing
|
||||
df = pd.DataFrame(data=test_data)
|
||||
# verify that we can read/write with new flag disabled
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={
|
||||
'use_compliant_nested_type': False})
|
||||
|
||||
# Write to a parquet file while disabling compliant nested type
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
with pq.ParquetWriter(path, table.schema, version='2.6',
|
||||
use_compliant_nested_type=False) as writer:
|
||||
writer.write_table(table)
|
||||
new_table = _read_table(path)
|
||||
|
||||
# Validate that "items" columns is not compliant to Parquet nested format
|
||||
# Should be like this: list<item: struct<name: string, value: string>>
|
||||
assert isinstance(new_table.schema.types[0], pa.ListType)
|
||||
assert new_table.schema.types[0].value_field.name == 'item'
|
||||
|
||||
# Verify that the new table can be read/written correctly
|
||||
_check_roundtrip(new_table,
|
||||
use_compliant_nested_type=False)
|
||||
@@ -0,0 +1,616 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import decimal
|
||||
import io
|
||||
import random
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests import util
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip, _roundtrip_table
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.pandas_examples import (dataframe_with_arrays,
|
||||
dataframe_with_lists)
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
# General roundtrip of data types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('chunk_size', [None, 1000])
|
||||
def test_parquet_2_6_roundtrip(tempdir, chunk_size):
|
||||
df = alltypes_sample(size=10000, categorical=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
table_read = pq.read_pandas(filename)
|
||||
assert table_read.schema.pandas_metadata is not None
|
||||
|
||||
read_metadata = table_read.schema.metadata
|
||||
assert arrow_table.schema.metadata == read_metadata
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_1_0_roundtrip(tempdir):
|
||||
size = 10000
|
||||
np.random.seed(0)
|
||||
df = pd.DataFrame({
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'str': [str(x) for x in range(size)],
|
||||
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
|
||||
'empty_str': [''] * size
|
||||
})
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
_write_table(arrow_table, filename, version='1.0')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
# We pass uint32_t as int64_t if we write Parquet version 1.0
|
||||
df['uint32'] = df['uint32'].values.astype(np.int64)
|
||||
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
# Dictionary
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _simple_table_write_read(table):
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
return pq.read_table(
|
||||
pa.BufferReader(contents)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_direct_read_dictionary():
|
||||
# ARROW-3325
|
||||
repeats = 10
|
||||
nunique = 5
|
||||
|
||||
data = [
|
||||
[util.rands(10) for i in range(nunique)] * repeats,
|
||||
|
||||
]
|
||||
table = pa.table(data, names=['f0'])
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
|
||||
result = pq.read_table(pa.BufferReader(contents),
|
||||
read_dictionary=['f0'])
|
||||
|
||||
# Compute dictionary-encoded subfield
|
||||
expected = pa.table([table[0].dictionary_encode()], names=['f0'])
|
||||
assert result.equals(expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_direct_read_dictionary_subfield():
|
||||
repeats = 10
|
||||
nunique = 5
|
||||
|
||||
data = [
|
||||
[[util.rands(10)] for i in range(nunique)] * repeats,
|
||||
]
|
||||
table = pa.table(data, names=['f0'])
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
result = pq.read_table(pa.BufferReader(contents),
|
||||
read_dictionary=['f0.list.element'])
|
||||
|
||||
arr = pa.array(data[0])
|
||||
values_as_dict = arr.values.dictionary_encode()
|
||||
|
||||
inner_indices = values_as_dict.indices.cast('int32')
|
||||
new_values = pa.DictionaryArray.from_arrays(inner_indices,
|
||||
values_as_dict.dictionary)
|
||||
|
||||
offsets = pa.array(range(51), type='int32')
|
||||
expected_arr = pa.ListArray.from_arrays(offsets, new_values)
|
||||
expected = pa.table([expected_arr], names=['f0'])
|
||||
|
||||
assert result.equals(expected)
|
||||
assert result[0].num_chunks == 1
|
||||
|
||||
|
||||
@pytest.mark.numpy
|
||||
def test_dictionary_array_automatically_read():
|
||||
# ARROW-3246
|
||||
|
||||
# Make a large dictionary, a little over 4MB of data
|
||||
dict_length = 4000
|
||||
dict_values = pa.array([('x' * 1000 + f'_{i}')
|
||||
for i in range(dict_length)])
|
||||
|
||||
num_chunks = 10
|
||||
chunk_size = 100
|
||||
chunks = []
|
||||
for i in range(num_chunks):
|
||||
indices = np.random.randint(0, dict_length,
|
||||
size=chunk_size).astype(np.int32)
|
||||
chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
|
||||
dict_values))
|
||||
|
||||
table = pa.table([pa.chunked_array(chunks)], names=['f0'])
|
||||
result = _simple_table_write_read(table)
|
||||
|
||||
assert result.equals(table)
|
||||
|
||||
# The only key in the metadata was the Arrow schema key
|
||||
assert result.schema.metadata is None
|
||||
|
||||
|
||||
# Decimal
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_decimal_roundtrip(tempdir):
|
||||
num_values = 10
|
||||
|
||||
columns = {}
|
||||
for precision in range(1, 39):
|
||||
for scale in range(0, precision + 1):
|
||||
with util.random_seed(0):
|
||||
random_decimal_values = [
|
||||
util.randdecimal(precision, scale)
|
||||
for _ in range(num_values)
|
||||
]
|
||||
column_name = f'dec_precision_{precision}_scale_{scale}'
|
||||
columns[column_name] = random_decimal_values
|
||||
|
||||
expected = pd.DataFrame(columns)
|
||||
filename = tempdir / 'decimals.parquet'
|
||||
string_filename = str(filename)
|
||||
table = pa.Table.from_pandas(expected)
|
||||
_write_table(table, string_filename)
|
||||
result_table = _read_table(string_filename)
|
||||
result = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.xfail(
|
||||
raises=OSError, reason='Parquet does not support negative scale'
|
||||
)
|
||||
def test_decimal_roundtrip_negative_scale(tempdir):
|
||||
expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
|
||||
filename = tempdir / 'decimals.parquet'
|
||||
string_filename = str(filename)
|
||||
t = pa.Table.from_pandas(expected)
|
||||
_write_table(t, string_filename)
|
||||
result_table = _read_table(string_filename)
|
||||
result = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# List types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', [int, float])
|
||||
def test_single_pylist_column_roundtrip(tempdir, dtype,):
|
||||
filename = tempdir / f'single_{dtype.__name__}_column.parquet'
|
||||
data = [pa.array(list(map(dtype, range(5))))]
|
||||
table = pa.Table.from_arrays(data, names=['a'])
|
||||
_write_table(table, filename)
|
||||
table_read = _read_table(filename)
|
||||
for i in range(table.num_columns):
|
||||
col_written = table[i]
|
||||
col_read = table_read[i]
|
||||
assert table.field(i).name == table_read.field(i).name
|
||||
assert col_read.num_chunks == 1
|
||||
data_written = col_written.chunk(0)
|
||||
data_read = col_read.chunk(0)
|
||||
assert data_written.equals(data_read)
|
||||
|
||||
|
||||
def test_empty_lists_table_roundtrip():
|
||||
# ARROW-2744: Shouldn't crash when writing an array of empty lists
|
||||
arr = pa.array([[], []], type=pa.list_(pa.int32()))
|
||||
table = pa.Table.from_arrays([arr], ["A"])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
def test_nested_list_nonnullable_roundtrip_bug():
|
||||
# Reproduce failure in ARROW-5630
|
||||
typ = pa.list_(pa.field("item", pa.float32(), False))
|
||||
num_rows = 10000
|
||||
t = pa.table([
|
||||
pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
|
||||
(num_rows // 10)), type=typ)
|
||||
], ['a'])
|
||||
_check_roundtrip(
|
||||
t, data_page_size=4096)
|
||||
|
||||
|
||||
def test_nested_list_struct_multiple_batches_roundtrip(tempdir):
|
||||
# Reproduce failure in ARROW-11024
|
||||
data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100
|
||||
table = pa.table([pa.array(data)], names=['column'])
|
||||
_check_roundtrip(
|
||||
table, row_group_size=20)
|
||||
|
||||
# Reproduce failure in ARROW-11069 (plain non-nested structs with strings)
|
||||
data = pa.array(
|
||||
[{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10
|
||||
)
|
||||
table = pa.table({'column': data})
|
||||
_check_roundtrip(table, row_group_size=10)
|
||||
|
||||
|
||||
def test_writing_empty_lists():
|
||||
# ARROW-2591: [Python] Segmentation fault issue in pq.write_table
|
||||
arr1 = pa.array([[], []], pa.list_(pa.int32()))
|
||||
table = pa.Table.from_arrays([arr1], ['list(int32)'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_column_of_arrays(tempdir):
|
||||
df, schema = dataframe_with_arrays()
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_column_of_lists(tempdir):
|
||||
df, schema = dataframe_with_lists(parquet_compatible=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
_write_table(arrow_table, filename, version='2.6')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
def test_large_list_records():
|
||||
# This was fixed in PARQUET-1100
|
||||
|
||||
list_lengths = [random.randint(0, 500) for _ in range(50)]
|
||||
list_lengths[::10] = [0, 0, 0, 0, 0]
|
||||
|
||||
list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
|
||||
if i % 8 else None
|
||||
for i, x in enumerate(list_lengths)]
|
||||
|
||||
a1 = pa.array(list_values)
|
||||
|
||||
table = pa.Table.from_arrays([a1], ['int_lists'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
list_types = [
|
||||
(pa.ListType, pa.list_),
|
||||
(pa.LargeListType, pa.large_list),
|
||||
]
|
||||
|
||||
|
||||
def test_list_types():
|
||||
data = [[1, 2, None]] * 50
|
||||
for _, in_factory in list_types:
|
||||
array = pa.array(data, type=in_factory(pa.int32()))
|
||||
table = pa.Table.from_arrays([array], ['lists'])
|
||||
for out_type, out_factory in list_types:
|
||||
for store_schema in (True, False):
|
||||
if store_schema:
|
||||
expected_table = table
|
||||
else:
|
||||
expected_table = pa.Table.from_arrays(
|
||||
[pa.array(data, type=out_factory(pa.int32()))], ['lists'])
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=dict(store_schema=store_schema),
|
||||
read_table_kwargs=dict(list_type=out_type))
|
||||
assert result == expected_table
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_nested_convenience(tempdir):
|
||||
# ARROW-1684
|
||||
df = pd.DataFrame({
|
||||
'a': [[1, 2, 3], None, [4, 5], []],
|
||||
'b': [[1.], None, None, [6., 7.]],
|
||||
})
|
||||
|
||||
path = str(tempdir / 'nested_convenience.parquet')
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
_write_table(table, path)
|
||||
|
||||
read = pq.read_table(
|
||||
path, columns=['a'])
|
||||
tm.assert_frame_equal(read.to_pandas(), df[['a']])
|
||||
|
||||
read = pq.read_table(
|
||||
path, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(read.to_pandas(), df)
|
||||
|
||||
|
||||
# Binary
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_fixed_size_binary():
|
||||
t0 = pa.binary(10)
|
||||
data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
|
||||
a0 = pa.array(data, type=t0)
|
||||
|
||||
table = pa.Table.from_arrays([a0],
|
||||
['binary[10]'])
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
def test_binary_types():
|
||||
types = [pa.binary(), pa.large_binary(), pa.binary_view()]
|
||||
data = [b'abc', None, b'defg', b'x' * 30]
|
||||
for in_type in types:
|
||||
array = pa.array(data, in_type)
|
||||
table = pa.Table.from_arrays([array], ['binary'])
|
||||
for out_type in types:
|
||||
for store_schema in (False, True):
|
||||
result = _roundtrip_table(
|
||||
table, write_table_kwargs=dict(store_schema=store_schema),
|
||||
read_table_kwargs=dict(binary_type=out_type))
|
||||
if store_schema:
|
||||
expected_table = table
|
||||
else:
|
||||
expected_table = pa.Table.from_arrays(
|
||||
[pa.array(data, out_type)], ['binary'])
|
||||
assert result == expected_table
|
||||
|
||||
|
||||
# Large types
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_large_table_int32_overflow():
|
||||
size = np.iinfo('int32').max + 1
|
||||
|
||||
arr = np.ones(size, dtype='uint8')
|
||||
|
||||
parr = pa.array(arr, type=pa.uint8())
|
||||
|
||||
table = pa.Table.from_arrays([parr], names=['one'])
|
||||
f = io.BytesIO()
|
||||
_write_table(table, f)
|
||||
|
||||
|
||||
def _simple_table_roundtrip(table, **write_kwargs):
|
||||
stream = pa.BufferOutputStream()
|
||||
_write_table(table, stream, **write_kwargs)
|
||||
buf = stream.getvalue()
|
||||
return _read_table(buf)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_byte_array_exactly_2gb():
|
||||
# Test edge case reported in ARROW-3762
|
||||
val = b'x' * (1 << 10)
|
||||
|
||||
base = pa.array([val] * ((1 << 21) - 1))
|
||||
cases = [
|
||||
[b'x' * 1023], # 2^31 - 1
|
||||
[b'x' * 1024], # 2^31
|
||||
[b'x' * 1025] # 2^31 + 1
|
||||
]
|
||||
for case in cases:
|
||||
values = pa.chunked_array([base, pa.array(case)])
|
||||
t = pa.table([values], names=['f0'])
|
||||
result = _simple_table_roundtrip(
|
||||
t, use_dictionary=False)
|
||||
assert t.equals(result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.large_memory
|
||||
def test_binary_array_overflow_to_chunked():
|
||||
# ARROW-3762
|
||||
|
||||
# 2^31 + 1 bytes
|
||||
values = [b'x'] + [
|
||||
b'x' * (1 << 20)
|
||||
] * 2 * (1 << 10)
|
||||
df = pd.DataFrame({'byte_col': values})
|
||||
|
||||
tbl = pa.Table.from_pandas(df, preserve_index=False)
|
||||
read_tbl = _simple_table_roundtrip(tbl)
|
||||
|
||||
col0_data = read_tbl[0]
|
||||
assert isinstance(col0_data, pa.ChunkedArray)
|
||||
|
||||
# Split up into 2GB chunks
|
||||
assert col0_data.num_chunks == 2
|
||||
|
||||
assert tbl.equals(read_tbl)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.large_memory
|
||||
def test_list_of_binary_large_cell():
|
||||
# ARROW-4688
|
||||
data = []
|
||||
|
||||
# TODO(wesm): handle chunked children
|
||||
# 2^31 - 1 bytes in a single cell
|
||||
# data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
|
||||
|
||||
# A little under 2GB in cell each containing approximately 10MB each
|
||||
data.extend([[b'x' * 1000000] * 10] * 214)
|
||||
|
||||
arr = pa.array(data)
|
||||
table = pa.Table.from_arrays([arr], ['chunky_cells'])
|
||||
read_table = _simple_table_roundtrip(table)
|
||||
assert table.equals(read_table)
|
||||
|
||||
|
||||
def test_large_binary_and_binary_view():
|
||||
data = [b'foo', b'bar'] * 50
|
||||
for type in [pa.large_binary(), pa.binary_view()]:
|
||||
arr = pa.array(data, type=type)
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
_check_roundtrip(table, use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_large_binary_and_binary_view_huge():
|
||||
s = b'xy' * 997
|
||||
data = [s] * ((1 << 33) // len(s))
|
||||
for type in [pa.large_binary(), pa.binary_view()]:
|
||||
arr = pa.array(data, type=type)
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
_check_roundtrip(table, use_dictionary=use_dictionary)
|
||||
del arr, table
|
||||
|
||||
|
||||
@pytest.mark.large_memory
|
||||
def test_large_binary_overflow():
|
||||
s = b'x' * (1 << 31)
|
||||
arr = pa.array([s], type=pa.large_binary())
|
||||
table = pa.Table.from_arrays([arr], names=['strs'])
|
||||
for use_dictionary in [False, True]:
|
||||
writer = pa.BufferOutputStream()
|
||||
with pytest.raises(
|
||||
pa.ArrowInvalid,
|
||||
match="Parquet cannot store strings with size 2GB or more"):
|
||||
_write_table(table, writer, use_dictionary=use_dictionary)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("storage_type", (
|
||||
pa.string(), pa.large_string()))
|
||||
def test_json_extension_type(storage_type):
|
||||
data = ['{"a": 1}', '{"b": 2}', None]
|
||||
arr = pa.array(data, type=pa.json_(storage_type))
|
||||
|
||||
table = pa.table([arr], names=["ext"])
|
||||
|
||||
# With defaults, this should roundtrip (because store_schema=True)
|
||||
_check_roundtrip(table, table)
|
||||
|
||||
# When store_schema is False, we get a string back by default
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.string())}),
|
||||
{"arrow_extensions_enabled": False},
|
||||
store_schema=False)
|
||||
|
||||
# With arrow_extensions_enabled=True on read, we get a arrow.json back
|
||||
# (but with string() storage)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.json_(pa.string()))}),
|
||||
{"arrow_extensions_enabled": True},
|
||||
store_schema=False)
|
||||
|
||||
|
||||
def test_uuid_extension_type():
|
||||
data = [
|
||||
b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
|
||||
b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
|
||||
None
|
||||
]
|
||||
arr = pa.array(data, type=pa.uuid())
|
||||
|
||||
table = pa.table([arr], names=["ext"])
|
||||
|
||||
_check_roundtrip(table, table)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
pa.table({"ext": pa.array(data, pa.binary(16))}),
|
||||
{"arrow_extensions_enabled": False},
|
||||
store_schema=False)
|
||||
_check_roundtrip(
|
||||
table,
|
||||
table,
|
||||
{"arrow_extensions_enabled": True},
|
||||
store_schema=False)
|
||||
|
||||
|
||||
def test_undefined_logical_type(parquet_test_datadir):
|
||||
test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
|
||||
|
||||
table = _read_table(test_file)
|
||||
assert table.column_names == ["column with known type", "column with unknown type"]
|
||||
assert table["column with unknown type"].to_pylist() == [
|
||||
b"unknown string 1",
|
||||
b"unknown string 2",
|
||||
b"unknown string 3"
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,461 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import datetime
|
||||
import io
|
||||
import warnings
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _read_table, _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_datetime_tz():
|
||||
# Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
|
||||
# so we need to cast the pandas dtype. Pandas v1 will always silently
|
||||
# coerce to [ns] due to lack of non-[ns] support.
|
||||
s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
|
||||
s = s.dt.tz_localize('utc')
|
||||
s.index = s
|
||||
|
||||
# Both a column and an index to hit both use cases
|
||||
df = pd.DataFrame({'tz_aware': s,
|
||||
'tz_eastern': s.dt.tz_convert('US/Eastern')},
|
||||
index=s)
|
||||
|
||||
f = io.BytesIO()
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
_write_table(arrow_table, f)
|
||||
f.seek(0)
|
||||
|
||||
table_read = pq.read_pandas(f)
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_datetime_timezone_tzinfo():
|
||||
value = datetime.datetime(2018, 1, 1, 1, 23, 45,
|
||||
tzinfo=datetime.timezone.utc)
|
||||
df = pd.DataFrame({'foo': [value]})
|
||||
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={})
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_coerce_timestamps(tempdir):
|
||||
from collections import OrderedDict
|
||||
|
||||
# ARROW-622
|
||||
arrays = OrderedDict()
|
||||
fields = [pa.field('datetime64',
|
||||
pa.list_(pa.timestamp('ms')))]
|
||||
arrays['datetime64'] = [
|
||||
np.array(['2007-07-13T01:23:34.123456789',
|
||||
None,
|
||||
'2010-08-13T05:46:57.437699912'],
|
||||
dtype='datetime64[ms]'),
|
||||
None,
|
||||
None,
|
||||
np.array(['2007-07-13T02',
|
||||
None,
|
||||
'2010-08-13T05:46:57.437699912'],
|
||||
dtype='datetime64[ms]'),
|
||||
]
|
||||
|
||||
df = pd.DataFrame(arrays)
|
||||
schema = pa.schema(fields)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, schema=schema)
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
|
||||
df_expected = df.copy()
|
||||
for i, x in enumerate(df_expected['datetime64']):
|
||||
if isinstance(x, np.ndarray):
|
||||
df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
|
||||
|
||||
tm.assert_frame_equal(df_expected, df_read)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
coerce_timestamps='unknown')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_coerce_timestamps_truncated(tempdir):
|
||||
"""
|
||||
ARROW-2555: Test that we can truncate timestamps when coercing if
|
||||
explicitly allowed.
|
||||
"""
|
||||
dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
|
||||
second=1, microsecond=1)
|
||||
dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
|
||||
second=1)
|
||||
|
||||
fields_us = [pa.field('datetime64', pa.timestamp('us'))]
|
||||
arrays_us = {'datetime64': [dt_us, dt_ms]}
|
||||
|
||||
df_us = pd.DataFrame(arrays_us)
|
||||
schema_us = pa.schema(fields_us)
|
||||
|
||||
filename = tempdir / 'pandas_truncated.parquet'
|
||||
table_us = pa.Table.from_pandas(df_us, schema=schema_us)
|
||||
|
||||
_write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
|
||||
allow_truncated_timestamps=True)
|
||||
table_ms = _read_table(filename)
|
||||
df_ms = table_ms.to_pandas()
|
||||
|
||||
arrays_expected = {'datetime64': [dt_ms, dt_ms]}
|
||||
df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]')
|
||||
tm.assert_frame_equal(df_expected, df_ms)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_date_time_types(tempdir):
|
||||
t1 = pa.date32()
|
||||
data1 = np.array([17259, 17260, 17261], dtype='int32')
|
||||
a1 = pa.array(data1, type=t1)
|
||||
|
||||
t2 = pa.date64()
|
||||
data2 = data1.astype('int64') * 86400000
|
||||
a2 = pa.array(data2, type=t2)
|
||||
|
||||
t3 = pa.timestamp('us')
|
||||
start = pd.Timestamp('2001-01-01').value / 1000
|
||||
data3 = np.array([start, start + 1, start + 2], dtype='int64')
|
||||
a3 = pa.array(data3, type=t3)
|
||||
|
||||
t4 = pa.time32('ms')
|
||||
data4 = np.arange(3, dtype='i4')
|
||||
a4 = pa.array(data4, type=t4)
|
||||
|
||||
t5 = pa.time64('us')
|
||||
a5 = pa.array(data4.astype('int64'), type=t5)
|
||||
|
||||
t6 = pa.time32('s')
|
||||
a6 = pa.array(data4, type=t6)
|
||||
|
||||
ex_t6 = pa.time32('ms')
|
||||
ex_a6 = pa.array(data4 * 1000, type=ex_t6)
|
||||
|
||||
t7 = pa.timestamp('ns')
|
||||
start = pd.Timestamp('2001-01-01').value
|
||||
data7 = np.array([start, start + 1000, start + 2000],
|
||||
dtype='int64')
|
||||
a7 = pa.array(data7, type=t7)
|
||||
|
||||
table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
|
||||
['date32', 'date64', 'timestamp[us]',
|
||||
'time32[s]', 'time64[us]',
|
||||
'time32_from64[s]',
|
||||
'timestamp[ns]'])
|
||||
|
||||
# date64 as date32
|
||||
# time32[s] to time32[ms]
|
||||
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
|
||||
['date32', 'date64', 'timestamp[us]',
|
||||
'time32[s]', 'time64[us]',
|
||||
'time32_from64[s]',
|
||||
'timestamp[ns]'])
|
||||
|
||||
_check_roundtrip(table, expected=expected, version='2.6')
|
||||
|
||||
t0 = pa.timestamp('ms')
|
||||
data0 = np.arange(4, dtype='int64')
|
||||
a0 = pa.array(data0, type=t0)
|
||||
|
||||
t1 = pa.timestamp('us')
|
||||
data1 = np.arange(4, dtype='int64')
|
||||
a1 = pa.array(data1, type=t1)
|
||||
|
||||
t2 = pa.timestamp('ns')
|
||||
data2 = np.arange(4, dtype='int64')
|
||||
a2 = pa.array(data2, type=t2)
|
||||
|
||||
table = pa.Table.from_arrays([a0, a1, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
expected = pa.Table.from_arrays([a0, a1, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
|
||||
# int64 for all timestamps supported by default
|
||||
filename = tempdir / 'int64_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6')
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT64'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
t0_ns = pa.timestamp('ns')
|
||||
data0_ns = np.array(data0 * 1000000, dtype='int64')
|
||||
a0_ns = pa.array(data0_ns, type=t0_ns)
|
||||
|
||||
t1_ns = pa.timestamp('ns')
|
||||
data1_ns = np.array(data1 * 1000, dtype='int64')
|
||||
a1_ns = pa.array(data1_ns, type=t1_ns)
|
||||
|
||||
expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
|
||||
['ts[ms]', 'ts[us]', 'ts[ns]'])
|
||||
|
||||
# int96 nanosecond timestamps produced upon request
|
||||
filename = tempdir / 'explicit_int96_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6',
|
||||
use_deprecated_int96_timestamps=True)
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT96'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
# int96 nanosecond timestamps implied by flavor 'spark'
|
||||
filename = tempdir / 'spark_int96_timestamps.parquet'
|
||||
_write_table(table, filename, version='2.6',
|
||||
flavor='spark')
|
||||
parquet_schema = pq.ParquetFile(filename).schema
|
||||
for i in range(3):
|
||||
assert parquet_schema.column(i).physical_type == 'INT96'
|
||||
read_table = _read_table(filename)
|
||||
assert read_table.equals(expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
|
||||
def test_coerce_int96_timestamp_unit(unit):
|
||||
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
|
||||
|
||||
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
|
||||
d_ms = d_s * 1000
|
||||
d_us = d_ms * 1000
|
||||
d_ns = d_us * 1000
|
||||
|
||||
a_s = pa.array(d_s, type=pa.timestamp('s'))
|
||||
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
|
||||
a_us = pa.array(d_us, type=pa.timestamp('us'))
|
||||
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
|
||||
|
||||
arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
|
||||
names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
|
||||
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
|
||||
|
||||
# For either Parquet version, coercing to nanoseconds is allowed
|
||||
# if Int96 storage is used
|
||||
expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
|
||||
read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
|
||||
_check_roundtrip(table, expected,
|
||||
read_table_kwargs=read_table_kwargs,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
_check_roundtrip(table, expected, version='2.6',
|
||||
read_table_kwargs=read_table_kwargs,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
|
||||
def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
|
||||
|
||||
def get_table(pq_reader_method, filename, **kwargs):
|
||||
if pq_reader_method == "ParquetFile":
|
||||
return pq.ParquetFile(filename, **kwargs).read()
|
||||
elif pq_reader_method == "read_table":
|
||||
return pq.read_table(filename, **kwargs)
|
||||
|
||||
# Recreating the initial JIRA issue referenced in ARROW-12096
|
||||
oob_dts = [
|
||||
datetime.datetime(1000, 1, 1),
|
||||
datetime.datetime(2000, 1, 1),
|
||||
datetime.datetime(3000, 1, 1)
|
||||
]
|
||||
df = pd.DataFrame({"a": oob_dts})
|
||||
table = pa.table(df)
|
||||
|
||||
filename = tempdir / "test_round_trip_overflow.parquet"
|
||||
pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
|
||||
version="1.0")
|
||||
|
||||
# with the default resolution of ns, we get wrong values for INT96
|
||||
# that are out of bounds for nanosecond range
|
||||
tab_error = get_table(pq_reader_method, filename)
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore",
|
||||
"Discarding nonzero nanoseconds in conversion",
|
||||
UserWarning)
|
||||
assert tab_error["a"].to_pylist() != oob_dts
|
||||
|
||||
# avoid this overflow by specifying the resolution to use for INT96 values
|
||||
tab_correct = get_table(
|
||||
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
|
||||
)
|
||||
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
|
||||
df["a"] = df["a"].astype(object)
|
||||
tm.assert_frame_equal(df, df_correct)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
|
||||
def test_timestamp_restore_timezone(unit):
|
||||
# ARROW-5888, restore timezone from serialized metadata
|
||||
ty = pa.timestamp(unit, tz='America/New_York')
|
||||
arr = pa.array([1, 2, 3], type=ty)
|
||||
t = pa.table([arr], names=['f0'])
|
||||
_check_roundtrip(t)
|
||||
|
||||
|
||||
def test_timestamp_restore_timezone_nanosecond():
|
||||
# ARROW-9634, also restore timezone for nanosecond data that get stored
|
||||
# as microseconds in the parquet file for Parquet ver 2.4 and less
|
||||
ty = pa.timestamp('ns', tz='America/New_York')
|
||||
arr = pa.array([1000, 2000, 3000], type=ty)
|
||||
table = pa.table([arr], names=['f0'])
|
||||
ty_us = pa.timestamp('us', tz='America/New_York')
|
||||
expected = pa.table([arr.cast(ty_us)], names=['f0'])
|
||||
_check_roundtrip(table, expected=expected, version='2.4')
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_list_of_datetime_time_roundtrip():
|
||||
# ARROW-4135
|
||||
times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
|
||||
'11:30', '12:00'], format="%H:%M")
|
||||
df = pd.DataFrame({'time': [times.time]})
|
||||
_roundtrip_pandas_dataframe(df, write_kwargs={})
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_version_timestamp_differences():
|
||||
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
|
||||
|
||||
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
|
||||
d_ms = d_s * 1000
|
||||
d_us = d_ms * 1000
|
||||
d_ns = d_us * 1000
|
||||
|
||||
a_s = pa.array(d_s, type=pa.timestamp('s'))
|
||||
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
|
||||
a_us = pa.array(d_us, type=pa.timestamp('us'))
|
||||
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
|
||||
|
||||
all_versions = ['1.0', '2.4', '2.6']
|
||||
|
||||
names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
|
||||
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
|
||||
|
||||
# Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
|
||||
# and nanoseconds should be coerced to microseconds by default
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
|
||||
_check_roundtrip(table, expected, version='1.0')
|
||||
_check_roundtrip(table, expected, version='2.4')
|
||||
|
||||
# Using Parquet version 2.6, seconds should be coerced to milliseconds
|
||||
# and nanoseconds should be retained by default
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
|
||||
_check_roundtrip(table, expected, version='2.6')
|
||||
|
||||
# For either Parquet version coercing to milliseconds or microseconds
|
||||
# is allowed
|
||||
expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
|
||||
|
||||
expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
|
||||
|
||||
# TODO: after pyarrow allows coerce_timestamps='ns', tests like the
|
||||
# following should pass ...
|
||||
|
||||
# Using Parquet version 1.0, coercing to nanoseconds is not allowed
|
||||
# expected = None
|
||||
# with pytest.raises(NotImplementedError):
|
||||
# _roundtrip_table(table, coerce_timestamps='ns')
|
||||
|
||||
# Using Parquet version 2.0, coercing to nanoseconds is allowed
|
||||
# expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
|
||||
# _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
|
||||
|
||||
# For either Parquet version, coercing to nanoseconds is allowed
|
||||
# if Int96 storage is used
|
||||
expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
|
||||
for ver in all_versions:
|
||||
_check_roundtrip(table, expected, version=ver,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_noncoerced_nanoseconds_written_without_exception(tempdir):
|
||||
# ARROW-1957: the Parquet version 2.0 writer preserves Arrow
|
||||
# nanosecond timestamps by default
|
||||
n = 9
|
||||
df = pd.DataFrame({'x': range(n)},
|
||||
index=pd.date_range('2017-01-01', freq='ns', periods=n))
|
||||
tb = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'written.parquet'
|
||||
try:
|
||||
pq.write_table(tb, filename, version='2.6')
|
||||
except Exception:
|
||||
pass
|
||||
assert filename.exists()
|
||||
|
||||
recovered_table = pq.read_table(filename)
|
||||
assert tb.equals(recovered_table)
|
||||
|
||||
# Loss of data through coercion (without explicit override) still an error
|
||||
filename = tempdir / 'not_written.parquet'
|
||||
with pytest.raises(ValueError):
|
||||
pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
|
||||
|
||||
|
||||
def test_duration_type():
|
||||
# ARROW-6780
|
||||
arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
|
||||
for unit in ["s", "ms", "us", "ns"]]
|
||||
table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
|
||||
|
||||
_check_roundtrip(table)
|
||||
@@ -0,0 +1,620 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import pytest
|
||||
from datetime import timedelta
|
||||
|
||||
import pyarrow as pa
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.parquet.encryption as pe
|
||||
except ImportError:
|
||||
pq = None
|
||||
pe = None
|
||||
else:
|
||||
from pyarrow.tests.parquet.encryption import (
|
||||
InMemoryKmsClient, verify_file_encrypted)
|
||||
|
||||
|
||||
PARQUET_NAME = 'encrypted_table.in_mem.parquet'
|
||||
FOOTER_KEY = b"0123456789112345"
|
||||
FOOTER_KEY_NAME = "footer_key"
|
||||
COL_KEY = b"1234567890123450"
|
||||
COL_KEY_NAME = "col_key"
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet_encryption'
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = [
|
||||
pytest.mark.parquet_encryption,
|
||||
pytest.mark.parquet
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def data_table():
|
||||
data_table = pa.Table.from_pydict({
|
||||
'a': pa.array([1, 2, 3]),
|
||||
'b': pa.array(['a', 'b', 'c']),
|
||||
'c': pa.array(['x', 'y', 'z'])
|
||||
})
|
||||
return data_table
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def basic_encryption_config():
|
||||
basic_encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
})
|
||||
return basic_encryption_config
|
||||
|
||||
|
||||
def setup_encryption_environment(custom_kms_conf):
|
||||
"""
|
||||
Sets up and returns the KMS connection configuration and crypto factory
|
||||
based on provided KMS configuration parameters.
|
||||
"""
|
||||
kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
# Create our CryptoFactory
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
|
||||
return kms_connection_config, crypto_factory
|
||||
|
||||
|
||||
def write_encrypted_file(path, data_table, footer_key_name, col_key_name,
|
||||
footer_key, col_key, encryption_config):
|
||||
"""
|
||||
Writes an encrypted parquet file based on the provided parameters.
|
||||
"""
|
||||
# Setup the custom KMS configuration with provided keys
|
||||
custom_kms_conf = {
|
||||
footer_key_name: footer_key.decode("UTF-8"),
|
||||
col_key_name: col_key.decode("UTF-8"),
|
||||
}
|
||||
|
||||
# Setup encryption environment
|
||||
kms_connection_config, crypto_factory = setup_encryption_environment(
|
||||
custom_kms_conf)
|
||||
|
||||
# Write the encrypted parquet file
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
return kms_connection_config, crypto_factory
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_read(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted, and then read it."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
assert encryption_config.uniform_encryption is False
|
||||
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Read with decryption properties
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
result_table = read_encrypted_parquet(
|
||||
path, decryption_config, kms_connection_config, crypto_factory)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_uniform_encrypted_parquet_write_read(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted, and then read it."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer and all columns with the footer key,
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
uniform_encryption=True,
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
assert encryption_config.uniform_encryption is True
|
||||
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"",
|
||||
encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Read with decryption properties
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
result_table = read_encrypted_parquet(
|
||||
path, decryption_config, kms_connection_config, crypto_factory)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def write_encrypted_parquet(path, table, encryption_config,
|
||||
kms_connection_config, crypto_factory):
|
||||
file_encryption_properties = crypto_factory.file_encryption_properties(
|
||||
kms_connection_config, encryption_config)
|
||||
assert file_encryption_properties is not None
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema,
|
||||
encryption_properties=file_encryption_properties) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
|
||||
def read_encrypted_parquet(path, decryption_config,
|
||||
kms_connection_config, crypto_factory):
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
assert file_decryption_properties is not None
|
||||
meta = pq.read_metadata(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
assert meta.num_columns == 3
|
||||
schema = pq.read_schema(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
assert len(schema.names) == 3
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
return result.read(use_threads=True)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
and then read it using wrong keys."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
encryption_algorithm="AES_GCM_V1",
|
||||
cache_lifetime=timedelta(minutes=5.0),
|
||||
data_key_length_bits=256)
|
||||
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, COL_KEY, encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({
|
||||
FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), # Intentionally wrong
|
||||
COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), # Intentionally wrong
|
||||
})
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
with pytest.raises(ValueError, match=r"Incorrect master key used"):
|
||||
read_encrypted_parquet(
|
||||
path, decryption_config, wrong_kms_connection_config,
|
||||
wrong_crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read it without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
# Read without decryption properties
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.ParquetFile(tempdir / PARQUET_NAME).read()
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_metadata_no_decryption_config(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read its metadata without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
# Read metadata without decryption properties
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.read_metadata(tempdir / PARQUET_NAME)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_schema_no_decryption_config(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
but then try to read its schema without decryption properties."""
|
||||
test_encrypted_parquet_write_read(tempdir, data_table)
|
||||
with pytest.raises(IOError, match=r"no decryption"):
|
||||
pq.read_schema(tempdir / PARQUET_NAME)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_no_col_key(tempdir, data_table):
|
||||
"""Write an encrypted parquet, but give only footer key,
|
||||
without column key."""
|
||||
path = tempdir / 'encrypted_table_no_col_key.in_mem.parquet'
|
||||
|
||||
# Encrypt the footer with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME)
|
||||
|
||||
with pytest.raises(OSError,
|
||||
match="Either column_keys or uniform_encryption "
|
||||
"must be set"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, b"", encryption_config)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_table):
|
||||
"""Write an encrypted parquet, but give only footer key,
|
||||
without column key."""
|
||||
path = tempdir / 'encrypted_table_col_key_and_uniform_encryption.in_mem.parquet'
|
||||
|
||||
# Encrypt the footer with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
uniform_encryption=True)
|
||||
|
||||
with pytest.raises(OSError,
|
||||
match=r"Cannot set both column_keys and uniform_encryption"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
|
||||
FOOTER_KEY, b"", encryption_config)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
# Empty master keys map will cause KeyError to be raised
|
||||
# on wrap/unwrap calls
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(KeyError, match="footer_key"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_specific_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
class ThrowingKmsClient(pe.KmsClient):
|
||||
"""A KmsClient implementation that throws exception in
|
||||
wrap/unwrap calls
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""Create an InMemoryKmsClient instance."""
|
||||
pe.KmsClient.__init__(self)
|
||||
self.config = config
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
raise ValueError("Cannot Wrap Key")
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
raise ValueError("Cannot Unwrap Key")
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
# Exception thrown in wrap/unwrap calls
|
||||
return ThrowingKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(ValueError, match="Cannot Wrap Key"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_factory_error(tempdir, data_table,
|
||||
basic_encryption_config):
|
||||
"""Write an encrypted parquet, but raise ValueError in kms_factory."""
|
||||
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
raise ValueError('Cannot create KmsClient')
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(ValueError,
|
||||
match="Cannot create KmsClient"):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_write_kms_factory_type_error(
|
||||
tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet, but use wrong KMS client type
|
||||
that doesn't implement KmsClient."""
|
||||
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
|
||||
encryption_config = basic_encryption_config
|
||||
|
||||
# Empty master_keys_map
|
||||
kms_connection_config = pe.KmsConnectionConfig()
|
||||
|
||||
class WrongTypeKmsClient():
|
||||
"""This is not an implementation of KmsClient.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.master_keys_map = config.custom_kms_conf
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
return None
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
return None
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return WrongTypeKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
with pytest.raises(TypeError):
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_encryption_configuration():
|
||||
def validate_encryption_configuration(encryption_config):
|
||||
assert FOOTER_KEY_NAME == encryption_config.footer_key
|
||||
assert ["a", "b"] == encryption_config.column_keys[COL_KEY_NAME]
|
||||
assert "AES_GCM_CTR_V1" == encryption_config.encryption_algorithm
|
||||
assert encryption_config.plaintext_footer
|
||||
assert not encryption_config.double_wrapping
|
||||
assert timedelta(minutes=10.0) == encryption_config.cache_lifetime
|
||||
assert not encryption_config.internal_key_material
|
||||
assert 192 == encryption_config.data_key_length_bits
|
||||
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={COL_KEY_NAME: ["a", "b"], },
|
||||
encryption_algorithm="AES_GCM_CTR_V1",
|
||||
plaintext_footer=True,
|
||||
double_wrapping=False,
|
||||
cache_lifetime=timedelta(minutes=10.0),
|
||||
internal_key_material=False,
|
||||
data_key_length_bits=192,
|
||||
)
|
||||
validate_encryption_configuration(encryption_config)
|
||||
|
||||
encryption_config_1 = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME)
|
||||
encryption_config_1.column_keys = {COL_KEY_NAME: ["a", "b"], }
|
||||
encryption_config_1.encryption_algorithm = "AES_GCM_CTR_V1"
|
||||
encryption_config_1.plaintext_footer = True
|
||||
encryption_config_1.double_wrapping = False
|
||||
encryption_config_1.cache_lifetime = timedelta(minutes=10.0)
|
||||
encryption_config_1.internal_key_material = False
|
||||
encryption_config_1.data_key_length_bits = 192
|
||||
validate_encryption_configuration(encryption_config_1)
|
||||
|
||||
|
||||
def test_encrypted_parquet_decryption_configuration():
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=10.0))
|
||||
assert timedelta(minutes=10.0) == decryption_config.cache_lifetime
|
||||
|
||||
decryption_config_1 = pe.DecryptionConfiguration()
|
||||
decryption_config_1.cache_lifetime = timedelta(minutes=10.0)
|
||||
assert timedelta(minutes=10.0) == decryption_config_1.cache_lifetime
|
||||
|
||||
|
||||
def test_encrypted_parquet_kms_configuration():
|
||||
def validate_kms_connection_config(kms_connection_config):
|
||||
assert "Instance1" == kms_connection_config.kms_instance_id
|
||||
assert "URL1" == kms_connection_config.kms_instance_url
|
||||
assert "MyToken" == kms_connection_config.key_access_token
|
||||
assert ({"key1": "key_material_1", "key2": "key_material_2"} ==
|
||||
kms_connection_config.custom_kms_conf)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
kms_instance_id="Instance1",
|
||||
kms_instance_url="URL1",
|
||||
key_access_token="MyToken",
|
||||
custom_kms_conf={
|
||||
"key1": "key_material_1",
|
||||
"key2": "key_material_2",
|
||||
})
|
||||
validate_kms_connection_config(kms_connection_config)
|
||||
|
||||
kms_connection_config_1 = pe.KmsConnectionConfig()
|
||||
kms_connection_config_1.kms_instance_id = "Instance1"
|
||||
kms_connection_config_1.kms_instance_url = "URL1"
|
||||
kms_connection_config_1.key_access_token = "MyToken"
|
||||
kms_connection_config_1.custom_kms_conf = {
|
||||
"key1": "key_material_1",
|
||||
"key2": "key_material_2",
|
||||
}
|
||||
validate_kms_connection_config(kms_connection_config_1)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Plaintext footer - reading plaintext column subset"
|
||||
" reads encrypted columns too")
|
||||
def test_encrypted_parquet_write_read_plain_footer_single_wrapping(
|
||||
tempdir, data_table):
|
||||
"""Write an encrypted parquet, with plaintext footer
|
||||
and with single wrapping,
|
||||
verify it's encrypted, and then read plaintext columns."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={
|
||||
COL_KEY_NAME: ["a", "b"],
|
||||
},
|
||||
plaintext_footer=True,
|
||||
double_wrapping=False)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
custom_kms_conf={
|
||||
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
|
||||
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
|
||||
}
|
||||
)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
# # Read without decryption properties only the plaintext column
|
||||
# result = pq.ParquetFile(path)
|
||||
# result_table = result.read(columns='c', use_threads=False)
|
||||
# assert table.num_rows == result_table.num_rows
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="External key material not supported yet")
|
||||
def test_encrypted_parquet_write_external(tempdir, data_table):
|
||||
"""Write an encrypted parquet, with external key
|
||||
material.
|
||||
Currently it's not implemented, so should throw
|
||||
an exception"""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the file with the footer key
|
||||
encryption_config = pe.EncryptionConfiguration(
|
||||
footer_key=FOOTER_KEY_NAME,
|
||||
column_keys={},
|
||||
internal_key_material=False)
|
||||
|
||||
kms_connection_config = pe.KmsConnectionConfig(
|
||||
custom_kms_conf={FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8")}
|
||||
)
|
||||
|
||||
def kms_factory(kms_connection_configuration):
|
||||
return InMemoryKmsClient(kms_connection_configuration)
|
||||
|
||||
crypto_factory = pe.CryptoFactory(kms_factory)
|
||||
# Write with encryption properties
|
||||
write_encrypted_parquet(path, data_table, encryption_config,
|
||||
kms_connection_config, crypto_factory)
|
||||
|
||||
|
||||
def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet, verify it's encrypted,
|
||||
and then read it multithreaded in a loop."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Encrypt the footer with the footer key,
|
||||
# encrypt column `a` and column `b` with another key,
|
||||
# keep `c` plaintext, defined in basic_encryption_config
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
|
||||
verify_file_encrypted(path)
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
|
||||
for i in range(50):
|
||||
# Read with decryption properties
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
assert file_decryption_properties is not None
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
result_table = result.read(use_threads=True)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_config):
|
||||
"""
|
||||
Test that decryption properties can be used if the crypto factory is no longer alive
|
||||
"""
|
||||
path = tempdir / PARQUET_NAME
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
verify_file_encrypted(path)
|
||||
|
||||
# Create decryption properties and delete the crypto factory that created
|
||||
# the properties afterwards.
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
del crypto_factory
|
||||
|
||||
result = pq.ParquetFile(
|
||||
path, decryption_properties=file_decryption_properties)
|
||||
result_table = result.read(use_threads=True)
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
|
||||
def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config):
|
||||
"""Write an encrypted parquet then read it back using read_table."""
|
||||
path = tempdir / PARQUET_NAME
|
||||
|
||||
# Write the encrypted parquet file using the utility function
|
||||
kms_connection_config, crypto_factory = write_encrypted_file(
|
||||
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
|
||||
basic_encryption_config)
|
||||
|
||||
decryption_config = pe.DecryptionConfiguration(
|
||||
cache_lifetime=timedelta(minutes=5.0))
|
||||
file_decryption_properties = crypto_factory.file_decryption_properties(
|
||||
kms_connection_config, decryption_config)
|
||||
|
||||
# Read the encrypted parquet file using read_table
|
||||
result_table = pq.read_table(path, decryption_properties=file_decryption_properties)
|
||||
|
||||
# Assert that the read table matches the original data
|
||||
assert data_table.equals(result_table)
|
||||
|
||||
# Read the encrypted parquet folder using read_table
|
||||
result_table = pq.read_table(
|
||||
tempdir, decryption_properties=file_decryption_properties)
|
||||
assert data_table.equals(result_table)
|
||||
@@ -0,0 +1,816 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import datetime
|
||||
import decimal
|
||||
from collections import OrderedDict
|
||||
import io
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
|
||||
from pyarrow.fs import LocalFileSystem
|
||||
from pyarrow.tests import util
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_metadata_api():
|
||||
df = alltypes_sample(size=10000)
|
||||
df = df.reindex(columns=sorted(df.columns))
|
||||
df.index = np.random.randint(0, 1000000, size=len(df))
|
||||
|
||||
fileh = make_sample_file(df)
|
||||
ncols = len(df.columns)
|
||||
|
||||
# Series of sniff tests
|
||||
meta = fileh.metadata
|
||||
repr(meta)
|
||||
assert meta.num_rows == len(df)
|
||||
assert meta.num_columns == ncols + 1 # +1 for index
|
||||
assert meta.num_row_groups == 1
|
||||
assert meta.format_version == '2.6'
|
||||
assert 'parquet-cpp' in meta.created_by
|
||||
assert isinstance(meta.serialized_size, int)
|
||||
assert isinstance(meta.metadata, dict)
|
||||
|
||||
# Schema
|
||||
schema = fileh.schema
|
||||
assert meta.schema is schema
|
||||
assert len(schema) == ncols + 1 # +1 for index
|
||||
repr(schema)
|
||||
|
||||
col = schema[0]
|
||||
repr(col)
|
||||
assert col.name == df.columns[0]
|
||||
assert col.max_definition_level == 1
|
||||
assert col.max_repetition_level == 0
|
||||
assert col.max_repetition_level == 0
|
||||
assert col.physical_type == 'BOOLEAN'
|
||||
assert col.converted_type == 'NONE'
|
||||
|
||||
col_float16 = schema[5]
|
||||
assert col_float16.logical_type.type == 'FLOAT16'
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
schema[ncols + 1] # +1 for index
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
schema[-1]
|
||||
|
||||
# Row group
|
||||
for rg in range(meta.num_row_groups):
|
||||
rg_meta = meta.row_group(rg)
|
||||
assert isinstance(rg_meta, pq.RowGroupMetaData)
|
||||
repr(rg_meta)
|
||||
|
||||
for col in range(rg_meta.num_columns):
|
||||
col_meta = rg_meta.column(col)
|
||||
assert isinstance(col_meta, pq.ColumnChunkMetaData)
|
||||
repr(col_meta)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
meta.row_group(-1)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
meta.row_group(meta.num_row_groups + 1)
|
||||
|
||||
rg_meta = meta.row_group(0)
|
||||
assert rg_meta.num_rows == len(df)
|
||||
assert rg_meta.num_columns == ncols + 1 # +1 for index
|
||||
assert rg_meta.total_byte_size > 0
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
col_meta = rg_meta.column(-1)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
col_meta = rg_meta.column(ncols + 2)
|
||||
|
||||
col_meta = rg_meta.column(0)
|
||||
assert col_meta.file_offset == 0
|
||||
assert col_meta.file_path == '' # created from BytesIO
|
||||
assert col_meta.physical_type == 'BOOLEAN'
|
||||
assert col_meta.num_values == 10000
|
||||
assert col_meta.path_in_schema == 'bool'
|
||||
assert col_meta.is_stats_set is True
|
||||
assert isinstance(col_meta.statistics, pq.Statistics)
|
||||
assert col_meta.compression == 'SNAPPY'
|
||||
assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
|
||||
assert col_meta.has_dictionary_page is False
|
||||
assert col_meta.dictionary_page_offset is None
|
||||
assert col_meta.data_page_offset > 0
|
||||
assert col_meta.total_compressed_size > 0
|
||||
assert col_meta.total_uncompressed_size > 0
|
||||
with pytest.raises(NotImplementedError):
|
||||
col_meta.has_index_page
|
||||
with pytest.raises(NotImplementedError):
|
||||
col_meta.index_page_offset
|
||||
|
||||
|
||||
def test_parquet_metadata_lifetime(tempdir):
|
||||
# ARROW-6642 - ensure that chained access keeps parent objects alive
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
|
||||
parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
|
||||
parquet_file.metadata.row_group(0).column(0).statistics
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize(
|
||||
(
|
||||
'data',
|
||||
'type',
|
||||
'physical_type',
|
||||
'min_value',
|
||||
'max_value',
|
||||
'null_count',
|
||||
'num_values',
|
||||
'distinct_count'
|
||||
),
|
||||
[
|
||||
([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
|
||||
([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
|
||||
([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
|
||||
(
|
||||
[-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
|
||||
'FLOAT', -1.1, 4.4, 1, 4, None
|
||||
),
|
||||
(
|
||||
[-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
|
||||
'DOUBLE', -1.1, 4.4, 1, 4, None
|
||||
),
|
||||
(
|
||||
['', 'b', chr(1000), None, 'aaa'], pa.binary(),
|
||||
'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, None
|
||||
),
|
||||
(
|
||||
[True, False, False, True, True], pa.bool_(),
|
||||
'BOOLEAN', False, True, 0, 5, None
|
||||
),
|
||||
(
|
||||
[b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
|
||||
'BYTE_ARRAY', b'\x00', b'b', 1, 4, None
|
||||
),
|
||||
]
|
||||
)
|
||||
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
|
||||
max_value, null_count, num_values,
|
||||
distinct_count):
|
||||
df = pd.DataFrame({'data': data})
|
||||
schema = pa.schema([pa.field('data', type)])
|
||||
table = pa.Table.from_pandas(df, schema=schema, safe=False)
|
||||
fileh = make_sample_file(table)
|
||||
|
||||
meta = fileh.metadata
|
||||
|
||||
rg_meta = meta.row_group(0)
|
||||
col_meta = rg_meta.column(0)
|
||||
|
||||
stat = col_meta.statistics
|
||||
assert stat.has_min_max
|
||||
assert _close(type, stat.min, min_value)
|
||||
assert _close(type, stat.max, max_value)
|
||||
assert stat.null_count == null_count
|
||||
assert stat.num_values == num_values
|
||||
# TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
|
||||
# method, missing distinct_count is represented as zero instead of None
|
||||
assert stat.distinct_count == distinct_count
|
||||
assert stat.physical_type == physical_type
|
||||
|
||||
|
||||
def _close(type, left, right):
|
||||
if type == pa.float32():
|
||||
return abs(left - right) < 1E-7
|
||||
elif type == pa.float64():
|
||||
return abs(left - right) < 1E-13
|
||||
else:
|
||||
return left == right
|
||||
|
||||
|
||||
# ARROW-6339
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_raise_on_unset_statistics():
|
||||
df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
|
||||
meta = make_sample_file(pa.Table.from_pandas(df)).metadata
|
||||
|
||||
assert not meta.row_group(0).column(0).statistics.has_min_max
|
||||
assert meta.row_group(0).column(0).statistics.max is None
|
||||
|
||||
|
||||
def test_statistics_convert_logical_types(tempdir):
|
||||
# ARROW-5166, ARROW-4139
|
||||
|
||||
# (min, max, type)
|
||||
cases = [(10, 11164359321221007157, pa.uint64()),
|
||||
(10, 4294967295, pa.uint32()),
|
||||
("ähnlich", "öffentlich", pa.utf8()),
|
||||
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
|
||||
pa.time32('ms')),
|
||||
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
|
||||
pa.time64('us')),
|
||||
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
|
||||
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
|
||||
pa.timestamp('ms')),
|
||||
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
|
||||
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
|
||||
pa.timestamp('us')),
|
||||
(datetime.date(2019, 6, 24),
|
||||
datetime.date(2019, 6, 25),
|
||||
pa.date32()),
|
||||
(decimal.Decimal("20.123"),
|
||||
decimal.Decimal("20.124"),
|
||||
pa.decimal128(12, 5))]
|
||||
|
||||
for i, (min_val, max_val, typ) in enumerate(cases):
|
||||
t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
|
||||
['col'])
|
||||
path = str(tempdir / f'example{i}.parquet')
|
||||
pq.write_table(t, path, version='2.6')
|
||||
pf = pq.ParquetFile(path)
|
||||
stats = pf.metadata.row_group(0).column(0).statistics
|
||||
assert stats.min == min_val
|
||||
assert stats.max == max_val
|
||||
|
||||
|
||||
def test_parquet_write_disable_statistics(tempdir):
|
||||
table = pa.Table.from_pydict(
|
||||
OrderedDict([
|
||||
('a', pa.array([1, 2, 3])),
|
||||
('b', pa.array(['a', 'b', 'c']))
|
||||
])
|
||||
)
|
||||
_write_table(table, tempdir / 'data.parquet')
|
||||
meta = pq.read_metadata(tempdir / 'data.parquet')
|
||||
for col in [0, 1]:
|
||||
cc = meta.row_group(0).column(col)
|
||||
assert cc.is_stats_set is True
|
||||
assert cc.statistics is not None
|
||||
|
||||
_write_table(table, tempdir / 'data2.parquet', write_statistics=False)
|
||||
meta = pq.read_metadata(tempdir / 'data2.parquet')
|
||||
for col in [0, 1]:
|
||||
cc = meta.row_group(0).column(col)
|
||||
assert cc.is_stats_set is False
|
||||
assert cc.statistics is None
|
||||
|
||||
_write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
|
||||
meta = pq.read_metadata(tempdir / 'data3.parquet')
|
||||
cc_a = meta.row_group(0).column(0)
|
||||
cc_b = meta.row_group(0).column(1)
|
||||
assert cc_a.is_stats_set is True
|
||||
assert cc_b.is_stats_set is False
|
||||
assert cc_a.statistics is not None
|
||||
assert cc_b.statistics is None
|
||||
|
||||
|
||||
def test_parquet_sorting_column():
|
||||
sorting_col = pq.SortingColumn(10)
|
||||
assert sorting_col.to_dict() == {
|
||||
'column_index': 10,
|
||||
'descending': False,
|
||||
'nulls_first': False
|
||||
}
|
||||
|
||||
sorting_col = pq.SortingColumn(0, descending=True, nulls_first=True)
|
||||
assert sorting_col.to_dict() == {
|
||||
'column_index': 0,
|
||||
'descending': True,
|
||||
'nulls_first': True
|
||||
}
|
||||
|
||||
schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
|
||||
sorting_cols = (
|
||||
pq.SortingColumn(1, descending=True),
|
||||
pq.SortingColumn(0, descending=False),
|
||||
)
|
||||
sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_cols)
|
||||
assert sort_order == (('b', "descending"), ('a', "ascending"))
|
||||
assert null_placement == "at_end"
|
||||
|
||||
sorting_cols_roundtripped = pq.SortingColumn.from_ordering(
|
||||
schema, sort_order, null_placement)
|
||||
assert sorting_cols_roundtripped == sorting_cols
|
||||
|
||||
sorting_cols = pq.SortingColumn.from_ordering(
|
||||
schema, ('a', ('b', "descending")), null_placement="at_start")
|
||||
expected = (
|
||||
pq.SortingColumn(0, descending=False, nulls_first=True),
|
||||
pq.SortingColumn(1, descending=True, nulls_first=True),
|
||||
)
|
||||
assert sorting_cols == expected
|
||||
|
||||
# Conversions handle empty tuples
|
||||
empty_sorting_cols = pq.SortingColumn.from_ordering(schema, ())
|
||||
assert empty_sorting_cols == ()
|
||||
|
||||
assert pq.SortingColumn.to_ordering(schema, ()) == ((), "at_end")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pq.SortingColumn.from_ordering(schema, (("a", "not a valid sort order")))
|
||||
|
||||
with pytest.raises(ValueError, match="inconsistent null placement"):
|
||||
sorting_cols = (
|
||||
pq.SortingColumn(1, nulls_first=True),
|
||||
pq.SortingColumn(0, nulls_first=False),
|
||||
)
|
||||
pq.SortingColumn.to_ordering(schema, sorting_cols)
|
||||
|
||||
|
||||
def test_parquet_sorting_column_nested():
|
||||
schema = pa.schema({
|
||||
'a': pa.struct([('x', pa.int64()), ('y', pa.int64())]),
|
||||
'b': pa.int64()
|
||||
})
|
||||
|
||||
sorting_columns = [
|
||||
pq.SortingColumn(0, descending=True), # a.x
|
||||
pq.SortingColumn(2, descending=False) # b
|
||||
]
|
||||
|
||||
sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_columns)
|
||||
assert null_placement == "at_end"
|
||||
assert len(sort_order) == 2
|
||||
assert sort_order[0] == ("a.x", "descending")
|
||||
assert sort_order[1] == ("b", "ascending")
|
||||
|
||||
|
||||
def test_parquet_file_sorting_columns():
|
||||
table = pa.table({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
|
||||
|
||||
sorting_columns = (
|
||||
pq.SortingColumn(column_index=0, descending=True, nulls_first=True),
|
||||
pq.SortingColumn(column_index=1, descending=False),
|
||||
)
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, sorting_columns=sorting_columns)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
|
||||
# Can retrieve sorting columns from metadata
|
||||
metadata = pq.read_metadata(reader)
|
||||
assert sorting_columns == metadata.row_group(0).sorting_columns
|
||||
|
||||
metadata_dict = metadata.to_dict()
|
||||
assert metadata_dict.get('num_columns') == 2
|
||||
assert metadata_dict.get('num_rows') == 3
|
||||
assert metadata_dict.get('num_row_groups') == 1
|
||||
|
||||
|
||||
def test_field_id_metadata():
|
||||
# ARROW-7080
|
||||
field_id = b'PARQUET:field_id'
|
||||
inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
|
||||
middle = pa.field('middle', pa.struct(
|
||||
[inner]), metadata={field_id: b'101'})
|
||||
fields = [
|
||||
pa.field('basic', pa.int32(), metadata={
|
||||
b'other': b'abc', field_id: b'1'}),
|
||||
pa.field(
|
||||
'list',
|
||||
pa.list_(pa.field('list-inner', pa.int32(),
|
||||
metadata={field_id: b'10'})),
|
||||
metadata={field_id: b'11'}),
|
||||
pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
|
||||
pa.field('no-metadata', pa.int32()),
|
||||
pa.field('non-integral-field-id', pa.int32(),
|
||||
metadata={field_id: b'xyz'}),
|
||||
pa.field('negative-field-id', pa.int32(),
|
||||
metadata={field_id: b'-1000'})
|
||||
]
|
||||
arrs = [[] for _ in fields]
|
||||
table = pa.table(arrs, schema=pa.schema(fields))
|
||||
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
contents = bio.getvalue()
|
||||
|
||||
pf = pq.ParquetFile(pa.BufferReader(contents))
|
||||
schema = pf.schema_arrow
|
||||
|
||||
assert schema[0].metadata[field_id] == b'1'
|
||||
assert schema[0].metadata[b'other'] == b'abc'
|
||||
|
||||
list_field = schema[1]
|
||||
assert list_field.metadata[field_id] == b'11'
|
||||
|
||||
list_item_field = list_field.type.value_field
|
||||
assert list_item_field.metadata[field_id] == b'10'
|
||||
|
||||
struct_field = schema[2]
|
||||
assert struct_field.metadata[field_id] == b'102'
|
||||
|
||||
struct_middle_field = struct_field.type[0]
|
||||
assert struct_middle_field.metadata[field_id] == b'101'
|
||||
|
||||
struct_inner_field = struct_middle_field.type[0]
|
||||
assert struct_inner_field.metadata[field_id] == b'100'
|
||||
|
||||
assert schema[3].metadata is None
|
||||
# Invalid input is passed through (ok) but does not
|
||||
# have field_id in parquet (not tested)
|
||||
assert schema[4].metadata[field_id] == b'xyz'
|
||||
assert schema[5].metadata[field_id] == b'-1000'
|
||||
|
||||
|
||||
def test_parquet_file_page_index():
|
||||
for write_page_index in (False, True):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
|
||||
writer = pa.BufferOutputStream()
|
||||
_write_table(table, writer, write_page_index=write_page_index)
|
||||
reader = pa.BufferReader(writer.getvalue())
|
||||
|
||||
# Can retrieve sorting columns from metadata
|
||||
metadata = pq.read_metadata(reader)
|
||||
cc = metadata.row_group(0).column(0)
|
||||
assert cc.has_offset_index is write_page_index
|
||||
assert cc.has_column_index is write_page_index
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multi_dataset_metadata(tempdir):
|
||||
filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
|
||||
metapath = str(tempdir / "_metadata")
|
||||
|
||||
# create a test dataset
|
||||
df = pd.DataFrame({
|
||||
'one': [1, 2, 3],
|
||||
'two': [-1, -2, -3],
|
||||
'three': [[1, 2], [2, 3], [3, 4]],
|
||||
})
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
# write dataset twice and collect/merge metadata
|
||||
_meta = None
|
||||
for filename in filenames:
|
||||
meta = []
|
||||
pq.write_table(table, str(tempdir / filename),
|
||||
metadata_collector=meta)
|
||||
meta[0].set_file_path(filename)
|
||||
if _meta is None:
|
||||
_meta = meta[0]
|
||||
else:
|
||||
_meta.append_row_groups(meta[0])
|
||||
|
||||
# Write merged metadata-only file
|
||||
with open(metapath, "wb") as f:
|
||||
_meta.write_metadata_file(f)
|
||||
|
||||
# Read back the metadata
|
||||
meta = pq.read_metadata(metapath)
|
||||
md = meta.to_dict()
|
||||
_md = _meta.to_dict()
|
||||
for key in _md:
|
||||
if key != 'serialized_size':
|
||||
assert _md[key] == md[key]
|
||||
assert _md['num_columns'] == 3
|
||||
assert _md['num_rows'] == 6
|
||||
assert _md['num_row_groups'] == 2
|
||||
assert _md['serialized_size'] == 0
|
||||
assert md['serialized_size'] > 0
|
||||
|
||||
|
||||
def test_metadata_hashing(tempdir):
|
||||
path1 = str(tempdir / "metadata1")
|
||||
schema1 = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
pq.write_metadata(schema1, path1)
|
||||
parquet_meta1 = pq.read_metadata(path1)
|
||||
|
||||
# Same as 1, just different path
|
||||
path2 = str(tempdir / "metadata2")
|
||||
schema2 = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
pq.write_metadata(schema2, path2)
|
||||
parquet_meta2 = pq.read_metadata(path2)
|
||||
|
||||
# different schema
|
||||
path3 = str(tempdir / "metadata3")
|
||||
schema3 = pa.schema([("a", "int64"), ("b", "float32")])
|
||||
pq.write_metadata(schema3, path3)
|
||||
parquet_meta3 = pq.read_metadata(path3)
|
||||
|
||||
# Deterministic
|
||||
assert hash(parquet_meta1) == hash(parquet_meta1) # equal w/ same instance
|
||||
assert hash(parquet_meta1) == hash(parquet_meta2) # equal w/ different instance
|
||||
|
||||
# Not the same as other metadata with different schema
|
||||
assert hash(parquet_meta1) != hash(parquet_meta3)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
|
||||
def test_write_metadata(tempdir):
|
||||
path = str(tempdir / "metadata")
|
||||
schema = pa.schema([("a", "int64"), ("b", "float64")])
|
||||
|
||||
# write a pyarrow schema
|
||||
pq.write_metadata(schema, path)
|
||||
parquet_meta = pq.read_metadata(path)
|
||||
schema_as_arrow = parquet_meta.schema.to_arrow_schema()
|
||||
assert schema_as_arrow.equals(schema)
|
||||
|
||||
# ARROW-8980: Check that the ARROW:schema metadata key was removed
|
||||
if schema_as_arrow.metadata:
|
||||
assert b'ARROW:schema' not in schema_as_arrow.metadata
|
||||
|
||||
# pass through writer keyword arguments
|
||||
for version in ["1.0", "2.4", "2.6"]:
|
||||
pq.write_metadata(schema, path, version=version)
|
||||
parquet_meta = pq.read_metadata(path)
|
||||
# The version is stored as a single integer in the Parquet metadata,
|
||||
# so it cannot correctly express dotted format versions
|
||||
expected_version = "1.0" if version == "1.0" else "2.6"
|
||||
assert parquet_meta.format_version == expected_version
|
||||
|
||||
# metadata_collector: list of FileMetaData objects
|
||||
table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
|
||||
pq.write_table(table, tempdir / "data.parquet")
|
||||
parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
|
||||
pq.write_metadata(
|
||||
schema, path, metadata_collector=[parquet_meta, parquet_meta]
|
||||
)
|
||||
parquet_meta_mult = pq.read_metadata(path)
|
||||
assert parquet_meta_mult.num_row_groups == 2
|
||||
|
||||
# append metadata with different schema raises an error
|
||||
msg = ("AppendRowGroups requires equal schemas.\n"
|
||||
"The two columns with index 0 differ.")
|
||||
with pytest.raises(RuntimeError, match=msg):
|
||||
pq.write_metadata(
|
||||
pa.schema([("a", "int32"), ("b", "null")]),
|
||||
path, metadata_collector=[parquet_meta, parquet_meta]
|
||||
)
|
||||
|
||||
|
||||
def test_table_large_metadata():
|
||||
# ARROW-8694
|
||||
my_schema = pa.schema([pa.field('f0', 'double')],
|
||||
metadata={'large': 'x' * 10000000})
|
||||
|
||||
table = pa.table([range(10)], schema=my_schema)
|
||||
_check_roundtrip(table)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_compare_schemas():
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
fileh = make_sample_file(df)
|
||||
fileh2 = make_sample_file(df)
|
||||
fileh3 = make_sample_file(df[df.columns[::2]])
|
||||
|
||||
# ParquetSchema
|
||||
assert isinstance(fileh.schema, pq.ParquetSchema)
|
||||
assert fileh.schema.equals(fileh.schema)
|
||||
assert fileh.schema == fileh.schema
|
||||
assert fileh.schema.equals(fileh2.schema)
|
||||
assert fileh.schema == fileh2.schema
|
||||
assert fileh.schema != 'arbitrary object'
|
||||
assert not fileh.schema.equals(fileh3.schema)
|
||||
assert fileh.schema != fileh3.schema
|
||||
|
||||
# ColumnSchema
|
||||
assert isinstance(fileh.schema[0], pq.ColumnSchema)
|
||||
assert fileh.schema[0].equals(fileh.schema[0])
|
||||
assert fileh.schema[0] == fileh.schema[0]
|
||||
assert not fileh.schema[0].equals(fileh.schema[1])
|
||||
assert fileh.schema[0] != fileh.schema[1]
|
||||
assert fileh.schema[0] != 'arbitrary object'
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_schema(tempdir):
|
||||
N = 100
|
||||
df = pd.DataFrame({
|
||||
'index': np.arange(N),
|
||||
'values': np.random.randn(N)
|
||||
}, columns=['index', 'values'])
|
||||
|
||||
data_path = tempdir / 'test.parquet'
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
_write_table(table, data_path)
|
||||
|
||||
read1 = pq.read_schema(data_path)
|
||||
read2 = pq.read_schema(data_path, memory_map=True)
|
||||
assert table.schema.equals(read1)
|
||||
assert table.schema.equals(read2)
|
||||
|
||||
assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
|
||||
|
||||
|
||||
def test_parquet_metadata_empty_to_dict(tempdir):
|
||||
# https://issues.apache.org/jira/browse/ARROW-10146
|
||||
table = pa.table({"a": pa.array([], type="int64")})
|
||||
pq.write_table(table, tempdir / "data.parquet")
|
||||
metadata = pq.read_metadata(tempdir / "data.parquet")
|
||||
# ensure this doesn't error / statistics set to None
|
||||
metadata_dict = metadata.to_dict()
|
||||
assert len(metadata_dict["row_groups"]) == 1
|
||||
assert len(metadata_dict["row_groups"][0]["columns"]) == 1
|
||||
assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.large_memory
|
||||
def test_metadata_exceeds_message_size():
|
||||
# ARROW-13655: Thrift may enable a default message size that limits
|
||||
# the size of Parquet metadata that can be written.
|
||||
NCOLS = 1000
|
||||
NREPEATS = 4000
|
||||
|
||||
table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})
|
||||
|
||||
with pa.BufferOutputStream() as out:
|
||||
pq.write_table(table, out)
|
||||
buf = out.getvalue()
|
||||
|
||||
original_metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
for i in range(NREPEATS):
|
||||
metadata.append_row_groups(original_metadata)
|
||||
|
||||
with pa.BufferOutputStream() as out:
|
||||
metadata.write_metadata_file(out)
|
||||
buf = out.getvalue()
|
||||
|
||||
metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
|
||||
|
||||
def test_metadata_schema_filesystem(tempdir):
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
|
||||
# URI writing to local file.
|
||||
fname = "data.parquet"
|
||||
file_path = str(tempdir / fname)
|
||||
file_uri = 'file:///' + file_path
|
||||
|
||||
pq.write_table(table, file_path)
|
||||
|
||||
# Get expected `metadata` from path.
|
||||
metadata = pq.read_metadata(tempdir / fname)
|
||||
schema = table.schema
|
||||
|
||||
assert pq.read_metadata(file_uri).equals(metadata)
|
||||
assert pq.read_metadata(
|
||||
file_path, filesystem=LocalFileSystem()).equals(metadata)
|
||||
assert pq.read_metadata(
|
||||
fname, filesystem=f'file:///{tempdir}').equals(metadata)
|
||||
|
||||
assert pq.read_schema(file_uri).equals(schema)
|
||||
assert pq.read_schema(
|
||||
file_path, filesystem=LocalFileSystem()).equals(schema)
|
||||
assert pq.read_schema(
|
||||
fname, filesystem=f'file:///{tempdir}').equals(schema)
|
||||
|
||||
with util.change_cwd(tempdir):
|
||||
# Pass `filesystem` arg
|
||||
assert pq.read_metadata(
|
||||
fname, filesystem=LocalFileSystem()).equals(metadata)
|
||||
|
||||
assert pq.read_schema(
|
||||
fname, filesystem=LocalFileSystem()).equals(schema)
|
||||
|
||||
|
||||
def test_metadata_equals():
|
||||
table = pa.table({"a": [1, 2, 3]})
|
||||
with pa.BufferOutputStream() as out:
|
||||
pq.write_table(table, out)
|
||||
buf = out.getvalue()
|
||||
|
||||
original_metadata = pq.read_metadata(pa.BufferReader(buf))
|
||||
match = "Argument 'other' has incorrect type"
|
||||
with pytest.raises(TypeError, match=match):
|
||||
original_metadata.equals(None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("t1,t2,expected_error", (
|
||||
({'col1': range(10)}, {'col1': range(10)}, None),
|
||||
({'col1': range(10)}, {'col2': range(10)},
|
||||
"The two columns with index 0 differ."),
|
||||
({'col1': range(10), 'col2': range(10)}, {'col3': range(10)},
|
||||
"This schema has 2 columns, other has 1")
|
||||
))
|
||||
def test_metadata_append_row_groups_diff(t1, t2, expected_error):
|
||||
table1 = pa.table(t1)
|
||||
table2 = pa.table(t2)
|
||||
|
||||
buf1 = io.BytesIO()
|
||||
buf2 = io.BytesIO()
|
||||
pq.write_table(table1, buf1)
|
||||
pq.write_table(table2, buf2)
|
||||
buf1.seek(0)
|
||||
buf2.seek(0)
|
||||
|
||||
meta1 = pq.ParquetFile(buf1).metadata
|
||||
meta2 = pq.ParquetFile(buf2).metadata
|
||||
|
||||
if expected_error:
|
||||
# Error clearly defines it's happening at append row groups call
|
||||
prefix = "AppendRowGroups requires equal schemas.\n"
|
||||
with pytest.raises(RuntimeError, match=prefix + expected_error):
|
||||
meta1.append_row_groups(meta2)
|
||||
else:
|
||||
meta1.append_row_groups(meta2)
|
||||
|
||||
|
||||
@pytest.mark.s3
|
||||
def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs):
|
||||
s3_fs, s3_path = s3_example_s3fs
|
||||
|
||||
meta1 = tempdir / "meta1"
|
||||
meta2 = tempdir / "meta2"
|
||||
meta3 = tempdir / "meta3"
|
||||
meta4 = tempdir / "meta4"
|
||||
meta5 = f"{s3_path}/meta5"
|
||||
|
||||
table = pa.table({"col": range(5)})
|
||||
|
||||
# plain local path
|
||||
pq.write_metadata(table.schema, meta1, [])
|
||||
|
||||
# Used the localfilesystem to resolve opening an output stream
|
||||
pq.write_metadata(table.schema, meta2, [], filesystem=LocalFileSystem())
|
||||
|
||||
# Can resolve local file URI
|
||||
pq.write_metadata(table.schema, meta3.as_uri(), [])
|
||||
|
||||
# Take a file-like obj all the way thru?
|
||||
with meta4.open('wb+') as meta4_stream:
|
||||
pq.write_metadata(table.schema, meta4_stream, [])
|
||||
|
||||
# S3FileSystem
|
||||
pq.write_metadata(table.schema, meta5, [], filesystem=s3_fs)
|
||||
|
||||
assert meta1.read_bytes() == meta2.read_bytes() \
|
||||
== meta3.read_bytes() == meta4.read_bytes() \
|
||||
== s3_fs.open(meta5).read()
|
||||
|
||||
|
||||
def test_column_chunk_key_value_metadata(parquet_test_datadir):
|
||||
metadata = pq.read_metadata(parquet_test_datadir /
|
||||
'column_chunk_key_value_metadata.parquet')
|
||||
key_value_metadata1 = metadata.row_group(0).column(0).metadata
|
||||
assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''}
|
||||
key_value_metadata2 = metadata.row_group(0).column(1).metadata
|
||||
assert key_value_metadata2 is None
|
||||
|
||||
|
||||
def test_internal_class_instantiation():
|
||||
def msg(c):
|
||||
return f"Do not call {c}'s constructor directly"
|
||||
|
||||
with pytest.raises(TypeError, match=msg("Statistics")):
|
||||
pq.Statistics()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
|
||||
pq.ParquetLogicalType()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
|
||||
pq.ColumnChunkMetaData()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
|
||||
pq.RowGroupMetaData()
|
||||
|
||||
with pytest.raises(TypeError, match=msg("FileMetaData")):
|
||||
pq.FileMetaData()
|
||||
@@ -0,0 +1,655 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
import json
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
np = None
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
|
||||
from pyarrow.util import guid
|
||||
from pyarrow.vendored.version import Version
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
|
||||
_write_table)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
|
||||
alltypes_sample)
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_custom_metadata(tempdir):
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert b'pandas' in arrow_table.schema.metadata
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
metadata = pq.read_metadata(filename).metadata
|
||||
assert b'pandas' in metadata
|
||||
|
||||
js = json.loads(metadata[b'pandas'].decode('utf8'))
|
||||
assert js['index_columns'] == [{'kind': 'range',
|
||||
'name': None,
|
||||
'start': 0, 'stop': 10000,
|
||||
'step': 1}]
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
|
||||
# ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
|
||||
schema = pa.schema([
|
||||
pa.field('int', pa.int16()),
|
||||
pa.field('float', pa.float32()),
|
||||
pa.field('string', pa.string())
|
||||
])
|
||||
df1 = pd.DataFrame({
|
||||
'int': np.arange(3, dtype=np.uint8),
|
||||
'float': np.arange(3, dtype=np.float32),
|
||||
'string': ['ABBA', 'EDDA', 'ACDC']
|
||||
})
|
||||
df2 = pd.DataFrame({
|
||||
'int': [4, 5],
|
||||
'float': [1.1, None],
|
||||
'string': [None, None]
|
||||
})
|
||||
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
|
||||
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
|
||||
|
||||
assert not table1.schema.equals(table2.schema, check_metadata=True)
|
||||
assert table1.schema.equals(table2.schema)
|
||||
|
||||
writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
|
||||
writer.write_table(table1)
|
||||
writer.write_table(table2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_column_multiindex(tempdir):
|
||||
df = alltypes_sample(size=10)
|
||||
df.columns = pd.MultiIndex.from_tuples(
|
||||
list(zip(df.columns, df.columns[::-1])),
|
||||
names=['level_1', 'level_2']
|
||||
)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
table_read = pq.read_pandas(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_2_roundtrip_read_pandas_no_index_written(tempdir):
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
js = arrow_table.schema.pandas_metadata
|
||||
assert not js['index_columns']
|
||||
# ARROW-2170
|
||||
# While index_columns should be empty, columns needs to be filled still.
|
||||
assert js['columns']
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
table_read = pq.read_pandas(filename)
|
||||
|
||||
js = table_read.schema.pandas_metadata
|
||||
assert not js['index_columns']
|
||||
|
||||
read_metadata = table_read.schema.metadata
|
||||
assert arrow_table.schema.metadata == read_metadata
|
||||
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_native_file_roundtrip():
|
||||
df = _test_dataframe(10000)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = _read_table(reader).to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_column_subset():
|
||||
df = _test_dataframe(10000)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = pq.read_pandas(
|
||||
reader, columns=['strings', 'uint8'],
|
||||
).to_pandas()
|
||||
tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_empty_roundtrip():
|
||||
df = _test_dataframe(0)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
_write_table(arrow_table, imos, version='2.6')
|
||||
buf = imos.getvalue()
|
||||
reader = pa.BufferReader(buf)
|
||||
df_read = _read_table(reader).to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_can_write_nested_data():
|
||||
data = {
|
||||
"agg_col": [
|
||||
{"page_type": 1},
|
||||
{"record_type": 1},
|
||||
{"non_consecutive_home": 0},
|
||||
],
|
||||
"uid_first": "1001"
|
||||
}
|
||||
df = pd.DataFrame(data=data)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
imos = pa.BufferOutputStream()
|
||||
# This succeeds under V2
|
||||
_write_table(arrow_table, imos)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_pyfile_roundtrip(tempdir):
|
||||
filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
|
||||
size = 5
|
||||
df = pd.DataFrame({
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0,
|
||||
'strings': ['foo', 'bar', None, 'baz', 'qux']
|
||||
})
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
with filename.open('wb') as f:
|
||||
_write_table(arrow_table, f, version="2.6")
|
||||
|
||||
data = io.BytesIO(filename.read_bytes())
|
||||
|
||||
table_read = _read_table(data)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_parquet_configuration_options(tempdir):
|
||||
size = 10000
|
||||
np.random.seed(0)
|
||||
df = pd.DataFrame({
|
||||
'uint8': np.arange(size, dtype=np.uint8),
|
||||
'uint16': np.arange(size, dtype=np.uint16),
|
||||
'uint32': np.arange(size, dtype=np.uint32),
|
||||
'uint64': np.arange(size, dtype=np.uint64),
|
||||
'int8': np.arange(size, dtype=np.int16),
|
||||
'int16': np.arange(size, dtype=np.int16),
|
||||
'int32': np.arange(size, dtype=np.int32),
|
||||
'int64': np.arange(size, dtype=np.int64),
|
||||
'float32': np.arange(size, dtype=np.float32),
|
||||
'float64': np.arange(size, dtype=np.float64),
|
||||
'bool': np.random.randn(size) > 0
|
||||
})
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
|
||||
for use_dictionary in [True, False]:
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
use_dictionary=use_dictionary)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
for write_statistics in [True, False]:
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
write_statistics=write_statistics)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
|
||||
if (compression != 'NONE' and
|
||||
not pa.lib.Codec.is_available(compression)):
|
||||
continue
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
compression=compression)
|
||||
table_read = _read_table(filename)
|
||||
df_read = table_read.to_pandas()
|
||||
tm.assert_frame_equal(df, df_read)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_spark_flavor_preserves_pandas_metadata():
|
||||
df = _test_dataframe(size=100)
|
||||
df.index = np.arange(0, 10 * len(df), 10)
|
||||
df.index.name = 'foo'
|
||||
|
||||
result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'})
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_index_column_name_duplicate(tempdir):
|
||||
data = {
|
||||
'close': {
|
||||
pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
|
||||
pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
|
||||
},
|
||||
'time': {
|
||||
pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
|
||||
'2017-06-30 01:31:00'
|
||||
),
|
||||
pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
|
||||
'2017-06-30 01:32:00'
|
||||
),
|
||||
}
|
||||
}
|
||||
path = str(tempdir / 'data.parquet')
|
||||
|
||||
# Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
|
||||
# so we need to cast the pandas dtype. Pandas v1 will always silently
|
||||
# coerce to [ns] due to lack of non-[ns] support.
|
||||
dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time', drop=False)
|
||||
|
||||
tdfx = pa.Table.from_pandas(dfx)
|
||||
_write_table(tdfx, path)
|
||||
arrow_table = _read_table(path)
|
||||
result_df = arrow_table.to_pandas()
|
||||
tm.assert_frame_equal(result_df, dfx)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_multiindex_duplicate_values(tempdir):
|
||||
num_rows = 3
|
||||
numbers = list(range(num_rows))
|
||||
index = pd.MultiIndex.from_arrays(
|
||||
[['foo', 'foo', 'bar'], numbers],
|
||||
names=['foobar', 'some_numbers'],
|
||||
)
|
||||
|
||||
df = pd.DataFrame({'numbers': numbers}, index=index)
|
||||
table = pa.Table.from_pandas(df)
|
||||
|
||||
filename = tempdir / 'dup_multi_index_levels.parquet'
|
||||
|
||||
_write_table(table, filename)
|
||||
result_table = _read_table(filename)
|
||||
assert table.equals(result_table)
|
||||
|
||||
result_df = result_table.to_pandas()
|
||||
tm.assert_frame_equal(result_df, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_naming(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
|
||||
index_col=None, header=0, engine='python')
|
||||
table = _read_table(datadir / 'v0.7.1.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_multi_level_named(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(
|
||||
io.BytesIO(expected_string), sep=r'\s{2,}',
|
||||
index_col=['cut', 'color', 'clarity'],
|
||||
header=0, engine='python'
|
||||
).sort_index()
|
||||
|
||||
table = _read_table(datadir / 'v0.7.1.all-named-index.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_index_multi_level_some_named(datadir):
|
||||
expected_string = b"""\
|
||||
carat cut color clarity depth table price x y z
|
||||
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
||||
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
||||
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
||||
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
||||
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
||||
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
||||
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
||||
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
||||
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
||||
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
||||
expected = pd.read_csv(
|
||||
io.BytesIO(expected_string),
|
||||
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
|
||||
header=0, engine='python'
|
||||
).sort_index()
|
||||
expected.index = expected.index.set_names(['cut', None, 'clarity'])
|
||||
|
||||
table = _read_table(datadir / 'v0.7.1.some-named-index.parquet')
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_backwards_compatible_column_metadata_handling(datadir):
|
||||
if Version("2.2.0") <= Version(pd.__version__):
|
||||
# TODO: regression in pandas
|
||||
# https://github.com/pandas-dev/pandas/issues/56775
|
||||
pytest.skip("Regression in pandas 2.2.0")
|
||||
expected = pd.DataFrame(
|
||||
{'a': [1, 2, 3], 'b': [.1, .2, .3],
|
||||
'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
|
||||
expected.index = pd.MultiIndex.from_arrays(
|
||||
[['a', 'b', 'c'],
|
||||
pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
|
||||
names=['index', None])
|
||||
|
||||
path = datadir / 'v0.7.1.column-metadata-handling.parquet'
|
||||
table = _read_table(path)
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
table = _read_table(
|
||||
path, columns=['a'])
|
||||
result = table.to_pandas()
|
||||
tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categorical_index_survives_roundtrip():
|
||||
# ARROW-3652, addressed by ARROW-3246
|
||||
df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
|
||||
df['c1'] = df['c1'].astype('category')
|
||||
df = df.set_index(['c1'])
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
bos = pa.BufferOutputStream()
|
||||
pq.write_table(table, bos)
|
||||
ref_df = pq.read_pandas(bos.getvalue()).to_pandas()
|
||||
assert isinstance(ref_df.index, pd.CategoricalIndex)
|
||||
assert ref_df.index.equals(df.index)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categorical_order_survives_roundtrip():
|
||||
# ARROW-6302
|
||||
df = pd.DataFrame({"a": pd.Categorical(
|
||||
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
|
||||
|
||||
table = pa.Table.from_pandas(df)
|
||||
bos = pa.BufferOutputStream()
|
||||
pq.write_table(table, bos)
|
||||
|
||||
contents = bos.getvalue()
|
||||
result = pq.read_pandas(contents).to_pandas()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_categorical_na_type_row_groups():
|
||||
# ARROW-5085
|
||||
df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
|
||||
df_category = df.astype({"col": "category", "int": "category"})
|
||||
table = pa.Table.from_pandas(df)
|
||||
table_cat = pa.Table.from_pandas(df_category)
|
||||
buf = pa.BufferOutputStream()
|
||||
|
||||
# it works
|
||||
pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
|
||||
result = pq.read_table(buf.getvalue())
|
||||
|
||||
# Result is non-categorical
|
||||
assert result[0].equals(table[0])
|
||||
assert result[1].equals(table[1])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pandas_categorical_roundtrip():
|
||||
# ARROW-5480, this was enabled by ARROW-3246
|
||||
|
||||
# Have one of the categories unobserved and include a null (-1)
|
||||
codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
|
||||
categories = ['foo', 'bar', 'baz']
|
||||
df = pd.DataFrame({'x': pd.Categorical.from_codes(
|
||||
codes, categories=categories)})
|
||||
|
||||
buf = pa.BufferOutputStream()
|
||||
pq.write_table(pa.table(df), buf)
|
||||
|
||||
result = pq.read_table(buf.getvalue()).to_pandas()
|
||||
assert result.x.dtype == 'category'
|
||||
assert (result.x.cat.categories == categories).all()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_categories_with_string_pyarrow_dtype(tempdir):
|
||||
# gh-33727: writing to parquet should not fail
|
||||
if Version(pd.__version__) < Version("1.3.0"):
|
||||
pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")
|
||||
|
||||
df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
|
||||
df1 = df1.astype("category")
|
||||
|
||||
df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]})
|
||||
df2 = df2.astype("category")
|
||||
|
||||
# categories should be converted to pa.Array
|
||||
assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist()
|
||||
assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array(
|
||||
df2["x"].cat.categories.values).to_pylist()
|
||||
|
||||
path = str(tempdir / 'cat.parquet')
|
||||
pq.write_table(pa.table(df1), path)
|
||||
result = pq.read_table(path).to_pandas()
|
||||
|
||||
tm.assert_frame_equal(result, df2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir):
|
||||
df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
|
||||
df['col'] = df['col'].astype("Int64")
|
||||
table = pa.table(df)
|
||||
|
||||
pq.write_to_dataset(
|
||||
table, str(tempdir / "case1"), partition_cols=['part'],
|
||||
)
|
||||
result = pq.read_table(str(tempdir / "case1")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
pq.write_to_dataset(table, str(tempdir / "case2"))
|
||||
result = pq.read_table(str(tempdir / "case2")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
|
||||
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_write_to_dataset_pandas_preserve_index(tempdir):
|
||||
# ARROW-8251 - preserve pandas index in roundtrip
|
||||
|
||||
df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
|
||||
df.index = pd.Index(['a', 'b', 'c'], name="idx")
|
||||
table = pa.table(df)
|
||||
df_cat = df[["col", "part"]].copy()
|
||||
df_cat["part"] = df_cat["part"].astype("category")
|
||||
|
||||
pq.write_to_dataset(
|
||||
table, str(tempdir / "case1"), partition_cols=['part'],
|
||||
)
|
||||
result = pq.read_table(str(tempdir / "case1")).to_pandas()
|
||||
tm.assert_frame_equal(result, df_cat)
|
||||
|
||||
pq.write_to_dataset(table, str(tempdir / "case2"))
|
||||
result = pq.read_table(str(tempdir / "case2")).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
pq.write_table(table, str(tempdir / "data.parquet"))
|
||||
result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('preserve_index', [True, False, None])
|
||||
@pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
|
||||
def test_dataset_read_pandas_common_metadata(
|
||||
tempdir, preserve_index, metadata_fname
|
||||
):
|
||||
# ARROW-1103
|
||||
nfiles = 5
|
||||
size = 5
|
||||
|
||||
dirpath = tempdir / guid()
|
||||
dirpath.mkdir()
|
||||
|
||||
test_data = []
|
||||
frames = []
|
||||
paths = []
|
||||
for i in range(nfiles):
|
||||
df = _test_dataframe(size, seed=i)
|
||||
df.index = pd.Index(
|
||||
np.arange(i * size, (i + 1) * size, dtype="int64"), name='index'
|
||||
)
|
||||
|
||||
path = dirpath / f'{i}.parquet'
|
||||
|
||||
table = pa.Table.from_pandas(df, preserve_index=preserve_index)
|
||||
|
||||
# Obliterate metadata
|
||||
table = table.replace_schema_metadata(None)
|
||||
assert table.schema.metadata is None
|
||||
|
||||
_write_table(table, path)
|
||||
test_data.append(table)
|
||||
frames.append(df)
|
||||
paths.append(path)
|
||||
|
||||
# Write _metadata common file
|
||||
table_for_metadata = pa.Table.from_pandas(
|
||||
df, preserve_index=preserve_index
|
||||
)
|
||||
pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
|
||||
|
||||
dataset = pq.ParquetDataset(dirpath)
|
||||
columns = ['uint8', 'strings']
|
||||
result = dataset.read_pandas(columns=columns).to_pandas()
|
||||
expected = pd.concat([x[columns] for x in frames])
|
||||
expected.index.name = (
|
||||
df.index.name if preserve_index is not False else None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_passthrough_keywords(tempdir):
|
||||
# ARROW-11464 - previously not all keywords were passed through (such as
|
||||
# the filesystem keyword)
|
||||
df = pd.DataFrame({'a': [1, 2, 3]})
|
||||
|
||||
filename = tempdir / 'data.parquet'
|
||||
_write_table(df, filename)
|
||||
|
||||
result = pq.read_pandas(
|
||||
'data.parquet',
|
||||
filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
|
||||
)
|
||||
assert result.equals(pa.table(df))
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_pandas_map_fields(tempdir):
|
||||
# ARROW-10140 - table created from Pandas with mapping fields
|
||||
df = pd.DataFrame({
|
||||
'col1': pd.Series([
|
||||
[('id', 'something'), ('value2', 'else')],
|
||||
[('id', 'something2'), ('value', 'else2')],
|
||||
]),
|
||||
'col2': pd.Series(['foo', 'bar'])
|
||||
})
|
||||
|
||||
filename = tempdir / 'data.parquet'
|
||||
|
||||
udt = pa.map_(pa.string(), pa.string())
|
||||
schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
|
||||
arrow_table = pa.Table.from_pandas(df, schema)
|
||||
|
||||
_write_table(arrow_table, filename)
|
||||
|
||||
result = pq.read_pandas(filename).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
@@ -0,0 +1,443 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
from unittest import mock
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import _write_table
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
from pyarrow.tests.parquet.common import alltypes_sample
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_pass_separate_metadata():
|
||||
# ARROW-471
|
||||
df = alltypes_sample(size=10000)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
metadata = pq.read_metadata(buf)
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
fileh = pq.ParquetFile(buf, metadata=metadata)
|
||||
|
||||
tm.assert_frame_equal(df, fileh.read().to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_single_row_group():
|
||||
# ARROW-471
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.num_row_groups == K
|
||||
|
||||
row_groups = [pf.read_row_group(i) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df, result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_single_row_group_with_column_subset():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
cols = list(df.columns[:2])
|
||||
row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
# ARROW-4267: Selection of duplicate columns still leads to these columns
|
||||
# being read uniquely.
|
||||
row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
|
||||
result = pa.concat_tables(row_groups)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_multiple_row_groups():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.num_row_groups == K
|
||||
|
||||
result = pf.read_row_groups(range(K))
|
||||
tm.assert_frame_equal(df, result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_read_multiple_row_groups_with_column_subset():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
cols = list(df.columns[:2])
|
||||
result = pf.read_row_groups(range(K), columns=cols)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
# ARROW-4267: Selection of duplicate columns still leads to these columns
|
||||
# being read uniquely.
|
||||
result = pf.read_row_groups(range(K), columns=cols + cols)
|
||||
tm.assert_frame_equal(df[cols], result.to_pandas())
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_scan_contents():
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf)
|
||||
|
||||
assert pf.scan_contents() == 10000
|
||||
assert pf.scan_contents(df.columns[:4]) == 10000
|
||||
|
||||
|
||||
def test_parquet_file_pass_directory_instead_of_file(tempdir):
|
||||
# ARROW-7208
|
||||
path = tempdir / 'directory'
|
||||
os.mkdir(str(path))
|
||||
|
||||
msg = f"Cannot open for reading: path '{str(path)}' is a directory"
|
||||
with pytest.raises(IOError) as exc:
|
||||
pq.ParquetFile(path)
|
||||
if exc.errisinstance(PermissionError) and sys.platform == 'win32':
|
||||
return # Windows CI can get a PermissionError here.
|
||||
exc.match(msg)
|
||||
|
||||
|
||||
def test_read_column_invalid_index():
|
||||
table = pa.table([pa.array([4, 5]), pa.array(["foo", "bar"])],
|
||||
names=['ints', 'strs'])
|
||||
bio = pa.BufferOutputStream()
|
||||
pq.write_table(table, bio)
|
||||
f = pq.ParquetFile(bio.getvalue())
|
||||
assert f.reader.read_column(0).to_pylist() == [4, 5]
|
||||
assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
|
||||
for index in (-1, 2):
|
||||
with pytest.raises((ValueError, IndexError)):
|
||||
f.reader.read_column(index)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('batch_size', [300, 1000, 1300])
|
||||
def test_iter_batches_columns_reader(tempdir, batch_size):
|
||||
total_size = 3000
|
||||
chunk_size = 1000
|
||||
# TODO: Add categorical support
|
||||
df = alltypes_sample(size=total_size)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
|
||||
file_ = pq.ParquetFile(filename)
|
||||
for columns in [df.columns[:10], df.columns[10:]]:
|
||||
batches = file_.iter_batches(batch_size=batch_size, columns=columns)
|
||||
batch_starts = range(0, total_size+batch_size, batch_size)
|
||||
for batch, start in zip(batches, batch_starts):
|
||||
end = min(total_size, start + batch_size)
|
||||
tm.assert_frame_equal(
|
||||
batch.to_pandas(),
|
||||
df.iloc[start:end, :].loc[:, columns].reset_index(drop=True)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('chunk_size', [1000])
|
||||
def test_iter_batches_reader(tempdir, chunk_size):
|
||||
df = alltypes_sample(size=10000, categorical=True)
|
||||
|
||||
filename = tempdir / 'pandas_roundtrip.parquet'
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
assert arrow_table.schema.pandas_metadata is not None
|
||||
|
||||
_write_table(arrow_table, filename, version='2.6',
|
||||
chunk_size=chunk_size)
|
||||
|
||||
file_ = pq.ParquetFile(filename)
|
||||
|
||||
def get_all_batches(f):
|
||||
for row_group in range(f.num_row_groups):
|
||||
batches = f.iter_batches(
|
||||
batch_size=900,
|
||||
row_groups=[row_group],
|
||||
)
|
||||
|
||||
for batch in batches:
|
||||
yield batch
|
||||
|
||||
batches = list(get_all_batches(file_))
|
||||
batch_no = 0
|
||||
|
||||
for i in range(file_.num_row_groups):
|
||||
tm.assert_frame_equal(
|
||||
batches[batch_no].to_pandas(),
|
||||
file_.read_row_groups([i]).to_pandas().head(900)
|
||||
)
|
||||
|
||||
batch_no += 1
|
||||
|
||||
tm.assert_frame_equal(
|
||||
batches[batch_no].to_pandas().reset_index(drop=True),
|
||||
file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
|
||||
drop=True
|
||||
)
|
||||
)
|
||||
|
||||
batch_no += 1
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize('pre_buffer', [False, True])
|
||||
def test_pre_buffer(pre_buffer):
|
||||
N, K = 10000, 4
|
||||
df = alltypes_sample(size=N)
|
||||
a_table = pa.Table.from_pandas(df)
|
||||
|
||||
buf = io.BytesIO()
|
||||
_write_table(a_table, buf, row_group_size=N / K,
|
||||
compression='snappy', version='2.6')
|
||||
|
||||
buf.seek(0)
|
||||
pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
|
||||
assert pf.read().num_rows == N
|
||||
|
||||
|
||||
def test_parquet_file_explicitly_closed(tempdir):
|
||||
"""
|
||||
Unopened files should be closed explicitly after use,
|
||||
and previously opened files should be left open.
|
||||
Applies to read_table, ParquetDataset, and ParquetFile
|
||||
"""
|
||||
# create test parquet file
|
||||
fn = tempdir.joinpath('file.parquet')
|
||||
table = pa.table({'col1': [0, 1], 'col2': [0, 1]})
|
||||
pq.write_table(table, fn)
|
||||
|
||||
# ParquetFile with opened file (will leave open)
|
||||
with open(fn, 'rb') as f:
|
||||
with pq.ParquetFile(f) as p:
|
||||
p.read()
|
||||
assert not f.closed
|
||||
assert not p.closed
|
||||
assert not f.closed # opened input file was not closed
|
||||
assert not p.closed # parquet file obj reports as not closed
|
||||
assert f.closed
|
||||
assert p.closed # parquet file being closed reflects underlying file
|
||||
|
||||
# ParquetFile with unopened file (will close)
|
||||
with pq.ParquetFile(fn) as p:
|
||||
p.read()
|
||||
assert not p.closed
|
||||
assert p.closed # parquet file obj reports as closed
|
||||
|
||||
|
||||
@pytest.mark.s3
|
||||
@pytest.mark.parametrize("use_uri", (True, False))
|
||||
def test_parquet_file_with_filesystem(s3_example_fs, use_uri):
|
||||
s3_fs, s3_uri, s3_path = s3_example_fs
|
||||
|
||||
args = (s3_uri if use_uri else s3_path,)
|
||||
kwargs = {} if use_uri else dict(filesystem=s3_fs)
|
||||
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, s3_path, filesystem=s3_fs)
|
||||
|
||||
parquet_file = pq.ParquetFile(*args, **kwargs)
|
||||
assert parquet_file.read() == table
|
||||
assert not parquet_file.closed
|
||||
parquet_file.close()
|
||||
assert parquet_file.closed
|
||||
|
||||
with pq.ParquetFile(*args, **kwargs) as f:
|
||||
assert f.read() == table
|
||||
assert not f.closed
|
||||
assert f.closed
|
||||
|
||||
|
||||
def test_read_statistics():
|
||||
table = pa.table({"value": pa.array([-1, None, 3])})
|
||||
buf = io.BytesIO()
|
||||
_write_table(table, buf)
|
||||
buf.seek(0)
|
||||
|
||||
statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
|
||||
assert statistics.null_count == 1
|
||||
assert statistics.distinct_count is None
|
||||
assert statistics.min == -1
|
||||
assert statistics.is_min_exact
|
||||
assert statistics.max == 3
|
||||
assert statistics.is_max_exact
|
||||
assert repr(statistics) == ("arrow.ArrayStatistics<"
|
||||
"null_count=1, distinct_count=None, "
|
||||
"min=-1, is_min_exact=True, "
|
||||
"max=3, is_max_exact=True>")
|
||||
|
||||
|
||||
def test_read_undefined_logical_type(parquet_test_datadir):
|
||||
test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
|
||||
|
||||
table = pq.ParquetFile(test_file).read()
|
||||
assert table.column_names == ["column with known type", "column with unknown type"]
|
||||
assert table["column with unknown type"].to_pylist() == [
|
||||
b"unknown string 1",
|
||||
b"unknown string 2",
|
||||
b"unknown string 3"
|
||||
]
|
||||
|
||||
|
||||
def test_parquet_file_fsspec_support():
|
||||
pytest.importorskip("fsspec")
|
||||
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, "fsspec+memory://example.parquet")
|
||||
table2 = pq.read_table("fsspec+memory://example.parquet")
|
||||
assert table.equals(table2)
|
||||
|
||||
msg = "Unrecognized filesystem type in URI"
|
||||
with pytest.raises(pa.ArrowInvalid, match=msg):
|
||||
pq.read_table("non-existing://example.parquet")
|
||||
|
||||
|
||||
def test_parquet_file_fsspec_support_through_filesystem_argument():
|
||||
try:
|
||||
from fsspec.implementations.memory import MemoryFileSystem
|
||||
except ImportError:
|
||||
pytest.skip("fsspec is not installed, skipping test")
|
||||
|
||||
table = pa.table({"b": range(10)})
|
||||
|
||||
fs = MemoryFileSystem()
|
||||
fs.mkdir("/path/to/prefix", create_parents=True)
|
||||
assert fs.exists("/path/to/prefix")
|
||||
|
||||
fs_str = "fsspec+memory://path/to/prefix"
|
||||
pq.write_table(table, "b.parquet", filesystem=fs_str)
|
||||
table2 = pq.read_table("fsspec+memory://path/to/prefix/b.parquet")
|
||||
assert table.equals(table2)
|
||||
|
||||
|
||||
def test_parquet_file_hugginface_support():
|
||||
try:
|
||||
from fsspec.implementations.memory import MemoryFileSystem
|
||||
except ImportError:
|
||||
pytest.skip("fsspec is not installed, skipping Hugging Face test")
|
||||
|
||||
fake_hf_module = types.ModuleType("huggingface_hub")
|
||||
fake_hf_module.HfFileSystem = MemoryFileSystem
|
||||
with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}):
|
||||
uri = "hf://datasets/apache/arrow/test.parquet"
|
||||
table = pa.table({"a": range(10)})
|
||||
pq.write_table(table, uri)
|
||||
table2 = pq.read_table(uri)
|
||||
assert table.equals(table2)
|
||||
|
||||
|
||||
def test_fsspec_uri_raises_if_fsspec_is_not_available():
|
||||
# sadly cannot patch sys.modules because cython will still be able to import fsspec
|
||||
try:
|
||||
import fsspec # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
pytest.skip("fsspec is available, skipping test")
|
||||
|
||||
msg = re.escape(
|
||||
"`fsspec` is required to handle `fsspec+<filesystem>://` and `hf://` URIs.")
|
||||
with pytest.raises(ImportError, match=msg):
|
||||
pq.read_table("fsspec+memory://example.parquet")
|
||||
|
||||
|
||||
def test_iter_batches_raises_batch_size_zero(tempdir):
|
||||
# See https://github.com/apache/arrow/issues/46811
|
||||
schema = pa.schema([])
|
||||
empty_table = pa.Table.from_batches([], schema=schema)
|
||||
parquet_file_path = tempdir / "empty_file.parquet"
|
||||
pq.write_table(empty_table, parquet_file_path)
|
||||
parquet_file = pq.ParquetFile(parquet_file_path)
|
||||
with pytest.raises(ValueError):
|
||||
parquet_file.iter_batches(batch_size=0)
|
||||
@@ -0,0 +1,450 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow import fs
|
||||
|
||||
try:
|
||||
import pyarrow.parquet as pq
|
||||
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
|
||||
_range_integers)
|
||||
except ImportError:
|
||||
pq = None
|
||||
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import pandas.testing as tm
|
||||
|
||||
except ImportError:
|
||||
pd = tm = None
|
||||
|
||||
|
||||
# Marks all of the tests in this module
|
||||
# Ignore these with pytest ... -m 'not parquet'
|
||||
pytestmark = pytest.mark.parquet
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_incremental_file_build(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
|
||||
writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
|
||||
frames.append(df.copy())
|
||||
|
||||
writer.close()
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
def test_validate_schema_write_table(tempdir):
|
||||
# ARROW-2926
|
||||
simple_fields = [
|
||||
pa.field('POS', pa.uint32()),
|
||||
pa.field('desc', pa.string())
|
||||
]
|
||||
|
||||
simple_schema = pa.schema(simple_fields)
|
||||
|
||||
# simple_table schema does not match simple_schema
|
||||
simple_from_array = [pa.array([1]), pa.array(['bla'])]
|
||||
simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
|
||||
|
||||
path = tempdir / 'simple_validate_schema.parquet'
|
||||
|
||||
with pq.ParquetWriter(path, simple_schema,
|
||||
version='2.6',
|
||||
compression='snappy', flavor='spark') as w:
|
||||
with pytest.raises(ValueError):
|
||||
w.write_table(simple_table)
|
||||
|
||||
|
||||
def test_parquet_invalid_writer(tempdir):
|
||||
# avoid segfaults with invalid construction
|
||||
with pytest.raises(TypeError):
|
||||
some_schema = pa.schema([pa.field("x", pa.int32())])
|
||||
pq.ParquetWriter(None, some_schema)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
pq.ParquetWriter(tempdir / "some_path", None)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_context_obj(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
|
||||
with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
|
||||
frames.append(df.copy())
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_context_obj_with_exception(tempdir):
|
||||
df = _test_dataframe(100)
|
||||
df['unique_id'] = 0
|
||||
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
out = pa.BufferOutputStream()
|
||||
error_text = 'Artificial Error'
|
||||
|
||||
try:
|
||||
with pq.ParquetWriter(out,
|
||||
arrow_table.schema,
|
||||
version='2.6') as writer:
|
||||
|
||||
frames = []
|
||||
for i in range(10):
|
||||
df['unique_id'] = i
|
||||
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
writer.write_table(arrow_table)
|
||||
frames.append(df.copy())
|
||||
if i == 5:
|
||||
raise ValueError(error_text)
|
||||
except Exception as e:
|
||||
assert str(e) == error_text
|
||||
|
||||
buf = out.getvalue()
|
||||
result = _read_table(pa.BufferReader(buf))
|
||||
|
||||
expected = pd.concat(frames, ignore_index=True)
|
||||
tm.assert_frame_equal(result.to_pandas(), expected)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None,
|
||||
fs.LocalFileSystem(),
|
||||
])
|
||||
def test_parquet_writer_write_wrappers(tempdir, filesystem):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
|
||||
path_table = str(tempdir / 'data_table.parquet')
|
||||
path_batch = str(tempdir / 'data_batch.parquet')
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_table, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path_table).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_batch, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_batch(batch)
|
||||
|
||||
result = _read_table(path_batch).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_table, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write(table)
|
||||
|
||||
result = _read_table(path_table).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path_batch, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write(batch)
|
||||
|
||||
result = _read_table(path_batch).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.large_memory
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_chunk_size(tempdir):
|
||||
default_chunk_size = 1024 * 1024
|
||||
abs_max_chunk_size = 64 * 1024 * 1024
|
||||
|
||||
def check_chunk_size(data_size, chunk_size, expect_num_chunks):
|
||||
table = pa.Table.from_arrays([
|
||||
_range_integers(data_size, 'b')
|
||||
], names=['x'])
|
||||
if chunk_size is None:
|
||||
pq.write_table(table, tempdir / 'test.parquet')
|
||||
else:
|
||||
pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
|
||||
metadata = pq.read_metadata(tempdir / 'test.parquet')
|
||||
expected_chunk_size = default_chunk_size if chunk_size is None else chunk_size
|
||||
assert metadata.num_row_groups == expect_num_chunks
|
||||
latched_chunk_size = min(expected_chunk_size, abs_max_chunk_size)
|
||||
# First chunks should be full size
|
||||
for chunk_idx in range(expect_num_chunks - 1):
|
||||
assert metadata.row_group(chunk_idx).num_rows == latched_chunk_size
|
||||
# Last chunk may be smaller
|
||||
remainder = data_size - (expected_chunk_size * (expect_num_chunks - 1))
|
||||
if remainder == 0:
|
||||
assert metadata.row_group(
|
||||
expect_num_chunks - 1).num_rows == latched_chunk_size
|
||||
else:
|
||||
assert metadata.row_group(expect_num_chunks - 1).num_rows == remainder
|
||||
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size - 100, 3)
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size, 2)
|
||||
check_chunk_size(default_chunk_size * 2, default_chunk_size + 100, 2)
|
||||
check_chunk_size(default_chunk_size + 100, default_chunk_size + 100, 1)
|
||||
# Even though the chunk size requested is large enough it will be capped
|
||||
# by the absolute max chunk size
|
||||
check_chunk_size(abs_max_chunk_size * 2, abs_max_chunk_size * 2, 2)
|
||||
|
||||
# These tests don't pass a chunk_size to write_table and so the chunk size
|
||||
# should be default_chunk_size
|
||||
check_chunk_size(default_chunk_size, None, 1)
|
||||
check_chunk_size(default_chunk_size + 1, None, 2)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.parametrize("filesystem", [
|
||||
None,
|
||||
fs.LocalFileSystem(),
|
||||
])
|
||||
def test_parquet_writer_filesystem_local(tempdir, filesystem):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
path = str(tempdir / 'data.parquet')
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=filesystem, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3(s3_example_fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, uri, path = s3_example_fs
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=fs, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(uri).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, uri, path = s3_example_fs
|
||||
|
||||
with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path, filesystem=fs).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
@pytest.mark.s3
|
||||
def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
|
||||
fs, directory = s3_example_s3fs
|
||||
path = directory + "/test.parquet"
|
||||
|
||||
with pq.ParquetWriter(
|
||||
path, table.schema, filesystem=fs, version='2.6'
|
||||
) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
result = _read_table(path, filesystem=fs).to_pandas()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.pandas
|
||||
def test_parquet_writer_filesystem_buffer_raises():
|
||||
df = _test_dataframe(100)
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
filesystem = fs.LocalFileSystem()
|
||||
|
||||
# Should raise ValueError when filesystem is passed with file-like object
|
||||
with pytest.raises(ValueError, match="specified path is file-like"):
|
||||
pq.ParquetWriter(
|
||||
pa.BufferOutputStream(), table.schema, filesystem=filesystem
|
||||
)
|
||||
|
||||
|
||||
def test_parquet_writer_store_schema(tempdir):
|
||||
table = pa.table({'a': [1, 2, 3]})
|
||||
|
||||
# default -> write schema information
|
||||
path1 = tempdir / 'test_with_schema.parquet'
|
||||
with pq.ParquetWriter(path1, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
meta = pq.read_metadata(path1)
|
||||
assert b'ARROW:schema' in meta.metadata
|
||||
assert meta.metadata[b'ARROW:schema']
|
||||
|
||||
# disable adding schema information
|
||||
path2 = tempdir / 'test_without_schema.parquet'
|
||||
with pq.ParquetWriter(path2, table.schema, store_schema=False) as writer:
|
||||
writer.write_table(table)
|
||||
|
||||
meta = pq.read_metadata(path2)
|
||||
assert meta.metadata is None
|
||||
|
||||
|
||||
def test_parquet_writer_append_key_value_metadata(tempdir):
|
||||
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
|
||||
path = tempdir / 'metadata.parquet'
|
||||
|
||||
with pq.ParquetWriter(path, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
writer.add_key_value_metadata({'key1': '1', 'key2': 'x'})
|
||||
writer.add_key_value_metadata({'key2': '2', 'key3': '3'})
|
||||
reader = pq.ParquetFile(path)
|
||||
metadata = reader.metadata.metadata
|
||||
assert metadata[b'key1'] == b'1'
|
||||
assert metadata[b'key2'] == b'2'
|
||||
assert metadata[b'key3'] == b'3'
|
||||
|
||||
|
||||
def test_parquet_content_defined_chunking(tempdir):
|
||||
table = pa.table({'a': range(100_000)})
|
||||
|
||||
# use PLAIN encoding because we compare the overall size of the row groups
|
||||
# which would vary depending on the encoding making the assertions wrong
|
||||
pq.write_table(table, tempdir / 'unchunked.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN")
|
||||
pq.write_table(table, tempdir / 'chunked-default.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN",
|
||||
use_content_defined_chunking=True)
|
||||
pq.write_table(table, tempdir / 'chunked-custom.parquet',
|
||||
use_dictionary=False,
|
||||
column_encoding="PLAIN",
|
||||
use_content_defined_chunking={"min_chunk_size": 32_768,
|
||||
"max_chunk_size": 65_536})
|
||||
|
||||
# the data must be the same
|
||||
unchunked = pq.read_table(tempdir / 'unchunked.parquet')
|
||||
chunked_default = pq.read_table(tempdir / 'chunked-default.parquet')
|
||||
chunked_custom = pq.read_table(tempdir / 'chunked-custom.parquet')
|
||||
assert unchunked.equals(chunked_default)
|
||||
assert unchunked.equals(chunked_custom)
|
||||
|
||||
# number of row groups and their sizes are not affected by content defined chunking
|
||||
unchunked_metadata = pq.read_metadata(tempdir / 'unchunked.parquet')
|
||||
chunked_default_metadata = pq.read_metadata(tempdir / 'chunked-default.parquet')
|
||||
chunked_custom_metadata = pq.read_metadata(tempdir / 'chunked-custom.parquet')
|
||||
|
||||
assert unchunked_metadata.num_row_groups == chunked_default_metadata.num_row_groups
|
||||
assert unchunked_metadata.num_row_groups == chunked_custom_metadata.num_row_groups
|
||||
|
||||
for i in range(unchunked_metadata.num_row_groups):
|
||||
rg_unchunked = unchunked_metadata.row_group(i)
|
||||
rg_chunked_default = chunked_default_metadata.row_group(i)
|
||||
rg_chunked_custom = chunked_custom_metadata.row_group(i)
|
||||
assert rg_unchunked.num_rows == rg_chunked_default.num_rows
|
||||
assert rg_unchunked.num_rows == rg_chunked_custom.num_rows
|
||||
# since PageReader is not exposed we cannot inspect the page sizes
|
||||
# so just check that the total byte size is different
|
||||
assert rg_unchunked.total_byte_size < rg_chunked_default.total_byte_size
|
||||
assert rg_unchunked.total_byte_size < rg_chunked_custom.total_byte_size
|
||||
assert rg_chunked_default.total_byte_size < rg_chunked_custom.total_byte_size
|
||||
|
||||
|
||||
def test_parquet_content_defined_chunking_parameters(tempdir):
|
||||
table = pa.table({'a': range(100)})
|
||||
path = tempdir / 'chunked-invalid.parquet'
|
||||
|
||||
# it raises OSError, not ideal but this is how parquet exceptions are handled
|
||||
# currently
|
||||
msg = "max_chunk_size must be greater than min_chunk_size"
|
||||
with pytest.raises(Exception, match=msg):
|
||||
cdc_options = {"min_chunk_size": 65_536, "max_chunk_size": 32_768}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
cases = [
|
||||
(
|
||||
{"min_chunk_size": 64 * 1024, "unknown_option": True},
|
||||
"Unknown options in 'use_content_defined_chunking': {'unknown_option'}"
|
||||
),
|
||||
(
|
||||
{"min_chunk_size": 64 * 1024},
|
||||
"Missing options in 'use_content_defined_chunking': {'max_chunk_size'}"
|
||||
),
|
||||
(
|
||||
{"max_chunk_size": 64 * 1024},
|
||||
"Missing options in 'use_content_defined_chunking': {'min_chunk_size'}"
|
||||
)
|
||||
]
|
||||
for cdc_options, msg in cases:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
# using the default parametrization
|
||||
pq.write_table(table, path, use_content_defined_chunking=True)
|
||||
|
||||
# using min_chunk_size and max_chunk_size
|
||||
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
|
||||
# using min_chunk_size, max_chunk_size and norm_level
|
||||
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_level": 1}
|
||||
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
|
||||
Reference in New Issue
Block a user