pfeed
supports partition pruning for parquet files.
import pfeed as pe
# get the configured data path
config = pe.get_config()
data_path = config.data_path
Using PyArrow¶
import pyarrow.dataset as ds
dataset = ds.dataset(data_path, format="parquet", partitioning="hive")
# filter the dataset by product type
dataset.filter(
(ds.field("product_type") == "PERP")
).to_table().to_pandas()
Using Polars¶
import polars as pl
lf = pl.scan_parquet(f'{data_path}/**/*.parquet', hive_partitioning=True)
# filter by product type, need to cast to string first using lazyframe
df = lf.filter(pl.col("product_type").cast(pl.String) == "PERP").collect()
df
Using DuckDB¶
import duckdb
query = f"""
SELECT * FROM '{data_path}/**/*.parquet'
WHERE volume > 100
"""
df = duckdb.sql(query).df()
df