Skip to article frontmatterSkip to article content

Partition Pruning

pfeed supports partition pruning for parquet files.

import pfeed as pe

# get the configured data path
config = pe.get_config()
data_path = config.data_path

Using PyArrow

import pyarrow.dataset as ds

dataset = ds.dataset(data_path, format="parquet", partitioning="hive")
# filter the dataset by product type
dataset.filter(
    (ds.field("product_type") == "PERP")
).to_table().to_pandas()

Using Polars

import polars as pl

lf = pl.scan_parquet(f'{data_path}/**/*.parquet', hive_partitioning=True)
# filter by product type, need to cast to string first using lazyframe
df = lf.filter(pl.col("product_type").cast(pl.String) == "PERP").collect()
df

Using DuckDB

import duckdb

query = f"""
SELECT * FROM '{data_path}/**/*.parquet'
WHERE volume > 100
"""
df = duckdb.sql(query).df()
df