Mise à jour de Monitor.py et autres scripts

This commit is contained in:
Debian
2025-07-23 10:46:27 +02:00
parent 7081418ce0
commit 7de3e0fb50
8604 changed files with 2789953 additions and 295 deletions

View File

@@ -0,0 +1,505 @@
from __future__ import annotations
from functools import reduce
from operator import and_
from typing import TYPE_CHECKING, Any
import duckdb
from duckdb import StarExpression
from narwhals._duckdb.utils import (
DeferredTimeZone,
F,
catch_duckdb_exception,
col,
evaluate_exprs,
lit,
native_to_narwhals_dtype,
window_expression,
)
from narwhals._utils import (
Implementation,
ValidateBackendVersion,
Version,
generate_temporary_column_name,
not_implemented,
parse_columns_to_drop,
requires,
)
from narwhals.dependencies import get_duckdb
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantLazyFrame
if TYPE_CHECKING:
from collections.abc import Iterator, Mapping, Sequence
from types import ModuleType
import pandas as pd
import pyarrow as pa
from duckdb import Expression
from duckdb.typing import DuckDBPyType
from typing_extensions import Self, TypeIs
from narwhals._compliant.typing import CompliantDataFrameAny
from narwhals._duckdb.expr import DuckDBExpr
from narwhals._duckdb.group_by import DuckDBGroupBy
from narwhals._duckdb.namespace import DuckDBNamespace
from narwhals._duckdb.series import DuckDBInterchangeSeries
from narwhals._utils import _LimitedContext
from narwhals.dataframe import LazyFrame
from narwhals.dtypes import DType
from narwhals.stable.v1 import DataFrame as DataFrameV1
from narwhals.typing import AsofJoinStrategy, JoinStrategy, LazyUniqueKeepStrategy
class DuckDBLazyFrame(
CompliantLazyFrame[
"DuckDBExpr",
"duckdb.DuckDBPyRelation",
"LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]",
],
ValidateBackendVersion,
):
_implementation = Implementation.DUCKDB
def __init__(
self,
df: duckdb.DuckDBPyRelation,
*,
version: Version,
validate_backend_version: bool = False,
) -> None:
self._native_frame: duckdb.DuckDBPyRelation = df
self._version = version
self._cached_native_schema: dict[str, DuckDBPyType] | None = None
self._cached_columns: list[str] | None = None
if validate_backend_version:
self._validate_backend_version()
@property
def _backend_version(self) -> tuple[int, ...]:
return self._implementation._backend_version()
@staticmethod
def _is_native(obj: duckdb.DuckDBPyRelation | Any) -> TypeIs[duckdb.DuckDBPyRelation]:
return isinstance(obj, duckdb.DuckDBPyRelation)
@classmethod
def from_native(
cls, data: duckdb.DuckDBPyRelation, /, *, context: _LimitedContext
) -> Self:
return cls(data, version=context._version)
def to_narwhals(
self, *args: Any, **kwds: Any
) -> LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]:
if self._version is Version.V1:
from narwhals.stable.v1 import DataFrame as DataFrameV1
return DataFrameV1(self, level="interchange") # type: ignore[no-any-return]
return self._version.lazyframe(self, level="lazy")
def __narwhals_dataframe__(self) -> Self: # pragma: no cover
# Keep around for backcompat.
if self._version is not Version.V1:
msg = "__narwhals_dataframe__ is not implemented for DuckDBLazyFrame"
raise AttributeError(msg)
return self
def __narwhals_lazyframe__(self) -> Self:
return self
def __native_namespace__(self) -> ModuleType:
return get_duckdb() # type: ignore[no-any-return]
def __narwhals_namespace__(self) -> DuckDBNamespace:
from narwhals._duckdb.namespace import DuckDBNamespace
return DuckDBNamespace(version=self._version)
def get_column(self, name: str) -> DuckDBInterchangeSeries:
from narwhals._duckdb.series import DuckDBInterchangeSeries
return DuckDBInterchangeSeries(self.native.select(name), version=self._version)
def _iter_columns(self) -> Iterator[Expression]:
for name in self.columns:
yield col(name)
def collect(
self, backend: ModuleType | Implementation | str | None, **kwargs: Any
) -> CompliantDataFrameAny:
if backend is None or backend is Implementation.PYARROW:
from narwhals._arrow.dataframe import ArrowDataFrame
return ArrowDataFrame(
self.native.arrow(),
validate_backend_version=True,
version=self._version,
validate_column_names=True,
)
if backend is Implementation.PANDAS:
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
return PandasLikeDataFrame(
self.native.df(),
implementation=Implementation.PANDAS,
validate_backend_version=True,
version=self._version,
validate_column_names=True,
)
if backend is Implementation.POLARS:
from narwhals._polars.dataframe import PolarsDataFrame
return PolarsDataFrame(
self.native.pl(), validate_backend_version=True, version=self._version
)
msg = f"Unsupported `backend` value: {backend}" # pragma: no cover
raise ValueError(msg) # pragma: no cover
def head(self, n: int) -> Self:
return self._with_native(self.native.limit(n))
def simple_select(self, *column_names: str) -> Self:
return self._with_native(self.native.select(*column_names))
def aggregate(self, *exprs: DuckDBExpr) -> Self:
selection = [val.alias(name) for name, val in evaluate_exprs(self, *exprs)]
try:
return self._with_native(self.native.aggregate(selection)) # type: ignore[arg-type]
except Exception as e: # noqa: BLE001
raise catch_duckdb_exception(e, self) from None
def select(self, *exprs: DuckDBExpr) -> Self:
selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs))
try:
return self._with_native(self.native.select(*selection))
except Exception as e: # noqa: BLE001
raise catch_duckdb_exception(e, self) from None
def drop(self, columns: Sequence[str], *, strict: bool) -> Self:
columns_to_drop = parse_columns_to_drop(self, columns, strict=strict)
selection = (name for name in self.columns if name not in columns_to_drop)
return self._with_native(self.native.select(*selection))
def lazy(self, *, backend: Implementation | None = None) -> Self:
# The `backend`` argument has no effect but we keep it here for
# backwards compatibility because in `narwhals.stable.v1`
# function `.from_native()` will return a DataFrame for DuckDB.
if backend is not None: # pragma: no cover
msg = "`backend` argument is not supported for DuckDB"
raise ValueError(msg)
return self
def with_columns(self, *exprs: DuckDBExpr) -> Self:
new_columns_map = dict(evaluate_exprs(self, *exprs))
result = [
new_columns_map.pop(name).alias(name)
if name in new_columns_map
else col(name)
for name in self.columns
]
result.extend(value.alias(name) for name, value in new_columns_map.items())
try:
return self._with_native(self.native.select(*result))
except Exception as e: # noqa: BLE001
raise catch_duckdb_exception(e, self) from None
def filter(self, predicate: DuckDBExpr) -> Self:
# `[0]` is safe as the predicate's expression only returns a single column
mask = predicate(self)[0]
try:
return self._with_native(self.native.filter(mask))
except Exception as e: # noqa: BLE001
raise catch_duckdb_exception(e, self) from None
@property
def schema(self) -> dict[str, DType]:
if self._cached_native_schema is None:
# Note: prefer `self._cached_native_schema` over `functools.cached_property`
# due to Python3.13 failures.
self._cached_native_schema = dict(zip(self.columns, self.native.types))
deferred_time_zone = DeferredTimeZone(self.native)
return {
column_name: native_to_narwhals_dtype(
duckdb_dtype, self._version, deferred_time_zone
)
for column_name, duckdb_dtype in zip(self.native.columns, self.native.types)
}
@property
def columns(self) -> list[str]:
if self._cached_columns is None:
self._cached_columns = (
list(self.schema)
if self._cached_native_schema is not None
else self.native.columns
)
return self._cached_columns
def to_pandas(self) -> pd.DataFrame:
# only if version is v1, keep around for backcompat
return self.native.df()
def to_arrow(self) -> pa.Table:
# only if version is v1, keep around for backcompat
return self.native.arrow()
def _with_version(self, version: Version) -> Self:
return self.__class__(self.native, version=version)
def _with_native(self, df: duckdb.DuckDBPyRelation) -> Self:
return self.__class__(df, version=self._version)
def group_by(
self, keys: Sequence[str] | Sequence[DuckDBExpr], *, drop_null_keys: bool
) -> DuckDBGroupBy:
from narwhals._duckdb.group_by import DuckDBGroupBy
return DuckDBGroupBy(self, keys, drop_null_keys=drop_null_keys)
def rename(self, mapping: Mapping[str, str]) -> Self:
df = self.native
selection = (
col(name).alias(mapping[name]) if name in mapping else col(name)
for name in df.columns
)
return self._with_native(self.native.select(*selection))
def join(
self,
other: Self,
*,
how: JoinStrategy,
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
) -> Self:
native_how = "outer" if how == "full" else how
if native_how == "cross":
if self._backend_version < (1, 1, 4):
msg = f"'duckdb>=1.1.4' is required for cross-join, found version: {self._backend_version}"
raise NotImplementedError(msg)
rel = self.native.set_alias("lhs").cross(other.native.set_alias("rhs"))
else:
# help mypy
assert left_on is not None # noqa: S101
assert right_on is not None # noqa: S101
it = (
col(f'lhs."{left}"') == col(f'rhs."{right}"')
for left, right in zip(left_on, right_on)
)
condition: Expression = reduce(and_, it)
rel = self.native.set_alias("lhs").join(
other.native.set_alias("rhs"),
# NOTE: Fixed in `--pre` https://github.com/duckdb/duckdb/pull/16933
condition=condition, # type: ignore[arg-type, unused-ignore]
how=native_how,
)
if native_how in {"inner", "left", "cross", "outer"}:
select = [col(f'lhs."{x}"') for x in self.columns]
for name in other.columns:
col_in_lhs: bool = name in self.columns
if native_how == "outer" and not col_in_lhs:
select.append(col(f'rhs."{name}"'))
elif (native_how == "outer") or (
col_in_lhs and (right_on is None or name not in right_on)
):
select.append(col(f'rhs."{name}"').alias(f"{name}{suffix}"))
elif right_on is None or name not in right_on:
select.append(col(name))
res = rel.select(*select).set_alias(self.native.alias)
else: # semi, anti
res = rel.select("lhs.*").set_alias(self.native.alias)
return self._with_native(res)
def join_asof(
self,
other: Self,
*,
left_on: str,
right_on: str,
by_left: Sequence[str] | None,
by_right: Sequence[str] | None,
strategy: AsofJoinStrategy,
suffix: str,
) -> Self:
lhs = self.native
rhs = other.native
conditions: list[Expression] = []
if by_left is not None and by_right is not None:
conditions.extend(
col(f'lhs."{left}"') == col(f'rhs."{right}"')
for left, right in zip(by_left, by_right)
)
else:
by_left = by_right = []
if strategy == "backward":
conditions.append(col(f'lhs."{left_on}"') >= col(f'rhs."{right_on}"'))
elif strategy == "forward":
conditions.append(col(f'lhs."{left_on}"') <= col(f'rhs."{right_on}"'))
else:
msg = "Only 'backward' and 'forward' strategies are currently supported for DuckDB"
raise NotImplementedError(msg)
condition: Expression = reduce(and_, conditions)
select = ["lhs.*"]
for name in rhs.columns:
if name in lhs.columns and (
right_on is None or name not in {right_on, *by_right}
):
select.append(f'rhs."{name}" as "{name}{suffix}"')
elif right_on is None or name not in {right_on, *by_right}:
select.append(str(col(name)))
# Replace with Python API call once
# https://github.com/duckdb/duckdb/discussions/16947 is addressed.
query = f"""
SELECT {",".join(select)}
FROM lhs
ASOF LEFT JOIN rhs
ON {condition}
""" # noqa: S608
return self._with_native(duckdb.sql(query))
def collect_schema(self) -> dict[str, DType]:
return self.schema
def unique(
self, subset: Sequence[str] | None, *, keep: LazyUniqueKeepStrategy
) -> Self:
if subset_ := subset if keep == "any" else (subset or self.columns):
# Sanitise input
if error := self._check_columns_exist(subset_):
raise error
idx_name = generate_temporary_column_name(8, self.columns)
count_name = generate_temporary_column_name(8, [*self.columns, idx_name])
name = count_name if keep == "none" else idx_name
idx_expr = window_expression(F("row_number"), subset_).alias(idx_name)
count_expr = window_expression(
F("count", StarExpression()), subset_, ()
).alias(count_name)
return self._with_native(
self.native.select(StarExpression(), idx_expr, count_expr)
.filter(col(name) == lit(1))
.select(StarExpression(exclude=[count_name, idx_name]))
)
return self._with_native(self.native.unique(", ".join(self.columns)))
def sort(self, *by: str, descending: bool | Sequence[bool], nulls_last: bool) -> Self:
if isinstance(descending, bool):
descending = [descending] * len(by)
if nulls_last:
it = (
col(name).nulls_last() if not desc else col(name).desc().nulls_last()
for name, desc in zip(by, descending)
)
else:
it = (
col(name).nulls_first() if not desc else col(name).desc().nulls_first()
for name, desc in zip(by, descending)
)
return self._with_native(self.native.sort(*it))
def drop_nulls(self, subset: Sequence[str] | None) -> Self:
subset_ = subset if subset is not None else self.columns
keep_condition = reduce(and_, (col(name).isnotnull() for name in subset_))
return self._with_native(self.native.filter(keep_condition))
def explode(self, columns: Sequence[str]) -> Self:
dtypes = self._version.dtypes
schema = self.collect_schema()
for name in columns:
dtype = schema[name]
if dtype != dtypes.List:
msg = (
f"`explode` operation not supported for dtype `{dtype}`, "
"expected List type"
)
raise InvalidOperationError(msg)
if len(columns) != 1:
msg = (
"Exploding on multiple columns is not supported with DuckDB backend since "
"we cannot guarantee that the exploded columns have matching element counts."
)
raise NotImplementedError(msg)
col_to_explode = col(columns[0])
rel = self.native
original_columns = self.columns
not_null_condition = col_to_explode.isnotnull() & F("len", col_to_explode) > lit(
0
)
non_null_rel = rel.filter(not_null_condition).select(
*(
F("unnest", col_to_explode).alias(name) if name in columns else name
for name in original_columns
)
)
null_rel = rel.filter(~not_null_condition).select(
*(
lit(None).alias(name) if name in columns else name
for name in original_columns
)
)
return self._with_native(non_null_rel.union(null_rel))
def unpivot(
self,
on: Sequence[str] | None,
index: Sequence[str] | None,
variable_name: str,
value_name: str,
) -> Self:
index_ = [] if index is None else index
on_ = [c for c in self.columns if c not in index_] if on is None else on
if variable_name == "":
msg = "`variable_name` cannot be empty string for duckdb backend."
raise NotImplementedError(msg)
if value_name == "":
msg = "`value_name` cannot be empty string for duckdb backend."
raise NotImplementedError(msg)
unpivot_on = ", ".join(str(col(name)) for name in on_)
rel = self.native # noqa: F841
# Replace with Python API once
# https://github.com/duckdb/duckdb/discussions/16980 is addressed.
query = f"""
unpivot rel
on {unpivot_on}
into
name "{variable_name}"
value "{value_name}"
"""
return self._with_native(
duckdb.sql(query).select(*[*index_, variable_name, value_name])
)
@requires.backend_version((1, 3))
def with_row_index(self, name: str, order_by: Sequence[str]) -> Self:
if order_by is None:
msg = "Cannot pass `order_by` to `with_row_index` for DuckDB"
raise TypeError(msg)
expr = (window_expression(F("row_number"), order_by=order_by) - lit(1)).alias(
name
)
return self._with_native(self.native.select(expr, StarExpression()))
gather_every = not_implemented.deprecated(
"`LazyFrame.gather_every` is deprecated and will be removed in a future version."
)
tail = not_implemented.deprecated(
"`LazyFrame.tail` is deprecated and will be removed in a future version."
)

View File

@@ -0,0 +1,827 @@
from __future__ import annotations
import operator
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
from duckdb import CoalesceOperator, StarExpression
from duckdb.typing import DuckDBPyType
from narwhals._compliant import LazyExpr, WindowInputs
from narwhals._duckdb.expr_dt import DuckDBExprDateTimeNamespace
from narwhals._duckdb.expr_list import DuckDBExprListNamespace
from narwhals._duckdb.expr_str import DuckDBExprStringNamespace
from narwhals._duckdb.expr_struct import DuckDBExprStructNamespace
from narwhals._duckdb.utils import (
DeferredTimeZone,
F,
col,
lit,
narwhals_to_native_dtype,
when,
window_expression,
)
from narwhals._expression_parsing import (
ExprKind,
combine_alias_output_names,
combine_evaluate_output_names,
)
from narwhals._utils import Implementation, not_implemented, requires
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
from duckdb import Expression
from typing_extensions import Self
from narwhals._compliant.typing import (
AliasNames,
EvalNames,
EvalSeries,
WindowFunction,
)
from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.namespace import DuckDBNamespace
from narwhals._duckdb.typing import WindowExpressionKwargs
from narwhals._expression_parsing import ExprMetadata
from narwhals._utils import Version, _LimitedContext
from narwhals.typing import (
FillNullStrategy,
IntoDType,
NonNestedLiteral,
NumericLiteral,
RankMethod,
RollingInterpolationMethod,
TemporalLiteral,
)
DuckDBWindowFunction = WindowFunction[DuckDBLazyFrame, Expression]
DuckDBWindowInputs = WindowInputs[Expression]
class DuckDBExpr(LazyExpr["DuckDBLazyFrame", "Expression"]):
_implementation = Implementation.DUCKDB
def __init__(
self,
call: EvalSeries[DuckDBLazyFrame, Expression],
window_function: DuckDBWindowFunction | None = None,
*,
evaluate_output_names: EvalNames[DuckDBLazyFrame],
alias_output_names: AliasNames | None,
version: Version,
) -> None:
self._call = call
self._evaluate_output_names = evaluate_output_names
self._alias_output_names = alias_output_names
self._version = version
self._metadata: ExprMetadata | None = None
self._window_function: DuckDBWindowFunction | None = window_function
@property
def window_function(self) -> DuckDBWindowFunction:
def default_window_func(
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
) -> list[Expression]:
assert not inputs.order_by # noqa: S101
return [
window_expression(expr, inputs.partition_by, inputs.order_by)
for expr in self(df)
]
return self._window_function or default_window_func
def __call__(self, df: DuckDBLazyFrame) -> Sequence[Expression]:
return self._call(df)
def __narwhals_expr__(self) -> None: ...
def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover
from narwhals._duckdb.namespace import DuckDBNamespace
return DuckDBNamespace(version=self._version)
def _cum_window_func(
self,
func_name: Literal["sum", "max", "min", "count", "product"],
*,
reverse: bool,
) -> DuckDBWindowFunction:
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
return [
window_expression(
F(func_name, expr),
inputs.partition_by,
inputs.order_by,
descending=[reverse] * len(inputs.order_by),
nulls_last=[reverse] * len(inputs.order_by),
rows_start="unbounded preceding",
rows_end="current row",
)
for expr in self(df)
]
return func
def _rolling_window_func(
self,
func_name: Literal["sum", "mean", "std", "var"],
window_size: int,
min_samples: int,
ddof: int | None = None,
*,
center: bool,
) -> DuckDBWindowFunction:
supported_funcs = ["sum", "mean", "std", "var"]
if center:
half = (window_size - 1) // 2
remainder = (window_size - 1) % 2
start = f"{half + remainder} preceding"
end = f"{half} following"
else:
start = f"{window_size - 1} preceding"
end = "current row"
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
if func_name in {"sum", "mean"}:
func_: str = func_name
elif func_name == "var" and ddof == 0:
func_ = "var_pop"
elif func_name in "var" and ddof == 1:
func_ = "var_samp"
elif func_name == "std" and ddof == 0:
func_ = "stddev_pop"
elif func_name == "std" and ddof == 1:
func_ = "stddev_samp"
elif func_name in {"var", "std"}: # pragma: no cover
msg = f"Only ddof=0 and ddof=1 are currently supported for rolling_{func_name}."
raise ValueError(msg)
else: # pragma: no cover
msg = f"Only the following functions are supported: {supported_funcs}.\nGot: {func_name}."
raise ValueError(msg)
window_kwargs: WindowExpressionKwargs = {
"partition_by": inputs.partition_by,
"order_by": inputs.order_by,
"rows_start": start,
"rows_end": end,
}
return [
when(
window_expression(F("count", expr), **window_kwargs)
>= lit(min_samples),
window_expression(F(func_, expr), **window_kwargs),
)
for expr in self(df)
]
return func
def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Self:
if kind is ExprKind.LITERAL:
return self
if self._backend_version < (1, 3):
msg = "At least version 1.3 of DuckDB is required for binary operations between aggregates and columns."
raise NotImplementedError(msg)
return self.over([lit(1)], [])
@classmethod
def from_column_names(
cls,
evaluate_column_names: EvalNames[DuckDBLazyFrame],
/,
*,
context: _LimitedContext,
) -> Self:
def func(df: DuckDBLazyFrame) -> list[Expression]:
return [col(name) for name in evaluate_column_names(df)]
return cls(
func,
evaluate_output_names=evaluate_column_names,
alias_output_names=None,
version=context._version,
)
@classmethod
def from_column_indices(cls, *column_indices: int, context: _LimitedContext) -> Self:
def func(df: DuckDBLazyFrame) -> list[Expression]:
columns = df.columns
return [col(columns[i]) for i in column_indices]
return cls(
func,
evaluate_output_names=cls._eval_names_indices(column_indices),
alias_output_names=None,
version=context._version,
)
@classmethod
def _from_elementwise_horizontal_op(
cls, func: Callable[[Iterable[Expression]], Expression], *exprs: Self
) -> Self:
def call(df: DuckDBLazyFrame) -> list[Expression]:
cols = (col for _expr in exprs for col in _expr(df))
return [func(cols)]
def window_function(
df: DuckDBLazyFrame, window_inputs: DuckDBWindowInputs
) -> list[Expression]:
cols = (
col for _expr in exprs for col in _expr.window_function(df, window_inputs)
)
return [func(cols)]
context = exprs[0]
return cls(
call=call,
window_function=window_function,
evaluate_output_names=combine_evaluate_output_names(*exprs),
alias_output_names=combine_alias_output_names(*exprs),
version=context._version,
)
def _callable_to_eval_series(
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
) -> EvalSeries[DuckDBLazyFrame, Expression]:
def func(df: DuckDBLazyFrame) -> list[Expression]:
native_series_list = self(df)
other_native_series = {
key: df._evaluate_expr(value) if self._is_expr(value) else lit(value)
for key, value in expressifiable_args.items()
}
return [
call(native_series, **other_native_series)
for native_series in native_series_list
]
return func
def _push_down_window_function(
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
) -> DuckDBWindowFunction:
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
# If a function `f` is elementwise, and `g` is another function, then
# - `f(g) over (window)`
# - `f(g over (window))
# are equivalent.
# Make sure to only use with if `call` is elementwise!
native_series_list = self.window_function(df, inputs)
other_native_series = {
key: df._evaluate_window_expr(value, inputs)
if self._is_expr(value)
else lit(value)
for key, value in expressifiable_args.items()
}
return [
call(native_series, **other_native_series)
for native_series in native_series_list
]
return window_f
def _with_callable(
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
) -> Self:
"""Create expression from callable.
Arguments:
call: Callable from compliant DataFrame to native Expression
expr_name: Expression name
expressifiable_args: arguments pass to expression which should be parsed
as expressions (e.g. in `nw.col('a').is_between('b', 'c')`)
"""
return self.__class__(
self._callable_to_eval_series(call, **expressifiable_args),
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
def _with_elementwise(
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
) -> Self:
return self.__class__(
self._callable_to_eval_series(call, **expressifiable_args),
self._push_down_window_function(call, **expressifiable_args),
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
def _with_binary(self, op: Callable[..., Expression], other: Self | Any) -> Self:
return self.__class__(
self._callable_to_eval_series(op, other=other),
self._push_down_window_function(op, other=other),
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
def _with_alias_output_names(self, func: AliasNames | None, /) -> Self:
return type(self)(
self._call,
self._window_function,
evaluate_output_names=self._evaluate_output_names,
alias_output_names=func,
version=self._version,
)
def _with_window_function(self, window_function: DuckDBWindowFunction) -> Self:
return self.__class__(
self._call,
window_function,
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
@classmethod
def _alias_native(cls, expr: Expression, name: str) -> Expression:
return expr.alias(name)
def __invert__(self) -> Self:
invert = cast("Callable[..., Expression]", operator.invert)
return self._with_elementwise(invert)
def abs(self) -> Self:
return self._with_elementwise(lambda expr: F("abs", expr))
def mean(self) -> Self:
return self._with_callable(lambda expr: F("mean", expr))
def skew(self) -> Self:
def func(expr: Expression) -> Expression:
count = F("count", expr)
# Adjust population skewness by correction factor to get sample skewness
sample_skewness = (
F("skewness", expr)
* (count - lit(2))
/ F("sqrt", count * (count - lit(1)))
)
return when(count == lit(0), lit(None)).otherwise(
when(count == lit(1), lit(float("nan"))).otherwise(
when(count == lit(2), lit(0.0)).otherwise(sample_skewness)
)
)
return self._with_callable(func)
def kurtosis(self) -> Self:
return self._with_callable(lambda expr: F("kurtosis_pop", expr))
def median(self) -> Self:
return self._with_callable(lambda expr: F("median", expr))
def all(self) -> Self:
def f(expr: Expression) -> Expression:
return CoalesceOperator(F("bool_and", expr), lit(True)) # noqa: FBT003
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
return [
CoalesceOperator(
window_expression(F("bool_and", expr), inputs.partition_by),
lit(True), # noqa: FBT003
)
for expr in self(df)
]
return self._with_callable(f)._with_window_function(window_f)
def any(self) -> Self:
def f(expr: Expression) -> Expression:
return CoalesceOperator(F("bool_or", expr), lit(False)) # noqa: FBT003
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
return [
CoalesceOperator(
window_expression(F("bool_or", expr), inputs.partition_by),
lit(False), # noqa: FBT003
)
for expr in self(df)
]
return self._with_callable(f)._with_window_function(window_f)
def quantile(
self, quantile: float, interpolation: RollingInterpolationMethod
) -> Self:
def func(expr: Expression) -> Expression:
if interpolation == "linear":
return F("quantile_cont", expr, lit(quantile))
msg = "Only linear interpolation methods are supported for DuckDB quantile."
raise NotImplementedError(msg)
return self._with_callable(func)
def clip(
self,
lower_bound: Self | NumericLiteral | TemporalLiteral | None,
upper_bound: Self | NumericLiteral | TemporalLiteral | None,
) -> Self:
def _clip_lower(expr: Expression, lower_bound: Any) -> Expression:
return F("greatest", expr, lower_bound)
def _clip_upper(expr: Expression, upper_bound: Any) -> Expression:
return F("least", expr, upper_bound)
def _clip_both(
expr: Expression, lower_bound: Any, upper_bound: Any
) -> Expression:
return F("greatest", F("least", expr, upper_bound), lower_bound)
if lower_bound is None:
return self._with_elementwise(_clip_upper, upper_bound=upper_bound)
if upper_bound is None:
return self._with_elementwise(_clip_lower, lower_bound=lower_bound)
return self._with_elementwise(
_clip_both, lower_bound=lower_bound, upper_bound=upper_bound
)
def sum(self) -> Self:
def f(expr: Expression) -> Expression:
return CoalesceOperator(F("sum", expr), lit(0))
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
return [
CoalesceOperator(
window_expression(F("sum", expr), inputs.partition_by), lit(0)
)
for expr in self(df)
]
return self._with_callable(f)._with_window_function(window_f)
def n_unique(self) -> Self:
def func(expr: Expression) -> Expression:
# https://stackoverflow.com/a/79338887/4451315
return F("array_unique", F("array_agg", expr)) + F(
"max", when(expr.isnotnull(), lit(0)).otherwise(lit(1))
)
return self._with_callable(func)
def count(self) -> Self:
return self._with_callable(lambda expr: F("count", expr))
def len(self) -> Self:
return self._with_callable(lambda _expr: F("count"))
def std(self, ddof: int) -> Self:
if ddof == 0:
return self._with_callable(lambda expr: F("stddev_pop", expr))
if ddof == 1:
return self._with_callable(lambda expr: F("stddev_samp", expr))
def _std(expr: Expression) -> Expression:
n_samples = F("count", expr)
return (
F("stddev_pop", expr)
* F("sqrt", n_samples)
/ (F("sqrt", (n_samples - lit(ddof))))
)
return self._with_callable(_std)
def var(self, ddof: int) -> Self:
if ddof == 0:
return self._with_callable(lambda expr: F("var_pop", expr))
if ddof == 1:
return self._with_callable(lambda expr: F("var_samp", expr))
def _var(expr: Expression) -> Expression:
n_samples = F("count", expr)
return F("var_pop", expr) * n_samples / (n_samples - lit(ddof))
return self._with_callable(_var)
def max(self) -> Self:
return self._with_callable(lambda expr: F("max", expr))
def min(self) -> Self:
return self._with_callable(lambda expr: F("min", expr))
def null_count(self) -> Self:
return self._with_callable(lambda expr: F("sum", expr.isnull().cast("int")))
@requires.backend_version((1, 3))
def over(
self, partition_by: Sequence[str | Expression], order_by: Sequence[str]
) -> Self:
def func(df: DuckDBLazyFrame) -> Sequence[Expression]:
return self.window_function(df, WindowInputs(partition_by, order_by))
return self.__class__(
func,
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
def is_null(self) -> Self:
return self._with_elementwise(lambda expr: expr.isnull())
def is_nan(self) -> Self:
return self._with_elementwise(lambda expr: F("isnan", expr))
def is_finite(self) -> Self:
return self._with_elementwise(lambda expr: F("isfinite", expr))
def is_in(self, other: Sequence[Any]) -> Self:
return self._with_elementwise(lambda expr: F("contains", lit(other), expr))
def round(self, decimals: int) -> Self:
return self._with_elementwise(lambda expr: F("round", expr, lit(decimals)))
@requires.backend_version((1, 3))
def shift(self, n: int) -> Self:
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
return [
window_expression(
F("lag", expr, lit(n)), inputs.partition_by, inputs.order_by
)
for expr in self(df)
]
return self._with_window_function(func)
@requires.backend_version((1, 3))
def is_first_distinct(self) -> Self:
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
return [
window_expression(
F("row_number"), (*inputs.partition_by, expr), inputs.order_by
)
== lit(1)
for expr in self(df)
]
return self._with_window_function(func)
@requires.backend_version((1, 3))
def is_last_distinct(self) -> Self:
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
return [
window_expression(
F("row_number"),
(*inputs.partition_by, expr),
inputs.order_by,
descending=[True] * len(inputs.order_by),
nulls_last=[True] * len(inputs.order_by),
)
== lit(1)
for expr in self(df)
]
return self._with_window_function(func)
@requires.backend_version((1, 3))
def diff(self) -> Self:
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
return [
expr
- window_expression(F("lag", expr), inputs.partition_by, inputs.order_by)
for expr in self(df)
]
return self._with_window_function(func)
@requires.backend_version((1, 3))
def cum_sum(self, *, reverse: bool) -> Self:
return self._with_window_function(self._cum_window_func("sum", reverse=reverse))
@requires.backend_version((1, 3))
def cum_max(self, *, reverse: bool) -> Self:
return self._with_window_function(self._cum_window_func("max", reverse=reverse))
@requires.backend_version((1, 3))
def cum_min(self, *, reverse: bool) -> Self:
return self._with_window_function(self._cum_window_func("min", reverse=reverse))
@requires.backend_version((1, 3))
def cum_count(self, *, reverse: bool) -> Self:
return self._with_window_function(self._cum_window_func("count", reverse=reverse))
@requires.backend_version((1, 3))
def cum_prod(self, *, reverse: bool) -> Self:
return self._with_window_function(
self._cum_window_func("product", reverse=reverse)
)
@requires.backend_version((1, 3))
def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self:
return self._with_window_function(
self._rolling_window_func("sum", window_size, min_samples, center=center)
)
@requires.backend_version((1, 3))
def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self:
return self._with_window_function(
self._rolling_window_func("mean", window_size, min_samples, center=center)
)
@requires.backend_version((1, 3))
def rolling_var(
self, window_size: int, *, min_samples: int, center: bool, ddof: int
) -> Self:
return self._with_window_function(
self._rolling_window_func(
"var", window_size, min_samples, ddof=ddof, center=center
)
)
@requires.backend_version((1, 3))
def rolling_std(
self, window_size: int, *, min_samples: int, center: bool, ddof: int
) -> Self:
return self._with_window_function(
self._rolling_window_func(
"std", window_size, min_samples, ddof=ddof, center=center
)
)
def fill_null(
self,
value: Self | NonNestedLiteral,
strategy: FillNullStrategy | None,
limit: int | None,
) -> Self:
if strategy is not None:
if self._backend_version < (1, 3): # pragma: no cover
msg = f"`fill_null` with `strategy={strategy}` is only available in 'duckdb>=1.3.0'."
raise NotImplementedError(msg)
def _fill_with_strategy(
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
) -> Sequence[Expression]:
fill_func = "last_value" if strategy == "forward" else "first_value"
_limit = "unbounded" if limit is None else limit
rows_start, rows_end = (
(f"{_limit} preceding", "current row")
if strategy == "forward"
else ("current row", f"{_limit} following")
)
return [
window_expression(
F(fill_func, expr),
inputs.partition_by,
inputs.order_by,
rows_start=rows_start,
rows_end=rows_end,
ignore_nulls=True,
)
for expr in self(df)
]
return self._with_window_function(_fill_with_strategy)
def _fill_constant(expr: Expression, value: Any) -> Expression:
return CoalesceOperator(expr, value)
return self._with_elementwise(_fill_constant, value=value)
def cast(self, dtype: IntoDType) -> Self:
def func(df: DuckDBLazyFrame) -> list[Expression]:
tz = DeferredTimeZone(df.native)
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
return [expr.cast(DuckDBPyType(native_dtype)) for expr in self(df)]
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
tz = DeferredTimeZone(df.native)
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
return [
expr.cast(DuckDBPyType(native_dtype))
for expr in self.window_function(df, inputs)
]
return self.__class__(
func,
window_f,
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)
@requires.backend_version((1, 3))
def is_unique(self) -> Self:
def _is_unique(expr: Expression, *partition_by: str | Expression) -> Expression:
return window_expression(
F("count", StarExpression()), (expr, *partition_by)
) == lit(1)
def _unpartitioned_is_unique(expr: Expression) -> Expression:
return _is_unique(expr)
def _partitioned_is_unique(
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
) -> Sequence[Expression]:
assert not inputs.order_by # noqa: S101
return [_is_unique(expr, *inputs.partition_by) for expr in self(df)]
return self._with_callable(_unpartitioned_is_unique)._with_window_function(
_partitioned_is_unique
)
@requires.backend_version((1, 3))
def rank(self, method: RankMethod, *, descending: bool) -> Self:
if method in {"min", "max", "average"}:
func = F("rank")
elif method == "dense":
func = F("dense_rank")
else: # method == "ordinal"
func = F("row_number")
def _rank(
expr: Expression,
partition_by: Sequence[str | Expression] = (),
order_by: Sequence[str | Expression] = (),
*,
descending: Sequence[bool],
nulls_last: Sequence[bool],
) -> Expression:
count_expr = F("count", StarExpression())
window_kwargs: WindowExpressionKwargs = {
"partition_by": partition_by,
"order_by": (expr, *order_by),
"descending": descending,
"nulls_last": nulls_last,
}
count_window_kwargs: WindowExpressionKwargs = {
"partition_by": (*partition_by, expr)
}
if method == "max":
rank_expr = (
window_expression(func, **window_kwargs)
+ window_expression(count_expr, **count_window_kwargs)
- lit(1)
)
elif method == "average":
rank_expr = window_expression(func, **window_kwargs) + (
window_expression(count_expr, **count_window_kwargs) - lit(1)
) / lit(2.0)
else:
rank_expr = window_expression(func, **window_kwargs)
return when(expr.isnotnull(), rank_expr)
def _unpartitioned_rank(expr: Expression) -> Expression:
return _rank(expr, descending=[descending], nulls_last=[True])
def _partitioned_rank(
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
) -> Sequence[Expression]:
# node: when `descending` / `nulls_last` are supported in `.over`, they should be respected here
# https://github.com/narwhals-dev/narwhals/issues/2790
return [
_rank(
expr,
inputs.partition_by,
inputs.order_by,
descending=[descending] + [False] * len(inputs.order_by),
nulls_last=[True] + [False] * len(inputs.order_by),
)
for expr in self(df)
]
return self._with_callable(_unpartitioned_rank)._with_window_function(
_partitioned_rank
)
def log(self, base: float) -> Self:
def _log(expr: Expression) -> Expression:
log = F("log", expr)
return (
when(expr < lit(0), lit(float("nan")))
.when(expr == lit(0), lit(float("-inf")))
.otherwise(log / F("log", lit(base)))
)
return self._with_elementwise(_log)
def exp(self) -> Self:
def _exp(expr: Expression) -> Expression:
return F("exp", expr)
return self._with_elementwise(_exp)
def sqrt(self) -> Self:
def _sqrt(expr: Expression) -> Expression:
return when(expr < lit(0), lit(float("nan"))).otherwise(F("sqrt", expr))
return self._with_elementwise(_sqrt)
@property
def str(self) -> DuckDBExprStringNamespace:
return DuckDBExprStringNamespace(self)
@property
def dt(self) -> DuckDBExprDateTimeNamespace:
return DuckDBExprDateTimeNamespace(self)
@property
def list(self) -> DuckDBExprListNamespace:
return DuckDBExprListNamespace(self)
@property
def struct(self) -> DuckDBExprStructNamespace:
return DuckDBExprStructNamespace(self)
drop_nulls = not_implemented()
unique = not_implemented()

View File

@@ -0,0 +1,157 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._compliant import LazyExprNamespace
from narwhals._compliant.any_namespace import DateTimeNamespace
from narwhals._constants import (
MS_PER_MINUTE,
MS_PER_SECOND,
NS_PER_SECOND,
SECONDS_PER_MINUTE,
US_PER_MINUTE,
US_PER_SECOND,
)
from narwhals._duckdb.utils import UNITS_DICT, F, fetch_rel_time_zone, lit
from narwhals._duration import Interval
from narwhals._utils import not_implemented
if TYPE_CHECKING:
from collections.abc import Sequence
from duckdb import Expression
from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.expr import DuckDBExpr
class DuckDBExprDateTimeNamespace(
LazyExprNamespace["DuckDBExpr"], DateTimeNamespace["DuckDBExpr"]
):
def year(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("year", expr))
def month(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("month", expr))
def day(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("day", expr))
def hour(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("hour", expr))
def minute(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("minute", expr))
def second(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("second", expr))
def millisecond(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("millisecond", expr) - F("second", expr) * lit(MS_PER_SECOND)
)
def microsecond(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("microsecond", expr) - F("second", expr) * lit(US_PER_SECOND)
)
def nanosecond(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("nanosecond", expr) - F("second", expr) * lit(NS_PER_SECOND)
)
def to_string(self, format: str) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("strftime", expr, lit(format))
)
def weekday(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("isodow", expr))
def ordinal_day(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("dayofyear", expr))
def date(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: expr.cast("date"))
def total_minutes(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("datepart", lit("minute"), expr)
)
def total_seconds(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: lit(SECONDS_PER_MINUTE) * F("datepart", lit("minute"), expr)
+ F("datepart", lit("second"), expr)
)
def total_milliseconds(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: lit(MS_PER_MINUTE) * F("datepart", lit("minute"), expr)
+ F("datepart", lit("millisecond"), expr)
)
def total_microseconds(self) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: lit(US_PER_MINUTE) * F("datepart", lit("minute"), expr)
+ F("datepart", lit("microsecond"), expr)
)
def truncate(self, every: str) -> DuckDBExpr:
interval = Interval.parse(every)
multiple, unit = interval.multiple, interval.unit
if multiple != 1:
# https://github.com/duckdb/duckdb/issues/17554
msg = f"Only multiple 1 is currently supported for DuckDB.\nGot {multiple!s}."
raise ValueError(msg)
if unit == "ns":
msg = "Truncating to nanoseconds is not yet supported for DuckDB."
raise NotImplementedError(msg)
format = lit(UNITS_DICT[unit])
def _truncate(expr: Expression) -> Expression:
return F("date_trunc", format, expr)
return self.compliant._with_elementwise(_truncate)
def offset_by(self, by: str) -> DuckDBExpr:
interval = Interval.parse_no_constraints(by)
format = lit(f"{interval.multiple!s} {UNITS_DICT[interval.unit]}")
def _offset_by(expr: Expression) -> Expression:
return F("date_add", format, expr)
return self.compliant._with_callable(_offset_by)
def _no_op_time_zone(self, time_zone: str) -> DuckDBExpr:
def func(df: DuckDBLazyFrame) -> Sequence[Expression]:
native_series_list = self.compliant(df)
conn_time_zone = fetch_rel_time_zone(df.native)
if conn_time_zone != time_zone:
msg = (
"DuckDB stores the time zone in the connection, rather than in the "
f"data type, so changing the timezone to anything other than {conn_time_zone} "
" (the current connection time zone) is not supported."
)
raise NotImplementedError(msg)
return native_series_list
return self.compliant.__class__(
func,
evaluate_output_names=self.compliant._evaluate_output_names,
alias_output_names=self.compliant._alias_output_names,
version=self.compliant._version,
)
def convert_time_zone(self, time_zone: str) -> DuckDBExpr:
return self._no_op_time_zone(time_zone)
def replace_time_zone(self, time_zone: str | None) -> DuckDBExpr:
if time_zone is None:
return self.compliant._with_elementwise(lambda expr: expr.cast("timestamp"))
else:
return self._no_op_time_zone(time_zone)
total_nanoseconds = not_implemented()
timestamp = not_implemented()

View File

@@ -0,0 +1,17 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._compliant import LazyExprNamespace
from narwhals._compliant.any_namespace import ListNamespace
from narwhals._duckdb.utils import F
if TYPE_CHECKING:
from narwhals._duckdb.expr import DuckDBExpr
class DuckDBExprListNamespace(
LazyExprNamespace["DuckDBExpr"], ListNamespace["DuckDBExpr"]
):
def len(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("len", expr))

View File

@@ -0,0 +1,128 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._compliant import LazyExprNamespace
from narwhals._compliant.any_namespace import StringNamespace
from narwhals._duckdb.utils import F, lit, when
from narwhals._utils import not_implemented
if TYPE_CHECKING:
from duckdb import Expression
from narwhals._duckdb.expr import DuckDBExpr
class DuckDBExprStringNamespace(
LazyExprNamespace["DuckDBExpr"], StringNamespace["DuckDBExpr"]
):
def starts_with(self, prefix: str) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("starts_with", expr, lit(prefix))
)
def ends_with(self, suffix: str) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("ends_with", expr, lit(suffix))
)
def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr:
def func(expr: Expression) -> Expression:
if literal:
return F("contains", expr, lit(pattern))
return F("regexp_matches", expr, lit(pattern))
return self.compliant._with_elementwise(func)
def slice(self, offset: int, length: int | None) -> DuckDBExpr:
def func(expr: Expression) -> Expression:
offset_lit = lit(offset)
return F(
"array_slice",
expr,
lit(offset + 1)
if offset >= 0
else F("length", expr) + offset_lit + lit(1),
F("length", expr) if length is None else lit(length) + offset_lit,
)
return self.compliant._with_elementwise(func)
def split(self, by: str) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("str_split", expr, lit(by))
)
def len_chars(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("length", expr))
def to_lowercase(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("lower", expr))
def to_uppercase(self) -> DuckDBExpr:
return self.compliant._with_elementwise(lambda expr: F("upper", expr))
def strip_chars(self, characters: str | None) -> DuckDBExpr:
import string
return self.compliant._with_elementwise(
lambda expr: F(
"trim", expr, lit(string.whitespace if characters is None else characters)
)
)
def replace_all(self, pattern: str, value: str, *, literal: bool) -> DuckDBExpr:
if not literal:
return self.compliant._with_elementwise(
lambda expr: F("regexp_replace", expr, lit(pattern), lit(value), lit("g"))
)
return self.compliant._with_elementwise(
lambda expr: F("replace", expr, lit(pattern), lit(value))
)
def to_datetime(self, format: str | None) -> DuckDBExpr:
if format is None:
msg = "Cannot infer format with DuckDB backend, please specify `format` explicitly."
raise NotImplementedError(msg)
return self.compliant._with_elementwise(
lambda expr: F("strptime", expr, lit(format))
)
def to_date(self, format: str | None) -> DuckDBExpr:
if format is not None:
return self.to_datetime(format=format).dt.date()
compliant_expr = self.compliant
return compliant_expr.cast(compliant_expr._version.dtypes.Date())
def zfill(self, width: int) -> DuckDBExpr:
# DuckDB does not have a built-in zfill function, so we need to implement it manually
# using string manipulation functions.
def func(expr: Expression) -> Expression:
less_than_width = F("length", expr) < lit(width)
zero, hyphen, plus = lit("0"), lit("-"), lit("+")
starts_with_minus = F("starts_with", expr, hyphen)
starts_with_plus = F("starts_with", expr, plus)
substring = F("substr", expr, lit(2))
padded_substring = F("lpad", substring, lit(width - 1), zero)
return (
when(
starts_with_minus & less_than_width,
F("concat", hyphen, padded_substring),
)
.when(
starts_with_plus & less_than_width,
F("concat", plus, padded_substring),
)
.when(less_than_width, F("lpad", expr, lit(width), zero))
.otherwise(expr)
)
# can't use `_with_elementwise` due to `when` operator.
# TODO(unassigned): implement `window_func` like we do in `Expr.cast`
return self.compliant._with_callable(func)
replace = not_implemented()

View File

@@ -0,0 +1,19 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._compliant import LazyExprNamespace
from narwhals._compliant.any_namespace import StructNamespace
from narwhals._duckdb.utils import F, lit
if TYPE_CHECKING:
from narwhals._duckdb.expr import DuckDBExpr
class DuckDBExprStructNamespace(
LazyExprNamespace["DuckDBExpr"], StructNamespace["DuckDBExpr"]
):
def field(self, name: str) -> DuckDBExpr:
return self.compliant._with_elementwise(
lambda expr: F("struct_extract", expr, lit(name))
).alias(name)

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
from itertools import chain
from typing import TYPE_CHECKING
from narwhals._compliant import LazyGroupBy
if TYPE_CHECKING:
from collections.abc import Sequence
from duckdb import Expression # noqa: F401
from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.expr import DuckDBExpr
class DuckDBGroupBy(LazyGroupBy["DuckDBLazyFrame", "DuckDBExpr", "Expression"]):
def __init__(
self,
df: DuckDBLazyFrame,
keys: Sequence[DuckDBExpr] | Sequence[str],
/,
*,
drop_null_keys: bool,
) -> None:
frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys)
self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame
def agg(self, *exprs: DuckDBExpr) -> DuckDBLazyFrame:
agg_columns = list(chain(self._keys, self._evaluate_exprs(exprs)))
return self.compliant._with_native(
self.compliant.native.aggregate(agg_columns) # type: ignore[arg-type]
).rename(dict(zip(self._keys, self._output_key_names)))

View File

@@ -0,0 +1,202 @@
from __future__ import annotations
import operator
from functools import reduce
from itertools import chain
from typing import TYPE_CHECKING
import duckdb
from duckdb import CoalesceOperator, Expression
from duckdb.typing import BIGINT, VARCHAR
from narwhals._compliant import LazyNamespace, LazyThen, LazyWhen
from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.expr import DuckDBExpr
from narwhals._duckdb.selectors import DuckDBSelectorNamespace
from narwhals._duckdb.utils import (
DeferredTimeZone,
F,
concat_str,
lit,
narwhals_to_native_dtype,
when,
)
from narwhals._expression_parsing import (
combine_alias_output_names,
combine_evaluate_output_names,
)
from narwhals._utils import Implementation
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
from narwhals._duckdb.expr import DuckDBWindowInputs
from narwhals._utils import Version
from narwhals.typing import ConcatMethod, IntoDType, NonNestedLiteral
class DuckDBNamespace(
LazyNamespace[DuckDBLazyFrame, DuckDBExpr, duckdb.DuckDBPyRelation]
):
_implementation: Implementation = Implementation.DUCKDB
def __init__(self, *, version: Version) -> None:
self._version = version
@property
def selectors(self) -> DuckDBSelectorNamespace:
return DuckDBSelectorNamespace.from_namespace(self)
@property
def _expr(self) -> type[DuckDBExpr]:
return DuckDBExpr
@property
def _lazyframe(self) -> type[DuckDBLazyFrame]:
return DuckDBLazyFrame
def concat(
self, items: Iterable[DuckDBLazyFrame], *, how: ConcatMethod
) -> DuckDBLazyFrame:
native_items = [item._native_frame for item in items]
items = list(items)
first = items[0]
schema = first.schema
if how == "vertical" and not all(x.schema == schema for x in items[1:]):
msg = "inputs should all have the same schema"
raise TypeError(msg)
res = reduce(lambda x, y: x.union(y), native_items)
return first._with_native(res)
def concat_str(
self, *exprs: DuckDBExpr, separator: str, ignore_nulls: bool
) -> DuckDBExpr:
def func(df: DuckDBLazyFrame) -> list[Expression]:
cols = list(chain.from_iterable(expr(df) for expr in exprs))
if not ignore_nulls:
null_mask_result = reduce(operator.or_, (s.isnull() for s in cols))
cols_separated = [
y
for x in [
(col.cast(VARCHAR),)
if i == len(cols) - 1
else (col.cast(VARCHAR), lit(separator))
for i, col in enumerate(cols)
]
for y in x
]
return [when(~null_mask_result, concat_str(*cols_separated))]
else:
return [concat_str(*cols, separator=separator)]
return self._expr(
call=func,
evaluate_output_names=combine_evaluate_output_names(*exprs),
alias_output_names=combine_alias_output_names(*exprs),
version=self._version,
)
def all_horizontal(self, *exprs: DuckDBExpr, ignore_nulls: bool) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
it = (
(CoalesceOperator(expr, lit(True)) for expr in cols) # noqa: FBT003
if ignore_nulls
else cols
)
return reduce(operator.and_, it)
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def any_horizontal(self, *exprs: DuckDBExpr, ignore_nulls: bool) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
it = (
(CoalesceOperator(expr, lit(False)) for expr in cols) # noqa: FBT003
if ignore_nulls
else cols
)
return reduce(operator.or_, it)
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def max_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
return F("greatest", *cols)
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def min_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
return F("least", *cols)
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def sum_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
return reduce(operator.add, (CoalesceOperator(col, lit(0)) for col in cols))
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def mean_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
cols = list(cols)
return reduce(
operator.add, (CoalesceOperator(col, lit(0)) for col in cols)
) / reduce(operator.add, (col.isnotnull().cast(BIGINT) for col in cols))
return self._expr._from_elementwise_horizontal_op(func, *exprs)
def when(self, predicate: DuckDBExpr) -> DuckDBWhen:
return DuckDBWhen.from_expr(predicate, context=self)
def lit(self, value: NonNestedLiteral, dtype: IntoDType | None) -> DuckDBExpr:
def func(df: DuckDBLazyFrame) -> list[Expression]:
tz = DeferredTimeZone(df.native)
if dtype is not None:
target = narwhals_to_native_dtype(dtype, self._version, tz)
return [lit(value).cast(target)] # type: ignore[arg-type]
return [lit(value)]
return self._expr(
func,
evaluate_output_names=lambda _df: ["literal"],
alias_output_names=None,
version=self._version,
)
def len(self) -> DuckDBExpr:
def func(_df: DuckDBLazyFrame) -> list[Expression]:
return [F("count")]
return self._expr(
call=func,
evaluate_output_names=lambda _df: ["len"],
alias_output_names=None,
version=self._version,
)
def coalesce(self, *exprs: DuckDBExpr) -> DuckDBExpr:
def func(cols: Iterable[Expression]) -> Expression:
return CoalesceOperator(*cols)
return self._expr._from_elementwise_horizontal_op(func, *exprs)
class DuckDBWhen(LazyWhen["DuckDBLazyFrame", Expression, DuckDBExpr]):
@property
def _then(self) -> type[DuckDBThen]:
return DuckDBThen
def __call__(self, df: DuckDBLazyFrame) -> Sequence[Expression]:
self.when = when
self.lit = lit
return super().__call__(df)
def _window_function(
self, df: DuckDBLazyFrame, window_inputs: DuckDBWindowInputs
) -> Sequence[Expression]:
self.when = when
self.lit = lit
return super()._window_function(df, window_inputs)
class DuckDBThen(LazyThen["DuckDBLazyFrame", Expression, DuckDBExpr], DuckDBExpr): ...

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._compliant import CompliantSelector, LazySelectorNamespace
from narwhals._duckdb.expr import DuckDBExpr
if TYPE_CHECKING:
from duckdb import Expression # noqa: F401
from narwhals._duckdb.dataframe import DuckDBLazyFrame # noqa: F401
class DuckDBSelectorNamespace(LazySelectorNamespace["DuckDBLazyFrame", "Expression"]):
@property
def _selector(self) -> type[DuckDBSelector]:
return DuckDBSelector
class DuckDBSelector( # type: ignore[misc]
CompliantSelector["DuckDBLazyFrame", "Expression"], DuckDBExpr
):
def _to_expr(self) -> DuckDBExpr:
return DuckDBExpr(
self._call,
self._window_function,
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
)

View File

@@ -0,0 +1,44 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from narwhals._duckdb.utils import DeferredTimeZone, native_to_narwhals_dtype
from narwhals.dependencies import get_duckdb
if TYPE_CHECKING:
from types import ModuleType
import duckdb
from typing_extensions import Never, Self
from narwhals._utils import Version
from narwhals.dtypes import DType
class DuckDBInterchangeSeries:
def __init__(self, df: duckdb.DuckDBPyRelation, version: Version) -> None:
self._native_series = df
self._version = version
def __narwhals_series__(self) -> Self:
return self
def __native_namespace__(self) -> ModuleType:
return get_duckdb() # type: ignore[no-any-return]
@property
def dtype(self) -> DType:
return native_to_narwhals_dtype(
self._native_series.types[0],
self._version,
DeferredTimeZone(self._native_series),
)
def __getattr__(self, attr: str) -> Never:
msg = ( # pragma: no cover
f"Attribute {attr} is not supported for interchange-level dataframes.\n\n"
"If you would like to see this kind of object better supported in "
"Narwhals, please open a feature request "
"at https://github.com/narwhals-dev/narwhals/issues."
)
raise NotImplementedError(msg) # pragma: no cover

View File

@@ -0,0 +1,18 @@
from __future__ import annotations
from typing import TYPE_CHECKING, TypedDict
if TYPE_CHECKING:
from collections.abc import Sequence
from duckdb import Expression
class WindowExpressionKwargs(TypedDict, total=False):
partition_by: Sequence[str | Expression]
order_by: Sequence[str | Expression]
rows_start: str
rows_end: str
descending: Sequence[bool]
nulls_last: Sequence[bool]
ignore_nulls: bool

View File

@@ -0,0 +1,371 @@
from __future__ import annotations
from functools import lru_cache
from typing import TYPE_CHECKING, Any
import duckdb
from narwhals._utils import Version, isinstance_or_issubclass
from narwhals.exceptions import ColumnNotFoundError
if TYPE_CHECKING:
from collections.abc import Sequence
from duckdb import DuckDBPyRelation, Expression
from duckdb.typing import DuckDBPyType
from narwhals._compliant.typing import CompliantLazyFrameAny
from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.expr import DuckDBExpr
from narwhals.dtypes import DType
from narwhals.typing import IntoDType
UNITS_DICT = {
"y": "year",
"q": "quarter",
"mo": "month",
"d": "day",
"h": "hour",
"m": "minute",
"s": "second",
"ms": "millisecond",
"us": "microsecond",
"ns": "nanosecond",
}
UNIT_TO_TIMESTAMPS = {
"s": "TIMESTAMP_S",
"ms": "TIMESTAMP_MS",
"us": "TIMESTAMP",
"ns": "TIMESTAMP_NS",
}
DESCENDING_TO_ORDER = {True: "desc", False: "asc"}
NULLS_LAST_TO_NULLS_POS = {True: "nulls last", False: "nulls first"}
col = duckdb.ColumnExpression
"""Alias for `duckdb.ColumnExpression`."""
lit = duckdb.ConstantExpression
"""Alias for `duckdb.ConstantExpression`."""
when = duckdb.CaseExpression
"""Alias for `duckdb.CaseExpression`."""
F = duckdb.FunctionExpression
"""Alias for `duckdb.FunctionExpression`."""
def concat_str(*exprs: Expression, separator: str = "") -> Expression:
"""Concatenate many strings, NULL inputs are skipped.
Wraps [concat] and [concat_ws] `FunctionExpression`(s).
Arguments:
exprs: Native columns.
separator: String that will be used to separate the values of each column.
Returns:
A new native expression.
[concat]: https://duckdb.org/docs/stable/sql/functions/char.html#concatstring-
[concat_ws]: https://duckdb.org/docs/stable/sql/functions/char.html#concat_wsseparator-string-
"""
return F("concat_ws", lit(separator), *exprs) if separator else F("concat", *exprs)
def evaluate_exprs(
df: DuckDBLazyFrame, /, *exprs: DuckDBExpr
) -> list[tuple[str, Expression]]:
native_results: list[tuple[str, Expression]] = []
for expr in exprs:
native_series_list = expr._call(df)
output_names = expr._evaluate_output_names(df)
if expr._alias_output_names is not None:
output_names = expr._alias_output_names(output_names)
if len(output_names) != len(native_series_list): # pragma: no cover
msg = f"Internal error: got output names {output_names}, but only got {len(native_series_list)} results"
raise AssertionError(msg)
native_results.extend(zip(output_names, native_series_list))
return native_results
class DeferredTimeZone:
"""Object which gets passed between `native_to_narwhals_dtype` calls.
DuckDB stores the time zone in the connection, rather than in the dtypes, so
this ensures that when calculating the schema of a dataframe with multiple
timezone-aware columns, that the connection's time zone is only fetched once.
Note: we cannot make the time zone a cached `DuckDBLazyFrame` property because
the time zone can be modified after `DuckDBLazyFrame` creation:
```python
df = nw.from_native(rel)
print(df.collect_schema())
rel.query("set timezone = 'Asia/Kolkata'")
print(df.collect_schema()) # should change to reflect new time zone
```
"""
_cached_time_zone: str | None = None
def __init__(self, rel: DuckDBPyRelation) -> None:
self._rel = rel
@property
def time_zone(self) -> str:
"""Fetch relation time zone (if it wasn't calculated already)."""
if self._cached_time_zone is None:
self._cached_time_zone = fetch_rel_time_zone(self._rel)
return self._cached_time_zone
def native_to_narwhals_dtype(
duckdb_dtype: DuckDBPyType, version: Version, deferred_time_zone: DeferredTimeZone
) -> DType:
duckdb_dtype_id = duckdb_dtype.id
dtypes = version.dtypes
# Handle nested data types first
if duckdb_dtype_id == "list":
return dtypes.List(
native_to_narwhals_dtype(duckdb_dtype.child, version, deferred_time_zone)
)
if duckdb_dtype_id == "struct":
children = duckdb_dtype.children
return dtypes.Struct(
[
dtypes.Field(
name=child[0],
dtype=native_to_narwhals_dtype(child[1], version, deferred_time_zone),
)
for child in children
]
)
if duckdb_dtype_id == "array":
child, size = duckdb_dtype.children
shape: list[int] = [size[1]]
while child[1].id == "array":
child, size = child[1].children
shape.insert(0, size[1])
inner = native_to_narwhals_dtype(child[1], version, deferred_time_zone)
return dtypes.Array(inner=inner, shape=tuple(shape))
if duckdb_dtype_id == "enum":
if version is Version.V1:
return dtypes.Enum() # type: ignore[call-arg]
categories = duckdb_dtype.children[0][1]
return dtypes.Enum(categories=categories)
if duckdb_dtype_id == "timestamp with time zone":
return dtypes.Datetime(time_zone=deferred_time_zone.time_zone)
return _non_nested_native_to_narwhals_dtype(duckdb_dtype_id, version)
def fetch_rel_time_zone(rel: duckdb.DuckDBPyRelation) -> str:
result = rel.query(
"duckdb_settings()", "select value from duckdb_settings() where name = 'TimeZone'"
).fetchone()
assert result is not None # noqa: S101
return result[0] # type: ignore[no-any-return]
@lru_cache(maxsize=16)
def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version) -> DType:
dtypes = version.dtypes
return {
"hugeint": dtypes.Int128(),
"bigint": dtypes.Int64(),
"integer": dtypes.Int32(),
"smallint": dtypes.Int16(),
"tinyint": dtypes.Int8(),
"uhugeint": dtypes.UInt128(),
"ubigint": dtypes.UInt64(),
"uinteger": dtypes.UInt32(),
"usmallint": dtypes.UInt16(),
"utinyint": dtypes.UInt8(),
"double": dtypes.Float64(),
"float": dtypes.Float32(),
"varchar": dtypes.String(),
"date": dtypes.Date(),
"timestamp_s": dtypes.Datetime("s"),
"timestamp_ms": dtypes.Datetime("ms"),
"timestamp": dtypes.Datetime(),
"timestamp_ns": dtypes.Datetime("ns"),
"boolean": dtypes.Boolean(),
"interval": dtypes.Duration(),
"decimal": dtypes.Decimal(),
"time": dtypes.Time(),
"blob": dtypes.Binary(),
}.get(duckdb_dtype_id, dtypes.Unknown())
def narwhals_to_native_dtype( # noqa: PLR0912,PLR0915,C901
dtype: IntoDType, version: Version, deferred_time_zone: DeferredTimeZone
) -> str:
dtypes = version.dtypes
if isinstance_or_issubclass(dtype, dtypes.Decimal):
msg = "Casting to Decimal is not supported yet."
raise NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.Float64):
return "DOUBLE"
if isinstance_or_issubclass(dtype, dtypes.Float32):
return "FLOAT"
if isinstance_or_issubclass(dtype, dtypes.Int128):
return "INT128"
if isinstance_or_issubclass(dtype, dtypes.Int64):
return "BIGINT"
if isinstance_or_issubclass(dtype, dtypes.Int32):
return "INTEGER"
if isinstance_or_issubclass(dtype, dtypes.Int16):
return "SMALLINT"
if isinstance_or_issubclass(dtype, dtypes.Int8):
return "TINYINT"
if isinstance_or_issubclass(dtype, dtypes.UInt128):
return "UINT128"
if isinstance_or_issubclass(dtype, dtypes.UInt64):
return "UBIGINT"
if isinstance_or_issubclass(dtype, dtypes.UInt32):
return "UINTEGER"
if isinstance_or_issubclass(dtype, dtypes.UInt16): # pragma: no cover
return "USMALLINT"
if isinstance_or_issubclass(dtype, dtypes.UInt8): # pragma: no cover
return "UTINYINT"
if isinstance_or_issubclass(dtype, dtypes.String):
return "VARCHAR"
if isinstance_or_issubclass(dtype, dtypes.Boolean): # pragma: no cover
return "BOOLEAN"
if isinstance_or_issubclass(dtype, dtypes.Time):
return "TIME"
if isinstance_or_issubclass(dtype, dtypes.Binary):
return "BLOB"
if isinstance_or_issubclass(dtype, dtypes.Categorical):
msg = "Categorical not supported by DuckDB"
raise NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.Enum):
if version is Version.V1:
msg = "Converting to Enum is not supported in narwhals.stable.v1"
raise NotImplementedError(msg)
if isinstance(dtype, dtypes.Enum):
categories = "'" + "', '".join(dtype.categories) + "'"
return f"ENUM ({categories})"
msg = "Can not cast / initialize Enum without categories present"
raise ValueError(msg)
if isinstance_or_issubclass(dtype, dtypes.Datetime):
tu = dtype.time_unit
tz = dtype.time_zone
if not tz:
return UNIT_TO_TIMESTAMPS[tu]
if tu != "us":
msg = f"Only microsecond precision is supported for timezone-aware `Datetime` in DuckDB, got {tu} precision"
raise ValueError(msg)
if tz != (rel_tz := deferred_time_zone.time_zone): # pragma: no cover
msg = f"Only the connection time zone {rel_tz} is supported, got: {tz}."
raise ValueError(msg)
# TODO(unassigned): cover once https://github.com/narwhals-dev/narwhals/issues/2742 addressed
return "TIMESTAMPTZ" # pragma: no cover
if isinstance_or_issubclass(dtype, dtypes.Duration):
if (tu := dtype.time_unit) != "us": # pragma: no cover
msg = f"Only microsecond-precision Duration is supported, got {tu} precision"
return "INTERVAL"
if isinstance_or_issubclass(dtype, dtypes.Date):
return "DATE"
if isinstance_or_issubclass(dtype, dtypes.List):
inner = narwhals_to_native_dtype(dtype.inner, version, deferred_time_zone)
return f"{inner}[]"
if isinstance_or_issubclass(dtype, dtypes.Struct):
inner = ", ".join(
f'"{field.name}" {narwhals_to_native_dtype(field.dtype, version, deferred_time_zone)}'
for field in dtype.fields
)
return f"STRUCT({inner})"
if isinstance_or_issubclass(dtype, dtypes.Array):
shape = dtype.shape
duckdb_shape_fmt = "".join(f"[{item}]" for item in shape)
inner_dtype: Any = dtype
for _ in shape:
inner_dtype = inner_dtype.inner
duckdb_inner = narwhals_to_native_dtype(inner_dtype, version, deferred_time_zone)
return f"{duckdb_inner}{duckdb_shape_fmt}"
msg = f"Unknown dtype: {dtype}" # pragma: no cover
raise AssertionError(msg)
def parse_into_expression(into_expression: str | Expression) -> Expression:
return col(into_expression) if isinstance(into_expression, str) else into_expression
def generate_partition_by_sql(*partition_by: str | Expression) -> str:
if not partition_by:
return ""
by_sql = ", ".join([f"{parse_into_expression(x)}" for x in partition_by])
return f"partition by {by_sql}"
def generate_order_by_sql(
*order_by: str | Expression, descending: Sequence[bool], nulls_last: Sequence[bool]
) -> str:
if not order_by:
return ""
by_sql = ",".join(
f"{parse_into_expression(x)} {DESCENDING_TO_ORDER[_descending]} {NULLS_LAST_TO_NULLS_POS[_nulls_last]}"
for x, _descending, _nulls_last in zip(order_by, descending, nulls_last)
)
return f"order by {by_sql}"
def window_expression(
expr: Expression,
partition_by: Sequence[str | Expression] = (),
order_by: Sequence[str | Expression] = (),
rows_start: str = "",
rows_end: str = "",
*,
descending: Sequence[bool] | None = None,
nulls_last: Sequence[bool] | None = None,
ignore_nulls: bool = False,
) -> Expression:
# TODO(unassigned): Replace with `duckdb.WindowExpression` when they release it.
# https://github.com/duckdb/duckdb/discussions/14725#discussioncomment-11200348
try:
from duckdb import SQLExpression
except ModuleNotFoundError as exc: # pragma: no cover
msg = f"DuckDB>=1.3.0 is required for this operation. Found: DuckDB {duckdb.__version__}"
raise NotImplementedError(msg) from exc
pb = generate_partition_by_sql(*partition_by)
descending = descending or [False] * len(order_by)
nulls_last = nulls_last or [False] * len(order_by)
ob = generate_order_by_sql(*order_by, descending=descending, nulls_last=nulls_last)
if rows_start and rows_end:
rows = f"rows between {rows_start} and {rows_end}"
elif rows_start or rows_end: # pragma: no cover
msg = "Either both `rows_start` and `rows_end` must be specified, or neither."
else:
rows = ""
func = f"{str(expr).removesuffix(')')} ignore nulls)" if ignore_nulls else str(expr)
return SQLExpression(f"{func} over ({pb} {ob} {rows})")
def catch_duckdb_exception(
exception: Exception, frame: CompliantLazyFrameAny, /
) -> ColumnNotFoundError | Exception:
if isinstance(exception, duckdb.BinderException) and any(
msg in str(exception)
for msg in (
"not found in FROM clause",
"this column cannot be referenced before it is defined",
)
):
return ColumnNotFoundError.from_available_column_names(
available_columns=frame.columns
)
# Just return exception as-is.
return exception