Mise à jour de Monitor.py et autres scripts
This commit is contained in:
505
myenv/lib/python3.11/site-packages/narwhals/_duckdb/dataframe.py
Normal file
505
myenv/lib/python3.11/site-packages/narwhals/_duckdb/dataframe.py
Normal file
@@ -0,0 +1,505 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import reduce
|
||||
from operator import and_
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import duckdb
|
||||
from duckdb import StarExpression
|
||||
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
catch_duckdb_exception,
|
||||
col,
|
||||
evaluate_exprs,
|
||||
lit,
|
||||
native_to_narwhals_dtype,
|
||||
window_expression,
|
||||
)
|
||||
from narwhals._utils import (
|
||||
Implementation,
|
||||
ValidateBackendVersion,
|
||||
Version,
|
||||
generate_temporary_column_name,
|
||||
not_implemented,
|
||||
parse_columns_to_drop,
|
||||
requires,
|
||||
)
|
||||
from narwhals.dependencies import get_duckdb
|
||||
from narwhals.exceptions import InvalidOperationError
|
||||
from narwhals.typing import CompliantLazyFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator, Mapping, Sequence
|
||||
from types import ModuleType
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from duckdb import Expression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
from typing_extensions import Self, TypeIs
|
||||
|
||||
from narwhals._compliant.typing import CompliantDataFrameAny
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals._duckdb.group_by import DuckDBGroupBy
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
from narwhals._duckdb.series import DuckDBInterchangeSeries
|
||||
from narwhals._utils import _LimitedContext
|
||||
from narwhals.dataframe import LazyFrame
|
||||
from narwhals.dtypes import DType
|
||||
from narwhals.stable.v1 import DataFrame as DataFrameV1
|
||||
from narwhals.typing import AsofJoinStrategy, JoinStrategy, LazyUniqueKeepStrategy
|
||||
|
||||
|
||||
class DuckDBLazyFrame(
|
||||
CompliantLazyFrame[
|
||||
"DuckDBExpr",
|
||||
"duckdb.DuckDBPyRelation",
|
||||
"LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]",
|
||||
],
|
||||
ValidateBackendVersion,
|
||||
):
|
||||
_implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
df: duckdb.DuckDBPyRelation,
|
||||
*,
|
||||
version: Version,
|
||||
validate_backend_version: bool = False,
|
||||
) -> None:
|
||||
self._native_frame: duckdb.DuckDBPyRelation = df
|
||||
self._version = version
|
||||
self._cached_native_schema: dict[str, DuckDBPyType] | None = None
|
||||
self._cached_columns: list[str] | None = None
|
||||
if validate_backend_version:
|
||||
self._validate_backend_version()
|
||||
|
||||
@property
|
||||
def _backend_version(self) -> tuple[int, ...]:
|
||||
return self._implementation._backend_version()
|
||||
|
||||
@staticmethod
|
||||
def _is_native(obj: duckdb.DuckDBPyRelation | Any) -> TypeIs[duckdb.DuckDBPyRelation]:
|
||||
return isinstance(obj, duckdb.DuckDBPyRelation)
|
||||
|
||||
@classmethod
|
||||
def from_native(
|
||||
cls, data: duckdb.DuckDBPyRelation, /, *, context: _LimitedContext
|
||||
) -> Self:
|
||||
return cls(data, version=context._version)
|
||||
|
||||
def to_narwhals(
|
||||
self, *args: Any, **kwds: Any
|
||||
) -> LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]:
|
||||
if self._version is Version.V1:
|
||||
from narwhals.stable.v1 import DataFrame as DataFrameV1
|
||||
|
||||
return DataFrameV1(self, level="interchange") # type: ignore[no-any-return]
|
||||
return self._version.lazyframe(self, level="lazy")
|
||||
|
||||
def __narwhals_dataframe__(self) -> Self: # pragma: no cover
|
||||
# Keep around for backcompat.
|
||||
if self._version is not Version.V1:
|
||||
msg = "__narwhals_dataframe__ is not implemented for DuckDBLazyFrame"
|
||||
raise AttributeError(msg)
|
||||
return self
|
||||
|
||||
def __narwhals_lazyframe__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __native_namespace__(self) -> ModuleType:
|
||||
return get_duckdb() # type: ignore[no-any-return]
|
||||
|
||||
def __narwhals_namespace__(self) -> DuckDBNamespace:
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
|
||||
return DuckDBNamespace(version=self._version)
|
||||
|
||||
def get_column(self, name: str) -> DuckDBInterchangeSeries:
|
||||
from narwhals._duckdb.series import DuckDBInterchangeSeries
|
||||
|
||||
return DuckDBInterchangeSeries(self.native.select(name), version=self._version)
|
||||
|
||||
def _iter_columns(self) -> Iterator[Expression]:
|
||||
for name in self.columns:
|
||||
yield col(name)
|
||||
|
||||
def collect(
|
||||
self, backend: ModuleType | Implementation | str | None, **kwargs: Any
|
||||
) -> CompliantDataFrameAny:
|
||||
if backend is None or backend is Implementation.PYARROW:
|
||||
from narwhals._arrow.dataframe import ArrowDataFrame
|
||||
|
||||
return ArrowDataFrame(
|
||||
self.native.arrow(),
|
||||
validate_backend_version=True,
|
||||
version=self._version,
|
||||
validate_column_names=True,
|
||||
)
|
||||
|
||||
if backend is Implementation.PANDAS:
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
||||
|
||||
return PandasLikeDataFrame(
|
||||
self.native.df(),
|
||||
implementation=Implementation.PANDAS,
|
||||
validate_backend_version=True,
|
||||
version=self._version,
|
||||
validate_column_names=True,
|
||||
)
|
||||
|
||||
if backend is Implementation.POLARS:
|
||||
from narwhals._polars.dataframe import PolarsDataFrame
|
||||
|
||||
return PolarsDataFrame(
|
||||
self.native.pl(), validate_backend_version=True, version=self._version
|
||||
)
|
||||
|
||||
msg = f"Unsupported `backend` value: {backend}" # pragma: no cover
|
||||
raise ValueError(msg) # pragma: no cover
|
||||
|
||||
def head(self, n: int) -> Self:
|
||||
return self._with_native(self.native.limit(n))
|
||||
|
||||
def simple_select(self, *column_names: str) -> Self:
|
||||
return self._with_native(self.native.select(*column_names))
|
||||
|
||||
def aggregate(self, *exprs: DuckDBExpr) -> Self:
|
||||
selection = [val.alias(name) for name, val in evaluate_exprs(self, *exprs)]
|
||||
try:
|
||||
return self._with_native(self.native.aggregate(selection)) # type: ignore[arg-type]
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def select(self, *exprs: DuckDBExpr) -> Self:
|
||||
selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs))
|
||||
try:
|
||||
return self._with_native(self.native.select(*selection))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def drop(self, columns: Sequence[str], *, strict: bool) -> Self:
|
||||
columns_to_drop = parse_columns_to_drop(self, columns, strict=strict)
|
||||
selection = (name for name in self.columns if name not in columns_to_drop)
|
||||
return self._with_native(self.native.select(*selection))
|
||||
|
||||
def lazy(self, *, backend: Implementation | None = None) -> Self:
|
||||
# The `backend`` argument has no effect but we keep it here for
|
||||
# backwards compatibility because in `narwhals.stable.v1`
|
||||
# function `.from_native()` will return a DataFrame for DuckDB.
|
||||
|
||||
if backend is not None: # pragma: no cover
|
||||
msg = "`backend` argument is not supported for DuckDB"
|
||||
raise ValueError(msg)
|
||||
return self
|
||||
|
||||
def with_columns(self, *exprs: DuckDBExpr) -> Self:
|
||||
new_columns_map = dict(evaluate_exprs(self, *exprs))
|
||||
result = [
|
||||
new_columns_map.pop(name).alias(name)
|
||||
if name in new_columns_map
|
||||
else col(name)
|
||||
for name in self.columns
|
||||
]
|
||||
result.extend(value.alias(name) for name, value in new_columns_map.items())
|
||||
try:
|
||||
return self._with_native(self.native.select(*result))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def filter(self, predicate: DuckDBExpr) -> Self:
|
||||
# `[0]` is safe as the predicate's expression only returns a single column
|
||||
mask = predicate(self)[0]
|
||||
try:
|
||||
return self._with_native(self.native.filter(mask))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
@property
|
||||
def schema(self) -> dict[str, DType]:
|
||||
if self._cached_native_schema is None:
|
||||
# Note: prefer `self._cached_native_schema` over `functools.cached_property`
|
||||
# due to Python3.13 failures.
|
||||
self._cached_native_schema = dict(zip(self.columns, self.native.types))
|
||||
|
||||
deferred_time_zone = DeferredTimeZone(self.native)
|
||||
return {
|
||||
column_name: native_to_narwhals_dtype(
|
||||
duckdb_dtype, self._version, deferred_time_zone
|
||||
)
|
||||
for column_name, duckdb_dtype in zip(self.native.columns, self.native.types)
|
||||
}
|
||||
|
||||
@property
|
||||
def columns(self) -> list[str]:
|
||||
if self._cached_columns is None:
|
||||
self._cached_columns = (
|
||||
list(self.schema)
|
||||
if self._cached_native_schema is not None
|
||||
else self.native.columns
|
||||
)
|
||||
return self._cached_columns
|
||||
|
||||
def to_pandas(self) -> pd.DataFrame:
|
||||
# only if version is v1, keep around for backcompat
|
||||
return self.native.df()
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
# only if version is v1, keep around for backcompat
|
||||
return self.native.arrow()
|
||||
|
||||
def _with_version(self, version: Version) -> Self:
|
||||
return self.__class__(self.native, version=version)
|
||||
|
||||
def _with_native(self, df: duckdb.DuckDBPyRelation) -> Self:
|
||||
return self.__class__(df, version=self._version)
|
||||
|
||||
def group_by(
|
||||
self, keys: Sequence[str] | Sequence[DuckDBExpr], *, drop_null_keys: bool
|
||||
) -> DuckDBGroupBy:
|
||||
from narwhals._duckdb.group_by import DuckDBGroupBy
|
||||
|
||||
return DuckDBGroupBy(self, keys, drop_null_keys=drop_null_keys)
|
||||
|
||||
def rename(self, mapping: Mapping[str, str]) -> Self:
|
||||
df = self.native
|
||||
selection = (
|
||||
col(name).alias(mapping[name]) if name in mapping else col(name)
|
||||
for name in df.columns
|
||||
)
|
||||
return self._with_native(self.native.select(*selection))
|
||||
|
||||
def join(
|
||||
self,
|
||||
other: Self,
|
||||
*,
|
||||
how: JoinStrategy,
|
||||
left_on: Sequence[str] | None,
|
||||
right_on: Sequence[str] | None,
|
||||
suffix: str,
|
||||
) -> Self:
|
||||
native_how = "outer" if how == "full" else how
|
||||
|
||||
if native_how == "cross":
|
||||
if self._backend_version < (1, 1, 4):
|
||||
msg = f"'duckdb>=1.1.4' is required for cross-join, found version: {self._backend_version}"
|
||||
raise NotImplementedError(msg)
|
||||
rel = self.native.set_alias("lhs").cross(other.native.set_alias("rhs"))
|
||||
else:
|
||||
# help mypy
|
||||
assert left_on is not None # noqa: S101
|
||||
assert right_on is not None # noqa: S101
|
||||
it = (
|
||||
col(f'lhs."{left}"') == col(f'rhs."{right}"')
|
||||
for left, right in zip(left_on, right_on)
|
||||
)
|
||||
condition: Expression = reduce(and_, it)
|
||||
rel = self.native.set_alias("lhs").join(
|
||||
other.native.set_alias("rhs"),
|
||||
# NOTE: Fixed in `--pre` https://github.com/duckdb/duckdb/pull/16933
|
||||
condition=condition, # type: ignore[arg-type, unused-ignore]
|
||||
how=native_how,
|
||||
)
|
||||
|
||||
if native_how in {"inner", "left", "cross", "outer"}:
|
||||
select = [col(f'lhs."{x}"') for x in self.columns]
|
||||
for name in other.columns:
|
||||
col_in_lhs: bool = name in self.columns
|
||||
if native_how == "outer" and not col_in_lhs:
|
||||
select.append(col(f'rhs."{name}"'))
|
||||
elif (native_how == "outer") or (
|
||||
col_in_lhs and (right_on is None or name not in right_on)
|
||||
):
|
||||
select.append(col(f'rhs."{name}"').alias(f"{name}{suffix}"))
|
||||
elif right_on is None or name not in right_on:
|
||||
select.append(col(name))
|
||||
res = rel.select(*select).set_alias(self.native.alias)
|
||||
else: # semi, anti
|
||||
res = rel.select("lhs.*").set_alias(self.native.alias)
|
||||
|
||||
return self._with_native(res)
|
||||
|
||||
def join_asof(
|
||||
self,
|
||||
other: Self,
|
||||
*,
|
||||
left_on: str,
|
||||
right_on: str,
|
||||
by_left: Sequence[str] | None,
|
||||
by_right: Sequence[str] | None,
|
||||
strategy: AsofJoinStrategy,
|
||||
suffix: str,
|
||||
) -> Self:
|
||||
lhs = self.native
|
||||
rhs = other.native
|
||||
conditions: list[Expression] = []
|
||||
if by_left is not None and by_right is not None:
|
||||
conditions.extend(
|
||||
col(f'lhs."{left}"') == col(f'rhs."{right}"')
|
||||
for left, right in zip(by_left, by_right)
|
||||
)
|
||||
else:
|
||||
by_left = by_right = []
|
||||
if strategy == "backward":
|
||||
conditions.append(col(f'lhs."{left_on}"') >= col(f'rhs."{right_on}"'))
|
||||
elif strategy == "forward":
|
||||
conditions.append(col(f'lhs."{left_on}"') <= col(f'rhs."{right_on}"'))
|
||||
else:
|
||||
msg = "Only 'backward' and 'forward' strategies are currently supported for DuckDB"
|
||||
raise NotImplementedError(msg)
|
||||
condition: Expression = reduce(and_, conditions)
|
||||
select = ["lhs.*"]
|
||||
for name in rhs.columns:
|
||||
if name in lhs.columns and (
|
||||
right_on is None or name not in {right_on, *by_right}
|
||||
):
|
||||
select.append(f'rhs."{name}" as "{name}{suffix}"')
|
||||
elif right_on is None or name not in {right_on, *by_right}:
|
||||
select.append(str(col(name)))
|
||||
# Replace with Python API call once
|
||||
# https://github.com/duckdb/duckdb/discussions/16947 is addressed.
|
||||
query = f"""
|
||||
SELECT {",".join(select)}
|
||||
FROM lhs
|
||||
ASOF LEFT JOIN rhs
|
||||
ON {condition}
|
||||
""" # noqa: S608
|
||||
return self._with_native(duckdb.sql(query))
|
||||
|
||||
def collect_schema(self) -> dict[str, DType]:
|
||||
return self.schema
|
||||
|
||||
def unique(
|
||||
self, subset: Sequence[str] | None, *, keep: LazyUniqueKeepStrategy
|
||||
) -> Self:
|
||||
if subset_ := subset if keep == "any" else (subset or self.columns):
|
||||
# Sanitise input
|
||||
if error := self._check_columns_exist(subset_):
|
||||
raise error
|
||||
idx_name = generate_temporary_column_name(8, self.columns)
|
||||
count_name = generate_temporary_column_name(8, [*self.columns, idx_name])
|
||||
name = count_name if keep == "none" else idx_name
|
||||
idx_expr = window_expression(F("row_number"), subset_).alias(idx_name)
|
||||
count_expr = window_expression(
|
||||
F("count", StarExpression()), subset_, ()
|
||||
).alias(count_name)
|
||||
return self._with_native(
|
||||
self.native.select(StarExpression(), idx_expr, count_expr)
|
||||
.filter(col(name) == lit(1))
|
||||
.select(StarExpression(exclude=[count_name, idx_name]))
|
||||
)
|
||||
return self._with_native(self.native.unique(", ".join(self.columns)))
|
||||
|
||||
def sort(self, *by: str, descending: bool | Sequence[bool], nulls_last: bool) -> Self:
|
||||
if isinstance(descending, bool):
|
||||
descending = [descending] * len(by)
|
||||
if nulls_last:
|
||||
it = (
|
||||
col(name).nulls_last() if not desc else col(name).desc().nulls_last()
|
||||
for name, desc in zip(by, descending)
|
||||
)
|
||||
else:
|
||||
it = (
|
||||
col(name).nulls_first() if not desc else col(name).desc().nulls_first()
|
||||
for name, desc in zip(by, descending)
|
||||
)
|
||||
return self._with_native(self.native.sort(*it))
|
||||
|
||||
def drop_nulls(self, subset: Sequence[str] | None) -> Self:
|
||||
subset_ = subset if subset is not None else self.columns
|
||||
keep_condition = reduce(and_, (col(name).isnotnull() for name in subset_))
|
||||
return self._with_native(self.native.filter(keep_condition))
|
||||
|
||||
def explode(self, columns: Sequence[str]) -> Self:
|
||||
dtypes = self._version.dtypes
|
||||
schema = self.collect_schema()
|
||||
for name in columns:
|
||||
dtype = schema[name]
|
||||
if dtype != dtypes.List:
|
||||
msg = (
|
||||
f"`explode` operation not supported for dtype `{dtype}`, "
|
||||
"expected List type"
|
||||
)
|
||||
raise InvalidOperationError(msg)
|
||||
|
||||
if len(columns) != 1:
|
||||
msg = (
|
||||
"Exploding on multiple columns is not supported with DuckDB backend since "
|
||||
"we cannot guarantee that the exploded columns have matching element counts."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
col_to_explode = col(columns[0])
|
||||
rel = self.native
|
||||
original_columns = self.columns
|
||||
|
||||
not_null_condition = col_to_explode.isnotnull() & F("len", col_to_explode) > lit(
|
||||
0
|
||||
)
|
||||
non_null_rel = rel.filter(not_null_condition).select(
|
||||
*(
|
||||
F("unnest", col_to_explode).alias(name) if name in columns else name
|
||||
for name in original_columns
|
||||
)
|
||||
)
|
||||
|
||||
null_rel = rel.filter(~not_null_condition).select(
|
||||
*(
|
||||
lit(None).alias(name) if name in columns else name
|
||||
for name in original_columns
|
||||
)
|
||||
)
|
||||
|
||||
return self._with_native(non_null_rel.union(null_rel))
|
||||
|
||||
def unpivot(
|
||||
self,
|
||||
on: Sequence[str] | None,
|
||||
index: Sequence[str] | None,
|
||||
variable_name: str,
|
||||
value_name: str,
|
||||
) -> Self:
|
||||
index_ = [] if index is None else index
|
||||
on_ = [c for c in self.columns if c not in index_] if on is None else on
|
||||
|
||||
if variable_name == "":
|
||||
msg = "`variable_name` cannot be empty string for duckdb backend."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if value_name == "":
|
||||
msg = "`value_name` cannot be empty string for duckdb backend."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
unpivot_on = ", ".join(str(col(name)) for name in on_)
|
||||
rel = self.native # noqa: F841
|
||||
# Replace with Python API once
|
||||
# https://github.com/duckdb/duckdb/discussions/16980 is addressed.
|
||||
query = f"""
|
||||
unpivot rel
|
||||
on {unpivot_on}
|
||||
into
|
||||
name "{variable_name}"
|
||||
value "{value_name}"
|
||||
"""
|
||||
return self._with_native(
|
||||
duckdb.sql(query).select(*[*index_, variable_name, value_name])
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def with_row_index(self, name: str, order_by: Sequence[str]) -> Self:
|
||||
if order_by is None:
|
||||
msg = "Cannot pass `order_by` to `with_row_index` for DuckDB"
|
||||
raise TypeError(msg)
|
||||
expr = (window_expression(F("row_number"), order_by=order_by) - lit(1)).alias(
|
||||
name
|
||||
)
|
||||
return self._with_native(self.native.select(expr, StarExpression()))
|
||||
|
||||
gather_every = not_implemented.deprecated(
|
||||
"`LazyFrame.gather_every` is deprecated and will be removed in a future version."
|
||||
)
|
||||
tail = not_implemented.deprecated(
|
||||
"`LazyFrame.tail` is deprecated and will be removed in a future version."
|
||||
)
|
||||
827
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr.py
Normal file
827
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr.py
Normal file
@@ -0,0 +1,827 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
||||
|
||||
from duckdb import CoalesceOperator, StarExpression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
|
||||
from narwhals._compliant import LazyExpr, WindowInputs
|
||||
from narwhals._duckdb.expr_dt import DuckDBExprDateTimeNamespace
|
||||
from narwhals._duckdb.expr_list import DuckDBExprListNamespace
|
||||
from narwhals._duckdb.expr_str import DuckDBExprStringNamespace
|
||||
from narwhals._duckdb.expr_struct import DuckDBExprStructNamespace
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
col,
|
||||
lit,
|
||||
narwhals_to_native_dtype,
|
||||
when,
|
||||
window_expression,
|
||||
)
|
||||
from narwhals._expression_parsing import (
|
||||
ExprKind,
|
||||
combine_alias_output_names,
|
||||
combine_evaluate_output_names,
|
||||
)
|
||||
from narwhals._utils import Implementation, not_implemented, requires
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
from typing_extensions import Self
|
||||
|
||||
from narwhals._compliant.typing import (
|
||||
AliasNames,
|
||||
EvalNames,
|
||||
EvalSeries,
|
||||
WindowFunction,
|
||||
)
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
from narwhals._duckdb.typing import WindowExpressionKwargs
|
||||
from narwhals._expression_parsing import ExprMetadata
|
||||
from narwhals._utils import Version, _LimitedContext
|
||||
from narwhals.typing import (
|
||||
FillNullStrategy,
|
||||
IntoDType,
|
||||
NonNestedLiteral,
|
||||
NumericLiteral,
|
||||
RankMethod,
|
||||
RollingInterpolationMethod,
|
||||
TemporalLiteral,
|
||||
)
|
||||
|
||||
DuckDBWindowFunction = WindowFunction[DuckDBLazyFrame, Expression]
|
||||
DuckDBWindowInputs = WindowInputs[Expression]
|
||||
|
||||
|
||||
class DuckDBExpr(LazyExpr["DuckDBLazyFrame", "Expression"]):
|
||||
_implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
call: EvalSeries[DuckDBLazyFrame, Expression],
|
||||
window_function: DuckDBWindowFunction | None = None,
|
||||
*,
|
||||
evaluate_output_names: EvalNames[DuckDBLazyFrame],
|
||||
alias_output_names: AliasNames | None,
|
||||
version: Version,
|
||||
) -> None:
|
||||
self._call = call
|
||||
self._evaluate_output_names = evaluate_output_names
|
||||
self._alias_output_names = alias_output_names
|
||||
self._version = version
|
||||
self._metadata: ExprMetadata | None = None
|
||||
self._window_function: DuckDBWindowFunction | None = window_function
|
||||
|
||||
@property
|
||||
def window_function(self) -> DuckDBWindowFunction:
|
||||
def default_window_func(
|
||||
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
|
||||
) -> list[Expression]:
|
||||
assert not inputs.order_by # noqa: S101
|
||||
return [
|
||||
window_expression(expr, inputs.partition_by, inputs.order_by)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._window_function or default_window_func
|
||||
|
||||
def __call__(self, df: DuckDBLazyFrame) -> Sequence[Expression]:
|
||||
return self._call(df)
|
||||
|
||||
def __narwhals_expr__(self) -> None: ...
|
||||
|
||||
def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
|
||||
return DuckDBNamespace(version=self._version)
|
||||
|
||||
def _cum_window_func(
|
||||
self,
|
||||
func_name: Literal["sum", "max", "min", "count", "product"],
|
||||
*,
|
||||
reverse: bool,
|
||||
) -> DuckDBWindowFunction:
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
return [
|
||||
window_expression(
|
||||
F(func_name, expr),
|
||||
inputs.partition_by,
|
||||
inputs.order_by,
|
||||
descending=[reverse] * len(inputs.order_by),
|
||||
nulls_last=[reverse] * len(inputs.order_by),
|
||||
rows_start="unbounded preceding",
|
||||
rows_end="current row",
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return func
|
||||
|
||||
def _rolling_window_func(
|
||||
self,
|
||||
func_name: Literal["sum", "mean", "std", "var"],
|
||||
window_size: int,
|
||||
min_samples: int,
|
||||
ddof: int | None = None,
|
||||
*,
|
||||
center: bool,
|
||||
) -> DuckDBWindowFunction:
|
||||
supported_funcs = ["sum", "mean", "std", "var"]
|
||||
if center:
|
||||
half = (window_size - 1) // 2
|
||||
remainder = (window_size - 1) % 2
|
||||
start = f"{half + remainder} preceding"
|
||||
end = f"{half} following"
|
||||
else:
|
||||
start = f"{window_size - 1} preceding"
|
||||
end = "current row"
|
||||
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
if func_name in {"sum", "mean"}:
|
||||
func_: str = func_name
|
||||
elif func_name == "var" and ddof == 0:
|
||||
func_ = "var_pop"
|
||||
elif func_name in "var" and ddof == 1:
|
||||
func_ = "var_samp"
|
||||
elif func_name == "std" and ddof == 0:
|
||||
func_ = "stddev_pop"
|
||||
elif func_name == "std" and ddof == 1:
|
||||
func_ = "stddev_samp"
|
||||
elif func_name in {"var", "std"}: # pragma: no cover
|
||||
msg = f"Only ddof=0 and ddof=1 are currently supported for rolling_{func_name}."
|
||||
raise ValueError(msg)
|
||||
else: # pragma: no cover
|
||||
msg = f"Only the following functions are supported: {supported_funcs}.\nGot: {func_name}."
|
||||
raise ValueError(msg)
|
||||
window_kwargs: WindowExpressionKwargs = {
|
||||
"partition_by": inputs.partition_by,
|
||||
"order_by": inputs.order_by,
|
||||
"rows_start": start,
|
||||
"rows_end": end,
|
||||
}
|
||||
return [
|
||||
when(
|
||||
window_expression(F("count", expr), **window_kwargs)
|
||||
>= lit(min_samples),
|
||||
window_expression(F(func_, expr), **window_kwargs),
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return func
|
||||
|
||||
def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Self:
|
||||
if kind is ExprKind.LITERAL:
|
||||
return self
|
||||
if self._backend_version < (1, 3):
|
||||
msg = "At least version 1.3 of DuckDB is required for binary operations between aggregates and columns."
|
||||
raise NotImplementedError(msg)
|
||||
return self.over([lit(1)], [])
|
||||
|
||||
@classmethod
|
||||
def from_column_names(
|
||||
cls,
|
||||
evaluate_column_names: EvalNames[DuckDBLazyFrame],
|
||||
/,
|
||||
*,
|
||||
context: _LimitedContext,
|
||||
) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
return [col(name) for name in evaluate_column_names(df)]
|
||||
|
||||
return cls(
|
||||
func,
|
||||
evaluate_output_names=evaluate_column_names,
|
||||
alias_output_names=None,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_column_indices(cls, *column_indices: int, context: _LimitedContext) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
columns = df.columns
|
||||
return [col(columns[i]) for i in column_indices]
|
||||
|
||||
return cls(
|
||||
func,
|
||||
evaluate_output_names=cls._eval_names_indices(column_indices),
|
||||
alias_output_names=None,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _from_elementwise_horizontal_op(
|
||||
cls, func: Callable[[Iterable[Expression]], Expression], *exprs: Self
|
||||
) -> Self:
|
||||
def call(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
cols = (col for _expr in exprs for col in _expr(df))
|
||||
return [func(cols)]
|
||||
|
||||
def window_function(
|
||||
df: DuckDBLazyFrame, window_inputs: DuckDBWindowInputs
|
||||
) -> list[Expression]:
|
||||
cols = (
|
||||
col for _expr in exprs for col in _expr.window_function(df, window_inputs)
|
||||
)
|
||||
return [func(cols)]
|
||||
|
||||
context = exprs[0]
|
||||
return cls(
|
||||
call=call,
|
||||
window_function=window_function,
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
def _callable_to_eval_series(
|
||||
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
|
||||
) -> EvalSeries[DuckDBLazyFrame, Expression]:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
native_series_list = self(df)
|
||||
other_native_series = {
|
||||
key: df._evaluate_expr(value) if self._is_expr(value) else lit(value)
|
||||
for key, value in expressifiable_args.items()
|
||||
}
|
||||
return [
|
||||
call(native_series, **other_native_series)
|
||||
for native_series in native_series_list
|
||||
]
|
||||
|
||||
return func
|
||||
|
||||
def _push_down_window_function(
|
||||
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
|
||||
) -> DuckDBWindowFunction:
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
# If a function `f` is elementwise, and `g` is another function, then
|
||||
# - `f(g) over (window)`
|
||||
# - `f(g over (window))
|
||||
# are equivalent.
|
||||
# Make sure to only use with if `call` is elementwise!
|
||||
native_series_list = self.window_function(df, inputs)
|
||||
other_native_series = {
|
||||
key: df._evaluate_window_expr(value, inputs)
|
||||
if self._is_expr(value)
|
||||
else lit(value)
|
||||
for key, value in expressifiable_args.items()
|
||||
}
|
||||
return [
|
||||
call(native_series, **other_native_series)
|
||||
for native_series in native_series_list
|
||||
]
|
||||
|
||||
return window_f
|
||||
|
||||
def _with_callable(
|
||||
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
|
||||
) -> Self:
|
||||
"""Create expression from callable.
|
||||
|
||||
Arguments:
|
||||
call: Callable from compliant DataFrame to native Expression
|
||||
expr_name: Expression name
|
||||
expressifiable_args: arguments pass to expression which should be parsed
|
||||
as expressions (e.g. in `nw.col('a').is_between('b', 'c')`)
|
||||
"""
|
||||
return self.__class__(
|
||||
self._callable_to_eval_series(call, **expressifiable_args),
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def _with_elementwise(
|
||||
self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any
|
||||
) -> Self:
|
||||
return self.__class__(
|
||||
self._callable_to_eval_series(call, **expressifiable_args),
|
||||
self._push_down_window_function(call, **expressifiable_args),
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def _with_binary(self, op: Callable[..., Expression], other: Self | Any) -> Self:
|
||||
return self.__class__(
|
||||
self._callable_to_eval_series(op, other=other),
|
||||
self._push_down_window_function(op, other=other),
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def _with_alias_output_names(self, func: AliasNames | None, /) -> Self:
|
||||
return type(self)(
|
||||
self._call,
|
||||
self._window_function,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=func,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def _with_window_function(self, window_function: DuckDBWindowFunction) -> Self:
|
||||
return self.__class__(
|
||||
self._call,
|
||||
window_function,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _alias_native(cls, expr: Expression, name: str) -> Expression:
|
||||
return expr.alias(name)
|
||||
|
||||
def __invert__(self) -> Self:
|
||||
invert = cast("Callable[..., Expression]", operator.invert)
|
||||
return self._with_elementwise(invert)
|
||||
|
||||
def abs(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("abs", expr))
|
||||
|
||||
def mean(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("mean", expr))
|
||||
|
||||
def skew(self) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
count = F("count", expr)
|
||||
# Adjust population skewness by correction factor to get sample skewness
|
||||
sample_skewness = (
|
||||
F("skewness", expr)
|
||||
* (count - lit(2))
|
||||
/ F("sqrt", count * (count - lit(1)))
|
||||
)
|
||||
return when(count == lit(0), lit(None)).otherwise(
|
||||
when(count == lit(1), lit(float("nan"))).otherwise(
|
||||
when(count == lit(2), lit(0.0)).otherwise(sample_skewness)
|
||||
)
|
||||
)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def kurtosis(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("kurtosis_pop", expr))
|
||||
|
||||
def median(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("median", expr))
|
||||
|
||||
def all(self) -> Self:
|
||||
def f(expr: Expression) -> Expression:
|
||||
return CoalesceOperator(F("bool_and", expr), lit(True)) # noqa: FBT003
|
||||
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
return [
|
||||
CoalesceOperator(
|
||||
window_expression(F("bool_and", expr), inputs.partition_by),
|
||||
lit(True), # noqa: FBT003
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_callable(f)._with_window_function(window_f)
|
||||
|
||||
def any(self) -> Self:
|
||||
def f(expr: Expression) -> Expression:
|
||||
return CoalesceOperator(F("bool_or", expr), lit(False)) # noqa: FBT003
|
||||
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
return [
|
||||
CoalesceOperator(
|
||||
window_expression(F("bool_or", expr), inputs.partition_by),
|
||||
lit(False), # noqa: FBT003
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_callable(f)._with_window_function(window_f)
|
||||
|
||||
def quantile(
|
||||
self, quantile: float, interpolation: RollingInterpolationMethod
|
||||
) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
if interpolation == "linear":
|
||||
return F("quantile_cont", expr, lit(quantile))
|
||||
msg = "Only linear interpolation methods are supported for DuckDB quantile."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def clip(
|
||||
self,
|
||||
lower_bound: Self | NumericLiteral | TemporalLiteral | None,
|
||||
upper_bound: Self | NumericLiteral | TemporalLiteral | None,
|
||||
) -> Self:
|
||||
def _clip_lower(expr: Expression, lower_bound: Any) -> Expression:
|
||||
return F("greatest", expr, lower_bound)
|
||||
|
||||
def _clip_upper(expr: Expression, upper_bound: Any) -> Expression:
|
||||
return F("least", expr, upper_bound)
|
||||
|
||||
def _clip_both(
|
||||
expr: Expression, lower_bound: Any, upper_bound: Any
|
||||
) -> Expression:
|
||||
return F("greatest", F("least", expr, upper_bound), lower_bound)
|
||||
|
||||
if lower_bound is None:
|
||||
return self._with_elementwise(_clip_upper, upper_bound=upper_bound)
|
||||
if upper_bound is None:
|
||||
return self._with_elementwise(_clip_lower, lower_bound=lower_bound)
|
||||
return self._with_elementwise(
|
||||
_clip_both, lower_bound=lower_bound, upper_bound=upper_bound
|
||||
)
|
||||
|
||||
def sum(self) -> Self:
|
||||
def f(expr: Expression) -> Expression:
|
||||
return CoalesceOperator(F("sum", expr), lit(0))
|
||||
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
return [
|
||||
CoalesceOperator(
|
||||
window_expression(F("sum", expr), inputs.partition_by), lit(0)
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_callable(f)._with_window_function(window_f)
|
||||
|
||||
def n_unique(self) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
# https://stackoverflow.com/a/79338887/4451315
|
||||
return F("array_unique", F("array_agg", expr)) + F(
|
||||
"max", when(expr.isnotnull(), lit(0)).otherwise(lit(1))
|
||||
)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def count(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("count", expr))
|
||||
|
||||
def len(self) -> Self:
|
||||
return self._with_callable(lambda _expr: F("count"))
|
||||
|
||||
def std(self, ddof: int) -> Self:
|
||||
if ddof == 0:
|
||||
return self._with_callable(lambda expr: F("stddev_pop", expr))
|
||||
if ddof == 1:
|
||||
return self._with_callable(lambda expr: F("stddev_samp", expr))
|
||||
|
||||
def _std(expr: Expression) -> Expression:
|
||||
n_samples = F("count", expr)
|
||||
return (
|
||||
F("stddev_pop", expr)
|
||||
* F("sqrt", n_samples)
|
||||
/ (F("sqrt", (n_samples - lit(ddof))))
|
||||
)
|
||||
|
||||
return self._with_callable(_std)
|
||||
|
||||
def var(self, ddof: int) -> Self:
|
||||
if ddof == 0:
|
||||
return self._with_callable(lambda expr: F("var_pop", expr))
|
||||
if ddof == 1:
|
||||
return self._with_callable(lambda expr: F("var_samp", expr))
|
||||
|
||||
def _var(expr: Expression) -> Expression:
|
||||
n_samples = F("count", expr)
|
||||
return F("var_pop", expr) * n_samples / (n_samples - lit(ddof))
|
||||
|
||||
return self._with_callable(_var)
|
||||
|
||||
def max(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("max", expr))
|
||||
|
||||
def min(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("min", expr))
|
||||
|
||||
def null_count(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("sum", expr.isnull().cast("int")))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def over(
|
||||
self, partition_by: Sequence[str | Expression], order_by: Sequence[str]
|
||||
) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> Sequence[Expression]:
|
||||
return self.window_function(df, WindowInputs(partition_by, order_by))
|
||||
|
||||
return self.__class__(
|
||||
func,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def is_null(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: expr.isnull())
|
||||
|
||||
def is_nan(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("isnan", expr))
|
||||
|
||||
def is_finite(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("isfinite", expr))
|
||||
|
||||
def is_in(self, other: Sequence[Any]) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("contains", lit(other), expr))
|
||||
|
||||
def round(self, decimals: int) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("round", expr, lit(decimals)))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def shift(self, n: int) -> Self:
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
|
||||
return [
|
||||
window_expression(
|
||||
F("lag", expr, lit(n)), inputs.partition_by, inputs.order_by
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(func)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def is_first_distinct(self) -> Self:
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
|
||||
return [
|
||||
window_expression(
|
||||
F("row_number"), (*inputs.partition_by, expr), inputs.order_by
|
||||
)
|
||||
== lit(1)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(func)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def is_last_distinct(self) -> Self:
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]:
|
||||
return [
|
||||
window_expression(
|
||||
F("row_number"),
|
||||
(*inputs.partition_by, expr),
|
||||
inputs.order_by,
|
||||
descending=[True] * len(inputs.order_by),
|
||||
nulls_last=[True] * len(inputs.order_by),
|
||||
)
|
||||
== lit(1)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(func)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def diff(self) -> Self:
|
||||
def func(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
return [
|
||||
expr
|
||||
- window_expression(F("lag", expr), inputs.partition_by, inputs.order_by)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(func)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def cum_sum(self, *, reverse: bool) -> Self:
|
||||
return self._with_window_function(self._cum_window_func("sum", reverse=reverse))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def cum_max(self, *, reverse: bool) -> Self:
|
||||
return self._with_window_function(self._cum_window_func("max", reverse=reverse))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def cum_min(self, *, reverse: bool) -> Self:
|
||||
return self._with_window_function(self._cum_window_func("min", reverse=reverse))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def cum_count(self, *, reverse: bool) -> Self:
|
||||
return self._with_window_function(self._cum_window_func("count", reverse=reverse))
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def cum_prod(self, *, reverse: bool) -> Self:
|
||||
return self._with_window_function(
|
||||
self._cum_window_func("product", reverse=reverse)
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self:
|
||||
return self._with_window_function(
|
||||
self._rolling_window_func("sum", window_size, min_samples, center=center)
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self:
|
||||
return self._with_window_function(
|
||||
self._rolling_window_func("mean", window_size, min_samples, center=center)
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def rolling_var(
|
||||
self, window_size: int, *, min_samples: int, center: bool, ddof: int
|
||||
) -> Self:
|
||||
return self._with_window_function(
|
||||
self._rolling_window_func(
|
||||
"var", window_size, min_samples, ddof=ddof, center=center
|
||||
)
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def rolling_std(
|
||||
self, window_size: int, *, min_samples: int, center: bool, ddof: int
|
||||
) -> Self:
|
||||
return self._with_window_function(
|
||||
self._rolling_window_func(
|
||||
"std", window_size, min_samples, ddof=ddof, center=center
|
||||
)
|
||||
)
|
||||
|
||||
def fill_null(
|
||||
self,
|
||||
value: Self | NonNestedLiteral,
|
||||
strategy: FillNullStrategy | None,
|
||||
limit: int | None,
|
||||
) -> Self:
|
||||
if strategy is not None:
|
||||
if self._backend_version < (1, 3): # pragma: no cover
|
||||
msg = f"`fill_null` with `strategy={strategy}` is only available in 'duckdb>=1.3.0'."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
def _fill_with_strategy(
|
||||
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
|
||||
) -> Sequence[Expression]:
|
||||
fill_func = "last_value" if strategy == "forward" else "first_value"
|
||||
_limit = "unbounded" if limit is None else limit
|
||||
rows_start, rows_end = (
|
||||
(f"{_limit} preceding", "current row")
|
||||
if strategy == "forward"
|
||||
else ("current row", f"{_limit} following")
|
||||
)
|
||||
return [
|
||||
window_expression(
|
||||
F(fill_func, expr),
|
||||
inputs.partition_by,
|
||||
inputs.order_by,
|
||||
rows_start=rows_start,
|
||||
rows_end=rows_end,
|
||||
ignore_nulls=True,
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(_fill_with_strategy)
|
||||
|
||||
def _fill_constant(expr: Expression, value: Any) -> Expression:
|
||||
return CoalesceOperator(expr, value)
|
||||
|
||||
return self._with_elementwise(_fill_constant, value=value)
|
||||
|
||||
def cast(self, dtype: IntoDType) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [expr.cast(DuckDBPyType(native_dtype)) for expr in self(df)]
|
||||
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [
|
||||
expr.cast(DuckDBPyType(native_dtype))
|
||||
for expr in self.window_function(df, inputs)
|
||||
]
|
||||
|
||||
return self.__class__(
|
||||
func,
|
||||
window_f,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def is_unique(self) -> Self:
|
||||
def _is_unique(expr: Expression, *partition_by: str | Expression) -> Expression:
|
||||
return window_expression(
|
||||
F("count", StarExpression()), (expr, *partition_by)
|
||||
) == lit(1)
|
||||
|
||||
def _unpartitioned_is_unique(expr: Expression) -> Expression:
|
||||
return _is_unique(expr)
|
||||
|
||||
def _partitioned_is_unique(
|
||||
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
|
||||
) -> Sequence[Expression]:
|
||||
assert not inputs.order_by # noqa: S101
|
||||
return [_is_unique(expr, *inputs.partition_by) for expr in self(df)]
|
||||
|
||||
return self._with_callable(_unpartitioned_is_unique)._with_window_function(
|
||||
_partitioned_is_unique
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def rank(self, method: RankMethod, *, descending: bool) -> Self:
|
||||
if method in {"min", "max", "average"}:
|
||||
func = F("rank")
|
||||
elif method == "dense":
|
||||
func = F("dense_rank")
|
||||
else: # method == "ordinal"
|
||||
func = F("row_number")
|
||||
|
||||
def _rank(
|
||||
expr: Expression,
|
||||
partition_by: Sequence[str | Expression] = (),
|
||||
order_by: Sequence[str | Expression] = (),
|
||||
*,
|
||||
descending: Sequence[bool],
|
||||
nulls_last: Sequence[bool],
|
||||
) -> Expression:
|
||||
count_expr = F("count", StarExpression())
|
||||
window_kwargs: WindowExpressionKwargs = {
|
||||
"partition_by": partition_by,
|
||||
"order_by": (expr, *order_by),
|
||||
"descending": descending,
|
||||
"nulls_last": nulls_last,
|
||||
}
|
||||
count_window_kwargs: WindowExpressionKwargs = {
|
||||
"partition_by": (*partition_by, expr)
|
||||
}
|
||||
if method == "max":
|
||||
rank_expr = (
|
||||
window_expression(func, **window_kwargs)
|
||||
+ window_expression(count_expr, **count_window_kwargs)
|
||||
- lit(1)
|
||||
)
|
||||
elif method == "average":
|
||||
rank_expr = window_expression(func, **window_kwargs) + (
|
||||
window_expression(count_expr, **count_window_kwargs) - lit(1)
|
||||
) / lit(2.0)
|
||||
else:
|
||||
rank_expr = window_expression(func, **window_kwargs)
|
||||
return when(expr.isnotnull(), rank_expr)
|
||||
|
||||
def _unpartitioned_rank(expr: Expression) -> Expression:
|
||||
return _rank(expr, descending=[descending], nulls_last=[True])
|
||||
|
||||
def _partitioned_rank(
|
||||
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
|
||||
) -> Sequence[Expression]:
|
||||
# node: when `descending` / `nulls_last` are supported in `.over`, they should be respected here
|
||||
# https://github.com/narwhals-dev/narwhals/issues/2790
|
||||
return [
|
||||
_rank(
|
||||
expr,
|
||||
inputs.partition_by,
|
||||
inputs.order_by,
|
||||
descending=[descending] + [False] * len(inputs.order_by),
|
||||
nulls_last=[True] + [False] * len(inputs.order_by),
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_callable(_unpartitioned_rank)._with_window_function(
|
||||
_partitioned_rank
|
||||
)
|
||||
|
||||
def log(self, base: float) -> Self:
|
||||
def _log(expr: Expression) -> Expression:
|
||||
log = F("log", expr)
|
||||
return (
|
||||
when(expr < lit(0), lit(float("nan")))
|
||||
.when(expr == lit(0), lit(float("-inf")))
|
||||
.otherwise(log / F("log", lit(base)))
|
||||
)
|
||||
|
||||
return self._with_elementwise(_log)
|
||||
|
||||
def exp(self) -> Self:
|
||||
def _exp(expr: Expression) -> Expression:
|
||||
return F("exp", expr)
|
||||
|
||||
return self._with_elementwise(_exp)
|
||||
|
||||
def sqrt(self) -> Self:
|
||||
def _sqrt(expr: Expression) -> Expression:
|
||||
return when(expr < lit(0), lit(float("nan"))).otherwise(F("sqrt", expr))
|
||||
|
||||
return self._with_elementwise(_sqrt)
|
||||
|
||||
@property
|
||||
def str(self) -> DuckDBExprStringNamespace:
|
||||
return DuckDBExprStringNamespace(self)
|
||||
|
||||
@property
|
||||
def dt(self) -> DuckDBExprDateTimeNamespace:
|
||||
return DuckDBExprDateTimeNamespace(self)
|
||||
|
||||
@property
|
||||
def list(self) -> DuckDBExprListNamespace:
|
||||
return DuckDBExprListNamespace(self)
|
||||
|
||||
@property
|
||||
def struct(self) -> DuckDBExprStructNamespace:
|
||||
return DuckDBExprStructNamespace(self)
|
||||
|
||||
drop_nulls = not_implemented()
|
||||
unique = not_implemented()
|
||||
157
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr_dt.py
Normal file
157
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr_dt.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import DateTimeNamespace
|
||||
from narwhals._constants import (
|
||||
MS_PER_MINUTE,
|
||||
MS_PER_SECOND,
|
||||
NS_PER_SECOND,
|
||||
SECONDS_PER_MINUTE,
|
||||
US_PER_MINUTE,
|
||||
US_PER_SECOND,
|
||||
)
|
||||
from narwhals._duckdb.utils import UNITS_DICT, F, fetch_rel_time_zone, lit
|
||||
from narwhals._duration import Interval
|
||||
from narwhals._utils import not_implemented
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprDateTimeNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], DateTimeNamespace["DuckDBExpr"]
|
||||
):
|
||||
def year(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("year", expr))
|
||||
|
||||
def month(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("month", expr))
|
||||
|
||||
def day(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("day", expr))
|
||||
|
||||
def hour(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("hour", expr))
|
||||
|
||||
def minute(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("minute", expr))
|
||||
|
||||
def second(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("second", expr))
|
||||
|
||||
def millisecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("millisecond", expr) - F("second", expr) * lit(MS_PER_SECOND)
|
||||
)
|
||||
|
||||
def microsecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("microsecond", expr) - F("second", expr) * lit(US_PER_SECOND)
|
||||
)
|
||||
|
||||
def nanosecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("nanosecond", expr) - F("second", expr) * lit(NS_PER_SECOND)
|
||||
)
|
||||
|
||||
def to_string(self, format: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("strftime", expr, lit(format))
|
||||
)
|
||||
|
||||
def weekday(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("isodow", expr))
|
||||
|
||||
def ordinal_day(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("dayofyear", expr))
|
||||
|
||||
def date(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: expr.cast("date"))
|
||||
|
||||
def total_minutes(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("datepart", lit("minute"), expr)
|
||||
)
|
||||
|
||||
def total_seconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(SECONDS_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("second"), expr)
|
||||
)
|
||||
|
||||
def total_milliseconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(MS_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("millisecond"), expr)
|
||||
)
|
||||
|
||||
def total_microseconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(US_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("microsecond"), expr)
|
||||
)
|
||||
|
||||
def truncate(self, every: str) -> DuckDBExpr:
|
||||
interval = Interval.parse(every)
|
||||
multiple, unit = interval.multiple, interval.unit
|
||||
if multiple != 1:
|
||||
# https://github.com/duckdb/duckdb/issues/17554
|
||||
msg = f"Only multiple 1 is currently supported for DuckDB.\nGot {multiple!s}."
|
||||
raise ValueError(msg)
|
||||
if unit == "ns":
|
||||
msg = "Truncating to nanoseconds is not yet supported for DuckDB."
|
||||
raise NotImplementedError(msg)
|
||||
format = lit(UNITS_DICT[unit])
|
||||
|
||||
def _truncate(expr: Expression) -> Expression:
|
||||
return F("date_trunc", format, expr)
|
||||
|
||||
return self.compliant._with_elementwise(_truncate)
|
||||
|
||||
def offset_by(self, by: str) -> DuckDBExpr:
|
||||
interval = Interval.parse_no_constraints(by)
|
||||
format = lit(f"{interval.multiple!s} {UNITS_DICT[interval.unit]}")
|
||||
|
||||
def _offset_by(expr: Expression) -> Expression:
|
||||
return F("date_add", format, expr)
|
||||
|
||||
return self.compliant._with_callable(_offset_by)
|
||||
|
||||
def _no_op_time_zone(self, time_zone: str) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> Sequence[Expression]:
|
||||
native_series_list = self.compliant(df)
|
||||
conn_time_zone = fetch_rel_time_zone(df.native)
|
||||
if conn_time_zone != time_zone:
|
||||
msg = (
|
||||
"DuckDB stores the time zone in the connection, rather than in the "
|
||||
f"data type, so changing the timezone to anything other than {conn_time_zone} "
|
||||
" (the current connection time zone) is not supported."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
return native_series_list
|
||||
|
||||
return self.compliant.__class__(
|
||||
func,
|
||||
evaluate_output_names=self.compliant._evaluate_output_names,
|
||||
alias_output_names=self.compliant._alias_output_names,
|
||||
version=self.compliant._version,
|
||||
)
|
||||
|
||||
def convert_time_zone(self, time_zone: str) -> DuckDBExpr:
|
||||
return self._no_op_time_zone(time_zone)
|
||||
|
||||
def replace_time_zone(self, time_zone: str | None) -> DuckDBExpr:
|
||||
if time_zone is None:
|
||||
return self.compliant._with_elementwise(lambda expr: expr.cast("timestamp"))
|
||||
else:
|
||||
return self._no_op_time_zone(time_zone)
|
||||
|
||||
total_nanoseconds = not_implemented()
|
||||
timestamp = not_implemented()
|
||||
@@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import ListNamespace
|
||||
from narwhals._duckdb.utils import F
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprListNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], ListNamespace["DuckDBExpr"]
|
||||
):
|
||||
def len(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("len", expr))
|
||||
128
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr_str.py
Normal file
128
myenv/lib/python3.11/site-packages/narwhals/_duckdb/expr_str.py
Normal file
@@ -0,0 +1,128 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import StringNamespace
|
||||
from narwhals._duckdb.utils import F, lit, when
|
||||
from narwhals._utils import not_implemented
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from duckdb import Expression
|
||||
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprStringNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], StringNamespace["DuckDBExpr"]
|
||||
):
|
||||
def starts_with(self, prefix: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("starts_with", expr, lit(prefix))
|
||||
)
|
||||
|
||||
def ends_with(self, suffix: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("ends_with", expr, lit(suffix))
|
||||
)
|
||||
|
||||
def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr:
|
||||
def func(expr: Expression) -> Expression:
|
||||
if literal:
|
||||
return F("contains", expr, lit(pattern))
|
||||
return F("regexp_matches", expr, lit(pattern))
|
||||
|
||||
return self.compliant._with_elementwise(func)
|
||||
|
||||
def slice(self, offset: int, length: int | None) -> DuckDBExpr:
|
||||
def func(expr: Expression) -> Expression:
|
||||
offset_lit = lit(offset)
|
||||
return F(
|
||||
"array_slice",
|
||||
expr,
|
||||
lit(offset + 1)
|
||||
if offset >= 0
|
||||
else F("length", expr) + offset_lit + lit(1),
|
||||
F("length", expr) if length is None else lit(length) + offset_lit,
|
||||
)
|
||||
|
||||
return self.compliant._with_elementwise(func)
|
||||
|
||||
def split(self, by: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("str_split", expr, lit(by))
|
||||
)
|
||||
|
||||
def len_chars(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("length", expr))
|
||||
|
||||
def to_lowercase(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("lower", expr))
|
||||
|
||||
def to_uppercase(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("upper", expr))
|
||||
|
||||
def strip_chars(self, characters: str | None) -> DuckDBExpr:
|
||||
import string
|
||||
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F(
|
||||
"trim", expr, lit(string.whitespace if characters is None else characters)
|
||||
)
|
||||
)
|
||||
|
||||
def replace_all(self, pattern: str, value: str, *, literal: bool) -> DuckDBExpr:
|
||||
if not literal:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("regexp_replace", expr, lit(pattern), lit(value), lit("g"))
|
||||
)
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("replace", expr, lit(pattern), lit(value))
|
||||
)
|
||||
|
||||
def to_datetime(self, format: str | None) -> DuckDBExpr:
|
||||
if format is None:
|
||||
msg = "Cannot infer format with DuckDB backend, please specify `format` explicitly."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("strptime", expr, lit(format))
|
||||
)
|
||||
|
||||
def to_date(self, format: str | None) -> DuckDBExpr:
|
||||
if format is not None:
|
||||
return self.to_datetime(format=format).dt.date()
|
||||
|
||||
compliant_expr = self.compliant
|
||||
return compliant_expr.cast(compliant_expr._version.dtypes.Date())
|
||||
|
||||
def zfill(self, width: int) -> DuckDBExpr:
|
||||
# DuckDB does not have a built-in zfill function, so we need to implement it manually
|
||||
# using string manipulation functions.
|
||||
|
||||
def func(expr: Expression) -> Expression:
|
||||
less_than_width = F("length", expr) < lit(width)
|
||||
zero, hyphen, plus = lit("0"), lit("-"), lit("+")
|
||||
|
||||
starts_with_minus = F("starts_with", expr, hyphen)
|
||||
starts_with_plus = F("starts_with", expr, plus)
|
||||
substring = F("substr", expr, lit(2))
|
||||
padded_substring = F("lpad", substring, lit(width - 1), zero)
|
||||
return (
|
||||
when(
|
||||
starts_with_minus & less_than_width,
|
||||
F("concat", hyphen, padded_substring),
|
||||
)
|
||||
.when(
|
||||
starts_with_plus & less_than_width,
|
||||
F("concat", plus, padded_substring),
|
||||
)
|
||||
.when(less_than_width, F("lpad", expr, lit(width), zero))
|
||||
.otherwise(expr)
|
||||
)
|
||||
|
||||
# can't use `_with_elementwise` due to `when` operator.
|
||||
# TODO(unassigned): implement `window_func` like we do in `Expr.cast`
|
||||
return self.compliant._with_callable(func)
|
||||
|
||||
replace = not_implemented()
|
||||
@@ -0,0 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import StructNamespace
|
||||
from narwhals._duckdb.utils import F, lit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprStructNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], StructNamespace["DuckDBExpr"]
|
||||
):
|
||||
def field(self, name: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("struct_extract", expr, lit(name))
|
||||
).alias(name)
|
||||
@@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyGroupBy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression # noqa: F401
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBGroupBy(LazyGroupBy["DuckDBLazyFrame", "DuckDBExpr", "Expression"]):
|
||||
def __init__(
|
||||
self,
|
||||
df: DuckDBLazyFrame,
|
||||
keys: Sequence[DuckDBExpr] | Sequence[str],
|
||||
/,
|
||||
*,
|
||||
drop_null_keys: bool,
|
||||
) -> None:
|
||||
frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys)
|
||||
self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame
|
||||
|
||||
def agg(self, *exprs: DuckDBExpr) -> DuckDBLazyFrame:
|
||||
agg_columns = list(chain(self._keys, self._evaluate_exprs(exprs)))
|
||||
return self.compliant._with_native(
|
||||
self.compliant.native.aggregate(agg_columns) # type: ignore[arg-type]
|
||||
).rename(dict(zip(self._keys, self._output_key_names)))
|
||||
202
myenv/lib/python3.11/site-packages/narwhals/_duckdb/namespace.py
Normal file
202
myenv/lib/python3.11/site-packages/narwhals/_duckdb/namespace.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from functools import reduce
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import duckdb
|
||||
from duckdb import CoalesceOperator, Expression
|
||||
from duckdb.typing import BIGINT, VARCHAR
|
||||
|
||||
from narwhals._compliant import LazyNamespace, LazyThen, LazyWhen
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals._duckdb.selectors import DuckDBSelectorNamespace
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
concat_str,
|
||||
lit,
|
||||
narwhals_to_native_dtype,
|
||||
when,
|
||||
)
|
||||
from narwhals._expression_parsing import (
|
||||
combine_alias_output_names,
|
||||
combine_evaluate_output_names,
|
||||
)
|
||||
from narwhals._utils import Implementation
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
|
||||
from narwhals._duckdb.expr import DuckDBWindowInputs
|
||||
from narwhals._utils import Version
|
||||
from narwhals.typing import ConcatMethod, IntoDType, NonNestedLiteral
|
||||
|
||||
|
||||
class DuckDBNamespace(
|
||||
LazyNamespace[DuckDBLazyFrame, DuckDBExpr, duckdb.DuckDBPyRelation]
|
||||
):
|
||||
_implementation: Implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(self, *, version: Version) -> None:
|
||||
self._version = version
|
||||
|
||||
@property
|
||||
def selectors(self) -> DuckDBSelectorNamespace:
|
||||
return DuckDBSelectorNamespace.from_namespace(self)
|
||||
|
||||
@property
|
||||
def _expr(self) -> type[DuckDBExpr]:
|
||||
return DuckDBExpr
|
||||
|
||||
@property
|
||||
def _lazyframe(self) -> type[DuckDBLazyFrame]:
|
||||
return DuckDBLazyFrame
|
||||
|
||||
def concat(
|
||||
self, items: Iterable[DuckDBLazyFrame], *, how: ConcatMethod
|
||||
) -> DuckDBLazyFrame:
|
||||
native_items = [item._native_frame for item in items]
|
||||
items = list(items)
|
||||
first = items[0]
|
||||
schema = first.schema
|
||||
if how == "vertical" and not all(x.schema == schema for x in items[1:]):
|
||||
msg = "inputs should all have the same schema"
|
||||
raise TypeError(msg)
|
||||
res = reduce(lambda x, y: x.union(y), native_items)
|
||||
return first._with_native(res)
|
||||
|
||||
def concat_str(
|
||||
self, *exprs: DuckDBExpr, separator: str, ignore_nulls: bool
|
||||
) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
cols = list(chain.from_iterable(expr(df) for expr in exprs))
|
||||
if not ignore_nulls:
|
||||
null_mask_result = reduce(operator.or_, (s.isnull() for s in cols))
|
||||
cols_separated = [
|
||||
y
|
||||
for x in [
|
||||
(col.cast(VARCHAR),)
|
||||
if i == len(cols) - 1
|
||||
else (col.cast(VARCHAR), lit(separator))
|
||||
for i, col in enumerate(cols)
|
||||
]
|
||||
for y in x
|
||||
]
|
||||
return [when(~null_mask_result, concat_str(*cols_separated))]
|
||||
else:
|
||||
return [concat_str(*cols, separator=separator)]
|
||||
|
||||
return self._expr(
|
||||
call=func,
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def all_horizontal(self, *exprs: DuckDBExpr, ignore_nulls: bool) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
it = (
|
||||
(CoalesceOperator(expr, lit(True)) for expr in cols) # noqa: FBT003
|
||||
if ignore_nulls
|
||||
else cols
|
||||
)
|
||||
return reduce(operator.and_, it)
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def any_horizontal(self, *exprs: DuckDBExpr, ignore_nulls: bool) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
it = (
|
||||
(CoalesceOperator(expr, lit(False)) for expr in cols) # noqa: FBT003
|
||||
if ignore_nulls
|
||||
else cols
|
||||
)
|
||||
return reduce(operator.or_, it)
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def max_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
return F("greatest", *cols)
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def min_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
return F("least", *cols)
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def sum_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
return reduce(operator.add, (CoalesceOperator(col, lit(0)) for col in cols))
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def mean_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
cols = list(cols)
|
||||
return reduce(
|
||||
operator.add, (CoalesceOperator(col, lit(0)) for col in cols)
|
||||
) / reduce(operator.add, (col.isnotnull().cast(BIGINT) for col in cols))
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def when(self, predicate: DuckDBExpr) -> DuckDBWhen:
|
||||
return DuckDBWhen.from_expr(predicate, context=self)
|
||||
|
||||
def lit(self, value: NonNestedLiteral, dtype: IntoDType | None) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
if dtype is not None:
|
||||
target = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [lit(value).cast(target)] # type: ignore[arg-type]
|
||||
return [lit(value)]
|
||||
|
||||
return self._expr(
|
||||
func,
|
||||
evaluate_output_names=lambda _df: ["literal"],
|
||||
alias_output_names=None,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def len(self) -> DuckDBExpr:
|
||||
def func(_df: DuckDBLazyFrame) -> list[Expression]:
|
||||
return [F("count")]
|
||||
|
||||
return self._expr(
|
||||
call=func,
|
||||
evaluate_output_names=lambda _df: ["len"],
|
||||
alias_output_names=None,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def coalesce(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
return CoalesceOperator(*cols)
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
|
||||
class DuckDBWhen(LazyWhen["DuckDBLazyFrame", Expression, DuckDBExpr]):
|
||||
@property
|
||||
def _then(self) -> type[DuckDBThen]:
|
||||
return DuckDBThen
|
||||
|
||||
def __call__(self, df: DuckDBLazyFrame) -> Sequence[Expression]:
|
||||
self.when = when
|
||||
self.lit = lit
|
||||
return super().__call__(df)
|
||||
|
||||
def _window_function(
|
||||
self, df: DuckDBLazyFrame, window_inputs: DuckDBWindowInputs
|
||||
) -> Sequence[Expression]:
|
||||
self.when = when
|
||||
self.lit = lit
|
||||
return super()._window_function(df, window_inputs)
|
||||
|
||||
|
||||
class DuckDBThen(LazyThen["DuckDBLazyFrame", Expression, DuckDBExpr], DuckDBExpr): ...
|
||||
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import CompliantSelector, LazySelectorNamespace
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from duckdb import Expression # noqa: F401
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame # noqa: F401
|
||||
|
||||
|
||||
class DuckDBSelectorNamespace(LazySelectorNamespace["DuckDBLazyFrame", "Expression"]):
|
||||
@property
|
||||
def _selector(self) -> type[DuckDBSelector]:
|
||||
return DuckDBSelector
|
||||
|
||||
|
||||
class DuckDBSelector( # type: ignore[misc]
|
||||
CompliantSelector["DuckDBLazyFrame", "Expression"], DuckDBExpr
|
||||
):
|
||||
def _to_expr(self) -> DuckDBExpr:
|
||||
return DuckDBExpr(
|
||||
self._call,
|
||||
self._window_function,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
@@ -0,0 +1,44 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._duckdb.utils import DeferredTimeZone, native_to_narwhals_dtype
|
||||
from narwhals.dependencies import get_duckdb
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import ModuleType
|
||||
|
||||
import duckdb
|
||||
from typing_extensions import Never, Self
|
||||
|
||||
from narwhals._utils import Version
|
||||
from narwhals.dtypes import DType
|
||||
|
||||
|
||||
class DuckDBInterchangeSeries:
|
||||
def __init__(self, df: duckdb.DuckDBPyRelation, version: Version) -> None:
|
||||
self._native_series = df
|
||||
self._version = version
|
||||
|
||||
def __narwhals_series__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __native_namespace__(self) -> ModuleType:
|
||||
return get_duckdb() # type: ignore[no-any-return]
|
||||
|
||||
@property
|
||||
def dtype(self) -> DType:
|
||||
return native_to_narwhals_dtype(
|
||||
self._native_series.types[0],
|
||||
self._version,
|
||||
DeferredTimeZone(self._native_series),
|
||||
)
|
||||
|
||||
def __getattr__(self, attr: str) -> Never:
|
||||
msg = ( # pragma: no cover
|
||||
f"Attribute {attr} is not supported for interchange-level dataframes.\n\n"
|
||||
"If you would like to see this kind of object better supported in "
|
||||
"Narwhals, please open a feature request "
|
||||
"at https://github.com/narwhals-dev/narwhals/issues."
|
||||
)
|
||||
raise NotImplementedError(msg) # pragma: no cover
|
||||
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
|
||||
|
||||
class WindowExpressionKwargs(TypedDict, total=False):
|
||||
partition_by: Sequence[str | Expression]
|
||||
order_by: Sequence[str | Expression]
|
||||
rows_start: str
|
||||
rows_end: str
|
||||
descending: Sequence[bool]
|
||||
nulls_last: Sequence[bool]
|
||||
ignore_nulls: bool
|
||||
371
myenv/lib/python3.11/site-packages/narwhals/_duckdb/utils.py
Normal file
371
myenv/lib/python3.11/site-packages/narwhals/_duckdb/utils.py
Normal file
@@ -0,0 +1,371 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import duckdb
|
||||
|
||||
from narwhals._utils import Version, isinstance_or_issubclass
|
||||
from narwhals.exceptions import ColumnNotFoundError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import DuckDBPyRelation, Expression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
|
||||
from narwhals._compliant.typing import CompliantLazyFrameAny
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals.dtypes import DType
|
||||
from narwhals.typing import IntoDType
|
||||
|
||||
|
||||
UNITS_DICT = {
|
||||
"y": "year",
|
||||
"q": "quarter",
|
||||
"mo": "month",
|
||||
"d": "day",
|
||||
"h": "hour",
|
||||
"m": "minute",
|
||||
"s": "second",
|
||||
"ms": "millisecond",
|
||||
"us": "microsecond",
|
||||
"ns": "nanosecond",
|
||||
}
|
||||
UNIT_TO_TIMESTAMPS = {
|
||||
"s": "TIMESTAMP_S",
|
||||
"ms": "TIMESTAMP_MS",
|
||||
"us": "TIMESTAMP",
|
||||
"ns": "TIMESTAMP_NS",
|
||||
}
|
||||
DESCENDING_TO_ORDER = {True: "desc", False: "asc"}
|
||||
NULLS_LAST_TO_NULLS_POS = {True: "nulls last", False: "nulls first"}
|
||||
|
||||
col = duckdb.ColumnExpression
|
||||
"""Alias for `duckdb.ColumnExpression`."""
|
||||
|
||||
lit = duckdb.ConstantExpression
|
||||
"""Alias for `duckdb.ConstantExpression`."""
|
||||
|
||||
when = duckdb.CaseExpression
|
||||
"""Alias for `duckdb.CaseExpression`."""
|
||||
|
||||
F = duckdb.FunctionExpression
|
||||
"""Alias for `duckdb.FunctionExpression`."""
|
||||
|
||||
|
||||
def concat_str(*exprs: Expression, separator: str = "") -> Expression:
|
||||
"""Concatenate many strings, NULL inputs are skipped.
|
||||
|
||||
Wraps [concat] and [concat_ws] `FunctionExpression`(s).
|
||||
|
||||
Arguments:
|
||||
exprs: Native columns.
|
||||
separator: String that will be used to separate the values of each column.
|
||||
|
||||
Returns:
|
||||
A new native expression.
|
||||
|
||||
[concat]: https://duckdb.org/docs/stable/sql/functions/char.html#concatstring-
|
||||
[concat_ws]: https://duckdb.org/docs/stable/sql/functions/char.html#concat_wsseparator-string-
|
||||
"""
|
||||
return F("concat_ws", lit(separator), *exprs) if separator else F("concat", *exprs)
|
||||
|
||||
|
||||
def evaluate_exprs(
|
||||
df: DuckDBLazyFrame, /, *exprs: DuckDBExpr
|
||||
) -> list[tuple[str, Expression]]:
|
||||
native_results: list[tuple[str, Expression]] = []
|
||||
for expr in exprs:
|
||||
native_series_list = expr._call(df)
|
||||
output_names = expr._evaluate_output_names(df)
|
||||
if expr._alias_output_names is not None:
|
||||
output_names = expr._alias_output_names(output_names)
|
||||
if len(output_names) != len(native_series_list): # pragma: no cover
|
||||
msg = f"Internal error: got output names {output_names}, but only got {len(native_series_list)} results"
|
||||
raise AssertionError(msg)
|
||||
native_results.extend(zip(output_names, native_series_list))
|
||||
return native_results
|
||||
|
||||
|
||||
class DeferredTimeZone:
|
||||
"""Object which gets passed between `native_to_narwhals_dtype` calls.
|
||||
|
||||
DuckDB stores the time zone in the connection, rather than in the dtypes, so
|
||||
this ensures that when calculating the schema of a dataframe with multiple
|
||||
timezone-aware columns, that the connection's time zone is only fetched once.
|
||||
|
||||
Note: we cannot make the time zone a cached `DuckDBLazyFrame` property because
|
||||
the time zone can be modified after `DuckDBLazyFrame` creation:
|
||||
|
||||
```python
|
||||
df = nw.from_native(rel)
|
||||
print(df.collect_schema())
|
||||
rel.query("set timezone = 'Asia/Kolkata'")
|
||||
print(df.collect_schema()) # should change to reflect new time zone
|
||||
```
|
||||
"""
|
||||
|
||||
_cached_time_zone: str | None = None
|
||||
|
||||
def __init__(self, rel: DuckDBPyRelation) -> None:
|
||||
self._rel = rel
|
||||
|
||||
@property
|
||||
def time_zone(self) -> str:
|
||||
"""Fetch relation time zone (if it wasn't calculated already)."""
|
||||
if self._cached_time_zone is None:
|
||||
self._cached_time_zone = fetch_rel_time_zone(self._rel)
|
||||
return self._cached_time_zone
|
||||
|
||||
|
||||
def native_to_narwhals_dtype(
|
||||
duckdb_dtype: DuckDBPyType, version: Version, deferred_time_zone: DeferredTimeZone
|
||||
) -> DType:
|
||||
duckdb_dtype_id = duckdb_dtype.id
|
||||
dtypes = version.dtypes
|
||||
|
||||
# Handle nested data types first
|
||||
if duckdb_dtype_id == "list":
|
||||
return dtypes.List(
|
||||
native_to_narwhals_dtype(duckdb_dtype.child, version, deferred_time_zone)
|
||||
)
|
||||
|
||||
if duckdb_dtype_id == "struct":
|
||||
children = duckdb_dtype.children
|
||||
return dtypes.Struct(
|
||||
[
|
||||
dtypes.Field(
|
||||
name=child[0],
|
||||
dtype=native_to_narwhals_dtype(child[1], version, deferred_time_zone),
|
||||
)
|
||||
for child in children
|
||||
]
|
||||
)
|
||||
|
||||
if duckdb_dtype_id == "array":
|
||||
child, size = duckdb_dtype.children
|
||||
shape: list[int] = [size[1]]
|
||||
|
||||
while child[1].id == "array":
|
||||
child, size = child[1].children
|
||||
shape.insert(0, size[1])
|
||||
|
||||
inner = native_to_narwhals_dtype(child[1], version, deferred_time_zone)
|
||||
return dtypes.Array(inner=inner, shape=tuple(shape))
|
||||
|
||||
if duckdb_dtype_id == "enum":
|
||||
if version is Version.V1:
|
||||
return dtypes.Enum() # type: ignore[call-arg]
|
||||
categories = duckdb_dtype.children[0][1]
|
||||
return dtypes.Enum(categories=categories)
|
||||
|
||||
if duckdb_dtype_id == "timestamp with time zone":
|
||||
return dtypes.Datetime(time_zone=deferred_time_zone.time_zone)
|
||||
|
||||
return _non_nested_native_to_narwhals_dtype(duckdb_dtype_id, version)
|
||||
|
||||
|
||||
def fetch_rel_time_zone(rel: duckdb.DuckDBPyRelation) -> str:
|
||||
result = rel.query(
|
||||
"duckdb_settings()", "select value from duckdb_settings() where name = 'TimeZone'"
|
||||
).fetchone()
|
||||
assert result is not None # noqa: S101
|
||||
return result[0] # type: ignore[no-any-return]
|
||||
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version) -> DType:
|
||||
dtypes = version.dtypes
|
||||
return {
|
||||
"hugeint": dtypes.Int128(),
|
||||
"bigint": dtypes.Int64(),
|
||||
"integer": dtypes.Int32(),
|
||||
"smallint": dtypes.Int16(),
|
||||
"tinyint": dtypes.Int8(),
|
||||
"uhugeint": dtypes.UInt128(),
|
||||
"ubigint": dtypes.UInt64(),
|
||||
"uinteger": dtypes.UInt32(),
|
||||
"usmallint": dtypes.UInt16(),
|
||||
"utinyint": dtypes.UInt8(),
|
||||
"double": dtypes.Float64(),
|
||||
"float": dtypes.Float32(),
|
||||
"varchar": dtypes.String(),
|
||||
"date": dtypes.Date(),
|
||||
"timestamp_s": dtypes.Datetime("s"),
|
||||
"timestamp_ms": dtypes.Datetime("ms"),
|
||||
"timestamp": dtypes.Datetime(),
|
||||
"timestamp_ns": dtypes.Datetime("ns"),
|
||||
"boolean": dtypes.Boolean(),
|
||||
"interval": dtypes.Duration(),
|
||||
"decimal": dtypes.Decimal(),
|
||||
"time": dtypes.Time(),
|
||||
"blob": dtypes.Binary(),
|
||||
}.get(duckdb_dtype_id, dtypes.Unknown())
|
||||
|
||||
|
||||
def narwhals_to_native_dtype( # noqa: PLR0912,PLR0915,C901
|
||||
dtype: IntoDType, version: Version, deferred_time_zone: DeferredTimeZone
|
||||
) -> str:
|
||||
dtypes = version.dtypes
|
||||
if isinstance_or_issubclass(dtype, dtypes.Decimal):
|
||||
msg = "Casting to Decimal is not supported yet."
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance_or_issubclass(dtype, dtypes.Float64):
|
||||
return "DOUBLE"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Float32):
|
||||
return "FLOAT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int128):
|
||||
return "INT128"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int64):
|
||||
return "BIGINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int32):
|
||||
return "INTEGER"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int16):
|
||||
return "SMALLINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int8):
|
||||
return "TINYINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt128):
|
||||
return "UINT128"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt64):
|
||||
return "UBIGINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt32):
|
||||
return "UINTEGER"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt16): # pragma: no cover
|
||||
return "USMALLINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt8): # pragma: no cover
|
||||
return "UTINYINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.String):
|
||||
return "VARCHAR"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Boolean): # pragma: no cover
|
||||
return "BOOLEAN"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Time):
|
||||
return "TIME"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Binary):
|
||||
return "BLOB"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Categorical):
|
||||
msg = "Categorical not supported by DuckDB"
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance_or_issubclass(dtype, dtypes.Enum):
|
||||
if version is Version.V1:
|
||||
msg = "Converting to Enum is not supported in narwhals.stable.v1"
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance(dtype, dtypes.Enum):
|
||||
categories = "'" + "', '".join(dtype.categories) + "'"
|
||||
return f"ENUM ({categories})"
|
||||
msg = "Can not cast / initialize Enum without categories present"
|
||||
raise ValueError(msg)
|
||||
|
||||
if isinstance_or_issubclass(dtype, dtypes.Datetime):
|
||||
tu = dtype.time_unit
|
||||
tz = dtype.time_zone
|
||||
if not tz:
|
||||
return UNIT_TO_TIMESTAMPS[tu]
|
||||
if tu != "us":
|
||||
msg = f"Only microsecond precision is supported for timezone-aware `Datetime` in DuckDB, got {tu} precision"
|
||||
raise ValueError(msg)
|
||||
if tz != (rel_tz := deferred_time_zone.time_zone): # pragma: no cover
|
||||
msg = f"Only the connection time zone {rel_tz} is supported, got: {tz}."
|
||||
raise ValueError(msg)
|
||||
# TODO(unassigned): cover once https://github.com/narwhals-dev/narwhals/issues/2742 addressed
|
||||
return "TIMESTAMPTZ" # pragma: no cover
|
||||
if isinstance_or_issubclass(dtype, dtypes.Duration):
|
||||
if (tu := dtype.time_unit) != "us": # pragma: no cover
|
||||
msg = f"Only microsecond-precision Duration is supported, got {tu} precision"
|
||||
return "INTERVAL"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Date):
|
||||
return "DATE"
|
||||
if isinstance_or_issubclass(dtype, dtypes.List):
|
||||
inner = narwhals_to_native_dtype(dtype.inner, version, deferred_time_zone)
|
||||
return f"{inner}[]"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Struct):
|
||||
inner = ", ".join(
|
||||
f'"{field.name}" {narwhals_to_native_dtype(field.dtype, version, deferred_time_zone)}'
|
||||
for field in dtype.fields
|
||||
)
|
||||
return f"STRUCT({inner})"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Array):
|
||||
shape = dtype.shape
|
||||
duckdb_shape_fmt = "".join(f"[{item}]" for item in shape)
|
||||
inner_dtype: Any = dtype
|
||||
for _ in shape:
|
||||
inner_dtype = inner_dtype.inner
|
||||
duckdb_inner = narwhals_to_native_dtype(inner_dtype, version, deferred_time_zone)
|
||||
return f"{duckdb_inner}{duckdb_shape_fmt}"
|
||||
msg = f"Unknown dtype: {dtype}" # pragma: no cover
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def parse_into_expression(into_expression: str | Expression) -> Expression:
|
||||
return col(into_expression) if isinstance(into_expression, str) else into_expression
|
||||
|
||||
|
||||
def generate_partition_by_sql(*partition_by: str | Expression) -> str:
|
||||
if not partition_by:
|
||||
return ""
|
||||
by_sql = ", ".join([f"{parse_into_expression(x)}" for x in partition_by])
|
||||
return f"partition by {by_sql}"
|
||||
|
||||
|
||||
def generate_order_by_sql(
|
||||
*order_by: str | Expression, descending: Sequence[bool], nulls_last: Sequence[bool]
|
||||
) -> str:
|
||||
if not order_by:
|
||||
return ""
|
||||
by_sql = ",".join(
|
||||
f"{parse_into_expression(x)} {DESCENDING_TO_ORDER[_descending]} {NULLS_LAST_TO_NULLS_POS[_nulls_last]}"
|
||||
for x, _descending, _nulls_last in zip(order_by, descending, nulls_last)
|
||||
)
|
||||
return f"order by {by_sql}"
|
||||
|
||||
|
||||
def window_expression(
|
||||
expr: Expression,
|
||||
partition_by: Sequence[str | Expression] = (),
|
||||
order_by: Sequence[str | Expression] = (),
|
||||
rows_start: str = "",
|
||||
rows_end: str = "",
|
||||
*,
|
||||
descending: Sequence[bool] | None = None,
|
||||
nulls_last: Sequence[bool] | None = None,
|
||||
ignore_nulls: bool = False,
|
||||
) -> Expression:
|
||||
# TODO(unassigned): Replace with `duckdb.WindowExpression` when they release it.
|
||||
# https://github.com/duckdb/duckdb/discussions/14725#discussioncomment-11200348
|
||||
try:
|
||||
from duckdb import SQLExpression
|
||||
except ModuleNotFoundError as exc: # pragma: no cover
|
||||
msg = f"DuckDB>=1.3.0 is required for this operation. Found: DuckDB {duckdb.__version__}"
|
||||
raise NotImplementedError(msg) from exc
|
||||
pb = generate_partition_by_sql(*partition_by)
|
||||
descending = descending or [False] * len(order_by)
|
||||
nulls_last = nulls_last or [False] * len(order_by)
|
||||
ob = generate_order_by_sql(*order_by, descending=descending, nulls_last=nulls_last)
|
||||
|
||||
if rows_start and rows_end:
|
||||
rows = f"rows between {rows_start} and {rows_end}"
|
||||
elif rows_start or rows_end: # pragma: no cover
|
||||
msg = "Either both `rows_start` and `rows_end` must be specified, or neither."
|
||||
else:
|
||||
rows = ""
|
||||
|
||||
func = f"{str(expr).removesuffix(')')} ignore nulls)" if ignore_nulls else str(expr)
|
||||
return SQLExpression(f"{func} over ({pb} {ob} {rows})")
|
||||
|
||||
|
||||
def catch_duckdb_exception(
|
||||
exception: Exception, frame: CompliantLazyFrameAny, /
|
||||
) -> ColumnNotFoundError | Exception:
|
||||
if isinstance(exception, duckdb.BinderException) and any(
|
||||
msg in str(exception)
|
||||
for msg in (
|
||||
"not found in FROM clause",
|
||||
"this column cannot be referenced before it is defined",
|
||||
)
|
||||
):
|
||||
return ColumnNotFoundError.from_available_column_names(
|
||||
available_columns=frame.columns
|
||||
)
|
||||
# Just return exception as-is.
|
||||
return exception
|
||||
Reference in New Issue
Block a user