narwhals-dev · MarcoGorelli · Apr 27, 2025 · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml
@@ -489,7 +489,7 @@ jobs:
         run: |
           cd validoopsie
           # empty pytest.ini to avoid pytest using narwhals configs
-          touch pytest.ini       
+          touch pytest.ini
           touch tests/__init__.py
           touch tests/utils/__init__.py
           uv run pytest tests

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -381,7 +381,9 @@ def with_columns(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame:
 
         return self._with_native(native_frame, validate_column_names=False)
 
-    def group_by(self, *keys: str, drop_null_keys: bool) -> ArrowGroupBy:
+    def group_by(
+        self, keys: Sequence[str] | Sequence[ArrowExpr], *, drop_null_keys: bool
+    ) -> ArrowGroupBy:
         from narwhals._arrow.group_by import ArrowGroupBy
 
         return ArrowGroupBy(self, keys, drop_null_keys=drop_null_keys)

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -176,7 +176,7 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]:
                     )
                     raise NotImplementedError(msg)
 
-                tmp = df.group_by(*partition_by, drop_null_keys=False).agg(self)
+                tmp = df.group_by(partition_by, drop_null_keys=False).agg(self)
                 tmp = df.simple_select(*partition_by).join(
                     tmp,
                     how="left",

diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -40,28 +40,28 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr"]):
 
     def __init__(
         self,
-        compliant_frame: ArrowDataFrame,
-        keys: Sequence[str],
+        df: ArrowDataFrame,
+        keys: Sequence[ArrowExpr] | Sequence[str],
         /,
         *,
         drop_null_keys: bool,
     ) -> None:
-        if drop_null_keys:
-            self._compliant_frame = compliant_frame.drop_nulls(keys)
-        else:
-            self._compliant_frame = compliant_frame
-        self._keys: list[str] = list(keys)
+        self._df = df
+        frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys)
+        self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame
         self._grouped = pa.TableGroupBy(self.compliant.native, self._keys)
+        self._drop_null_keys = drop_null_keys
 
     def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
         self._ensure_all_simple(exprs)
         aggs: list[tuple[str, str, Any]] = []
         expected_pyarrow_column_names: list[str] = self._keys.copy()
         new_column_names: list[str] = self._keys.copy()
+        exclude = (*self._keys, *self._output_key_names)
 
         for expr in exprs:
             output_names, aliases = evaluate_output_names_and_aliases(
-                expr, self.compliant, self._keys
+                expr, self.compliant, exclude
             )
 
             if expr._depth == 0:
@@ -120,7 +120,10 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
             result_simple = result_simple.select(
                 [*self._keys, *[col for col in columns if col not in self._keys]]
             )
-        return self.compliant._with_native(result_simple)
+
+        return self.compliant._with_native(result_simple).rename(
+            dict(zip(self._keys, self._output_key_names))
+        )
 
     def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
         col_token = generate_temporary_column_name(
@@ -142,9 +145,13 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
             null_replacement=null_token,
         )
         table = table.add_column(i=0, field_=col_token, column=key_values)
+
         for v in pc.unique(key_values):
             t = self.compliant._with_native(
                 table.filter(pc.equal(table[col_token], v)).drop([col_token])
             )
             row = t.simple_select(*self._keys).row(0)
-            yield tuple(extract_py_scalar(el) for el in row), t
+            yield (
+                tuple(extract_py_scalar(el) for el in row),
+                t.simple_select(*self._df.columns),
+            )
diff --git a/narwhals/_compliant/dataframe.py b/narwhals/_compliant/dataframe.py
@@ -18,7 +18,6 @@
 from narwhals._compliant.typing import EagerSeriesT
 from narwhals._compliant.typing import NativeFrameT
 from narwhals._compliant.typing import NativeSeriesT
-from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._translate import ArrowConvertible
 from narwhals._translate import DictConvertible
 from narwhals._translate import FromNative
@@ -159,7 +158,10 @@ def filter(self, predicate: CompliantExprT_contra | Incomplete) -> Self: ...
     def gather_every(self, n: int, offset: int) -> Self: ...
     def get_column(self, name: str) -> CompliantSeriesT: ...
     def group_by(
-        self, *keys: str, drop_null_keys: bool
+        self,
+        keys: Sequence[str] | Sequence[CompliantExprT_contra],
+        *,
+        drop_null_keys: bool,
     ) -> DataFrameGroupBy[Self, Any]: ...
     def head(self, n: int) -> Self: ...
     def item(self, row: int | None, column: int | str | None) -> Any: ...
@@ -250,6 +252,10 @@ def write_csv(self, file: str | Path | BytesIO) -> None: ...
     def write_csv(self, file: str | Path | BytesIO | None) -> str | None: ...
     def write_parquet(self, file: str | Path | BytesIO) -> None: ...
 
+    def _evaluate_aliases(self, *exprs: CompliantExprT_contra) -> list[str]:
+        it = (expr._evaluate_aliases(self) for expr in exprs)
+        return list(chain.from_iterable(it))
+
 
 class CompliantLazyFrame(
     _StoresNative[NativeFrameT],
@@ -302,8 +308,11 @@ def filter(self, predicate: CompliantExprT_contra | Incomplete) -> Self: ...
     )
     def gather_every(self, n: int, offset: int) -> Self: ...
     def group_by(
-        self, *keys: str, drop_null_keys: bool
-    ) -> CompliantGroupBy[Self, Any]: ...
+        self,
+        keys: Sequence[str] | Sequence[CompliantExprT_contra],
+        *,
+        drop_null_keys: bool,
+    ) -> CompliantGroupBy[Self, CompliantExprT_contra]: ...
     def head(self, n: int) -> Self: ...
     def join(
         self,
@@ -349,6 +358,10 @@ def _evaluate_expr(self, expr: CompliantExprT_contra, /) -> Any:
         assert len(result) == 1  # debug assertion  # noqa: S101
         return result[0]
 
+    def _evaluate_aliases(self, *exprs: CompliantExprT_contra) -> list[str]:
+        it = (expr._evaluate_aliases(self) for expr in exprs)
+        return list(chain.from_iterable(it))
+
 
 class EagerDataFrame(
     CompliantDataFrame[EagerSeriesT, EagerExprT, NativeFrameT],
@@ -379,7 +392,7 @@ def _evaluate_into_expr(self, expr: EagerExprT, /) -> Sequence[EagerSeriesT]:
 
         Note that for PySpark / DuckDB, we are less free to liberally set aliases whenever we want.
         """
-        _, aliases = evaluate_output_names_and_aliases(expr, self, [])
+        aliases = expr._evaluate_aliases(self)
         result = expr(self)
         if list(aliases) != (
             result_aliases := [s.name for s in result]

diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py
@@ -30,7 +30,6 @@
 from narwhals._compliant.typing import EagerSeriesT
 from narwhals._compliant.typing import LazyExprT
 from narwhals._compliant.typing import NativeExprT
-from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals.dependencies import get_numpy
 from narwhals.dependencies import is_numpy_array
 from narwhals.dtypes import DType
@@ -195,19 +194,6 @@ def clip(
         upper_bound: Self | NumericLiteral | TemporalLiteral | None,
     ) -> Self: ...
 
-    @property
-    def str(self) -> Any: ...
-    @property
-    def name(self) -> Any: ...
-    @property
-    def dt(self) -> Any: ...
-    @property
-    def cat(self) -> Any: ...
-    @property
-    def list(self) -> Any: ...
-    @property
-    def struct(self) -> Any: ...
-
     def ewm_mean(
         self,
         *,
@@ -287,6 +273,25 @@ def _is_multi_output_unnamed(self) -> bool:
         assert self._metadata is not None  # noqa: S101
         return self._metadata.expansion_kind.is_multi_unnamed()
 
+    def _evaluate_aliases(
+        self: CompliantExpr[CompliantFrameT, Any], frame: CompliantFrameT, /
+    ) -> Sequence[str]:
+        names = self._evaluate_output_names(frame)
+        return alias(names) if (alias := self._alias_output_names) else names
+
+    @property
+    def str(self) -> Any: ...
+    @property
+    def name(self) -> Any: ...
+    @property
+    def dt(self) -> Any: ...
+    @property
+    def cat(self) -> Any: ...
+    @property
+    def list(self) -> Any: ...
+    @property
+    def struct(self) -> Any: ...
+
 
 class DepthTrackingExpr(
     CompliantExpr[CompliantFrameT, CompliantSeriesOrNativeExprT_co],
@@ -467,7 +472,7 @@ def _reuse_series_inner(
             series._from_scalar(method(series)) if returns_scalar else method(series)
             for series in self(df)
         ]
-        _, aliases = evaluate_output_names_and_aliases(self, df, [])
+        aliases = self._evaluate_aliases(df)
         if [s.name for s in out] != list(aliases):  # pragma: no cover
             msg = (
                 f"Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues\n"

diff --git a/narwhals/_compliant/group_by.py b/narwhals/_compliant/group_by.py
@@ -13,19 +13,27 @@
 from typing import Sequence
 from typing import TypeVar
 
+from narwhals._compliant.typing import CompliantDataFrameAny
+from narwhals._compliant.typing import CompliantDataFrameT
 from narwhals._compliant.typing import CompliantDataFrameT_co
 from narwhals._compliant.typing import CompliantExprT_contra
+from narwhals._compliant.typing import CompliantFrameT
 from narwhals._compliant.typing import CompliantFrameT_co
-from narwhals._compliant.typing import CompliantLazyFrameT_co
+from narwhals._compliant.typing import CompliantLazyFrameAny
+from narwhals._compliant.typing import CompliantLazyFrameT
 from narwhals._compliant.typing import DepthTrackingExprAny
 from narwhals._compliant.typing import DepthTrackingExprT_contra
 from narwhals._compliant.typing import EagerExprT_contra
 from narwhals._compliant.typing import LazyExprT_contra
 from narwhals._compliant.typing import NativeExprT_co
+from narwhals._expression_parsing import is_multi_output
+from narwhals.utils import is_sequence_of
 
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
 
+    _SameFrameT = TypeVar("_SameFrameT", CompliantDataFrameAny, CompliantLazyFrameAny)
+
 
 if not TYPE_CHECKING:  # pragma: no cover
     if sys.version_info >= (3, 9):
@@ -58,7 +66,6 @@
 
 class CompliantGroupBy(Protocol38[CompliantFrameT_co, CompliantExprT_contra]):
     _compliant_frame: Any
-    _keys: Sequence[str]
 
     @property
     def compliant(self) -> CompliantFrameT_co:
@@ -67,7 +74,7 @@ def compliant(self) -> CompliantFrameT_co:
     def __init__(
         self,
         compliant_frame: CompliantFrameT_co,
-        keys: Sequence[str],
+        keys: Sequence[CompliantExprT_contra] | Sequence[str],
         /,
         *,
         drop_null_keys: bool,
@@ -83,9 +90,60 @@ class DataFrameGroupBy(
     def __iter__(self) -> Iterator[tuple[Any, CompliantDataFrameT_co]]: ...
 
 
+class ParseKeysGroupBy(
+    CompliantGroupBy[CompliantFrameT, CompliantExprT_contra],
+    Protocol38[CompliantFrameT, CompliantExprT_contra],
+):
+    def _parse_keys(
+        self,
+        compliant_frame: CompliantFrameT,
+        keys: Sequence[CompliantExprT_contra] | Sequence[str],
+    ) -> tuple[CompliantFrameT, list[str], list[str]]:
+        if is_sequence_of(keys, str):
+            keys_str = list(keys)
+            return compliant_frame, keys_str, keys_str.copy()
+        else:
+            return self._parse_expr_keys(compliant_frame, keys=keys)
+
+    @staticmethod
+    def _parse_expr_keys(
+        compliant_frame: _SameFrameT, keys: Sequence[CompliantExprT_contra]
+    ) -> tuple[_SameFrameT, list[str], list[str]]:
+        """Parses key expressions to set up `.agg` operation with correct information.
+
+        Since keys are expressions, it's possible to alias any such key to match
+        other dataframe column names.
+
+        In order to match polars behavior and not overwrite columns when evaluating keys:
+
+        - We evaluate what the output key names should be, in order to remap temporary column
+            names to the expected ones, and to exclude those from unnamed expressions in
+            `.agg(...)` context (see https://github.com/narwhals-dev/narwhals/pull/2325#issuecomment-2800004520)
+        - Create temporary names for evaluated key expressions that are guaranteed to have
+            no overlap with any existing column name.
+        - Add these temporary columns to the compliant dataframe.
+        """
+        suffix_token = "_" * (max(len(str(c)) for c in compliant_frame.columns) + 1)
+        output_names = compliant_frame._evaluate_aliases(*keys)
+
+        safe_keys = [
+            # multi-output expression cannot have duplicate names, hence it's safe to suffix
+            key.name.suffix(suffix_token)
+            if key._metadata is not None and is_multi_output(key._metadata.expansion_kind)
+            # otherwise it's single named and we can use Expr.alias
+            else key.alias(f"{new_name}{suffix_token}")
+            for key, new_name in zip(keys, output_names)
+        ]
+        return (
+            compliant_frame.with_columns(*safe_keys),
+            compliant_frame._evaluate_aliases(*safe_keys),
+            output_names,
+        )
+
+
 class DepthTrackingGroupBy(
-    CompliantGroupBy[CompliantFrameT_co, DepthTrackingExprT_contra],
-    Protocol38[CompliantFrameT_co, DepthTrackingExprT_contra, NativeAggregationT_co],
+    ParseKeysGroupBy[CompliantFrameT, DepthTrackingExprT_contra],
+    Protocol38[CompliantFrameT, DepthTrackingExprT_contra, NativeAggregationT_co],
 ):
     """`CompliantGroupBy` variant, deals with `Eager` and other backends that utilize `CompliantExpr._depth`."""
 
@@ -138,16 +196,20 @@ def _leaf_name(cls, expr: DepthTrackingExprAny, /) -> NarwhalsAggregation | Any:
 
 
 class EagerGroupBy(
-    DepthTrackingGroupBy[CompliantDataFrameT_co, EagerExprT_contra, str],
-    DataFrameGroupBy[CompliantDataFrameT_co, EagerExprT_contra],
-    Protocol38[CompliantDataFrameT_co, EagerExprT_contra],
+    DepthTrackingGroupBy[CompliantDataFrameT, EagerExprT_contra, str],
+    DataFrameGroupBy[CompliantDataFrameT, EagerExprT_contra],
+    Protocol38[CompliantDataFrameT, EagerExprT_contra],
 ): ...
 
 
 class LazyGroupBy(
-    CompliantGroupBy[CompliantLazyFrameT_co, LazyExprT_contra],
-    Protocol38[CompliantLazyFrameT_co, LazyExprT_contra, NativeExprT_co],
+    ParseKeysGroupBy[CompliantLazyFrameT, LazyExprT_contra],
+    CompliantGroupBy[CompliantLazyFrameT, LazyExprT_contra],
+    Protocol38[CompliantLazyFrameT, LazyExprT_contra, NativeExprT_co],
 ):
+    _keys: list[str]
+    _output_key_names: list[str]
+
     def _evaluate_expr(self, expr: LazyExprT_contra, /) -> Iterator[NativeExprT_co]:
         output_names = expr._evaluate_output_names(self.compliant)
         aliases = (
@@ -157,8 +219,9 @@ def _evaluate_expr(self, expr: LazyExprT_contra, /) -> Iterator[NativeExprT_co]:
         )
         native_exprs = expr(self.compliant)
         if expr._is_multi_output_unnamed():
+            exclude = {*self._keys, *self._output_key_names}
             for native_expr, name, alias in zip(native_exprs, output_names, aliases):
-                if name not in self._keys:
+                if name not in exclude:
                     yield native_expr.alias(alias)
         else:
             for native_expr, alias in zip(native_exprs, aliases):

diff --git a/narwhals/_compliant/series.py b/narwhals/_compliant/series.py
@@ -307,7 +307,7 @@ def _with_native(
         """Return a new `CompliantSeries`, wrapping the native `series`.
 
         In cases when operations are known to not affect whether a result should
-        be broadcast, we can pass `preverse_broadcast=True`.
+        be broadcast, we can pass `preserve_broadcast=True`.
         Set this with care - it should only be set for unary expressions which don't
         change length or order, such as `.alias` or `.fill_null`. If in doubt, don't
         set it, you probably don't need it.

diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -406,10 +406,12 @@ def join_asof(
             ),
         )
 
-    def group_by(self, *by: str, drop_null_keys: bool) -> DaskLazyGroupBy:
+    def group_by(
+        self, keys: Sequence[str] | Sequence[DaskExpr], *, drop_null_keys: bool
+    ) -> DaskLazyGroupBy:
         from narwhals._dask.group_by import DaskLazyGroupBy
 
-        return DaskLazyGroupBy(self, by, drop_null_keys=drop_null_keys)
+        return DaskLazyGroupBy(self, keys, drop_null_keys=drop_null_keys)
 
     def tail(self, n: int) -> Self:  # pragma: no cover
         native_frame = self.native