Merge branch 'main' into fix-merge-attrs

timhoffm · Jan 22, 2025 · dfbde9b · dfbde9b
2 parents a9de220 + 1bb264c
commit dfbde9b
Show file tree

Hide file tree

Showing 135 changed files with 2,458 additions and 604 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -22,10 +22,11 @@ defaults:
 
 jobs:
   ubuntu:
-    runs-on: ubuntu-22.04
+    runs-on: ${{ matrix.platform }}
     timeout-minutes: 90
     strategy:
       matrix:
+        platform: [ubuntu-22.04, ubuntu-24.04-arm]
         env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
@@ -35,9 +36,11 @@ jobs:
             env_file: actions-311-downstream_compat.yaml
             pattern: "not slow and not network and not single_cpu"
             pytest_target: "pandas/tests/test_downstream.py"
+            platform: ubuntu-22.04
           - name: "Minimum Versions"
             env_file: actions-310-minimum_versions.yaml
             pattern: "not slow and not network and not single_cpu"
+            platform: ubuntu-22.04
           - name: "Locale: it_IT"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -48,6 +51,7 @@ jobs:
             # Also install it_IT (its encoding is ISO8859-1) but do not activate it.
             # It will be temporarily activated during tests with locale.setlocale
             extra_loc: "it_IT"
+            platform: ubuntu-22.04
           - name: "Locale: zh_CN"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -58,25 +62,32 @@ jobs:
             # Also install zh_CN (its encoding is gb2312) but do not activate it.
             # It will be temporarily activated during tests with locale.setlocale
             extra_loc: "zh_CN"
+            platform: ubuntu-22.04
           - name: "Future infer strings"
             env_file: actions-312.yaml
             pandas_future_infer_string: "1"
+            platform: ubuntu-22.04
           - name: "Future infer strings (without pyarrow)"
             env_file: actions-311.yaml
             pandas_future_infer_string: "1"
+            platform: ubuntu-22.04
           - name: "Pypy"
             env_file: actions-pypy-39.yaml
             pattern: "not slow and not network and not single_cpu"
             test_args: "--max-worker-restart 0"
+            platform: ubuntu-22.04
           - name: "Numpy Dev"
             env_file: actions-311-numpydev.yaml
             pattern: "not slow and not network and not single_cpu"
             test_args: "-W error::DeprecationWarning -W error::FutureWarning"
+            platform: ubuntu-22.04
           - name: "Pyarrow Nightly"
             env_file: actions-311-pyarrownightly.yaml
             pattern: "not slow and not network and not single_cpu"
+            pandas_future_infer_string: "1"
+            platform: ubuntu-22.04
       fail-fast: false
-    name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
+    name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }}
     env:
       PATTERN: ${{ matrix.pattern }}
       LANG: ${{ matrix.lang || 'C.UTF-8' }}
@@ -91,7 +102,7 @@ jobs:
       REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }}
       cancel-in-progress: true
 
     services:

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -94,7 +94,8 @@ jobs:
         buildplat:
         - [ubuntu-22.04, manylinux_x86_64]
         - [ubuntu-22.04, musllinux_x86_64]
-        - [macos-12, macosx_x86_64]
+        - [ubuntu-24.04-arm, manylinux_aarch64]
+        - [macos-13, macosx_x86_64]
         # Note: M1 images on Github Actions start from macOS 14
         - [macos-14, macosx_arm64]
         - [windows-2022, win_amd64]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.1
+    rev: v0.8.6
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
     -   id: ruff-format
         exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
 -   repo: https://github.com/jendrikseipp/vulture
-    rev: 'v2.13'
+    rev: 'v2.14'
     hooks:
       - id: vulture
         entry: python scripts/run_vulture.py
@@ -74,7 +74,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.19.0
+    rev: v3.19.1
     hooks:
     -   id: pyupgrade
         args: [--py310-plus]
@@ -95,12 +95,17 @@ repos:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.4
+    rev: v19.1.6
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include
       args: [-i]
       types_or: [c, c++]
+-   repo: https://github.com/trim21/pre-commit-mirror-meson
+    rev: v1.6.1
+    hooks:
+    - id: meson-fmt
+      args: ['--inplace']
 -   repo: local
     hooks:
     -   id: pyright

diff --git a/LICENSE b/LICENSE
@@ -3,7 +3,7 @@ BSD 3-Clause License
 Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
 All rights reserved.
 
-Copyright (c) 2011-2024, Open source contributors.
+Copyright (c) 2011-2025, Open source contributors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -594,7 +594,7 @@ def setup(self):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_index_col(self):
-        read_csv(self.StringIO_input, index_col="a")
+        read_csv(self.data(self.StringIO_input), index_col="a")
 
 
 class ReadCSVDatePyarrowEngine(StringIORewind):
@@ -605,7 +605,7 @@ def setup(self):
 
     def time_read_csv_index_col(self):
         read_csv(
-            self.StringIO_input,
+            self.data(self.StringIO_input),
             parse_dates=["a"],
             engine="pyarrow",
             dtype_backend="pyarrow",

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -72,28 +72,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
-        -i "pandas.RangeIndex.from_range PR01,SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
         -i "pandas.Timestamp.max PR02" \
         -i "pandas.Timestamp.min PR02" \
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
-        -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-        -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
-        -i "pandas.core.resample.Resampler.mean SA01" \
-        -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \
-        -i "pandas.core.resample.Resampler.prod SA01" \
         -i "pandas.core.resample.Resampler.quantile PR01,PR07" \
-        -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
-        -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
         -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \

diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
@@ -23,7 +23,7 @@ dependencies:
 
   - pip:
     - "tzdata>=2022.7"
-    - "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
+    - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
     - "--prefer-binary"
     - "--pre"
     - "pyarrow"
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
@@ -6,10 +6,12 @@ and pick "PDF" as the format.
 
 This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).
 
-| Topic                  | PDF                                                                                                                                                                                                                                     | PPT                                                                                                                                                                                                                                               |
-|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Pandas_Cheat_Sheet     | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a>    | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pptx" target="_parent"><img src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a>     |
-| Pandas_Cheat_Sheet_JA  | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx" target="_parent"><img  src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |
+| Topic                  | Language    | PDF                                                                                                                                                                                                                                     | PPT                                                                                                                                                                                                                                               |
+|------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas_Cheat_Sheet     | English     | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a>    | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pptx" target="_parent"><img src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a>     |
+| Pandas_Cheat_Sheet_JA  | Japanese    | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx" target="_parent"><img  src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |
+| Pandas_Cheat_Sheet_FA  | Persian     | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx" target="_parent"><img  src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |
+
 
 
 **Alternative**

diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
@@ -104,6 +104,7 @@ Function application
    DataFrameGroupBy.shift
    DataFrameGroupBy.size
    DataFrameGroupBy.skew
+   DataFrameGroupBy.kurt
    DataFrameGroupBy.std
    DataFrameGroupBy.sum
    DataFrameGroupBy.var
@@ -159,6 +160,7 @@ Function application
    SeriesGroupBy.shift
    SeriesGroupBy.size
    SeriesGroupBy.skew
+   SeriesGroupBy.kurt
    SeriesGroupBy.std
    SeriesGroupBy.sum
    SeriesGroupBy.var

diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -30,11 +30,14 @@ Rolling window functions
    Rolling.std
    Rolling.min
    Rolling.max
+   Rolling.first
+   Rolling.last
    Rolling.corr
    Rolling.cov
    Rolling.skew
    Rolling.kurt
    Rolling.apply
+   Rolling.pipe
    Rolling.aggregate
    Rolling.quantile
    Rolling.sem
@@ -71,11 +74,14 @@ Expanding window functions
    Expanding.std
    Expanding.min
    Expanding.max
+   Expanding.first
+   Expanding.last
    Expanding.corr
    Expanding.cov
    Expanding.skew
    Expanding.kurt
    Expanding.apply
+   Expanding.pipe
    Expanding.aggregate
    Expanding.quantile
    Expanding.sem

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -858,9 +858,10 @@ and :ref:`Advanced Indexing <advanced>` you may select along more than one axis
 
 .. warning::
 
-   ``iloc`` supports two kinds of boolean indexing. If the indexer is a boolean ``Series``,
-   an error will be raised. For instance, in the following example, ``df.iloc[s.values, 1]`` is ok.
-   The boolean indexer is an array. But ``df.iloc[s, 1]`` would raise ``ValueError``.
+   While ``loc`` supports two kinds of boolean indexing, ``iloc`` only supports indexing with a
+   boolean array. If the indexer is a boolean ``Series``, an error will be raised. For instance,
+   in the following example, ``df.iloc[s.values, 1]`` is ok. The boolean indexer is an array.
+   But ``df.iloc[s, 1]`` would raise ``ValueError``.
 
    .. ipython:: python
 

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2340,6 +2340,7 @@ Read a URL with no options:
 .. code-block:: ipython
 
    In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
+
    In [321]: pd.read_html(url)
    Out[321]:
    [                         Bank NameBank           CityCity StateSt  ...              Acquiring InstitutionAI Closing DateClosing FundFund
@@ -2366,6 +2367,7 @@ Read a URL while passing headers alongside the HTTP request:
 .. code-block:: ipython
 
    In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector
+
    In [323]: pd.read_html(url)
    Out[323]:
    [                   0                    1
@@ -2378,14 +2380,16 @@ Read a URL while passing headers alongside the HTTP request:
     1              Host:         www.sump.org
     2        User-Agent:    Python-urllib/3.8
     3        Connection:                close]
+
    In [324]: headers = {
-   In [325]:    'User-Agent':'Mozilla Firefox v14.0',
-   In [326]:    'Accept':'application/json',
-   In [327]:    'Connection':'keep-alive',
-   In [328]:    'Auth':'Bearer 2*/f3+fe68df*4'
-   In [329]: }
-   In [340]: pd.read_html(url, storage_options=headers)
-   Out[340]:
+      .....:    'User-Agent':'Mozilla Firefox v14.0',
+      .....:    'Accept':'application/json',
+      .....:    'Connection':'keep-alive',
+      .....:    'Auth':'Bearer 2*/f3+fe68df*4'
+      .....: }
+
+   In [325]: pd.read_html(url, storage_options=headers)
+   Out[325]:
    [                   0                    1
     0     Remote Socket:  51.15.105.256:51760
     1  Protocol Version:             HTTP/1.1

diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
@@ -1210,11 +1210,6 @@ You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labe
 for x and y axis. By default, pandas will pick up index name as xlabel, while leaving
 it empty for ylabel.
 
-.. ipython:: python
-   :suppress:
-
-   plt.figure();
-
 .. ipython:: python
 
    df.plot();

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -35,8 +35,8 @@ Other enhancements
 - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
   when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.notable_bug_fixes:
@@ -105,6 +105,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
@@ -175,7 +176,6 @@ Other
 ^^^^^
 - Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
   are not installed (:issue:`60196`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.contributors: