DEPR/BUG: Do not ignore sort in concat for DatetimeIndex (#62843)

rhshadrach · web-flow · commit 50d8d1eb4440 · 2025-11-05T10:31:51.000-08:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -374,6 +374,71 @@ In cases with mixed-resolution inputs, the highest resolution is used:
     In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
     Out[2]: dtype('<M8[ns]')
 
+.. _whatsnew_300.api_breaking.concat_datetime_sorting:
+
+:func:`concat` no longer ignores ``sort`` when all objects have a :class:`DatetimeIndex`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When all objects passed to :func:`concat` have a :class:`DatetimeIndex`,
+passing ``sort=False`` will now result in the non-concatenation axis not
+being sorted. Previously, the result would always be sorted along
+the non-concatenation axis even when ``sort=False`` is passed. :issue:`57335`
+
+If you do not specify the ``sort`` argument, pandas will continue to return a
+sorted result but this behavior is deprecated and you will receive a warning.
+In order to make this less noisy for users, pandas checks if not sorting would
+impact the result and only warns when it would. This check can be expensive,
+and users can skip the check by explicitly specifying ``sort=True`` or
+``sort=False``.
+
+This deprecation can also impact pandas' internal usage of :func:`concat`.
+Here cases where :func:`concat` was sorting a :class:`DatetimeIndex` but not
+other indexes are considered bugs and have been fixed as noted below. However
+it is possible some have been missed. In order to be cautious here, pandas has *not*
+added ``sort=False`` to any internal calls where we believe behavior should not change.
+If we have missed something, users will not experience a behavior change but they
+will receive a warning about :func:`concat` even though they are not directly
+calling this function. If this does occur, we ask users to open an issue so that
+we may address any potential behavior changes.
+
+.. ipython:: python
+
+    idx1 = pd.date_range("2025-01-02", periods=3, freq="h")
+    df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1)
+    df1
+
+    idx2 = pd.date_range("2025-01-01", periods=3, freq="h")
+    df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2)
+    df2
+
+*Old behavior*
+
+.. code-block:: ipython
+
+    In [3]: pd.concat([df1, df2], axis=1, sort=False)
+    Out[3]:
+                           a    b
+    2025-01-01 00:00:00  NaN  1.0
+    2025-01-01 01:00:00  NaN  2.0
+    2025-01-01 02:00:00  NaN  3.0
+    2025-01-02 00:00:00  1.0  NaN
+    2025-01-02 01:00:00  2.0  NaN
+    2025-01-02 02:00:00  3.0  NaN
+
+*New behavior*
+
+.. ipython:: python
+
+    pd.concat([df1, df2], axis=1, sort=False)
+
+Cases where pandas' internal usage of :func:`concat` resulted in inconsistent sorting
+that are now fixed in this release are as follows.
+
+- :meth:`Series.apply` and :meth:`DataFrame.apply` with a list-like or dict-like ``func`` argument.
+- :meth:`Series.shift`, :meth:`DataFrame.shift`, :meth:`.SeriesGroupBy.shift`, :meth:`.DataFrameGroupBy.shift` with the ``periods`` argument a list of length greater than 1.
+- :meth:`DataFrame.join` with ``other`` a list of one or more Series or DataFrames and ``how="inner"``, ``how="left"``, or ``how="right"``.
+- :meth:`Series.str.cat` with ``others`` a Series or DataFrame.
+
 .. _whatsnew_300.api_breaking.value_counts_sorting:
 
 Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -639,6 +639,7 @@ def wrap_results_dict_like(
                 results,
                 axis=axis,
                 keys=keys_to_use,
+                sort=False,
             )
         elif any(is_ndframe):
             # There is a mix of NDFrames and scalars
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6137,7 +6137,7 @@ def shift(
                     .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)
                     .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")
                 )
-            return concat(shifted_dataframes, axis=1)
+            return concat(shifted_dataframes, axis=1, sort=False)
         elif suffix:
             raise ValueError("Cannot specify `suffix` if `periods` is an int.")
         periods = cast(int, periods)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -5231,7 +5231,7 @@ def shift(
         return (
             shifted_dataframes[0]
             if len(shifted_dataframes) == 1
-            else concat(shifted_dataframes, axis=1)
+            else concat(shifted_dataframes, axis=1, sort=False)
         )
 
     @final
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -64,7 +64,7 @@ def get_objs_combined_axis(
     objs,
     intersect: bool = False,
     axis: Axis = 0,
-    sort: bool = True,
+    sort: bool | lib.NoDefault = True,
 ) -> Index:
     """
     Extract combined index: return intersection or union (depending on the
@@ -81,7 +81,8 @@ def get_objs_combined_axis(
     axis : {0 or 'index', 1 or 'outer'}, default 0
         The axis to extract indexes from.
     sort : bool, default True
-        Whether the result index should come out sorted or not.
+        Whether the result index should come out sorted or not. NoDefault
+        use for deprecation in GH#57335.
 
     Returns
     -------
@@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]:
 def _get_combined_index(
     indexes: list[Index],
     intersect: bool = False,
-    sort: bool = False,
+    sort: bool | lib.NoDefault = False,
 ) -> Index:
     """
     Return the union or intersection of indexes.
@@ -121,7 +122,8 @@ def _get_combined_index(
         If True, calculate the intersection between indexes. Otherwise,
         calculate the union.
     sort : bool, default False
-        Whether the result index should come out sorted or not.
+        Whether the result index should come out sorted or not. NoDefault
+        used for deprecation of GH#57335
 
     Returns
     -------
@@ -138,10 +140,10 @@ def _get_combined_index(
         for other in indexes[1:]:
             index = index.intersection(other)
     else:
-        index = union_indexes(indexes, sort=False)
+        index = union_indexes(indexes, sort=sort if sort is lib.no_default else False)
         index = ensure_index(index)
 
-    if sort:
+    if sort and sort is not lib.no_default:
         index = safe_sort_index(index)
     return index
 
@@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index:
     return index
 
 
-def union_indexes(indexes, sort: bool | None = True) -> Index:
+def union_indexes(indexes, sort: bool | lib.NoDefault = True) -> Index:
     """
     Return the union of indexes.
 
@@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
     ----------
     indexes : list of Index or list objects
     sort : bool, default True
-        Whether the result index should come out sorted or not.
+        Whether the result index should come out sorted or not. NoDefault
+        used for deprecation of GH#57335.
 
     Returns
     -------
@@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
     if len(indexes) == 1:
         result = indexes[0]
         if isinstance(result, list):
-            if not sort:
+            if not sort or sort is lib.no_default:
                 result = Index(result)
             else:
                 result = Index(sorted(result))
@@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
             raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
 
         if num_dtis == len(indexes):
-            sort = True
+            if sort is lib.no_default:
+                sort = True
             result = indexes[0]
 
         elif num_dtis > 1:
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -45,7 +45,9 @@
     ensure_index,
     get_objs_combined_axis,
     get_unanimous_names,
+    union_indexes,
 )
+from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.internals import concatenate_managers
 
 if TYPE_CHECKING:
@@ -162,7 +164,7 @@ def concat(
     levels=None,
     names: list[HashableT] | None = None,
     verify_integrity: bool = False,
-    sort: bool = False,
+    sort: bool | lib.NoDefault = lib.no_default,
     copy: bool | lib.NoDefault = lib.no_default,
 ) -> DataFrame | Series:
     """
@@ -405,13 +407,40 @@ def concat(
             "Only can inner (intersect) or outer (union) join the other axis"
         )
 
-    if not is_bool(sort):
+    objs, keys, ndims = _clean_keys_and_objs(objs, keys)
+
+    if sort is lib.no_default:
+        if axis == 0:
+            non_concat_axis = [
+                obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
+                for obj in objs
+            ]
+        else:
+            non_concat_axis = [obj.index for obj in objs]
+
+        if (
+            intersect
+            or any(not isinstance(index, DatetimeIndex) for index in non_concat_axis)
+            or all(
+                prev is curr for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
+            )
+            or (
+                all(
+                    prev[-1] <= curr[0] and prev.is_monotonic_increasing
+                    for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
+                    if not prev.empty and not curr.empty
+                )
+                and non_concat_axis[-1].is_monotonic_increasing
+            )
+        ):
+            # Sorting or not will not impact the result.
+            sort = False
+    elif not is_bool(sort):
         raise ValueError(
             f"The 'sort' keyword only accepts boolean values; {sort} was passed."
         )
-    sort = bool(sort)
-
-    objs, keys, ndims = _clean_keys_and_objs(objs, keys)
+    else:
+        sort = bool(sort)
 
     # select an object to be our result reference
     sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect)
@@ -436,9 +465,10 @@ def concat(
     if len(ndims) > 1:
         objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis)
 
+    orig_axis = axis
     axis = 1 - bm_axis if is_frame else 0
     names = names or getattr(keys, "names", None)
-    return _get_result(
+    result = _get_result(
         objs,
         is_series,
         bm_axis,
@@ -452,6 +482,28 @@ def concat(
         axis,
     )
 
+    if sort is lib.no_default:
+        if orig_axis == 0:
+            non_concat_axis = [
+                obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
+                for obj in objs
+            ]
+        else:
+            non_concat_axis = [obj.index for obj in objs]
+        no_sort_result_index = union_indexes(non_concat_axis, sort=False)
+        orig = result.index if orig_axis == 1 else result.columns
+        if not no_sort_result_index.equals(orig):
+            msg = (
+                "Sorting by default when concatenating all DatetimeIndex is "
+                "deprecated.  In the future, pandas will respect the default "
+                "of `sort=False`. Specify `sort=True` or `sort=False` to "
+                "silence this message. If you see this warnings when not "
+                "directly calling concat, report a bug to pandas."
+            )
+            warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level())
+
+    return result
+
 
 def _sanitize_mixed_ndim(
     objs: list[Series | DataFrame],
@@ -510,7 +562,7 @@ def _get_result(
     bm_axis: AxisInt,
     ignore_index: bool,
     intersect: bool,
-    sort: bool,
+    sort: bool | lib.NoDefault,
     keys: Iterable[Hashable] | None,
     levels,
     verify_integrity: bool,
@@ -620,7 +672,7 @@ def new_axes(
     objs: list[Series | DataFrame],
     bm_axis: AxisInt,
     intersect: bool,
-    sort: bool,
+    sort: bool | lib.NoDefault,
     keys: Iterable[Hashable] | None,
     names: list[HashableT] | None,
     axis: AxisInt,
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -915,6 +915,36 @@ def test_listlike_lambda(ops, by_row, expected):
     tm.assert_equal(result, expected)
 
 
+def test_listlike_datetime_index_unsorted():
+    # https://github.com/pandas-dev/pandas/pull/62843
+    values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
+    df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]])
+    result = df.apply([lambda x: x, lambda x: x.shift(freq="D")], by_row=False)
+    expected = DataFrame(
+        [[1.0, 2.0], [2.0, np.nan], [np.nan, 1.0]],
+        index=[values[1], values[0], values[2]],
+        columns=MultiIndex([["a"], ["<lambda>"]], codes=[[0, 0], [0, 0]]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dictlike_datetime_index_unsorted():
+    # https://github.com/pandas-dev/pandas/pull/62843
+    values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
+    df = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[values[1], values[0]])
+    result = df.apply(
+        {"a": lambda x: x, "b": lambda x: x.shift(freq="D")}, by_row=False
+    )
+    expected = DataFrame(
+        {
+            "a": [1.0, 2.0, np.nan],
+            "b": [4.0, np.nan, 3.0],
+        },
+        index=[values[1], values[0], values[2]],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "ops",
     [
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
@@ -794,3 +794,17 @@ def test_shift_invalid_fill_value_deprecation(self):
             df["a"].shift(1, fill_value=NaT)
         with tm.assert_produces_warning(Pandas4Warning, match=msg):
             df["b"].shift(1, fill_value=NaT)
+
+    def test_shift_dt_index_multiple_periods_unsorted(self):
+        # https://github.com/pandas-dev/pandas/pull/62843
+        values = date_range("1/1/2000", periods=4, freq="D")
+        df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]])
+        result = df.shift(periods=[1, 2], freq="D")
+        expected = DataFrame(
+            {
+                "a_1": [1.0, 2.0, np.nan],
+                "a_2": [2.0, np.nan, 1.0],
+            },
+            index=[values[2], values[1], values[3]],
+        )
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py
@@ -248,3 +248,21 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
     msg = "Passing a 'freq' together with a 'fill_value'"
     with pytest.raises(ValueError, match=msg):
         df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
+
+
+def test_groupby_shift_multiple_periods_unsorted_index():
+    # https://github.com/pandas-dev/pandas/pull/62843
+    idx = date_range("1/1/2000", periods=4, freq="h")
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [True, True, False]},
+        index=[idx[2], idx[0], idx[1]],
+    )
+    result = df.groupby("b")[["a"]].shift([0, 1], freq="h")
+    expected = DataFrame(
+        {
+            "a_0": [1.0, 2.0, 3.0, np.nan],
+            "a_1": [3.0, np.nan, 2.0, 1.0],
+        },
+        index=[idx[2], idx[0], idx[1], idx[3]],
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py

Original file line number	Diff line number	Diff line change
`@@ -639,6 +639,7 @@ def wrap_results_dict_like(`
`639`	`639`	`results,`
`640`	`640`	`axis=axis,`
`641`	`641`	`keys=keys_to_use,`
	`642`	`+ sort=False,`
`642`	`643`	`)`
`643`	`644`	`elif any(is_ndframe):`
`644`	`645`	`# There is a mix of NDFrames and scalars`
Original file line number	Diff line number	Diff line change
`@@ -6137,7 +6137,7 @@ def shift(`
`6137`	`6137`	`.shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)`
`6138`	`6138`	`.add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")`
`6139`	`6139`	`)`
`6140`		`- return concat(shifted_dataframes, axis=1)`
	`6140`	`+ return concat(shifted_dataframes, axis=1, sort=False)`
`6141`	`6141`	`elif suffix:`
`6142`	`6142`	raise ValueError("Cannot specify `suffix` if `periods` is an int.")
`6143`	`6143`	`periods = cast(int, periods)`
Original file line number	Diff line number	Diff line change
`@@ -5231,7 +5231,7 @@ def shift(`
`5231`	`5231`	`return (`
`5232`	`5232`	`shifted_dataframes[0]`
`5233`	`5233`	`if len(shifted_dataframes) == 1`
`5234`		`- else concat(shifted_dataframes, axis=1)`
	`5234`	`+ else concat(shifted_dataframes, axis=1, sort=False)`
`5235`	`5235`	`)`
`5236`	`5236`
`5237`	`5237`	`@final`