Skip to content

Commit 50d8d1e

Browse files
authored
DEPR/BUG: Do not ignore sort in concat for DatetimeIndex (#62843)
1 parent 03a5289 commit 50d8d1e

File tree

13 files changed

+245
-23
lines changed

13 files changed

+245
-23
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,71 @@ In cases with mixed-resolution inputs, the highest resolution is used:
374374
In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
375375
Out[2]: dtype('<M8[ns]')
376376
377+
.. _whatsnew_300.api_breaking.concat_datetime_sorting:
378+
379+
:func:`concat` no longer ignores ``sort`` when all objects have a :class:`DatetimeIndex`
380+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
381+
382+
When all objects passed to :func:`concat` have a :class:`DatetimeIndex`,
383+
passing ``sort=False`` will now result in the non-concatenation axis not
384+
being sorted. Previously, the result would always be sorted along
385+
the non-concatenation axis even when ``sort=False`` is passed. :issue:`57335`
386+
387+
If you do not specify the ``sort`` argument, pandas will continue to return a
388+
sorted result but this behavior is deprecated and you will receive a warning.
389+
In order to make this less noisy for users, pandas checks if not sorting would
390+
impact the result and only warns when it would. This check can be expensive,
391+
and users can skip the check by explicitly specifying ``sort=True`` or
392+
``sort=False``.
393+
394+
This deprecation can also impact pandas' internal usage of :func:`concat`.
395+
Here cases where :func:`concat` was sorting a :class:`DatetimeIndex` but not
396+
other indexes are considered bugs and have been fixed as noted below. However
397+
it is possible some have been missed. In order to be cautious here, pandas has *not*
398+
added ``sort=False`` to any internal calls where we believe behavior should not change.
399+
If we have missed something, users will not experience a behavior change but they
400+
will receive a warning about :func:`concat` even though they are not directly
401+
calling this function. If this does occur, we ask users to open an issue so that
402+
we may address any potential behavior changes.
403+
404+
.. ipython:: python
405+
406+
idx1 = pd.date_range("2025-01-02", periods=3, freq="h")
407+
df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1)
408+
df1
409+
410+
idx2 = pd.date_range("2025-01-01", periods=3, freq="h")
411+
df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2)
412+
df2
413+
414+
*Old behavior*
415+
416+
.. code-block:: ipython
417+
418+
In [3]: pd.concat([df1, df2], axis=1, sort=False)
419+
Out[3]:
420+
a b
421+
2025-01-01 00:00:00 NaN 1.0
422+
2025-01-01 01:00:00 NaN 2.0
423+
2025-01-01 02:00:00 NaN 3.0
424+
2025-01-02 00:00:00 1.0 NaN
425+
2025-01-02 01:00:00 2.0 NaN
426+
2025-01-02 02:00:00 3.0 NaN
427+
428+
*New behavior*
429+
430+
.. ipython:: python
431+
432+
pd.concat([df1, df2], axis=1, sort=False)
433+
434+
Cases where pandas' internal usage of :func:`concat` resulted in inconsistent sorting
435+
that are now fixed in this release are as follows.
436+
437+
- :meth:`Series.apply` and :meth:`DataFrame.apply` with a list-like or dict-like ``func`` argument.
438+
- :meth:`Series.shift`, :meth:`DataFrame.shift`, :meth:`.SeriesGroupBy.shift`, :meth:`.DataFrameGroupBy.shift` with the ``periods`` argument a list of length greater than 1.
439+
- :meth:`DataFrame.join` with ``other`` a list of one or more Series or DataFrames and ``how="inner"``, ``how="left"``, or ``how="right"``.
440+
- :meth:`Series.str.cat` with ``others`` a Series or DataFrame.
441+
377442
.. _whatsnew_300.api_breaking.value_counts_sorting:
378443

379444
Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``

pandas/core/apply.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,7 @@ def wrap_results_dict_like(
639639
results,
640640
axis=axis,
641641
keys=keys_to_use,
642+
sort=False,
642643
)
643644
elif any(is_ndframe):
644645
# There is a mix of NDFrames and scalars

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6137,7 +6137,7 @@ def shift(
61376137
.shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)
61386138
.add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")
61396139
)
6140-
return concat(shifted_dataframes, axis=1)
6140+
return concat(shifted_dataframes, axis=1, sort=False)
61416141
elif suffix:
61426142
raise ValueError("Cannot specify `suffix` if `periods` is an int.")
61436143
periods = cast(int, periods)

pandas/core/groupby/groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5231,7 +5231,7 @@ def shift(
52315231
return (
52325232
shifted_dataframes[0]
52335233
if len(shifted_dataframes) == 1
5234-
else concat(shifted_dataframes, axis=1)
5234+
else concat(shifted_dataframes, axis=1, sort=False)
52355235
)
52365236

52375237
@final

pandas/core/indexes/api.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def get_objs_combined_axis(
6464
objs,
6565
intersect: bool = False,
6666
axis: Axis = 0,
67-
sort: bool = True,
67+
sort: bool | lib.NoDefault = True,
6868
) -> Index:
6969
"""
7070
Extract combined index: return intersection or union (depending on the
@@ -81,7 +81,8 @@ def get_objs_combined_axis(
8181
axis : {0 or 'index', 1 or 'outer'}, default 0
8282
The axis to extract indexes from.
8383
sort : bool, default True
84-
Whether the result index should come out sorted or not.
84+
Whether the result index should come out sorted or not. NoDefault
85+
use for deprecation in GH#57335.
8586
8687
Returns
8788
-------
@@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]:
108109
def _get_combined_index(
109110
indexes: list[Index],
110111
intersect: bool = False,
111-
sort: bool = False,
112+
sort: bool | lib.NoDefault = False,
112113
) -> Index:
113114
"""
114115
Return the union or intersection of indexes.
@@ -121,7 +122,8 @@ def _get_combined_index(
121122
If True, calculate the intersection between indexes. Otherwise,
122123
calculate the union.
123124
sort : bool, default False
124-
Whether the result index should come out sorted or not.
125+
Whether the result index should come out sorted or not. NoDefault
126+
used for deprecation of GH#57335
125127
126128
Returns
127129
-------
@@ -138,10 +140,10 @@ def _get_combined_index(
138140
for other in indexes[1:]:
139141
index = index.intersection(other)
140142
else:
141-
index = union_indexes(indexes, sort=False)
143+
index = union_indexes(indexes, sort=sort if sort is lib.no_default else False)
142144
index = ensure_index(index)
143145

144-
if sort:
146+
if sort and sort is not lib.no_default:
145147
index = safe_sort_index(index)
146148
return index
147149

@@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index:
180182
return index
181183

182184

183-
def union_indexes(indexes, sort: bool | None = True) -> Index:
185+
def union_indexes(indexes, sort: bool | lib.NoDefault = True) -> Index:
184186
"""
185187
Return the union of indexes.
186188
@@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
190192
----------
191193
indexes : list of Index or list objects
192194
sort : bool, default True
193-
Whether the result index should come out sorted or not.
195+
Whether the result index should come out sorted or not. NoDefault
196+
used for deprecation of GH#57335.
194197
195198
Returns
196199
-------
@@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
201204
if len(indexes) == 1:
202205
result = indexes[0]
203206
if isinstance(result, list):
204-
if not sort:
207+
if not sort or sort is lib.no_default:
205208
result = Index(result)
206209
else:
207210
result = Index(sorted(result))
@@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
227230
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
228231

229232
if num_dtis == len(indexes):
230-
sort = True
233+
if sort is lib.no_default:
234+
sort = True
231235
result = indexes[0]
232236

233237
elif num_dtis > 1:

pandas/core/reshape/concat.py

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@
4545
ensure_index,
4646
get_objs_combined_axis,
4747
get_unanimous_names,
48+
union_indexes,
4849
)
50+
from pandas.core.indexes.datetimes import DatetimeIndex
4951
from pandas.core.internals import concatenate_managers
5052

5153
if TYPE_CHECKING:
@@ -162,7 +164,7 @@ def concat(
162164
levels=None,
163165
names: list[HashableT] | None = None,
164166
verify_integrity: bool = False,
165-
sort: bool = False,
167+
sort: bool | lib.NoDefault = lib.no_default,
166168
copy: bool | lib.NoDefault = lib.no_default,
167169
) -> DataFrame | Series:
168170
"""
@@ -405,13 +407,40 @@ def concat(
405407
"Only can inner (intersect) or outer (union) join the other axis"
406408
)
407409

408-
if not is_bool(sort):
410+
objs, keys, ndims = _clean_keys_and_objs(objs, keys)
411+
412+
if sort is lib.no_default:
413+
if axis == 0:
414+
non_concat_axis = [
415+
obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
416+
for obj in objs
417+
]
418+
else:
419+
non_concat_axis = [obj.index for obj in objs]
420+
421+
if (
422+
intersect
423+
or any(not isinstance(index, DatetimeIndex) for index in non_concat_axis)
424+
or all(
425+
prev is curr for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
426+
)
427+
or (
428+
all(
429+
prev[-1] <= curr[0] and prev.is_monotonic_increasing
430+
for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
431+
if not prev.empty and not curr.empty
432+
)
433+
and non_concat_axis[-1].is_monotonic_increasing
434+
)
435+
):
436+
# Sorting or not will not impact the result.
437+
sort = False
438+
elif not is_bool(sort):
409439
raise ValueError(
410440
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
411441
)
412-
sort = bool(sort)
413-
414-
objs, keys, ndims = _clean_keys_and_objs(objs, keys)
442+
else:
443+
sort = bool(sort)
415444

416445
# select an object to be our result reference
417446
sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect)
@@ -436,9 +465,10 @@ def concat(
436465
if len(ndims) > 1:
437466
objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis)
438467

468+
orig_axis = axis
439469
axis = 1 - bm_axis if is_frame else 0
440470
names = names or getattr(keys, "names", None)
441-
return _get_result(
471+
result = _get_result(
442472
objs,
443473
is_series,
444474
bm_axis,
@@ -452,6 +482,28 @@ def concat(
452482
axis,
453483
)
454484

485+
if sort is lib.no_default:
486+
if orig_axis == 0:
487+
non_concat_axis = [
488+
obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
489+
for obj in objs
490+
]
491+
else:
492+
non_concat_axis = [obj.index for obj in objs]
493+
no_sort_result_index = union_indexes(non_concat_axis, sort=False)
494+
orig = result.index if orig_axis == 1 else result.columns
495+
if not no_sort_result_index.equals(orig):
496+
msg = (
497+
"Sorting by default when concatenating all DatetimeIndex is "
498+
"deprecated. In the future, pandas will respect the default "
499+
"of `sort=False`. Specify `sort=True` or `sort=False` to "
500+
"silence this message. If you see this warnings when not "
501+
"directly calling concat, report a bug to pandas."
502+
)
503+
warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level())
504+
505+
return result
506+
455507

456508
def _sanitize_mixed_ndim(
457509
objs: list[Series | DataFrame],
@@ -510,7 +562,7 @@ def _get_result(
510562
bm_axis: AxisInt,
511563
ignore_index: bool,
512564
intersect: bool,
513-
sort: bool,
565+
sort: bool | lib.NoDefault,
514566
keys: Iterable[Hashable] | None,
515567
levels,
516568
verify_integrity: bool,
@@ -620,7 +672,7 @@ def new_axes(
620672
objs: list[Series | DataFrame],
621673
bm_axis: AxisInt,
622674
intersect: bool,
623-
sort: bool,
675+
sort: bool | lib.NoDefault,
624676
keys: Iterable[Hashable] | None,
625677
names: list[HashableT] | None,
626678
axis: AxisInt,

pandas/tests/apply/test_frame_apply.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,36 @@ def test_listlike_lambda(ops, by_row, expected):
915915
tm.assert_equal(result, expected)
916916

917917

918+
def test_listlike_datetime_index_unsorted():
919+
# https://github.com/pandas-dev/pandas/pull/62843
920+
values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
921+
df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]])
922+
result = df.apply([lambda x: x, lambda x: x.shift(freq="D")], by_row=False)
923+
expected = DataFrame(
924+
[[1.0, 2.0], [2.0, np.nan], [np.nan, 1.0]],
925+
index=[values[1], values[0], values[2]],
926+
columns=MultiIndex([["a"], ["<lambda>"]], codes=[[0, 0], [0, 0]]),
927+
)
928+
tm.assert_frame_equal(result, expected)
929+
930+
931+
def test_dictlike_datetime_index_unsorted():
932+
# https://github.com/pandas-dev/pandas/pull/62843
933+
values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
934+
df = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[values[1], values[0]])
935+
result = df.apply(
936+
{"a": lambda x: x, "b": lambda x: x.shift(freq="D")}, by_row=False
937+
)
938+
expected = DataFrame(
939+
{
940+
"a": [1.0, 2.0, np.nan],
941+
"b": [4.0, np.nan, 3.0],
942+
},
943+
index=[values[1], values[0], values[2]],
944+
)
945+
tm.assert_frame_equal(result, expected)
946+
947+
918948
@pytest.mark.parametrize(
919949
"ops",
920950
[

pandas/tests/frame/methods/test_shift.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,3 +794,17 @@ def test_shift_invalid_fill_value_deprecation(self):
794794
df["a"].shift(1, fill_value=NaT)
795795
with tm.assert_produces_warning(Pandas4Warning, match=msg):
796796
df["b"].shift(1, fill_value=NaT)
797+
798+
def test_shift_dt_index_multiple_periods_unsorted(self):
799+
# https://github.com/pandas-dev/pandas/pull/62843
800+
values = date_range("1/1/2000", periods=4, freq="D")
801+
df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]])
802+
result = df.shift(periods=[1, 2], freq="D")
803+
expected = DataFrame(
804+
{
805+
"a_1": [1.0, 2.0, np.nan],
806+
"a_2": [2.0, np.nan, 1.0],
807+
},
808+
index=[values[2], values[1], values[3]],
809+
)
810+
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/methods/test_groupby_shift_diff.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,21 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
248248
msg = "Passing a 'freq' together with a 'fill_value'"
249249
with pytest.raises(ValueError, match=msg):
250250
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
251+
252+
253+
def test_groupby_shift_multiple_periods_unsorted_index():
254+
# https://github.com/pandas-dev/pandas/pull/62843
255+
idx = date_range("1/1/2000", periods=4, freq="h")
256+
df = DataFrame(
257+
{"a": [1, 2, 3], "b": [True, True, False]},
258+
index=[idx[2], idx[0], idx[1]],
259+
)
260+
result = df.groupby("b")[["a"]].shift([0, 1], freq="h")
261+
expected = DataFrame(
262+
{
263+
"a_0": [1.0, 2.0, 3.0, np.nan],
264+
"a_1": [3.0, np.nan, 2.0, 1.0],
265+
},
266+
index=[idx[2], idx[0], idx[1], idx[3]],
267+
)
268+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)