From 5b024e8266fe62d969836c86e2be6ca0fb7f2043 Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 09:34:37 -0500 Subject: [PATCH 1/6] [-] Try to make the datetime guesser better --- pandas/_libs/tslibs/parsing.pyx | 24 ----------------- pandas/core/tools/datetimes.py | 38 +++++++++++++------------- pandas/tests/tslibs/test_parsing.py | 41 +++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 308183402198d..61771321b309d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1049,7 +1049,6 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: # rebuild string, capturing any inferred padding dt_str = "".join(tokens) if parsed_datetime.strftime(guessed_format) == dt_str: - _maybe_warn_about_dayfirst(guessed_format, dayfirst) return guessed_format else: return None @@ -1072,29 +1071,6 @@ cdef str _fill_token(token: str, padding: int): return token_filled -cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: - """Warn if guessed datetime format doesn't respect dayfirst argument.""" - cdef: - int day_index = format.find("%d") - int month_index = format.find("%m") - - if (day_index != -1) and (month_index != -1): - if (day_index > month_index) and dayfirst: - warnings.warn( - f"Parsing dates in {format} format when dayfirst=True was specified. " - "Pass `dayfirst=False` or specify a format to silence this warning.", - UserWarning, - stacklevel=find_stack_level(), - ) - if (day_index < month_index) and not dayfirst: - warnings.warn( - f"Parsing dates in {format} format when dayfirst=False (the default) " - "was specified. " - "Pass `dayfirst=True` or specify a format to silence this warning.", - UserWarning, - stacklevel=find_stack_level(), - ) - cpdef str get_rule_month(str source): """ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ce311d0c89b55..6242f3dcb704a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import abc -from datetime import date +from datetime import date, datetime from functools import partial from itertools import islice from typing import ( @@ -131,24 +131,26 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't - if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: - # GH#32264 np.str_ object - guessed_format = guess_datetime_format( - first_non_nan_element, dayfirst=dayfirst - ) + search_start = 0 + allowed_formats = set() + while not search_start >= len(arr): + non_null_offset = tslib.first_non_null(arr[search_start:]) + if non_null_offset == -1: + break + idx = search_start + non_null_offset + element = arr[idx] + if isinstance(element, str): + guessed_format = guess_datetime_format(str(element), dayfirst=dayfirst) if guessed_format is not None: - return guessed_format - # If there are multiple non-null elements, warn about - # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) + allowed_formats.add(guessed_format) + search_start = idx + 1 + # Look through the formats and see if one satisfies each item in the array + for fmt in list(allowed_formats): + try: + [datetime.strptime(date_string, fmt) for date_string in arr if date_string] + return fmt + except ValueError: + pass return None diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 5d4e2e8ddb234..642a41feea284 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -424,3 +424,44 @@ def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input): ) assert except_out_dateutil == except_in_dateutil assert result[0] == expected + + +from pandas.core.tools.datetimes import _guess_datetime_format_for_array + + +@pytest.mark.parametrize( + "expected_format, array", + [ + ("%d/%m/%Y", np.array(["01/02/2025", "30/07/2025"])), + ("%Y-%m-%d", np.array(["2025-08-09", "2025-08-13", None])), + ("%m/%d/%Y", np.array(["02/01/2025", "12/31/2025"])), + ("%d-%m-%Y", np.array(["01-02-2025", "30-07-2025"])), + ("%d.%m.%Y", np.array(["01.02.2025", "30.07.2025"])), + ("%Y/%m/%d", np.array(["2025/08/09", "2025/12/01"])), + ("%b %d, %Y", np.array(["Feb 01, 2025", "Jul 30, 2025"])), + ("%B %d, %Y", np.array(["February 01, 2025", "July 30, 2025"])), + ("%d %b %Y", np.array(["01 Feb 2025", "30 Jul 2025"])), + ("%d-%b-%Y", np.array(["01-Feb-2025", "30-Jul-2025"])), + ("%Y%m%d", np.array(["20250201", "20250730"])), + (None, np.array(["02/01/25", "12/31/25"])), + ("%Y-%m-%d %H:%M:%S", np.array(["2025-08-09 14:30:00", "2025-12-01 00:00:00"])), + ("%Y-%m-%dT%H:%M:%S", np.array(["2025-08-09T14:30:00", "2025-12-01T00:00:00"])), + ( + "%Y-%m-%dT%H:%M:%S.%f", + np.array(["2025-08-09T14:30:00.123456", "2025-12-01T00:00:00.5"]), + ), + ( + "%Y-%m-%d %H:%M:%S%z", + np.array(["2025-08-09 14:30:00+0000", "2025-12-01 09:15:00-0500"]), + ), + ("%Y-%m-%d", np.array(["2025-08-09", None, "2025-12-01"])), + (None, np.array(["2025/13/01", "not-a-date", ""])), + ( + None, + np.array(["01/02/2025", "2025-02-01"]), + ), + ], +) +def test_guess_datetime_format_for_array(expected_format: str, array: np.array) -> None: + fmt = _guess_datetime_format_for_array(array, dayfirst=False) + assert fmt == expected_format, f"{fmt} does not match {expected_format}" From ac7d24079ad4bb143aa2ca0ade0ef63e54c26b5f Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 09:46:21 -0500 Subject: [PATCH 2/6] Add my note + add type hint --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/tools/datetimes.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b1dc78bbf8020..a4438b8bdbbe4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1274,6 +1274,7 @@ Other - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) +- Fixed bug when parsing datetime array formatting in :func:`_guess_datetime_format_for_array` where the format string was based on the first item and not the entire series of entered dates .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6242f3dcb704a..0102d1953e962 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -129,7 +129,9 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: +def _guess_datetime_format_for_array( + arr: np.ndarray, dayfirst: bool | None = False +) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't search_start = 0 allowed_formats = set() From 195e40590d9db73db08834e97282fd22a7985708 Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 09:50:26 -0500 Subject: [PATCH 3/6] Move Import --- pandas/tests/tslibs/test_parsing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 642a41feea284..7e4f5ffe5479e 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -19,6 +19,7 @@ WASM, is_platform_windows, ) +from pandas.core.tools.datetimes import _guess_datetime_format_for_array import pandas.util._test_decorators as td # Usually we wouldn't want this import in this test file (which is targeted at @@ -426,9 +427,6 @@ def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input): assert result[0] == expected -from pandas.core.tools.datetimes import _guess_datetime_format_for_array - - @pytest.mark.parametrize( "expected_format, array", [ From 9ffc3d37bd4be53d1fd4e9cf0503d39ad33ea7ad Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 10:02:12 -0500 Subject: [PATCH 4/6] Remove imports + add if statement for NaTType --- pandas/core/tools/datetimes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0102d1953e962..fac9faa8ffd1e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -12,7 +12,6 @@ cast, overload, ) -import warnings import numpy as np @@ -30,6 +29,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.missing import NAType from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, @@ -42,7 +42,6 @@ DateTimeErrorChoices, ) from pandas.util._decorators import set_module -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -149,7 +148,11 @@ def _guess_datetime_format_for_array( # Look through the formats and see if one satisfies each item in the array for fmt in list(allowed_formats): try: - [datetime.strptime(date_string, fmt) for date_string in arr if date_string] + [ + datetime.strptime(date_string, fmt) + for date_string in arr + if date_string and not isinstance(date_string, NAType) + ] return fmt except ValueError: pass From dbcb4280bdc40a100075dc964ac21cf99b00b0f3 Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 10:18:41 -0500 Subject: [PATCH 5/6] Fix typing and add test + check for NaT --- pandas/_libs/tslibs/parsing.pyx | 1 - pandas/core/tools/datetimes.py | 10 ++++++---- pandas/tests/tslibs/test_parsing.py | 7 ++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 61771321b309d..0a8f948be1a0b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1071,7 +1071,6 @@ cdef str _fill_token(token: str, padding: int): return token_filled - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index fac9faa8ffd1e..034faad878e5b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,7 +1,10 @@ from __future__ import annotations from collections import abc -from datetime import date, datetime +from datetime import ( + date, + datetime, +) from functools import partial from itertools import islice from typing import ( @@ -29,8 +32,8 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized -from pandas._libs.missing import NAType from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -84,7 +87,6 @@ Hashable, ) - from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices from pandas import ( @@ -151,7 +153,7 @@ def _guess_datetime_format_for_array( [ datetime.strptime(date_string, fmt) for date_string in arr - if date_string and not isinstance(date_string, NAType) + if date_string and not isinstance(date_string, NaTType) ] return fmt except ValueError: diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 7e4f5ffe5479e..272d0e9121810 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -10,6 +10,7 @@ import pytest from pandas._libs.tslibs import ( + NaT, parsing, strptime, ) @@ -19,7 +20,6 @@ WASM, is_platform_windows, ) -from pandas.core.tools.datetimes import _guess_datetime_format_for_array import pandas.util._test_decorators as td # Usually we wouldn't want this import in this test file (which is targeted at @@ -30,6 +30,7 @@ option_context, ) import pandas._testing as tm +from pandas.core.tools.datetimes import _guess_datetime_format_for_array @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @@ -453,10 +454,10 @@ def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input): np.array(["2025-08-09 14:30:00+0000", "2025-12-01 09:15:00-0500"]), ), ("%Y-%m-%d", np.array(["2025-08-09", None, "2025-12-01"])), - (None, np.array(["2025/13/01", "not-a-date", ""])), + (None, np.array(["2025/13/01", "not-a-date", "", NaT])), ( None, - np.array(["01/02/2025", "2025-02-01"]), + np.array(["01/02/2025", "2025-02-01", np.nan]), ), ], ) From 2e31faa8dc10492844dc997d2e6e8c8b60600fce Mon Sep 17 00:00:00 2001 From: Ben Cutler Date: Mon, 3 Nov 2025 11:47:30 -0500 Subject: [PATCH 6/6] Fix a few tests, want to make sure it'll be approved if the tests are good before fixing everything --- pandas/core/tools/datetimes.py | 5 ++-- pandas/tests/tools/test_to_datetime.py | 16 ++++------ pandas/tests/tslibs/test_parsing.py | 41 ++++++++++---------------- 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 034faad878e5b..38b267d136ef7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -33,7 +33,6 @@ ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -86,7 +85,7 @@ Callable, Hashable, ) - + from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices from pandas import ( @@ -153,7 +152,7 @@ def _guess_datetime_format_for_array( [ datetime.strptime(date_string, fmt) for date_string in arr - if date_string and not isinstance(date_string, NaTType) + if date_string and isinstance(date_string, str) ] return fmt except ValueError: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c9cda0c4153cb..bc9dcfd018268 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1429,10 +1429,7 @@ def test_datetime_invalid_index(self, values, format): else: warn = None - with tm.assert_produces_warning( - warn, match="Could not infer format", raise_on_extra_warnings=False - ): - res = to_datetime(values, errors="coerce", format=format) + res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = "|".join( @@ -1628,13 +1625,10 @@ def test_to_datetime_malformed_raise(self): ValueError, match=msg, ): - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - to_datetime( - ts_strings, - errors="raise", - ) + to_datetime( + ts_strings, + errors="raise", + ) def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 272d0e9121810..43ec044d5af1f 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -232,10 +232,7 @@ def test_parsers_month_freq(date_str, expected): ], ) def test_guess_datetime_format_with_parseable_formats(string, fmt): - with tm.maybe_produces_warning( - UserWarning, fmt is not None and re.search(r"%d.*%m", fmt) - ): - result = parsing.guess_datetime_format(string) + result = parsing.guess_datetime_format(string) assert result == fmt @@ -292,31 +289,25 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt,dayfirst,warning", + "string,fmt,dayfirst", [ - ("2011-1-1", "%Y-%m-%d", False, None), - ("2011-1-1", "%Y-%d-%m", True, None), - ("1/1/2011", "%m/%d/%Y", False, None), - ("1/1/2011", "%d/%m/%Y", True, None), - ("30-1-2011", "%d-%m-%Y", False, UserWarning), - ("30-1-2011", "%d-%m-%Y", True, None), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None), - ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None), - ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None), - ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None), + ("2011-1-1", "%Y-%m-%d", False), + ("2011-1-1", "%Y-%d-%m", True), + ("1/1/2011", "%m/%d/%Y", False), + ("1/1/2011", "%d/%m/%Y", True), + ("30-1-2011", "%d-%m-%Y", False), + ("30-1-2011", "%d-%m-%Y", True), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False), + ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True), ], ) -def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst): # see gh-11142 - msg = ( - rf"Parsing dates in {fmt} format when dayfirst=False \(the default\) " - "was specified. " - "Pass `dayfirst=True` or specify a format to silence this warning." - ) - with tm.assert_produces_warning(warning, match=msg): - result = parsing.guess_datetime_format(string, dayfirst=dayfirst) + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt