diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b1dc78bbf8020..a4438b8bdbbe4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1274,6 +1274,7 @@ Other - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) +- Fixed bug when parsing datetime array formatting in :func:`_guess_datetime_format_for_array` where the format string was based on the first item and not the entire series of entered dates .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 308183402198d..0a8f948be1a0b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1049,7 +1049,6 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: # rebuild string, capturing any inferred padding dt_str = "".join(tokens) if parsed_datetime.strftime(guessed_format) == dt_str: - _maybe_warn_about_dayfirst(guessed_format, dayfirst) return guessed_format else: return None @@ -1072,30 +1071,6 @@ cdef str _fill_token(token: str, padding: int): return token_filled -cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: - """Warn if guessed datetime format doesn't respect dayfirst argument.""" - cdef: - int day_index = format.find("%d") - int month_index = format.find("%m") - - if (day_index != -1) and (month_index != -1): - if (day_index > month_index) and dayfirst: - warnings.warn( - f"Parsing dates in {format} format when dayfirst=True was specified. " - "Pass `dayfirst=False` or specify a format to silence this warning.", - UserWarning, - stacklevel=find_stack_level(), - ) - if (day_index < month_index) and not dayfirst: - warnings.warn( - f"Parsing dates in {format} format when dayfirst=False (the default) " - "was specified. " - "Pass `dayfirst=True` or specify a format to silence this warning.", - UserWarning, - stacklevel=find_stack_level(), - ) - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ce311d0c89b55..38b267d136ef7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,7 +1,10 @@ from __future__ import annotations from collections import abc -from datetime import date +from datetime import ( + date, + datetime, +) from functools import partial from itertools import islice from typing import ( @@ -12,7 +15,6 @@ cast, overload, ) -import warnings import numpy as np @@ -42,7 +44,6 @@ DateTimeErrorChoices, ) from pandas.util._decorators import set_module -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -84,7 +85,6 @@ Callable, Hashable, ) - from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices @@ -129,26 +129,34 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: +def _guess_datetime_format_for_array( + arr: np.ndarray, dayfirst: bool | None = False +) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't - if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: - # GH#32264 np.str_ object - guessed_format = guess_datetime_format( - first_non_nan_element, dayfirst=dayfirst - ) + search_start = 0 + allowed_formats = set() + while not search_start >= len(arr): + non_null_offset = tslib.first_non_null(arr[search_start:]) + if non_null_offset == -1: + break + idx = search_start + non_null_offset + element = arr[idx] + if isinstance(element, str): + guessed_format = guess_datetime_format(str(element), dayfirst=dayfirst) if guessed_format is not None: - return guessed_format - # If there are multiple non-null elements, warn about - # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) + allowed_formats.add(guessed_format) + search_start = idx + 1 + # Look through the formats and see if one satisfies each item in the array + for fmt in list(allowed_formats): + try: + [ + datetime.strptime(date_string, fmt) + for date_string in arr + if date_string and isinstance(date_string, str) + ] + return fmt + except ValueError: + pass return None diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c9cda0c4153cb..bc9dcfd018268 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1429,10 +1429,7 @@ def test_datetime_invalid_index(self, values, format): else: warn = None - with tm.assert_produces_warning( - warn, match="Could not infer format", raise_on_extra_warnings=False - ): - res = to_datetime(values, errors="coerce", format=format) + res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = "|".join( @@ -1628,13 +1625,10 @@ def test_to_datetime_malformed_raise(self): ValueError, match=msg, ): - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - to_datetime( - ts_strings, - errors="raise", - ) + to_datetime( + ts_strings, + errors="raise", + ) def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 5d4e2e8ddb234..43ec044d5af1f 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -10,6 +10,7 @@ import pytest from pandas._libs.tslibs import ( + NaT, parsing, strptime, ) @@ -29,6 +30,7 @@ option_context, ) import pandas._testing as tm +from pandas.core.tools.datetimes import _guess_datetime_format_for_array @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @@ -230,10 +232,7 @@ def test_parsers_month_freq(date_str, expected): ], ) def test_guess_datetime_format_with_parseable_formats(string, fmt): - with tm.maybe_produces_warning( - UserWarning, fmt is not None and re.search(r"%d.*%m", fmt) - ): - result = parsing.guess_datetime_format(string) + result = parsing.guess_datetime_format(string) assert result == fmt @@ -290,31 +289,25 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt,dayfirst,warning", + "string,fmt,dayfirst", [ - ("2011-1-1", "%Y-%m-%d", False, None), - ("2011-1-1", "%Y-%d-%m", True, None), - ("1/1/2011", "%m/%d/%Y", False, None), - ("1/1/2011", "%d/%m/%Y", True, None), - ("30-1-2011", "%d-%m-%Y", False, UserWarning), - ("30-1-2011", "%d-%m-%Y", True, None), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None), - ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None), - ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None), - ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None), + ("2011-1-1", "%Y-%m-%d", False), + ("2011-1-1", "%Y-%d-%m", True), + ("1/1/2011", "%m/%d/%Y", False), + ("1/1/2011", "%d/%m/%Y", True), + ("30-1-2011", "%d-%m-%Y", False), + ("30-1-2011", "%d-%m-%Y", True), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False), + ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True), ], ) -def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst): # see gh-11142 - msg = ( - rf"Parsing dates in {fmt} format when dayfirst=False \(the default\) " - "was specified. " - "Pass `dayfirst=True` or specify a format to silence this warning." - ) - with tm.assert_produces_warning(warning, match=msg): - result = parsing.guess_datetime_format(string, dayfirst=dayfirst) + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt @@ -424,3 +417,41 @@ def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input): ) assert except_out_dateutil == except_in_dateutil assert result[0] == expected + + +@pytest.mark.parametrize( + "expected_format, array", + [ + ("%d/%m/%Y", np.array(["01/02/2025", "30/07/2025"])), + ("%Y-%m-%d", np.array(["2025-08-09", "2025-08-13", None])), + ("%m/%d/%Y", np.array(["02/01/2025", "12/31/2025"])), + ("%d-%m-%Y", np.array(["01-02-2025", "30-07-2025"])), + ("%d.%m.%Y", np.array(["01.02.2025", "30.07.2025"])), + ("%Y/%m/%d", np.array(["2025/08/09", "2025/12/01"])), + ("%b %d, %Y", np.array(["Feb 01, 2025", "Jul 30, 2025"])), + ("%B %d, %Y", np.array(["February 01, 2025", "July 30, 2025"])), + ("%d %b %Y", np.array(["01 Feb 2025", "30 Jul 2025"])), + ("%d-%b-%Y", np.array(["01-Feb-2025", "30-Jul-2025"])), + ("%Y%m%d", np.array(["20250201", "20250730"])), + (None, np.array(["02/01/25", "12/31/25"])), + ("%Y-%m-%d %H:%M:%S", np.array(["2025-08-09 14:30:00", "2025-12-01 00:00:00"])), + ("%Y-%m-%dT%H:%M:%S", np.array(["2025-08-09T14:30:00", "2025-12-01T00:00:00"])), + ( + "%Y-%m-%dT%H:%M:%S.%f", + np.array(["2025-08-09T14:30:00.123456", "2025-12-01T00:00:00.5"]), + ), + ( + "%Y-%m-%d %H:%M:%S%z", + np.array(["2025-08-09 14:30:00+0000", "2025-12-01 09:15:00-0500"]), + ), + ("%Y-%m-%d", np.array(["2025-08-09", None, "2025-12-01"])), + (None, np.array(["2025/13/01", "not-a-date", "", NaT])), + ( + None, + np.array(["01/02/2025", "2025-02-01", np.nan]), + ), + ], +) +def test_guess_datetime_format_for_array(expected_format: str, array: np.array) -> None: + fmt = _guess_datetime_format_for_array(array, dayfirst=False) + assert fmt == expected_format, f"{fmt} does not match {expected_format}"