From d7cf8b956045558d1d83f29f5e67e8e46dc73a5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 1 Nov 2025 10:24:53 -0700 Subject: [PATCH 01/13] Clean test_bad_date_parse_with_warning --- pandas/tests/io/parser/test_parse_dates.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index beb5e8d9d996c..034e5d3f811a2 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -265,12 +265,11 @@ def test_bad_date_parse(all_parsers, cache, value): ) -@pytest.mark.parametrize("value", ["0"]) -def test_bad_date_parse_with_warning(all_parsers, cache, value): +def test_bad_date_parse_with_warning(all_parsers, cache): # if we have an invalid date make sure that we handle this with # and w/o the cache properly. parser = all_parsers - s = StringIO((f"{value},\n") * 50000) + s = StringIO(("0,\n") * (start_caching_at + 1)) if parser.engine == "pyarrow": # pyarrow reads "0" as 0 (of type int64), and so From 0a23c73c77297952b05fd4dd661a6e4f27b5f679 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 1 Nov 2025 10:43:42 -0700 Subject: [PATCH 02/13] Reduce some parameterization --- pandas/tests/libs/test_hashtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 6a95cfc7355d8..1f24f87348595 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -247,7 +247,7 @@ def test_get_state(self, table_type, dtype): assert "n_buckets" in state assert "upper_bound" in state - @pytest.mark.parametrize("N", range(1, 110)) + @pytest.mark.parametrize("N", range(1, 110, 4)) def test_no_reallocation(self, table_type, dtype, N): keys = np.arange(N).astype(dtype) preallocated_table = table_type(N) @@ -517,7 +517,7 @@ def test_tracemalloc_for_empty_StringHashTable(): assert get_allocated_khash_memory() == 0 -@pytest.mark.parametrize("N", range(1, 110)) +@pytest.mark.parametrize("N", range(1, 110, 4)) def test_no_reallocation_StringHashTable(N): keys = np.arange(N).astype(np.str_).astype(np.object_) preallocated_table = ht.StringHashTable(N) From 231e00cc7165c3a95863bf9b8b3ed6786941082d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 07:56:19 -0800 Subject: [PATCH 03/13] Patch DEFAULT_BUFFER_HEURISTIC in test_warn_if_chunks_have_mismatched_type --- .../tests/io/parser/common/test_chunksize.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 3c9e7c80f9db0..3a50544641bfb 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -229,17 +229,20 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): +def test_warn_if_chunks_have_mismatched_type( + all_parsers, using_infer_string, monkeypatch +): warning_type = None parser = all_parsers - size = 10000 + heuristic = 2**3 + size = 10 # see gh-3866: if chunks are different types and can't # be coerced using numerical types, then issue warning. if parser.engine == "c" and parser.low_memory: warning_type = DtypeWarning # Use larger size to hit warning path - size = 499999 + size = heuristic - 1 integers = [str(i) for i in range(size)] data = "a\n" + "\n".join(integers + ["a", "b"] + integers) @@ -251,12 +254,14 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): buf, ) else: - df = parser.read_csv_check_warnings( - warning_type, - r"Columns \(0: a\) have mixed types. " - "Specify dtype option on import or set low_memory=False.", - buf, - ) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + df = parser.read_csv_check_warnings( + warning_type, + r"Columns \(0: a\) have mixed types. " + "Specify dtype option on import or set low_memory=False.", + buf, + ) if parser.engine == "c" and parser.low_memory: assert df.a.dtype == object elif using_infer_string: From cb29a48b8bbdfa77a8f532197e7057ab1ecf3fc1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 08:00:26 -0800 Subject: [PATCH 04/13] Remove test_read_csv_memory_growth_chunksize as there's an asv --- .../tests/io/parser/common/test_chunksize.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 3a50544641bfb..0e8303e66f4ac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -300,30 +300,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) -def test_read_csv_memory_growth_chunksize(temp_file, all_parsers): - # see gh-24805 - # - # Let's just make sure that we don't crash - # as we iteratively process all chunks. - parser = all_parsers - - with open(temp_file, "w", encoding="utf-8") as f: - for i in range(1000): - f.write(str(i) + "\n") - - if parser.engine == "pyarrow": - msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with parser.read_csv(temp_file, chunksize=20) as result: - for _ in result: - pass - return - - with parser.read_csv(temp_file, chunksize=20) as result: - for _ in result: - pass - - def test_chunksize_with_usecols_second_block_shorter(all_parsers): # GH#21211 parser = all_parsers From 38458009d2e8fd6c312d4f27acde1efb8e1cece3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 08:11:24 -0800 Subject: [PATCH 05/13] Reduce data in test_weekly_upsample --- pandas/tests/resample/test_period_index.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index fa8eef8835285..13e9cbf4d4940 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -396,8 +396,7 @@ def test_fill_method_and_how_upsample(self): @pytest.mark.parametrize("convention", ["start", "end"]) def test_weekly_upsample(self, day, target, convention, simple_period_range_series): freq = f"W-{day}" - ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) - + ts = simple_period_range_series("1/1/1990", "07/31/1990", freq=freq) warn = None if target == "D" else FutureWarning msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): From b9da54f8c6b6fec45944208e788ec6e6e0273179 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 08:35:18 -0800 Subject: [PATCH 06/13] Reduce data sizes in test_period_index --- pandas/tests/resample/test_period_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 13e9cbf4d4940..5cf0446911806 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -130,7 +130,7 @@ def test_selection(self, freq, kwargs): def test_annual_upsample_cases( self, offset, period, conv, meth, month, simple_period_range_series ): - ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"Y-{month}") + ts = simple_period_range_series("1/1/1990", "12/31/1990", freq=f"Y-{month}") warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): @@ -214,7 +214,7 @@ def test_quarterly_upsample( self, month, offset, period, convention, simple_period_range_series ): freq = f"Q-{month}" - ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=freq) warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): From 046145262c8cf552c95a7fa6e38e8d5d17a50e49 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 08:50:30 -0800 Subject: [PATCH 07/13] Reduce data in test_resample_ohlc --- pandas/tests/resample/test_datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ab88d221864c0..dd199b8c55476 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -526,7 +526,7 @@ def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): def test_resample_ohlc(unit): - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="Min") + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 2), freq="Min") s = Series(range(len(index)), index=index) s.index.name = "index" s.index = s.index.as_unit(unit) From 3a331ebeb83a2e30c2a2f7bda20ae406da7eacb9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 09:13:51 -0800 Subject: [PATCH 08/13] generate less data in lhs fixture --- pandas/tests/computation/test_eval.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 030cab58df67a..43a130c52a1c6 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -104,17 +104,21 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): ids=["DataFrame", "Series", "SeriesNaN", "DataFrameNaN", "float"], ) def lhs(request): - nan_df1 = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) - nan_df1[nan_df1 > 0.5] = np.nan - - opts = ( - DataFrame(np.random.default_rng(2).standard_normal((10, 5))), - Series(np.random.default_rng(2).standard_normal(5)), - Series([1, 2, np.nan, np.nan, 5]), - nan_df1, - np.random.default_rng(2).standard_normal(), - ) - return opts[request.param] + rng = np.random.default_rng(2) + if request.param == 0: + return DataFrame(rng.standard_normal((10, 5))) + elif request.param == 1: + return Series(rng.standard_normal(5)) + elif request.param == 2: + return Series([1, 2, np.nan, np.nan, 5]) + elif request.param == 3: + nan_df1 = DataFrame(rng.standard_normal((10, 5))) + nan_df1[nan_df1 > 0.5] = np.nan + return nan_df1 + elif request.param == 4: + return rng.standard_normal() + else: + raise ValueError(f"{request.param}") rhs = lhs From 35152b11c6506c5f6aa45a80090afcdef6834e33 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 09:56:13 -0800 Subject: [PATCH 09/13] Avoid gc in test_memory_leak --- pandas/tests/plotting/frame/test_frame.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index c4ab708f33978..728244e4439ac 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -4,11 +4,9 @@ date, datetime, ) -import gc import itertools import re import string -import weakref import numpy as np import pytest @@ -2164,15 +2162,14 @@ def test_memory_leak(self, kind): index=date_range("2000-01-01", periods=10, freq="B"), ) - # Use a weakref so we can see if the object gets collected without - # also preventing it from being collected - ref = weakref.ref(df.plot(kind=kind, **args)) - - # have matplotlib delete all the figures - plt.close("all") - # force a garbage collection - gc.collect() - assert ref() is None + ax = df.plot(kind=kind, **args) + if kind in ["line", "area"]: + for i, (cached_data, _, _) in enumerate(ax._plot_data): + ser = df.iloc[:, i] + assert not tm.shares_memory(ser, cached_data) + tm.assert_numpy_array_equal(ser._values, cached_data._values) + else: + assert not hasattr(ax, "_plot_data") def test_df_gridspec_patterns_vert_horiz(self): # GH 10819 From fcb2c7097260e1cbe93ae83195822a8cb7a0b5e0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 10:06:38 -0800 Subject: [PATCH 10/13] Use less data in test_resample_equivalent_offsets --- pandas/tests/resample/test_datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index dd199b8c55476..272aaf23d84c1 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1842,7 +1842,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): # GH 24127 n1_ = n1 * k n2_ = n2 * k - dti = date_range("1991-09-05", "1991-09-12", freq=freq1).as_unit(unit) + dti = date_range("1991-09-05", "1991-09-06", freq=freq1).as_unit(unit) ser = Series(range(len(dti)), index=dti) result1 = ser.resample(str(n1_) + freq1).mean() From 2f9a4b84703672fabfe6bfaf2e68453e4ddbb321 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 2 Nov 2025 10:10:53 -0800 Subject: [PATCH 11/13] Mark test_indexing_slow as slow --- pandas/tests/indexing/multiindex/test_indexing_slow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index c6fc1659500e6..8c7f7f3bb05ac 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -71,6 +71,7 @@ def b(df, cols): return df.drop_duplicates(subset=cols[:-1]) +@pytest.mark.slow @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") @pytest.mark.parametrize("lexsort_depth", list(range(5))) @pytest.mark.parametrize("frame_fixture", ["a", "b"]) From be90fce9650cfd24059f588564910b965b898da0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Nov 2025 16:24:16 -0800 Subject: [PATCH 12/13] Update comment about larger size --- pandas/tests/io/parser/common/test_chunksize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 0e8303e66f4ac..6ba9e0b44df86 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -241,7 +241,8 @@ def test_warn_if_chunks_have_mismatched_type( # be coerced using numerical types, then issue warning. if parser.engine == "c" and parser.low_memory: warning_type = DtypeWarning - # Use larger size to hit warning path + # Use a size to hit warning path dictated by DEFAULT_BUFFER_HEURISTIC + # monkeypatched below size = heuristic - 1 integers = [str(i) for i in range(size)] From cdd56e314c9b008ab71e7b56e74ba354157c7655 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:58:47 -0800 Subject: [PATCH 13/13] Add context about whats being tested --- pandas/tests/plotting/frame/test_frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 50cb5ad3563c6..412909b8fadf5 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2172,6 +2172,7 @@ def test_memory_leak(self, kind): ) ax = df.plot(kind=kind, **args) + # https://github.com/pandas-dev/pandas/issues/9003#issuecomment-70544889 if kind in ["line", "area"]: for i, (cached_data, _, _) in enumerate(ax._plot_data): ser = df.iloc[:, i]