From 1a0058660d229f9c493146f2cffeb6c9e0a3cd14 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 16 Jun 2023 16:12:38 +0200 Subject: [PATCH 1/2] API / COW: ensure every new Series/DataFrame also has new (shallow copy) index --- pandas/_libs/internals.pyi | 4 +++- pandas/_libs/internals.pyx | 15 ++++++++---- pandas/core/generic.py | 4 +++- pandas/core/internals/array_manager.py | 4 +++- pandas/core/internals/managers.py | 22 +++++++++++++++++- pandas/core/internals/ops.py | 1 + pandas/tests/copy_view/test_indexing.py | 31 +++++++++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 10 ++++++++ 8 files changed, 83 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index ce112413f8a64..d3f82a457187c 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -95,7 +95,9 @@ class BlockManager: def __init__( self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=... ) -> None: ... - def get_slice(self, slobj: slice, axis: int = ...) -> Self: ... + def get_slice( + self, slobj: slice, axis: int = ..., using_cow: bool = False + ) -> Self: ... def _rebuild_blknos_and_blklocs(self) -> None: ... class BlockValuesRefs: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index adf4e8c926fa3..2adab79cd88fd 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -899,7 +899,7 @@ cdef class BlockManager: # ------------------------------------------------------------------- # Indexing - cdef BlockManager _slice_mgr_rows(self, slice slobj): + cdef BlockManager _slice_mgr_rows(self, slice slobj, bint using_cow): cdef: SharedBlock blk, nb BlockManager mgr @@ -910,7 +910,10 @@ cdef class BlockManager: nb = blk.slice_block_rows(slobj) nbs.append(nb) - new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] + if using_cow: + new_axes = [self.axes[0]._view(), self.axes[1]._getitem_slice(slobj)] + else: + new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos @@ -921,17 +924,21 @@ cdef class BlockManager: mgr._blklocs = blklocs.copy() return mgr - def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + def get_slice( + self, slobj: slice, axis: int = 0, using_cow: bool = False + ) -> BlockManager: if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: - return self._slice_mgr_rows(slobj) + return self._slice_mgr_rows(slobj, using_cow) else: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + if using_cow: + new_axes[1 - axis] = self.axes[1 - axis]._view() return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d112f5aa7d671..2644a16972f9a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4218,7 +4218,9 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self: """ assert isinstance(slobj, slice), type(slobj) axis = self._get_block_manager_axis(axis) - result = self._constructor(self._mgr.get_slice(slobj, axis=axis)) + result = self._constructor( + self._mgr.get_slice(slobj, axis=axis, using_cow=using_copy_on_write()) + ) result = result.__finalize__(self) # this could be a view diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 098a78fc54b71..7a617ad3b603d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -760,7 +760,9 @@ def fast_xs(self, loc: int) -> SingleArrayManager: result = np.array(values, dtype=dtype) return SingleArrayManager([result], [self._axes[1]]) - def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager: + def get_slice( + self, slobj: slice, axis: AxisInt = 0, using_cow: bool = False + ) -> ArrayManager: axis = self._normalize_axis(axis) if axis == 0: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2a7c0536c66a4..4393499d9a233 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -197,6 +197,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: + # TODO shallow copy remaining axis? axes = [Index([])] + self.axes[1:] # preserve dtype if possible @@ -355,6 +356,7 @@ def apply( applied = getattr(b, f)(**kwargs) result_blocks = extend_blocks(applied, result_blocks) + # TODO shallow copy axes (in from_blocks or here?) out = type(self).from_blocks(result_blocks, self.axes) return out @@ -575,6 +577,7 @@ def get_numeric_data(self, copy: bool = False) -> Self: # Avoid somewhat expensive _combine if copy: return self.copy(deep=True) + # TODO(CoW) need to return a shallow copy here? return self return self._combine(numeric_blocks, copy) @@ -606,6 +609,7 @@ def _combine( new_blocks.append(nb) axes = list(self.axes) + # TODO shallow copy of axes? if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) @@ -647,7 +651,10 @@ def copy_func(ax): new_axes = [copy_func(ax) for ax in self.axes] else: - new_axes = list(self.axes) + if using_copy_on_write(): + new_axes = [ax.view() for ax in self.axes] + else: + new_axes = list(self.axes) res = self.apply("copy", deep=deep) res.axes = new_axes @@ -674,6 +681,7 @@ def consolidate(self) -> Self: if self.is_consolidated(): return self + # TODO shallow copy is not needed here? bm = type(self)(self.blocks, self.axes, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() @@ -718,6 +726,7 @@ def reindex_indexer( if indexer is None: if new_axis is self.axes[axis] and not copy: + # TODO(CoW) need to handle CoW? return self result = self.copy(deep=copy) @@ -756,6 +765,8 @@ def reindex_indexer( new_axes = list(self.axes) new_axes[axis] = new_axis + if self.ndim == 2 and using_copy_on_write(): + new_axes[1 - axis] = self.axes[1 - axis]._view() new_mgr = type(self).from_blocks(new_blocks, new_axes) if axis == 1: @@ -1034,6 +1045,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: ndim=1, refs=self.blocks[0].refs, ) + # TODO shallow copy columns return SingleBlockManager(block, self.axes[0]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -1067,6 +1079,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: bp = BlockPlacement(slice(0, len(result))) block = new_block(result, placement=bp, ndim=1) + # TODO shallow copy columns return SingleBlockManager(block, self.axes[0]) def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: @@ -1081,6 +1094,7 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: nb = type(block)( values, placement=bp, ndim=1, refs=block.refs if track_ref else None ) + # TODO shallow copy index? (might already be done where this gets called) return SingleBlockManager(nb, self.axes[1]) def iget_values(self, i: int) -> ArrayLike: @@ -1479,6 +1493,7 @@ def idelete(self, indexer) -> BlockManager: nbs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] + # TODO shallow copy index? axes = [new_columns, self.axes[1]] return type(self)(tuple(nbs), axes, verify_integrity=False) @@ -1516,6 +1531,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = result_blocks[0].values.shape[-1] index = Index(range(nrows)) + # TODO shallow copy columns? return type(self).from_blocks(result_blocks, [self.axes[0], index]) def reduce(self, func: Callable) -> Self: @@ -1539,6 +1555,7 @@ def reduce(self, func: Callable) -> Self: res_blocks.extend(nbs) index = Index([None]) # placeholder + # TODO shallow copy self.items new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) return new_mgr @@ -1585,6 +1602,7 @@ def quantile( assert is_list_like(qs) # caller is responsible for this assert axis == 1 # only ever called this way + # TODO shallow copy axes new_axes = list(self.axes) new_axes[1] = Index(qs, dtype=np.float64) @@ -1873,6 +1891,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: offset += len(mgr.items) + # TODO relevant axis already shallow-copied at caller? new_mgr = cls(tuple(blocks), axes) return new_mgr @@ -1942,6 +1961,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) + # TODO shallow copy index axes = [columns, self.axes[0]] return BlockManager([new_blk], axes=axes, verify_integrity=False) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 8434ed05571b7..129fab1776ffa 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -88,6 +88,7 @@ def operate_blockwise( # assert len(slocs) == nlocs, (len(slocs), nlocs) # assert slocs == set(range(nlocs)), slocs + # TODO shallow copy axes? new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False) return new_mgr diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index f9b4ba295f3a0..6e7662a9db0a7 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -59,6 +59,9 @@ def test_subset_column_selection(backend, using_copy_on_write): subset = df[["a", "c"]] + if using_copy_on_write: + assert subset.index is not df.index + if using_copy_on_write: # the subset shares memory ... assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) @@ -111,6 +114,9 @@ def test_subset_row_slice(backend, using_copy_on_write): subset = df[1:3] subset._mgr._verify_integrity() + if using_copy_on_write: + assert subset.columns is not df.columns + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) if using_copy_on_write: @@ -154,6 +160,9 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, subset = df.iloc[:, 1:] subset._mgr._verify_integrity() + if using_copy_on_write: + assert subset.index is not df.index + if using_copy_on_write: assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) @@ -213,6 +222,10 @@ def test_subset_loc_rows_columns( subset = df.loc[row_indexer, column_indexer] + if using_copy_on_write: + assert subset.index is not df.index + assert subset.columns is not df.columns + # modifying the subset never modifies the parent subset.iloc[0, 0] = 0 @@ -273,6 +286,10 @@ def test_subset_iloc_rows_columns( subset = df.iloc[row_indexer, column_indexer] + if using_copy_on_write: + assert subset.index is not df.index + assert subset.columns is not df.columns + # modifying the subset never modifies the parent subset.iloc[0, 0] = 0 @@ -718,6 +735,10 @@ def test_null_slice(backend, method, using_copy_on_write): df2 = method(df) + if using_copy_on_write: + assert df2.index is not df.index + assert df2.columns is not df.columns + # we always return new objects (shallow copy), regardless of CoW or not assert df2 is not df @@ -745,6 +766,9 @@ def test_null_slice_series(backend, method, using_copy_on_write): s2 = method(s) + if using_copy_on_write: + assert s2.index is not s.index + # we always return new objects, regardless of CoW or not assert s2 is not s @@ -886,6 +910,9 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager): s = df["a"] + if using_copy_on_write: + assert s.index is not df.index + assert np.shares_memory(get_array(s, "a"), get_array(df, "a")) if using_copy_on_write or using_array_manager: @@ -963,6 +990,10 @@ def test_column_as_series_no_item_cache( s1 = method(df) s2 = method(df) + if using_copy_on_write: + assert s1.index is not df.index + assert s1.index is not s2.index + is_iloc = "iloc" in request.node.name if using_copy_on_write or is_iloc: assert s1 is not s2 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 67fc91e0567ef..8f0601aff8d6d 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -28,6 +28,9 @@ def test_copy(using_copy_on_write): assert not df_copy._mgr.blocks[0].refs.has_reference() assert not df_copy._mgr.blocks[1].refs.has_reference() + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 @@ -37,6 +40,13 @@ def test_copy_shallow(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) + if using_copy_on_write: + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + else: + assert df_copy.index is df.index + assert df_copy.columns is df.columns + # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: From dff37fe7193c53b5ac677e187bfa47aa31a388fa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Jun 2023 09:11:48 +0200 Subject: [PATCH 2/2] fix existing tests checking for index identity --- pandas/tests/frame/methods/test_reindex.py | 7 +++++-- pandas/tests/frame/test_npfuncs.py | 4 ++-- pandas/tests/reshape/concat/test_index.py | 12 ++++++------ pandas/tests/series/methods/test_align.py | 12 ++++++++---- pandas/tests/series/test_constructors.py | 7 +++++-- pandas/tests/test_multilevel.py | 12 +++++++++--- 6 files changed, 35 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 63e2eb790a4ea..eb3e3e08a67d5 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -599,7 +599,7 @@ def test_reindex_sparse(self): ) tm.assert_frame_equal(result, expected) - def test_reindex(self, float_frame): + def test_reindex(self, float_frame, using_copy_on_write): datetime_series = tm.makeTimeSeries(nper=30) newFrame = float_frame.reindex(datetime_series.index) @@ -639,7 +639,10 @@ def test_reindex(self, float_frame): # Same index, copies values but not index if copy=False newFrame = float_frame.reindex(float_frame.index, copy=False) - assert newFrame.index is float_frame.index + if using_copy_on_write: + assert newFrame.index.is_(float_frame.index) + else: + assert newFrame.index is float_frame.index # length zero newFrame = float_frame.reindex([]) diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py index b734dafb6c31b..1159623bdd281 100644 --- a/pandas/tests/frame/test_npfuncs.py +++ b/pandas/tests/frame/test_npfuncs.py @@ -22,8 +22,8 @@ def test_np_sqrt(self, float_frame): with np.errstate(all="ignore"): result = np.sqrt(float_frame) assert isinstance(result, type(float_frame)) - assert result.index is float_frame.index - assert result.columns is float_frame.columns + assert result.index.is_(float_frame.index) + assert result.columns.is_(float_frame.columns) tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index ce06e74de91b9..b4f2a5e13aac4 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -114,14 +114,14 @@ def test_concat_copy_index_frame(self, axis, using_copy_on_write): df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) comb = concat([df, df], axis=axis, copy=True) if not using_copy_on_write: - assert comb.index is not df.index - assert comb.columns is not df.columns + assert not comb.index.is_(df.index) + assert not comb.columns.is_(df.columns) elif axis in [0, "index"]: - assert comb.index is not df.index - assert comb.columns is df.columns + assert not comb.index.is_(df.index) + assert comb.columns.is_(df.columns) elif axis in [1, "columns"]: - assert comb.index is df.index - assert comb.columns is not df.columns + assert comb.index.is_(df.index) + assert not comb.columns.is_(df.columns) def test_default_index(self): # is_series and ignore_index diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index 3edbe1b2f61f3..274312ee5e1fe 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -127,16 +127,20 @@ def test_align_nocopy(datetime_series, using_copy_on_write): def test_align_same_index(datetime_series, using_copy_on_write): a, b = datetime_series.align(datetime_series, copy=False) - assert a.index is datetime_series.index - assert b.index is datetime_series.index + if not using_copy_on_write: + assert a.index is datetime_series.index + assert b.index is datetime_series.index + else: + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) a, b = datetime_series.align(datetime_series, copy=True) if not using_copy_on_write: assert a.index is not datetime_series.index assert b.index is not datetime_series.index else: - assert a.index is datetime_series.index - assert b.index is datetime_series.index + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) def test_align_multiindex(): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4bf16b6d20d1f..99f772ead4262 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -628,12 +628,15 @@ def test_constructor_maskedarray_hardened(self): expected = Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) - def test_series_ctor_plus_datetimeindex(self): + def test_series_ctor_plus_datetimeindex(self, using_copy_on_write): rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) - assert result.index is rng + if using_copy_on_write: + assert result.index.is_(rng) + else: + assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8c5f9a894f2f7..b97fa2f9bc16f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -46,20 +46,26 @@ def test_reindex(self, multiindex_dataframe_random_data): tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write ): ymd = multiindex_year_month_day_dataframe_random_data new_index = ymd.index[::10] chunk = ymd.reindex(new_index) - assert chunk.index is new_index + if using_copy_on_write: + assert chunk.index.is_(new_index) + else: + assert chunk.index is new_index chunk = ymd.loc[new_index] assert chunk.index.equals(new_index) ymdT = ymd.T chunk = ymdT.reindex(columns=new_index) - assert chunk.columns is new_index + if using_copy_on_write: + assert chunk.columns.is_(new_index) + else: + assert chunk.columns is new_index chunk = ymdT.loc[:, new_index] assert chunk.columns.equals(new_index)