From 1a0058660d229f9c493146f2cffeb6c9e0a3cd14 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 16 Jun 2023 16:12:38 +0200
Subject: [PATCH 1/2] API / COW: ensure every new Series/DataFrame also has new
 (shallow copy) index

---
 pandas/_libs/internals.pyi              |  4 +++-
 pandas/_libs/internals.pyx              | 15 ++++++++----
 pandas/core/generic.py                  |  4 +++-
 pandas/core/internals/array_manager.py  |  4 +++-
 pandas/core/internals/managers.py       | 22 +++++++++++++++++-
 pandas/core/internals/ops.py            |  1 +
 pandas/tests/copy_view/test_indexing.py | 31 +++++++++++++++++++++++++
 pandas/tests/copy_view/test_methods.py  | 10 ++++++++
 8 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
index ce112413f8a64..d3f82a457187c 100644
--- a/pandas/_libs/internals.pyi
+++ b/pandas/_libs/internals.pyi
@@ -95,7 +95,9 @@ class BlockManager:
     def __init__(
         self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
     ) -> None: ...
-    def get_slice(self, slobj: slice, axis: int = ...) -> Self: ...
+    def get_slice(
+        self, slobj: slice, axis: int = ..., using_cow: bool = False
+    ) -> Self: ...
     def _rebuild_blknos_and_blklocs(self) -> None: ...
 
 class BlockValuesRefs:
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index adf4e8c926fa3..2adab79cd88fd 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -899,7 +899,7 @@ cdef class BlockManager:
     # -------------------------------------------------------------------
     # Indexing
 
-    cdef BlockManager _slice_mgr_rows(self, slice slobj):
+    cdef BlockManager _slice_mgr_rows(self, slice slobj, bint using_cow):
         cdef:
             SharedBlock blk, nb
             BlockManager mgr
@@ -910,7 +910,10 @@ cdef class BlockManager:
             nb = blk.slice_block_rows(slobj)
             nbs.append(nb)
 
-        new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
+        if using_cow:
+            new_axes = [self.axes[0]._view(), self.axes[1]._getitem_slice(slobj)]
+        else:
+            new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
         mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
 
         # We can avoid having to rebuild blklocs/blknos
@@ -921,17 +924,21 @@ cdef class BlockManager:
             mgr._blklocs = blklocs.copy()
         return mgr
 
-    def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
+    def get_slice(
+        self, slobj: slice, axis: int = 0, using_cow: bool = False
+    ) -> BlockManager:
 
         if axis == 0:
             new_blocks = self._slice_take_blocks_ax0(slobj)
         elif axis == 1:
-            return self._slice_mgr_rows(slobj)
+            return self._slice_mgr_rows(slobj, using_cow)
         else:
             raise IndexError("Requested axis not found in manager")
 
         new_axes = list(self.axes)
         new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
+        if using_cow:
+            new_axes[1 - axis] = self.axes[1 - axis]._view()
 
         return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index d112f5aa7d671..2644a16972f9a 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -4218,7 +4218,9 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
         """
         assert isinstance(slobj, slice), type(slobj)
         axis = self._get_block_manager_axis(axis)
-        result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
+        result = self._constructor(
+            self._mgr.get_slice(slobj, axis=axis, using_cow=using_copy_on_write())
+        )
         result = result.__finalize__(self)
 
         # this could be a view
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
index 098a78fc54b71..7a617ad3b603d 100644
--- a/pandas/core/internals/array_manager.py
+++ b/pandas/core/internals/array_manager.py
@@ -760,7 +760,9 @@ def fast_xs(self, loc: int) -> SingleArrayManager:
             result = np.array(values, dtype=dtype)
         return SingleArrayManager([result], [self._axes[1]])
 
-    def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
+    def get_slice(
+        self, slobj: slice, axis: AxisInt = 0, using_cow: bool = False
+    ) -> ArrayManager:
         axis = self._normalize_axis(axis)
 
         if axis == 0:
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 2a7c0536c66a4..4393499d9a233 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -197,6 +197,7 @@ def blklocs(self) -> npt.NDArray[np.intp]:
     def make_empty(self, axes=None) -> Self:
         """return an empty BlockManager with the items axis of len 0"""
         if axes is None:
+            # TODO shallow copy remaining axis?
             axes = [Index([])] + self.axes[1:]
 
         # preserve dtype if possible
@@ -355,6 +356,7 @@ def apply(
                 applied = getattr(b, f)(**kwargs)
             result_blocks = extend_blocks(applied, result_blocks)
 
+        # TODO shallow copy axes (in from_blocks or here?)
         out = type(self).from_blocks(result_blocks, self.axes)
         return out
 
@@ -575,6 +577,7 @@ def get_numeric_data(self, copy: bool = False) -> Self:
             # Avoid somewhat expensive _combine
             if copy:
                 return self.copy(deep=True)
+            # TODO(CoW) need to return a shallow copy here?
             return self
         return self._combine(numeric_blocks, copy)
 
@@ -606,6 +609,7 @@ def _combine(
             new_blocks.append(nb)
 
         axes = list(self.axes)
+        # TODO shallow copy of axes?
         if index is not None:
             axes[-1] = index
         axes[0] = self.items.take(indexer)
@@ -647,7 +651,10 @@ def copy_func(ax):
 
             new_axes = [copy_func(ax) for ax in self.axes]
         else:
-            new_axes = list(self.axes)
+            if using_copy_on_write():
+                new_axes = [ax.view() for ax in self.axes]
+            else:
+                new_axes = list(self.axes)
 
         res = self.apply("copy", deep=deep)
         res.axes = new_axes
@@ -674,6 +681,7 @@ def consolidate(self) -> Self:
         if self.is_consolidated():
             return self
 
+        # TODO shallow copy is not needed here?
         bm = type(self)(self.blocks, self.axes, verify_integrity=False)
         bm._is_consolidated = False
         bm._consolidate_inplace()
@@ -718,6 +726,7 @@ def reindex_indexer(
 
         if indexer is None:
             if new_axis is self.axes[axis] and not copy:
+                # TODO(CoW) need to handle CoW?
                 return self
 
             result = self.copy(deep=copy)
@@ -756,6 +765,8 @@ def reindex_indexer(
 
         new_axes = list(self.axes)
         new_axes[axis] = new_axis
+        if self.ndim == 2 and using_copy_on_write():
+            new_axes[1 - axis] = self.axes[1 - axis]._view()
 
         new_mgr = type(self).from_blocks(new_blocks, new_axes)
         if axis == 1:
@@ -1034,6 +1045,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
                 ndim=1,
                 refs=self.blocks[0].refs,
             )
+            # TODO shallow copy columns
             return SingleBlockManager(block, self.axes[0])
 
         dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
@@ -1067,6 +1079,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
 
         bp = BlockPlacement(slice(0, len(result)))
         block = new_block(result, placement=bp, ndim=1)
+        # TODO shallow copy columns
         return SingleBlockManager(block, self.axes[0])
 
     def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
@@ -1081,6 +1094,7 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
         nb = type(block)(
             values, placement=bp, ndim=1, refs=block.refs if track_ref else None
         )
+        # TODO shallow copy index? (might already be done where this gets called)
         return SingleBlockManager(nb, self.axes[1])
 
     def iget_values(self, i: int) -> ArrayLike:
@@ -1479,6 +1493,7 @@ def idelete(self, indexer) -> BlockManager:
 
         nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
         new_columns = self.items[~is_deleted]
+        # TODO shallow copy index?
         axes = [new_columns, self.axes[1]]
         return type(self)(tuple(nbs), axes, verify_integrity=False)
 
@@ -1516,6 +1531,7 @@ def grouped_reduce(self, func: Callable) -> Self:
             nrows = result_blocks[0].values.shape[-1]
         index = Index(range(nrows))
 
+        # TODO shallow copy columns?
         return type(self).from_blocks(result_blocks, [self.axes[0], index])
 
     def reduce(self, func: Callable) -> Self:
@@ -1539,6 +1555,7 @@ def reduce(self, func: Callable) -> Self:
             res_blocks.extend(nbs)
 
         index = Index([None])  # placeholder
+        # TODO shallow copy self.items
         new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
         return new_mgr
 
@@ -1585,6 +1602,7 @@ def quantile(
         assert is_list_like(qs)  # caller is responsible for this
         assert axis == 1  # only ever called this way
 
+        # TODO shallow copy axes
         new_axes = list(self.axes)
         new_axes[1] = Index(qs, dtype=np.float64)
 
@@ -1873,6 +1891,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
 
             offset += len(mgr.items)
 
+        # TODO relevant axis already shallow-copied at caller?
         new_mgr = cls(tuple(blocks), axes)
         return new_mgr
 
@@ -1942,6 +1961,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager:
         arr = ensure_block_shape(blk.values, ndim=2)
         bp = BlockPlacement(0)
         new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
+        # TODO shallow copy index
         axes = [columns, self.axes[0]]
         return BlockManager([new_blk], axes=axes, verify_integrity=False)
 
diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py
index 8434ed05571b7..129fab1776ffa 100644
--- a/pandas/core/internals/ops.py
+++ b/pandas/core/internals/ops.py
@@ -88,6 +88,7 @@ def operate_blockwise(
     #  assert len(slocs) == nlocs, (len(slocs), nlocs)
     #  assert slocs == set(range(nlocs)), slocs
 
+    # TODO shallow copy axes?
     new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
     return new_mgr
 
diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py
index f9b4ba295f3a0..6e7662a9db0a7 100644
--- a/pandas/tests/copy_view/test_indexing.py
+++ b/pandas/tests/copy_view/test_indexing.py
@@ -59,6 +59,9 @@ def test_subset_column_selection(backend, using_copy_on_write):
 
     subset = df[["a", "c"]]
 
+    if using_copy_on_write:
+        assert subset.index is not df.index
+
     if using_copy_on_write:
         # the subset shares memory ...
         assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
@@ -111,6 +114,9 @@ def test_subset_row_slice(backend, using_copy_on_write):
     subset = df[1:3]
     subset._mgr._verify_integrity()
 
+    if using_copy_on_write:
+        assert subset.columns is not df.columns
+
     assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
 
     if using_copy_on_write:
@@ -154,6 +160,9 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager,
     subset = df.iloc[:, 1:]
     subset._mgr._verify_integrity()
 
+    if using_copy_on_write:
+        assert subset.index is not df.index
+
     if using_copy_on_write:
         assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
 
@@ -213,6 +222,10 @@ def test_subset_loc_rows_columns(
 
     subset = df.loc[row_indexer, column_indexer]
 
+    if using_copy_on_write:
+        assert subset.index is not df.index
+        assert subset.columns is not df.columns
+
     # modifying the subset never modifies the parent
     subset.iloc[0, 0] = 0
 
@@ -273,6 +286,10 @@ def test_subset_iloc_rows_columns(
 
     subset = df.iloc[row_indexer, column_indexer]
 
+    if using_copy_on_write:
+        assert subset.index is not df.index
+        assert subset.columns is not df.columns
+
     # modifying the subset never modifies the parent
     subset.iloc[0, 0] = 0
 
@@ -718,6 +735,10 @@ def test_null_slice(backend, method, using_copy_on_write):
 
     df2 = method(df)
 
+    if using_copy_on_write:
+        assert df2.index is not df.index
+        assert df2.columns is not df.columns
+
     # we always return new objects (shallow copy), regardless of CoW or not
     assert df2 is not df
 
@@ -745,6 +766,9 @@ def test_null_slice_series(backend, method, using_copy_on_write):
 
     s2 = method(s)
 
+    if using_copy_on_write:
+        assert s2.index is not s.index
+
     # we always return new objects, regardless of CoW or not
     assert s2 is not s
 
@@ -886,6 +910,9 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):
 
     s = df["a"]
 
+    if using_copy_on_write:
+        assert s.index is not df.index
+
     assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
 
     if using_copy_on_write or using_array_manager:
@@ -963,6 +990,10 @@ def test_column_as_series_no_item_cache(
     s1 = method(df)
     s2 = method(df)
 
+    if using_copy_on_write:
+        assert s1.index is not df.index
+        assert s1.index is not s2.index
+
     is_iloc = "iloc" in request.node.name
     if using_copy_on_write or is_iloc:
         assert s1 is not s2
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
index 67fc91e0567ef..8f0601aff8d6d 100644
--- a/pandas/tests/copy_view/test_methods.py
+++ b/pandas/tests/copy_view/test_methods.py
@@ -28,6 +28,9 @@ def test_copy(using_copy_on_write):
         assert not df_copy._mgr.blocks[0].refs.has_reference()
         assert not df_copy._mgr.blocks[1].refs.has_reference()
 
+    assert df_copy.index is not df.index
+    assert df_copy.columns is not df.columns
+
     # mutating copy doesn't mutate original
     df_copy.iloc[0, 0] = 0
     assert df.iloc[0, 0] == 1
@@ -37,6 +40,13 @@ def test_copy_shallow(using_copy_on_write):
     df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
     df_copy = df.copy(deep=False)
 
+    if using_copy_on_write:
+        assert df_copy.index is not df.index
+        assert df_copy.columns is not df.columns
+    else:
+        assert df_copy.index is df.index
+        assert df_copy.columns is df.columns
+
     # the shallow copy still shares memory
     assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
     if using_copy_on_write:

From dff37fe7193c53b5ac677e187bfa47aa31a388fa Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 19 Jun 2023 09:11:48 +0200
Subject: [PATCH 2/2] fix existing tests checking for index identity

---
 pandas/tests/frame/methods/test_reindex.py |  7 +++++--
 pandas/tests/frame/test_npfuncs.py         |  4 ++--
 pandas/tests/reshape/concat/test_index.py  | 12 ++++++------
 pandas/tests/series/methods/test_align.py  | 12 ++++++++----
 pandas/tests/series/test_constructors.py   |  7 +++++--
 pandas/tests/test_multilevel.py            | 12 +++++++++---
 6 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
index 63e2eb790a4ea..eb3e3e08a67d5 100644
--- a/pandas/tests/frame/methods/test_reindex.py
+++ b/pandas/tests/frame/methods/test_reindex.py
@@ -599,7 +599,7 @@ def test_reindex_sparse(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_reindex(self, float_frame):
+    def test_reindex(self, float_frame, using_copy_on_write):
         datetime_series = tm.makeTimeSeries(nper=30)
 
         newFrame = float_frame.reindex(datetime_series.index)
@@ -639,7 +639,10 @@ def test_reindex(self, float_frame):
 
         # Same index, copies values but not index if copy=False
         newFrame = float_frame.reindex(float_frame.index, copy=False)
-        assert newFrame.index is float_frame.index
+        if using_copy_on_write:
+            assert newFrame.index.is_(float_frame.index)
+        else:
+            assert newFrame.index is float_frame.index
 
         # length zero
         newFrame = float_frame.reindex([])
diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py
index b734dafb6c31b..1159623bdd281 100644
--- a/pandas/tests/frame/test_npfuncs.py
+++ b/pandas/tests/frame/test_npfuncs.py
@@ -22,8 +22,8 @@ def test_np_sqrt(self, float_frame):
         with np.errstate(all="ignore"):
             result = np.sqrt(float_frame)
         assert isinstance(result, type(float_frame))
-        assert result.index is float_frame.index
-        assert result.columns is float_frame.columns
+        assert result.index.is_(float_frame.index)
+        assert result.columns.is_(float_frame.columns)
 
         tm.assert_frame_equal(result, float_frame.apply(np.sqrt))
 
diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py
index ce06e74de91b9..b4f2a5e13aac4 100644
--- a/pandas/tests/reshape/concat/test_index.py
+++ b/pandas/tests/reshape/concat/test_index.py
@@ -114,14 +114,14 @@ def test_concat_copy_index_frame(self, axis, using_copy_on_write):
         df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
         comb = concat([df, df], axis=axis, copy=True)
         if not using_copy_on_write:
-            assert comb.index is not df.index
-            assert comb.columns is not df.columns
+            assert not comb.index.is_(df.index)
+            assert not comb.columns.is_(df.columns)
         elif axis in [0, "index"]:
-            assert comb.index is not df.index
-            assert comb.columns is df.columns
+            assert not comb.index.is_(df.index)
+            assert comb.columns.is_(df.columns)
         elif axis in [1, "columns"]:
-            assert comb.index is df.index
-            assert comb.columns is not df.columns
+            assert comb.index.is_(df.index)
+            assert not comb.columns.is_(df.columns)
 
     def test_default_index(self):
         # is_series and ignore_index
diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py
index 3edbe1b2f61f3..274312ee5e1fe 100644
--- a/pandas/tests/series/methods/test_align.py
+++ b/pandas/tests/series/methods/test_align.py
@@ -127,16 +127,20 @@ def test_align_nocopy(datetime_series, using_copy_on_write):
 
 def test_align_same_index(datetime_series, using_copy_on_write):
     a, b = datetime_series.align(datetime_series, copy=False)
-    assert a.index is datetime_series.index
-    assert b.index is datetime_series.index
+    if not using_copy_on_write:
+        assert a.index is datetime_series.index
+        assert b.index is datetime_series.index
+    else:
+        assert a.index.is_(datetime_series.index)
+        assert b.index.is_(datetime_series.index)
 
     a, b = datetime_series.align(datetime_series, copy=True)
     if not using_copy_on_write:
         assert a.index is not datetime_series.index
         assert b.index is not datetime_series.index
     else:
-        assert a.index is datetime_series.index
-        assert b.index is datetime_series.index
+        assert a.index.is_(datetime_series.index)
+        assert b.index.is_(datetime_series.index)
 
 
 def test_align_multiindex():
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 4bf16b6d20d1f..99f772ead4262 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -628,12 +628,15 @@ def test_constructor_maskedarray_hardened(self):
         expected = Series([np.nan, np.nan, np.nan])
         tm.assert_series_equal(result, expected)
 
-    def test_series_ctor_plus_datetimeindex(self):
+    def test_series_ctor_plus_datetimeindex(self, using_copy_on_write):
         rng = date_range("20090415", "20090519", freq="B")
         data = {k: 1 for k in rng}
 
         result = Series(data, index=rng)
-        assert result.index is rng
+        if using_copy_on_write:
+            assert result.index.is_(rng)
+        else:
+            assert result.index is rng
 
     def test_constructor_default_index(self):
         s = Series([0, 1, 2])
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 8c5f9a894f2f7..b97fa2f9bc16f 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -46,20 +46,26 @@ def test_reindex(self, multiindex_dataframe_random_data):
         tm.assert_frame_equal(reindexed, expected)
 
     def test_reindex_preserve_levels(
-        self, multiindex_year_month_day_dataframe_random_data
+        self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write
     ):
         ymd = multiindex_year_month_day_dataframe_random_data
 
         new_index = ymd.index[::10]
         chunk = ymd.reindex(new_index)
-        assert chunk.index is new_index
+        if using_copy_on_write:
+            assert chunk.index.is_(new_index)
+        else:
+            assert chunk.index is new_index
 
         chunk = ymd.loc[new_index]
         assert chunk.index.equals(new_index)
 
         ymdT = ymd.T
         chunk = ymdT.reindex(columns=new_index)
-        assert chunk.columns is new_index
+        if using_copy_on_write:
+            assert chunk.columns.is_(new_index)
+        else:
+            assert chunk.columns is new_index
 
         chunk = ymdT.loc[:, new_index]
         assert chunk.columns.equals(new_index)