From a23a2e0a337b0e5f680e6ec53f010cb03b1522db Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 21:12:26 -0500 Subject: [PATCH 01/11] ENH: Add leftsemi merge --- asv_bench/benchmarks/join_merge.py | 6 +++ doc/source/user_guide/merging.rst | 1 + doc/source/whatsnew/v3.0.0.rst | 16 ++++-- pandas/_libs/hashtable.pyx | 3 ++ pandas/_libs/hashtable_class_helper.pxi.in | 27 ++++++++++ pandas/_typing.py | 2 +- pandas/core/frame.py | 7 ++- pandas/core/reshape/merge.py | 59 +++++++++++++++++++--- pandas/tests/reshape/merge/test_semi.py | 57 +++++++++++++++++++++ 9 files changed, 166 insertions(+), 12 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_semi.py diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index a6c6990892d38..d0ac3ff1bbe45 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -272,6 +272,9 @@ def time_merge_dataframe_empty_left(self, sort): def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + def time_merge_semi(self, sort): + merge(self.df, self.df2, on="key1", how="leftsemi") + class MergeEA: params = [ @@ -380,6 +383,9 @@ def setup(self, units, tz, monotonic): def time_merge(self, units, tz, monotonic): merge(self.left, self.right) + def time_merge_semi(self, units, tz, monotonic): + merge(self.left, self.right, how="leftsemi") + class MergeCategoricals: def setup(self): diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 1edf3908936db..f7e7d1dd24317 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -407,6 +407,7 @@ either the left or right tables, the values in the joined table will be ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames ``inner``, ``INNER JOIN``, Use intersection of keys from both frames + ``leftsemi``, ``SEMIJOIN``, Filter rows on left based on occurrences in right. ``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames .. ipython:: python diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f748f6e23e003..6e7cd0ac500f8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -14,10 +14,20 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_300.enhancements.enhancement1: +.. _whatsnew_300.enhancements.semi_merge: -enhancement1 -^^^^^^^^^^^^ +New merge method ``leftsemi`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new merge method ``leftsemi`` has been added to :func:`merge` and +:meth:`DataFrame.merge` that returns only the rows from the left DataFrame that have +a match in the right DataFrame. This is equivalent to a SQL ``LEFT SEMI JOIN``. (:issue:`42784`) + +.. ipython:: python + + df1 = pd.DataFrame({"key": ["A", "B", "C"], "value": [1, 2, 3]}) + df2 = pd.DataFrame({"key": ["A", "B"], "value": [1, 2]}) + df1.merge(df2, how="leftsemi") .. _whatsnew_300.enhancements.enhancement2: diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 97fae1d6480ce..1ac01e2ce7ae4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -123,3 +123,6 @@ cdef class ObjectFactorizer(Factorizer): self.count, na_sentinel, na_value) self.count = len(self.uniques) return labels + + def hash_inner_join(self, values, mask=None): + return self.table.hash_inner_join(values, mask) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e3a9102fec395..205c58c940b27 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1385,6 +1385,33 @@ cdef class PyObjectHashTable(HashTable): k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i + @cython.wraparound(False) + @cython.boundscheck(False) + def hash_inner_join(self, ndarray[object] values, object mask = None) -> tuple[ndarray, ndarray]: + cdef: + Py_ssize_t i, n = len(values) + object val + khiter_t k + Int64Vector locs = Int64Vector() + Int64Vector self_locs = Int64Vector() + Int64VectorData *l + Int64VectorData *sl + # mask not implemented + + l = &locs.data + sl = &self_locs.data + + for i in range(n): + val = values[i] + hash(val) + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + append_data_int64(l, i) + append_data_int64(sl, self.table.vals[k]) + + return self_locs.to_array(), locs.to_array() + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] # mask not yet implemented diff --git a/pandas/_typing.py b/pandas/_typing.py index f868a92554b39..781240e62a552 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -447,7 +447,7 @@ def closed(self) -> bool: AnyAll = Literal["any", "all"] # merge -MergeHow = Literal["left", "right", "inner", "outer", "cross"] +MergeHow = Literal["left", "right", "inner", "outer", "cross", "leftsemi"] MergeValidate = Literal[ "one_to_one", "1:1", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5d10a5541f556..bcf1e3f58e720 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -315,7 +315,7 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'leftsemi', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -326,6 +326,11 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * leftsemi: Filter for rows in the left that have a match on the right; + preserve the order of the left keys. Doesn't support `left_index`, `right_index`, + `indicator` or `validate`. + + .. versionadded:: 3.0 * cross: creates the cartesian product from both frames, preserves the order of the left keys. on : label or list diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2cd065d03ff53..54f5b6d90c398 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -166,7 +166,8 @@ def merge( validate=validate, ) else: - op = _MergeOperation( + klass = _MergeOperation if how != "leftsemi" else _SemiMergeOperation + op = klass( left_df, right_df, how=how, @@ -817,7 +818,6 @@ def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: # Overridden by AsOfMerge pass - @final def _reindex_and_concat( self, join_index: Index, @@ -945,7 +945,6 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame: result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) return result - @final def _maybe_restore_index_levels(self, result: DataFrame) -> None: """ Restore index levels specified as `on` parameters @@ -989,7 +988,6 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> None: if names_to_restore: result.set_index(names_to_restore, inplace=True) - @final def _maybe_add_join_keys( self, result: DataFrame, @@ -1740,7 +1738,8 @@ def get_join_indexers( right = Index(rkey) if ( - left.is_monotonic_increasing + how != "leftsemi" + and left.is_monotonic_increasing and right.is_monotonic_increasing and (left.is_unique or right.is_unique) ): @@ -1883,6 +1882,48 @@ def _convert_to_multiindex(index: Index) -> MultiIndex: return tuple(join_levels), tuple(join_codes), tuple(join_names) +class _SemiMergeOperation(_MergeOperation): + def __init__(self, *args, **kwargs): + if kwargs.get("validate", None): + raise NotImplementedError("validate is not supported for semi-join.") + + super().__init__(*args, **kwargs) + if self.left_index or self.right_index: + raise NotImplementedError( + "left_index or right_index are not supported for semi-join." + ) + elif self.indicator: + raise NotImplementedError("indicator is not supported for semi-join.") + elif self.sort: + raise NotImplementedError( + "sort is not supported for semi-join. Sort your DataFrame afterwards." + ) + + def _maybe_add_join_keys( + self, + result: DataFrame, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> None: + return + + def _maybe_restore_index_levels(self, result: DataFrame) -> None: + return + + def _reindex_and_concat( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> DataFrame: + left = self.left[:] + + if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): + lmgr = left._mgr.take(left_indexer, axis=1, verify=False) + left = left._constructor_from_mgr(lmgr, axes=lmgr.axes) + return left + + class _OrderedMerge(_MergeOperation): _merge_type = "ordered_merge" @@ -2470,7 +2511,7 @@ def _factorize_keys( lk = ensure_int64(lk.codes) rk = ensure_int64(rk.codes) - elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: + elif how != "leftsemi" and isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( isinstance(lk.dtype, StringDtype) and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] @@ -2560,7 +2601,7 @@ def _factorize_keys( lk_data, rk_data = lk, rk # type: ignore[assignment] lk_mask, rk_mask = None, None - hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufb" + hash_join_available = how == "inner" and not sort if hash_join_available: rlab = rizer.factorize(rk_data, mask=rk_mask) if rizer.get_count() == len(rlab): @@ -2568,6 +2609,10 @@ def _factorize_keys( return lidx, ridx, -1 else: llab = rizer.factorize(lk_data, mask=lk_mask) + elif how == "leftsemi": + # populate hashtable for right and then do a hash join + rizer.factorize(rk_data, mask=rk_mask) + return rizer.hash_inner_join(lk_data, lk_mask)[1], None, -1 else: llab = rizer.factorize(lk_data, mask=lk_mask) rlab = rizer.factorize(rk_data, mask=rk_mask) diff --git a/pandas/tests/reshape/merge/test_semi.py b/pandas/tests/reshape/merge/test_semi.py new file mode 100644 index 0000000000000..92b265fd75d13 --- /dev/null +++ b/pandas/tests/reshape/merge/test_semi.py @@ -0,0 +1,57 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "vals_left, vals_right", + [ + ([1, 2, 3], [1, 2]), + (["a", "b", "c"], ["a", "b"]), + pytest.param( + pd.Series(["a", "b", "c"], dtype="string[pyarrow]"), + pd.Series(["a", "b"], dtype="string[pyarrow]"), + marks=td.skip_if_no("pyarrow"), + ), + ], +) +def test_leftsemi(vals_left, vals_right): + left = pd.DataFrame({"a": vals_left, "b": [1, 2, 3]}) + right = pd.DataFrame({"a": vals_right, "c": 1}) + expected = pd.DataFrame({"a": vals_right, "b": [1, 2]}) + result = left.merge(right, how="leftsemi") + tm.assert_frame_equal(result, expected) + + right = pd.DataFrame({"d": vals_right, "c": 1}) + result = left.merge(right, how="leftsemi", left_on="a", right_on="d") + tm.assert_frame_equal(result, expected) + + right = pd.DataFrame({"d": vals_right, "c": 1}) + result = left.merge(right, how="leftsemi", left_on=["a", "b"], right_on=["d", "c"]) + tm.assert_frame_equal(result, expected.head(1)) + + +def test_leftsemi_invalid(): + left = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + right = pd.DataFrame({"a": [1, 2], "c": 1}) + + msg = "left_index or right_index are not supported for semi-join." + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, how="leftsemi", left_index=True, right_on="a") + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, how="leftsemi", right_index=True, left_on="a") + + msg = "validate is not supported for semi-join." + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, how="leftsemi", validate="one_to_one") + + msg = "indicator is not supported for semi-join." + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, how="leftsemi", indicator=True) + + msg = "sort is not supported for semi-join. Sort your DataFrame afterwards." + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, how="leftsemi", sort=True) From 267e29debe84505d588402f6f480672b7629dd4a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 21:13:23 -0500 Subject: [PATCH 02/11] Docs --- doc/source/user_guide/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index f7e7d1dd24317..3578dba6fa591 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -407,7 +407,7 @@ either the left or right tables, the values in the joined table will be ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames ``inner``, ``INNER JOIN``, Use intersection of keys from both frames - ``leftsemi``, ``SEMIJOIN``, Filter rows on left based on occurrences in right. + ``leftsemi``, ``LEFT SEMI JOIN``, Filter rows on left based on occurrences in right. ``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames .. ipython:: python From 58ea8457f872080a5f5963d0fa50c66b2defe345 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Mar 2024 21:40:35 -0500 Subject: [PATCH 03/11] Fixup --- pandas/tests/reshape/merge/test_semi.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/merge/test_semi.py b/pandas/tests/reshape/merge/test_semi.py index 92b265fd75d13..6f54f32d62121 100644 --- a/pandas/tests/reshape/merge/test_semi.py +++ b/pandas/tests/reshape/merge/test_semi.py @@ -7,18 +7,21 @@ @pytest.mark.parametrize( - "vals_left, vals_right", + "vals_left, vals_right, dtype", [ - ([1, 2, 3], [1, 2]), - (["a", "b", "c"], ["a", "b"]), + ([1, 2, 3], [1, 2], "int64"), + (["a", "b", "c"], ["a", "b"], "object"), pytest.param( - pd.Series(["a", "b", "c"], dtype="string[pyarrow]"), - pd.Series(["a", "b"], dtype="string[pyarrow]"), + ["a", "b", "c"], + ["a", "b"], + "string[pyarrow]", marks=td.skip_if_no("pyarrow"), ), ], ) -def test_leftsemi(vals_left, vals_right): +def test_leftsemi(vals_left, vals_right, dtype): + vals_left = pd.Series(vals_left, dtype=dtype) + vals_right = pd.Series(vals_right, dtype=dtype) left = pd.DataFrame({"a": vals_left, "b": [1, 2, 3]}) right = pd.DataFrame({"a": vals_right, "c": 1}) expected = pd.DataFrame({"a": vals_right, "b": [1, 2]}) From e39bc4c6404d0b01d0f8fc377103a0defb4f9ecf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 24 Mar 2024 12:05:23 -0500 Subject: [PATCH 04/11] Fixup --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 54f5b6d90c398..f7c515c135863 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1681,7 +1681,7 @@ def get_join_indexers( left_keys: list[ArrayLike], right_keys: list[ArrayLike], sort: bool = False, - how: JoinHow = "inner", + how: JoinHow + Literal["leftsemi"] = "inner", ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: """ @@ -2612,7 +2612,7 @@ def _factorize_keys( elif how == "leftsemi": # populate hashtable for right and then do a hash join rizer.factorize(rk_data, mask=rk_mask) - return rizer.hash_inner_join(lk_data, lk_mask)[1], None, -1 + return rizer.hash_inner_join(lk_data, lk_mask)[1], None, -1 # type: ignore[return-value] else: llab = rizer.factorize(lk_data, mask=lk_mask) rlab = rizer.factorize(rk_data, mask=rk_mask) From cf7a325984973d96557d3b6622869c3aa06f476b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 11:27:59 +0000 Subject: [PATCH 05/11] Replace --- asv_bench/benchmarks/join_merge.py | 4 ++-- doc/source/user_guide/merging.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 10 +++++----- pandas/tests/reshape/merge/test_semi.py | 20 ++++++++++---------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index d0ac3ff1bbe45..bd122471898cd 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -273,7 +273,7 @@ def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) def time_merge_semi(self, sort): - merge(self.df, self.df2, on="key1", how="leftsemi") + merge(self.df, self.df2, on="key1", how="left_semi") class MergeEA: @@ -384,7 +384,7 @@ def time_merge(self, units, tz, monotonic): merge(self.left, self.right) def time_merge_semi(self, units, tz, monotonic): - merge(self.left, self.right, how="leftsemi") + merge(self.left, self.right, how="left_semi") class MergeCategoricals: diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index ff461548f017e..dd71cc3d95bfb 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -407,7 +407,7 @@ either the left or right tables, the values in the joined table will be ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames ``inner``, ``INNER JOIN``, Use intersection of keys from both frames - ``leftsemi``, ``LEFT SEMI JOIN``, Filter rows on left based on occurrences in right. + ``left_semi``, ``LEFT SEMI JOIN``, Filter rows on left based on occurrences in right. ``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d8d748377f97..fdba5174cf660 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -338,7 +338,7 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - * leftsemi: Filter for rows in the left that have a match on the right; + * left_semi: Filter for rows in the left that have a match on the right; preserve the order of the left keys. Doesn't support `left_index`, `right_index`, `indicator` or `validate`. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8761f61811787..c15d144533e55 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -385,7 +385,7 @@ def merge( validate=validate, ) else: - klass = _MergeOperation if how != "leftsemi" else _SemiMergeOperation + klass = _MergeOperation if how != "left_semi" else _SemiMergeOperation op = klass( left_df, right_df, @@ -2038,7 +2038,7 @@ def get_join_indexers( left_keys: list[ArrayLike], right_keys: list[ArrayLike], sort: bool = False, - how: JoinHow + Literal["leftsemi"] = "inner", + how: JoinHow + Literal["left_semi"] = "inner", ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: """ @@ -2095,7 +2095,7 @@ def get_join_indexers( right = Index(rkey) if ( - how != "leftsemi" + how != "left_semi" and left.is_monotonic_increasing and right.is_monotonic_increasing and (left.is_unique or right.is_unique) @@ -2868,7 +2868,7 @@ def _factorize_keys( lk = ensure_int64(lk.codes) rk = ensure_int64(rk.codes) - elif how != "leftsemi" and isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: + elif how != "left_semi" and isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): @@ -2964,7 +2964,7 @@ def _factorize_keys( return lidx, ridx, -1 else: llab = rizer.factorize(lk_data, mask=lk_mask) - elif how == "leftsemi": + elif how == "left_semi": # populate hashtable for right and then do a hash join rizer.factorize(rk_data, mask=rk_mask) return rizer.hash_inner_join(lk_data, lk_mask)[1], None, -1 # type: ignore[return-value] diff --git a/pandas/tests/reshape/merge/test_semi.py b/pandas/tests/reshape/merge/test_semi.py index 6f54f32d62121..5b3edf96b2d18 100644 --- a/pandas/tests/reshape/merge/test_semi.py +++ b/pandas/tests/reshape/merge/test_semi.py @@ -19,42 +19,42 @@ ), ], ) -def test_leftsemi(vals_left, vals_right, dtype): +def test_left_semi(vals_left, vals_right, dtype): vals_left = pd.Series(vals_left, dtype=dtype) vals_right = pd.Series(vals_right, dtype=dtype) left = pd.DataFrame({"a": vals_left, "b": [1, 2, 3]}) right = pd.DataFrame({"a": vals_right, "c": 1}) expected = pd.DataFrame({"a": vals_right, "b": [1, 2]}) - result = left.merge(right, how="leftsemi") + result = left.merge(right, how="left_semi") tm.assert_frame_equal(result, expected) right = pd.DataFrame({"d": vals_right, "c": 1}) - result = left.merge(right, how="leftsemi", left_on="a", right_on="d") + result = left.merge(right, how="left_semi", left_on="a", right_on="d") tm.assert_frame_equal(result, expected) right = pd.DataFrame({"d": vals_right, "c": 1}) - result = left.merge(right, how="leftsemi", left_on=["a", "b"], right_on=["d", "c"]) + result = left.merge(right, how="left_semi", left_on=["a", "b"], right_on=["d", "c"]) tm.assert_frame_equal(result, expected.head(1)) -def test_leftsemi_invalid(): +def test_left_semi_invalid(): left = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) right = pd.DataFrame({"a": [1, 2], "c": 1}) msg = "left_index or right_index are not supported for semi-join." with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="leftsemi", left_index=True, right_on="a") + left.merge(right, how="left_semi", left_index=True, right_on="a") with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="leftsemi", right_index=True, left_on="a") + left.merge(right, how="left_semi", right_index=True, left_on="a") msg = "validate is not supported for semi-join." with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="leftsemi", validate="one_to_one") + left.merge(right, how="left_semi", validate="one_to_one") msg = "indicator is not supported for semi-join." with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="leftsemi", indicator=True) + left.merge(right, how="left_semi", indicator=True) msg = "sort is not supported for semi-join. Sort your DataFrame afterwards." with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="leftsemi", sort=True) + left.merge(right, how="left_semi", sort=True) From 0285efba7fa32d923736d934d5cf7ab57289ea95 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 18:01:06 +0000 Subject: [PATCH 06/11] Finish up --- pandas/core/frame.py | 3 +- pandas/core/reshape/merge.py | 53 +++++++++++-------------- pandas/tests/reshape/merge/test_semi.py | 34 +++++++++++----- 3 files changed, 48 insertions(+), 42 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fdba5174cf660..1cf708809b591 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -339,8 +339,7 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. * left_semi: Filter for rows in the left that have a match on the right; - preserve the order of the left keys. Doesn't support `left_index`, `right_index`, - `indicator` or `validate`. + preserve the order of the left keys. .. versionadded:: 3.0 * cross: creates the cartesian product from both frames, preserves the order diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c15d144533e55..73fa8386e1366 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1054,6 +1054,7 @@ def _validate_how( "right", "inner", "outer", + "left_semi", "left_anti", "right_anti", "cross", @@ -1403,7 +1404,11 @@ def _get_join_info( left_ax = self.left.index right_ax = self.right.index - if self.left_index and self.right_index and self.how != "asof": + if ( + self.left_index + and self.right_index + and self.how not in ("asof", "left_semi") + ): join_index, left_indexer, right_indexer = left_ax.join( right_ax, how=self.how, return_indexers=True, sort=self.sort ) @@ -1647,15 +1652,7 @@ def _get_merge_keys( k = cast(Hashable, k) left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) - if isinstance(self.right.index, MultiIndex): - right_keys = [ - lev._values.take(lev_codes) - for lev, lev_codes in zip( - self.right.index.levels, self.right.index.codes - ) - ] - else: - right_keys = [self.right.index._values] + right_keys = self._unpack_index_as_join_key(self.right.index) elif _any(self.right_on): for k in self.right_on: k = extract_array(k, extract_numpy=True) @@ -1669,18 +1666,23 @@ def _get_merge_keys( k = cast(Hashable, k) right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) - if isinstance(self.left.index, MultiIndex): - left_keys = [ - lev._values.take(lev_codes) - for lev, lev_codes in zip( - self.left.index.levels, self.left.index.codes - ) - ] - else: - left_keys = [self.left.index._values] + left_keys = self._unpack_index_as_join_key(self.left.index) + elif self.how == "left_semi": + left_keys = self._unpack_index_as_join_key(self.left.index) + right_keys = self._unpack_index_as_join_key(self.right.index) return left_keys, right_keys, join_names, left_drop, right_drop + def _unpack_index_as_join_key(self, index: Index) -> list[ArrayLike]: + if isinstance(index, MultiIndex): + keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip(index.levels, index.codes) + ] + else: + keys = [index._values] + return keys + @final def _maybe_coerce_merge_keys(self) -> None: # we have valid merges but we may have to further @@ -2241,15 +2243,8 @@ def _convert_to_multiindex(index: Index) -> MultiIndex: class _SemiMergeOperation(_MergeOperation): def __init__(self, *args, **kwargs): - if kwargs.get("validate", None): - raise NotImplementedError("validate is not supported for semi-join.") - super().__init__(*args, **kwargs) - if self.left_index or self.right_index: - raise NotImplementedError( - "left_index or right_index are not supported for semi-join." - ) - elif self.indicator: + if self.indicator: raise NotImplementedError("indicator is not supported for semi-join.") elif self.sort: raise NotImplementedError( @@ -2273,7 +2268,7 @@ def _reindex_and_concat( left_indexer: npt.NDArray[np.intp] | None, right_indexer: npt.NDArray[np.intp] | None, ) -> DataFrame: - left = self.left[:] + left = self.left if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): lmgr = left._mgr.take(left_indexer, axis=1, verify=False) @@ -2956,7 +2951,7 @@ def _factorize_keys( lk_data, rk_data = lk, rk # type: ignore[assignment] lk_mask, rk_mask = None, None - hash_join_available = how == "inner" and not sort + hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufbO" if hash_join_available: rlab = rizer.factorize(rk_data, mask=rk_mask) if rizer.get_count() == len(rlab): diff --git a/pandas/tests/reshape/merge/test_semi.py b/pandas/tests/reshape/merge/test_semi.py index 5b3edf96b2d18..b3be4ddd5bb52 100644 --- a/pandas/tests/reshape/merge/test_semi.py +++ b/pandas/tests/reshape/merge/test_semi.py @@ -10,6 +10,8 @@ "vals_left, vals_right, dtype", [ ([1, 2, 3], [1, 2], "int64"), + ([1.5, 2.5, 3.5], [1.5, 2.5], "float64"), + ([True, True, False], [True, True], "bool"), (["a", "b", "c"], ["a", "b"], "object"), pytest.param( ["a", "b", "c"], @@ -17,6 +19,12 @@ "string[pyarrow]", marks=td.skip_if_no("pyarrow"), ), + pytest.param( + ["a", "b", "c"], + ["a", "b"], + "str", + marks=td.skip_if_no("pyarrow"), + ), ], ) def test_left_semi(vals_left, vals_right, dtype): @@ -28,6 +36,21 @@ def test_left_semi(vals_left, vals_right, dtype): result = left.merge(right, how="left_semi") tm.assert_frame_equal(result, expected) + result = left.set_index("a").merge( + right.set_index("a"), how="left_semi", left_index=True, right_index=True + ) + tm.assert_frame_equal(result, expected.set_index("a")) + + result = left.set_index("a").merge( + right, how="left_semi", left_index=True, right_on="a" + ) + tm.assert_frame_equal(result, expected.set_index("a")) + + result = left.merge( + right.set_index("a"), how="left_semi", right_index=True, left_on="a" + ) + tm.assert_frame_equal(result, expected) + right = pd.DataFrame({"d": vals_right, "c": 1}) result = left.merge(right, how="left_semi", left_on="a", right_on="d") tm.assert_frame_equal(result, expected) @@ -40,17 +63,6 @@ def test_left_semi(vals_left, vals_right, dtype): def test_left_semi_invalid(): left = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) right = pd.DataFrame({"a": [1, 2], "c": 1}) - - msg = "left_index or right_index are not supported for semi-join." - with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="left_semi", left_index=True, right_on="a") - with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="left_semi", right_index=True, left_on="a") - - msg = "validate is not supported for semi-join." - with pytest.raises(NotImplementedError, match=msg): - left.merge(right, how="left_semi", validate="one_to_one") - msg = "indicator is not supported for semi-join." with pytest.raises(NotImplementedError, match=msg): left.merge(right, how="left_semi", indicator=True) From 3060169ac4facee4457cbf95517a355553fb5aac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 18:08:56 +0000 Subject: [PATCH 07/11] Finish up --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/tests/reshape/merge/test_semi.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 12f522301e121..9fc8a3ed2ccea 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -167,6 +167,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) +- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support left-semi joins in the ``how`` parameter (:issue:`62961`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). diff --git a/pandas/tests/reshape/merge/test_semi.py b/pandas/tests/reshape/merge/test_semi.py index b3be4ddd5bb52..39bca11b5174f 100644 --- a/pandas/tests/reshape/merge/test_semi.py +++ b/pandas/tests/reshape/merge/test_semi.py @@ -36,6 +36,12 @@ def test_left_semi(vals_left, vals_right, dtype): result = left.merge(right, how="left_semi") tm.assert_frame_equal(result, expected) + result = left.join(right.set_index("a"), how="left_semi", on="a") + tm.assert_frame_equal(result, expected) + + result = left.set_index("a").join(right.set_index("a"), how="left_semi") + tm.assert_frame_equal(result, expected.set_index("a")) + result = left.set_index("a").merge( right.set_index("a"), how="left_semi", left_index=True, right_index=True ) From 3929b17eb14e536b782dee7a060c326b18ff7a3c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 18:28:07 +0000 Subject: [PATCH 08/11] Fixup --- pandas/core/frame.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1cf708809b591..5d898a971c6ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -304,7 +304,17 @@ or number (0, 1).""", } -_merge_doc = """ +_how = { + "left", + "right", + "outer", + "inner", + "left_semi", + "cross", + "left_anti", + "right_anti", +} +_merge_doc = f""" Merge DataFrame or named Series objects with a database-style join. A named Series object is treated as a DataFrame with a single named column. @@ -325,8 +335,7 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'left_semi', - 'cross', 'left_anti', 'right_anti'}, +how : {_how}, default 'inner' Type of merge to be performed. @@ -426,9 +435,11 @@ Examples -------- ->>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +>>> df1 = pd.DataFrame({ + "lkey": ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}) ->>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +>>> df2 = pd.DataFrame({ + "rkey": ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}) >>> df1 lkey value @@ -477,8 +488,8 @@ ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') ->>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) ->>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 = pd.DataFrame({"a": ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({"a": ['foo', 'baz'], 'c': [3, 4]}) >>> df1 a b 0 foo 1 @@ -497,8 +508,8 @@ 0 foo 1 3.0 1 bar 2 NaN ->>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) ->>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 = pd.DataFrame({"left": ['foo', 'bar']}) +>>> df2 = pd.DataFrame({"right": [7, 8]}) >>> df1 left 0 foo From 94593d4e1eb0f1a3061e13861db07b973ca64063 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 18:45:02 +0000 Subject: [PATCH 09/11] Revert "Fixup" This reverts commit 3929b17eb14e536b782dee7a060c326b18ff7a3c. --- pandas/core/frame.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5d898a971c6ba..1cf708809b591 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -304,17 +304,7 @@ or number (0, 1).""", } -_how = { - "left", - "right", - "outer", - "inner", - "left_semi", - "cross", - "left_anti", - "right_anti", -} -_merge_doc = f""" +_merge_doc = """ Merge DataFrame or named Series objects with a database-style join. A named Series object is treated as a DataFrame with a single named column. @@ -335,7 +325,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {_how}, +how : {'left', 'right', 'outer', 'inner', 'left_semi', + 'cross', 'left_anti', 'right_anti'}, default 'inner' Type of merge to be performed. @@ -435,11 +426,9 @@ Examples -------- ->>> df1 = pd.DataFrame({ - "lkey": ['foo', 'bar', 'baz', 'foo'], +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}) ->>> df2 = pd.DataFrame({ - "rkey": ['foo', 'bar', 'baz', 'foo'], +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}) >>> df1 lkey value @@ -488,8 +477,8 @@ ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') ->>> df1 = pd.DataFrame({"a": ['foo', 'bar'], 'b': [1, 2]}) ->>> df2 = pd.DataFrame({"a": ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) >>> df1 a b 0 foo 1 @@ -508,8 +497,8 @@ 0 foo 1 3.0 1 bar 2 NaN ->>> df1 = pd.DataFrame({"left": ['foo', 'bar']}) ->>> df2 = pd.DataFrame({"right": [7, 8]}) +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) >>> df1 left 0 foo From f2613c3030284cad797e83aa526b497f799146d5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 18:54:33 +0000 Subject: [PATCH 10/11] Fixup --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1cf708809b591..19531b2550987 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -325,8 +325,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'left_semi', - 'cross', 'left_anti', 'right_anti'}, +how : {'left', 'right', 'outer', 'inner', 'left_semi', 'cross', 'left_anti', +'right_anti'}, default 'inner' Type of merge to be performed. From 0bed787a245f3bd4a3f51fc86373f4effe091e8f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 2 Nov 2025 20:31:55 +0000 Subject: [PATCH 11/11] Add docs --- doc/source/user_guide/merging.rst | 12 ++++++++++++ pandas/core/frame.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index dd71cc3d95bfb..220ae61b782b7 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -473,6 +473,18 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); +.. ipython:: python + + result = pd.merge(left, right, how="left_semi", on=["key1", "key2"]) + result + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_inner.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); + .. ipython:: python result = pd.merge(left, right, how="cross") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19531b2550987..8ae2aeef6ffca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -339,7 +339,7 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. * left_semi: Filter for rows in the left that have a match on the right; - preserve the order of the left keys. + preserve the order of the left keys, similar to SQL left semi join. .. versionadded:: 3.0 * cross: creates the cartesian product from both frames, preserves the order