Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ def time_merge_dataframe_empty_left(self, sort):
def time_merge_dataframes_cross(self, sort):
merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)

def time_merge_semi(self, sort):
merge(self.df, self.df2, on="key1", how="left_semi")


class MergeEA:
params = [
Expand Down Expand Up @@ -380,6 +383,9 @@ def setup(self, units, tz, monotonic):
def time_merge(self, units, tz, monotonic):
merge(self.left, self.right)

def time_merge_semi(self, units, tz, monotonic):
merge(self.left, self.right, how="left_semi")


class MergeCategoricals:
def setup(self):
Expand Down
13 changes: 13 additions & 0 deletions doc/source/user_guide/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ either the left or right tables, the values in the joined table will be
``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only
``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames
``inner``, ``INNER JOIN``, Use intersection of keys from both frames
``left_semi``, ``LEFT SEMI JOIN``, Filter rows on left based on occurrences in right.
``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames

.. ipython:: python
Expand Down Expand Up @@ -472,6 +473,18 @@ either the left or right tables, the values in the joined table will be
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");

.. ipython:: python

result = pd.merge(left, right, how="left_semi", on=["key1", "key2"])
result

.. ipython:: python
:suppress:

@savefig merging_merge_on_key_inner.png
p.plot([left, right], result, labels=["left", "right"], vertical=False);
plt.close("all");

.. ipython:: python

result = pd.merge(left, right, how="cross")
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ Other enhancements
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support left-semi joins in the ``how`` parameter (:issue:`62961`)
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,6 @@ cdef class ObjectFactorizer(Factorizer):
self.count, na_sentinel, na_value)
self.count = len(self.uniques)
return labels

def hash_inner_join(self, values, mask=None):
return self.table.hash_inner_join(values, mask)
27 changes: 27 additions & 0 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,33 @@ cdef class PyObjectHashTable(HashTable):
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i

@cython.wraparound(False)
@cython.boundscheck(False)
def hash_inner_join(self, ndarray[object] values, object mask = None) -> tuple[ndarray, ndarray]:
cdef:
Py_ssize_t i, n = len(values)
object val
khiter_t k
Int64Vector locs = Int64Vector()
Int64Vector self_locs = Int64Vector()
Int64VectorData *l
Int64VectorData *sl
# mask not implemented

l = &locs.data
sl = &self_locs.data

for i in range(n):
val = values[i]
hash(val)

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
append_data_int64(l, i)
append_data_int64(sl, self.table.vals[k])

return self_locs.to_array(), locs.to_array()

def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# mask not yet implemented
Expand Down
2 changes: 1 addition & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def closed(self) -> bool:

# merge
MergeHow: TypeAlias = Literal[
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
"left", "right", "inner", "outer", "cross", "left_semi", "left_anti", "right_anti"
]
MergeValidate: TypeAlias = Literal[
"one_to_one",
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
how : {'left', 'right', 'outer', 'inner', 'left_semi', 'cross', 'left_anti',
'right_anti'},
default 'inner'
Type of merge to be performed.

Expand All @@ -337,6 +338,10 @@
join; sort keys lexicographically.
* inner: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
* left_semi: Filter for rows in the left that have a match on the right;
preserve the order of the left keys, similar to SQL left semi join.

.. versionadded:: 3.0
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
* left_anti: use only keys from left frame that are not in right frame, similar
Expand Down
94 changes: 67 additions & 27 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,8 @@ def merge(
validate=validate,
)
else:
op = _MergeOperation(
klass = _MergeOperation if how != "left_semi" else _SemiMergeOperation
op = klass(
left_df,
right_df,
how=how,
Expand Down Expand Up @@ -1053,6 +1054,7 @@ def _validate_how(
"right",
"inner",
"outer",
"left_semi",
"left_anti",
"right_anti",
"cross",
Expand Down Expand Up @@ -1080,7 +1082,6 @@ def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:
# Overridden by AsOfMerge
pass

@final
def _reindex_and_concat(
self,
join_index: Index,
Expand Down Expand Up @@ -1225,7 +1226,6 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
return result

@final
def _maybe_restore_index_levels(self, result: DataFrame) -> None:
"""
Restore index levels specified as `on` parameters
Expand Down Expand Up @@ -1269,7 +1269,6 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> None:
if names_to_restore:
result.set_index(names_to_restore, inplace=True)

@final
def _maybe_add_join_keys(
self,
result: DataFrame,
Expand Down Expand Up @@ -1405,7 +1404,11 @@ def _get_join_info(
left_ax = self.left.index
right_ax = self.right.index

if self.left_index and self.right_index and self.how != "asof":
if (
self.left_index
and self.right_index
and self.how not in ("asof", "left_semi")
):
join_index, left_indexer, right_indexer = left_ax.join(
right_ax, how=self.how, return_indexers=True, sort=self.sort
)
Expand Down Expand Up @@ -1649,15 +1652,7 @@ def _get_merge_keys(
k = cast(Hashable, k)
left_keys.append(left._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.right.index, MultiIndex):
right_keys = [
lev._values.take(lev_codes)
for lev, lev_codes in zip(
self.right.index.levels, self.right.index.codes
)
]
else:
right_keys = [self.right.index._values]
right_keys = self._unpack_index_as_join_key(self.right.index)
elif _any(self.right_on):
for k in self.right_on:
k = extract_array(k, extract_numpy=True)
Expand All @@ -1671,18 +1666,23 @@ def _get_merge_keys(
k = cast(Hashable, k)
right_keys.append(right._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.left.index, MultiIndex):
left_keys = [
lev._values.take(lev_codes)
for lev, lev_codes in zip(
self.left.index.levels, self.left.index.codes
)
]
else:
left_keys = [self.left.index._values]
left_keys = self._unpack_index_as_join_key(self.left.index)
elif self.how == "left_semi":
left_keys = self._unpack_index_as_join_key(self.left.index)
right_keys = self._unpack_index_as_join_key(self.right.index)

return left_keys, right_keys, join_names, left_drop, right_drop

def _unpack_index_as_join_key(self, index: Index) -> list[ArrayLike]:
if isinstance(index, MultiIndex):
keys = [
lev._values.take(lev_codes)
for lev, lev_codes in zip(index.levels, index.codes)
]
else:
keys = [index._values]
return keys

@final
def _maybe_coerce_merge_keys(self) -> None:
# we have valid merges but we may have to further
Expand Down Expand Up @@ -2040,7 +2040,7 @@ def get_join_indexers(
left_keys: list[ArrayLike],
right_keys: list[ArrayLike],
sort: bool = False,
how: JoinHow = "inner",
how: JoinHow + Literal["left_semi"] = "inner",
) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
"""

Expand Down Expand Up @@ -2097,7 +2097,8 @@ def get_join_indexers(
right = Index(rkey)

if (
left.is_monotonic_increasing
how != "left_semi"
and left.is_monotonic_increasing
and right.is_monotonic_increasing
and (left.is_unique or right.is_unique)
):
Expand Down Expand Up @@ -2240,6 +2241,41 @@ def _convert_to_multiindex(index: Index) -> MultiIndex:
return join_levels, join_codes, join_names


class _SemiMergeOperation(_MergeOperation):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.indicator:
raise NotImplementedError("indicator is not supported for semi-join.")
elif self.sort:
raise NotImplementedError(
"sort is not supported for semi-join. Sort your DataFrame afterwards."
)

def _maybe_add_join_keys(
self,
result: DataFrame,
left_indexer: npt.NDArray[np.intp] | None,
right_indexer: npt.NDArray[np.intp] | None,
) -> None:
return

def _maybe_restore_index_levels(self, result: DataFrame) -> None:
return

def _reindex_and_concat(
self,
join_index: Index,
left_indexer: npt.NDArray[np.intp] | None,
right_indexer: npt.NDArray[np.intp] | None,
) -> DataFrame:
left = self.left

if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
lmgr = left._mgr.take(left_indexer, axis=1, verify=False)
left = left._constructor_from_mgr(lmgr, axes=lmgr.axes)
return left


class _OrderedMerge(_MergeOperation):
_merge_type = "ordered_merge"

Expand Down Expand Up @@ -2827,7 +2863,7 @@ def _factorize_keys(
lk = ensure_int64(lk.codes)
rk = ensure_int64(rk.codes)

elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
elif how != "left_semi" and isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
):
Expand Down Expand Up @@ -2915,14 +2951,18 @@ def _factorize_keys(
lk_data, rk_data = lk, rk # type: ignore[assignment]
lk_mask, rk_mask = None, None

hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufb"
hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufbO"
if hash_join_available:
rlab = rizer.factorize(rk_data, mask=rk_mask)
if rizer.get_count() == len(rlab):
ridx, lidx = rizer.hash_inner_join(lk_data, lk_mask)
return lidx, ridx, -1
else:
llab = rizer.factorize(lk_data, mask=lk_mask)
elif how == "left_semi":
# populate hashtable for right and then do a hash join
rizer.factorize(rk_data, mask=rk_mask)
return rizer.hash_inner_join(lk_data, lk_mask)[1], None, -1 # type: ignore[return-value]
else:
llab = rizer.factorize(lk_data, mask=lk_mask)
rlab = rizer.factorize(rk_data, mask=rk_mask)
Expand Down
78 changes: 78 additions & 0 deletions pandas/tests/reshape/merge/test_semi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm


@pytest.mark.parametrize(
"vals_left, vals_right, dtype",
[
([1, 2, 3], [1, 2], "int64"),
([1.5, 2.5, 3.5], [1.5, 2.5], "float64"),
([True, True, False], [True, True], "bool"),
(["a", "b", "c"], ["a", "b"], "object"),
pytest.param(
["a", "b", "c"],
["a", "b"],
"string[pyarrow]",
marks=td.skip_if_no("pyarrow"),
),
pytest.param(
["a", "b", "c"],
["a", "b"],
"str",
marks=td.skip_if_no("pyarrow"),
),
],
)
def test_left_semi(vals_left, vals_right, dtype):
vals_left = pd.Series(vals_left, dtype=dtype)
vals_right = pd.Series(vals_right, dtype=dtype)
left = pd.DataFrame({"a": vals_left, "b": [1, 2, 3]})
right = pd.DataFrame({"a": vals_right, "c": 1})
expected = pd.DataFrame({"a": vals_right, "b": [1, 2]})
result = left.merge(right, how="left_semi")
tm.assert_frame_equal(result, expected)

result = left.join(right.set_index("a"), how="left_semi", on="a")
tm.assert_frame_equal(result, expected)

result = left.set_index("a").join(right.set_index("a"), how="left_semi")
tm.assert_frame_equal(result, expected.set_index("a"))

result = left.set_index("a").merge(
right.set_index("a"), how="left_semi", left_index=True, right_index=True
)
tm.assert_frame_equal(result, expected.set_index("a"))

result = left.set_index("a").merge(
right, how="left_semi", left_index=True, right_on="a"
)
tm.assert_frame_equal(result, expected.set_index("a"))

result = left.merge(
right.set_index("a"), how="left_semi", right_index=True, left_on="a"
)
tm.assert_frame_equal(result, expected)

right = pd.DataFrame({"d": vals_right, "c": 1})
result = left.merge(right, how="left_semi", left_on="a", right_on="d")
tm.assert_frame_equal(result, expected)

right = pd.DataFrame({"d": vals_right, "c": 1})
result = left.merge(right, how="left_semi", left_on=["a", "b"], right_on=["d", "c"])
tm.assert_frame_equal(result, expected.head(1))


def test_left_semi_invalid():
left = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
right = pd.DataFrame({"a": [1, 2], "c": 1})
msg = "indicator is not supported for semi-join."
with pytest.raises(NotImplementedError, match=msg):
left.merge(right, how="left_semi", indicator=True)

msg = "sort is not supported for semi-join. Sort your DataFrame afterwards."
with pytest.raises(NotImplementedError, match=msg):
left.merge(right, how="left_semi", sort=True)
Loading