Skip to content

BUG: pd.merge fail with numpy.intc on Windows #52451

Closed
@hoxbro

Description

@hoxbro

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
import numpy as np
from scipy.spatial import Delaunay

n_verts = 10
pts = np.random.randint(1, n_verts, (n_verts, 2))
tris = Delaunay(pts)

A = pd.DataFrame(tris.simplices)
B = pd.DataFrame(pts)
pd.merge(A, B, left_on=[0], right_on=[0])

Issue Description

The example raises a KeyError on Windows and Pandas 2.0.

The example work if I convert A = pd.DataFrame(tris.simplices.astype(int))

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[26], line 3
      1 A = pd.DataFrame(tris.simplices)
      2 B = pd.DataFrame(pts)
----> 3 pd.merge(A, B, left_on=[0], right_on=[0])

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:156, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    125 @Substitution("\nleft : DataFrame or named Series")
    126 @Appender(_merge_doc, indents=0)
    127 def merge(
   (...)
    140     validate: str | None = None,
    141 ) -> DataFrame:
    142     op = _MergeOperation(
    143         left,
    144         right,
   (...)
    154         validate=validate,
    155     )
--> 156     return op.get_result(copy=copy)

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:803, in _MergeOperation.get_result(self, copy)
    800 if self.indicator:
    801     self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 803 join_index, left_indexer, right_indexer = self._get_join_info()
    805 result = self._reindex_and_concat(
    806     join_index, left_indexer, right_indexer, copy=copy
    807 )
    808 result = result.__finalize__(self, method=self._merge_type)

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:1051, in _MergeOperation._get_join_info(self)
   1047     join_index, right_indexer, left_indexer = _left_join_on_index(
   1048         right_ax, left_ax, self.right_join_keys, sort=self.sort
   1049     )
   1050 else:
-> 1051     (left_indexer, right_indexer) = self._get_join_indexers()
   1053     if self.right_index:
   1054         if len(self.left) > 0:

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:1024, in _MergeOperation._get_join_indexers(self)
   1022 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
   1023     """return the join indexers"""
-> 1024     return get_join_indexers(
   1025         self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
   1026     )

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:1645, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1640 # get left & right join labels and num. of levels at each location
   1641 mapped = (
   1642     _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
   1643     for n in range(len(left_keys))
   1644 )
-> 1645 zipped = zip(*mapped)
   1646 llab, rlab, shape = (list(x) for x in zipped)
   1648 # get flat i8 keys from label lists

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:1642, in <genexpr>(.0)
   1638         return _get_no_sort_one_missing_indexer(left_n, False)
   1640 # get left & right join labels and num. of levels at each location
   1641 mapped = (
-> 1642     _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
   1643     for n in range(len(left_keys))
   1644 )
   1645 zipped = zip(*mapped)
   1646 llab, rlab, shape = (list(x) for x in zipped)

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:2382, in _factorize_keys(lk, rk, sort, how)
   2378         # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
   2379         # "_values_for_factorize"
   2380         rk, _ = rk._values_for_factorize()  # type: ignore[union-attr]
-> 2382 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
   2384 rizer = klass(max(len(lk), len(rk)))
   2386 if isinstance(lk, BaseMaskedArray):

File ~\AppData\Local\mambaforge\envs\tmp\Lib\site-packages\pandas\core\reshape\merge.py:2449, in _convert_arrays_and_get_rizer_klass(lk, rk)
   2447         klass = _factorizers[lk.dtype.type]  # type: ignore[index]
   2448     else:
-> 2449         klass = _factorizers[lk.dtype.type]
   2451 else:
   2452     klass = libhashtable.ObjectFactorizer

KeyError: <class 'numpy.intc'>

Expected Behavior

No KeyError and that the merge happens.

Installed Versions

INSTALLED VERSIONS
------------------
commit           : 478d340667831908b5b4bf09a2787a11a14560c9
python           : 3.11.2.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19045
machine          : AMD64
processor        : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : English_United Kingdom.1252

pandas           : 2.0.0
numpy            : 1.24.2
pytz             : 2023.3
dateutil         : 2.8.2
setuptools       : 67.6.1
pip              : 23.0.1
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.1.2
IPython          : 8.12.0
pandas_datareader: None
bs4              : 4.12.0
bottleneck       : None
brotli           : 
fastparquet      : None
fsspec           : None
gcsfs            : None
matplotlib       : 3.7.1
numba            : None
numexpr          : None
odfpy            : None
openpyxl         : None
pandas_gbq       : None
pyarrow          : None
pyreadstat       : None
pyxlsb           : None
s3fs             : None
scipy            : 1.10.1
snappy           : None
sqlalchemy       : None
tables           : None
tabulate         : None
xarray           : None
xlrd             : None
zstandard        : None
tzdata           : 2023.3
qtpy             : None
pyqt5            : None

Metadata

Metadata

Assignees

Labels

BugRegressionFunctionality that used to work in a prior pandas versionReshapingConcat, Merge/Join, Stack/Unstack, Explode

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions