I am trying to index data frame with a MultiIndex
with constituent IntervalIndex
Example
ii = pd.IntervalIndex.from_arrays([1, 5, 2, 6, 3, 7], [4, 8, 5, 9, 6, 10])
mi = pd.MultiIndex.from_arrays([["aa", "aa", "bb", "bb", "cc", "cc"], ii])
data = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12)]
df = pd.DataFrame(data, columns=['k', 'l'], index=mi)
df.loc[("bb", 3)]
Above code throws TypeError
as follows:
TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [3] of <class 'int'>
I was expecting above to produce same result as:
df.loc[("bb")].loc[3]
# k 5
# l 6
# Name: (2, 5], dtype: int64
Is this a bug or I'm not using it properly? (newbie here)
Version Details:
python: Python 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43)
pandas: '0.23.4'
Comment From: jschendel
This is a bug. To my knowledge there hasn't been any work done to ensure that IntervalIndex
works within a MultiIndex
, nor do I see any tests for such functionality.
It does appear to be working in certain cases though, so there's at least some IntervalIndex
compatible code in place already:
In [2]: mi = pd.MultiIndex.from_product([list('ab'), pd.interval_range(0, 2)])
In [3]: df = pd.DataFrame({'A': list('wxyz'), 'B': range(4)}, index=mi)
In [4]: df
Out[4]:
A B
a (0, 1] w 0
(1, 2] x 1
b (0, 1] y 2
(1, 2] z 3
In [5]: df.loc[('a', 1.1)]
Out[5]:
A x
B 1
Name: (a, (1, 2]), dtype: object
In [6]: df.loc[('a', 2)]
Out[6]:
A x
B 1
Name: (a, (1, 2]), dtype: object
Investigation into the issue and PRs would certainly be welcome!
Comment From: jamesholcombe
Hi all, was this issue ever resolved? I have run into the same problem but a different error is thrown. If this has not been resolved I will raise this as an issue with some example code
Comment From: timlod
There are still issues with this (pandas 1.4.4
). The code snippet from above runs fine, but I found several other issues along the way (and it's really hard to come up with a contrived example that mirrors my real data).
Here I have made 2 similar DataFrames, but they behave differently:
ii = pd.IntervalIndex.from_arrays(
[141104, 151056, 154080, 0],
[146512, 154832, 160000, 160000],
)
mi = pd.MultiIndex.from_arrays([["a", "a", "a", "b"], ii])
data = [[1], [2], [3], [4]]
df = pd.DataFrame(data, columns=["col"], index=mi)
print(df)
df.loc[("a", 152000)]
col
a (141104, 146512] 1
(151056, 154832] 2
(154080, 160000] 3
b (0, 160000] 4
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [4], in <cell line: 9>()
7 df = pd.DataFrame(data, columns=["col"], index=mi)
8 print(df)
----> 9 df.loc[("a", 152000)]
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:961, in _LocationIndexer.__getitem__(self, key)
959 if self._is_scalar_access(key):
960 return self.obj._get_value(*key, takeable=self._takeable)
--> 961 return self._getitem_tuple(key)
962 else:
963 # we by definition only have the 0th axis
964 axis = self.axis or 0
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1140, in _LocIndexer._getitem_tuple(self, tup)
1138 with suppress(IndexingError):
1139 tup = self._expand_ellipsis(tup)
-> 1140 return self._getitem_lowerdim(tup)
1142 # no multi-index, so validate all of the indexers
1143 tup = self._validate_tuple_indexer(tup)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:859, in _LocationIndexer._getitem_lowerdim(self, tup)
849 if (
850 isinstance(ax0, MultiIndex)
851 and self.name != "iloc"
(...)
856 # is equivalent.
857 # (see the other place where we call _handle_lowerdim_multi_index_axis0)
858 with suppress(IndexingError):
--> 859 return self._handle_lowerdim_multi_index_axis0(tup)
861 tup = self._validate_key_length(tup)
863 for i, key in enumerate(tup):
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1160, in _LocIndexer._handle_lowerdim_multi_index_axis0(self, tup)
1157 axis = self.axis or 0
1158 try:
1159 # fast path for series or for tup devoid of slices
-> 1160 return self._get_label(tup, axis=axis)
1162 except KeyError as ek:
1163 # raise KeyError if number of indexers match
1164 # else IndexingError will be raised
1165 if self.ndim < len(tup) <= self.obj.index.nlevels:
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1153, in _LocIndexer._get_label(self, label, axis)
1151 def _get_label(self, label, axis: int):
1152 # GH#5667 this will fail if the label is not present in the axis.
-> 1153 return self.obj.xs(label, axis=axis)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/generic.py:3857, in NDFrame.xs(self, key, axis, level, drop_level)
3854 self._consolidate_inplace()
3856 if isinstance(index, MultiIndex):
-> 3857 loc, new_index = index._get_loc_level(key, level=0)
3858 if not drop_level:
3859 if lib.is_integer(loc):
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexes/multi.py:3043, in MultiIndex._get_loc_level(self, key, level)
3040 if len(key) == self.nlevels and self.is_unique:
3041 # Complete key in unique index -> standard get_loc
3042 try:
-> 3043 return (self._engine.get_loc(key), None)
3044 except KeyError as err:
3045 raise KeyError(key) from err
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/_libs/index.pyx:777, in pandas._libs.index.BaseMultiIndexCodesEngine.get_loc()
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.
Changing the first interval to (0, 160000)
:
ii2 = pd.IntervalIndex.from_arrays(
[0, 151056, 154080, 0],
[160000, 154832, 160000, 160000],
)
mi2 = pd.MultiIndex.from_arrays([["a", "a", "a", "b"], ii2])
df2 = pd.DataFrame(data, columns=["col"], index=mi2)
df2.loc[("a", 152000)]
col
a (0, 160000] 1
(151056, 154832] 2
Somehow this query now works...
On my real data, I have a query like the latter one (but only supposed to return one value) which gives a KeyError. Then, if I change the interval location, I get this error (I'm not posting the data because it is big, and making it smaller somehow makes the error disappear in a way I can't figure out):
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Input In [22], in <cell line: 1>()
----> 1 l2.loc[("zzya4dDVRLk.wav", 25000), :]
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:961, in _LocationIndexer.__getitem__(self, key)
959 if self._is_scalar_access(key):
960 return self.obj._get_value(*key, takeable=self._takeable)
--> 961 return self._getitem_tuple(key)
962 else:
963 # we by definition only have the 0th axis
964 axis = self.axis or 0
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1140, in _LocIndexer._getitem_tuple(self, tup)
1138 with suppress(IndexingError):
1139 tup = self._expand_ellipsis(tup)
-> 1140 return self._getitem_lowerdim(tup)
1142 # no multi-index, so validate all of the indexers
1143 tup = self._validate_tuple_indexer(tup)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:843, in _LocationIndexer._getitem_lowerdim(self, tup)
841 # we may have a nested tuples indexer here
842 if self._is_nested_tuple_indexer(tup):
--> 843 return self._getitem_nested_tuple(tup)
845 # we maybe be using a tuple to represent multiple dimensions here
846 ax0 = self.obj._get_axis(0)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:942, in _LocationIndexer._getitem_nested_tuple(self, tup)
939 axis -= 1
940 continue
--> 942 obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
943 axis -= 1
945 # if we have a scalar, we are done
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1205, in _LocIndexer._getitem_axis(self, key, axis)
1203 # fall thru to straight lookup
1204 self._validate_key(key, axis)
-> 1205 return self._get_label(key, axis=axis)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexing.py:1153, in _LocIndexer._get_label(self, label, axis)
1151 def _get_label(self, label, axis: int):
1152 # GH#5667 this will fail if the label is not present in the axis.
-> 1153 return self.obj.xs(label, axis=axis)
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/generic.py:3857, in NDFrame.xs(self, key, axis, level, drop_level)
3854 self._consolidate_inplace()
3856 if isinstance(index, MultiIndex):
-> 3857 loc, new_index = index._get_loc_level(key, level=0)
3858 if not drop_level:
3859 if lib.is_integer(loc):
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexes/multi.py:3052, in MultiIndex._get_loc_level(self, key, level)
3049 pass
3051 # partial selection
-> 3052 indexer = self.get_loc(key)
3053 ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
3054 if len(ilevels) == self.nlevels:
File ~/miniconda3/envs/bm/lib/python3.9/site-packages/pandas/core/indexes/multi.py:2922, in MultiIndex.get_loc(self, key, method)
2918 for i, k in enumerate(follow_key, len(lead_key)):
2919 mask = self.codes[i][loc] == self._get_loc_single_level_index(
2920 self.levels[i], k
2921 )
-> 2922 if not mask.all():
2923 loc = loc[mask]
2924 if not len(loc):
AttributeError: 'bool' object has no attribute 'all'```
In short, having a `MultiIndex` with `IntervalIndex` appears to currently fail in many different ways.
**Comment From: timlod**
Potentially related: #27456, #46699
**Comment From: phofl**
This works as expected now
k l
bb (2, 5] 5 6 ```
Comment From: asdinara
take