Pandas version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of pandas.
-
[X] I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
'2022-01-01 00:00:02', '2022-01-01 00:00:03',
'2022-01-01 01:00:01', '2022-01-01 02:00:02',
'2022-01-01 03:00:03', '2032-01-01 01:00:01',
'2042-01-01 01:00:01', '2032-01-01 01:00:01'],
dtype='datetime64[ns]', name='TIME', freq=None)
df = pd.DataFrame(index=index, data=list(range(10)))
print(df.first('1H'))
Issue Description
Calling df.first('1H')
on my df gives me KeyError: Timestamp('2022-01-01 01:00:00')
.
stack trace
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:3448, in Index.get_loc(self, key)
3447 try:
-> 3448 return self._engine.get_loc(casted_key)
3449 except KeyError as err:
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:533, in pandas._libs.index.DatetimeEngine.get_loc()
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:562, in pandas._libs.index.DatetimeEngine.get_loc()
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:202, in pandas._libs.index.IndexEngine._get_loc_duplicates()
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:210, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer()
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:106, in pandas._libs.index._unpack_bool_indexer()
KeyError: 1640998800000000000
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:608, in DatetimeIndex.get_loc(self, key)
607 try:
--> 608 return Index.get_loc(self, key)
609 except KeyError as err:
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:3450, in Index.get_loc(self, key)
3449 except KeyError as err:
-> 3450 raise KeyError(key) from err
3451 except TypeError:
3452 # If we have a listlike key, _check_indexing_error will raise
3453 # InvalidIndexError. Otherwise we fall through and re-raise
3454 # the TypeError.
KeyError: Timestamp('2022-01-01 01:00:00')
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In [13], line 11
3 index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
4 '2022-01-01 00:00:02', '2022-01-01 00:00:03',
5 '2022-01-01 01:00:01', '2022-01-01 02:00:02',
6 '2022-01-01 03:00:03', '2032-01-01 01:00:01',
7 '2042-01-01 01:00:01', '2032-01-01 01:00:01'],
8 dtype='datetime64[ns]', name='TIME', freq=None)
9 df = pd.DataFrame(index=index, data=list(range(10)))
---> 11 print(df.first('1H'))
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/generic.py:8735, in NDFrame.first(self, offset)
8732 end = self.index.searchsorted(end_date, side="left")
8733 return self.iloc[:end]
-> 8735 return self.loc[:end]
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1099, in _LocationIndexer.__getitem__(self, key)
1096 axis = self.axis or 0
1098 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1099 return self._getitem_axis(maybe_callable, axis=axis)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1319, in _LocIndexer._getitem_axis(self, key, axis)
1317 if isinstance(key, slice):
1318 self._validate_key(key, axis)
-> 1319 return self._get_slice_axis(key, axis=axis)
1320 elif com.is_bool_indexer(key):
1321 return self._getbool_axis(key, axis=axis)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1353, in _LocIndexer._get_slice_axis(self, slice_obj, axis)
1350 return obj.copy(deep=False)
1352 labels = obj._get_axis(axis)
-> 1353 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
1355 if isinstance(indexer, slice):
1356 return self.obj._slice(indexer, axis=axis)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:661, in DatetimeIndex.slice_indexer(self, start, end, step)
653 # GH#33146 if start and end are combinations of str and None and Index is not
654 # monotonic, we can not use Index.slice_indexer because it does not honor the
655 # actual elements, is only searching for start and end
656 if (
657 check_str_or_none(start)
658 or check_str_or_none(end)
659 or self.is_monotonic_increasing
660 ):
--> 661 return Index.slice_indexer(self, start, end, step)
663 mask = np.array(True)
664 raise_mask = np.array(True)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6109, in Index.slice_indexer(self, start, end, step)
6065 def slice_indexer(
6066 self,
6067 start: Hashable | None = None,
6068 end: Hashable | None = None,
6069 step: int | None = None,
6070 ) -> slice:
6071 """
6072 Compute the slice indexer for input labels and step.
6073
(...)
6107 slice(1, 3, None)
6108 """
-> 6109 start_slice, end_slice = self.slice_locs(start, end, step=step)
6111 # return a slice
6112 if not is_scalar(start_slice):
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6310, in Index.slice_locs(self, start, end, step)
6308 end_slice = None
6309 if end is not None:
-> 6310 end_slice = self.get_slice_bound(end, "right")
6311 if end_slice is None:
6312 end_slice = len(self)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6229, in Index.get_slice_bound(self, label, side)
6226 return self._searchsorted_monotonic(label, side)
6227 except ValueError:
6228 # raise the original KeyError
-> 6229 raise err
6231 if isinstance(slc, np.ndarray):
6232 # get_loc may return a boolean array, which
6233 # is OK as long as they are representable by a slice.
6234 assert is_bool_dtype(slc.dtype)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6223, in Index.get_slice_bound(self, label, side)
6221 # we need to look up the label
6222 try:
-> 6223 slc = self.get_loc(label)
6224 except KeyError as err:
6225 try:
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:610, in DatetimeIndex.get_loc(self, key)
608 return Index.get_loc(self, key)
609 except KeyError as err:
--> 610 raise KeyError(orig_key) from err
KeyError: Timestamp('2022-01-01 01:00:00')
Expected Behavior
df.first('1H')
should give me the first 4 rows, all of which fall at the smallest hour, "2022-01-01 00"
,
Installed Versions
INSTALLED VERSIONS
------------------
commit : 7cb7592523380133f552e258f272a5694e37957a
python : 3.10.4.final.0
python-bits : 64
OS : Darwin
OS-release : 21.5.0
Version : Darwin Kernel Version 21.5.0: Tue Apr 26 21:08:22 PDT 2022; root:xnu-8020.121.3~4/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.0.0.dev0+1147.g7cb7592523
numpy : 1.23.3
pytz : 2022.2.1
dateutil : 2.8.2
setuptools : 63.4.1
pip : 22.1.2
Cython : 0.29.32
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 8.5.0
pandas_datareader: None
bs4 : None
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : None
qtpy : None
pyqt5 : None
Comment From: saucoide
looks like it happens because the index is out of order:
import pandas as pd
index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
'2022-01-01 00:00:02', '2022-01-01 00:00:03',
'2022-01-01 01:00:01', '2022-01-01 02:00:02',
'2022-01-01 03:00:03', '2032-01-01 01:00:01', # <--
'2042-01-01 01:00:01', '2032-01-01 01:00:01'], # <-- same value
dtype='datetime64[ns]', name='TIME', freq=None)
df = pd.DataFrame(index=index, data=list(range(10)))
print(df.first('1H'))
Comment From: MarcoGorelli
agreed - the docs now specify
For a DataFrame with a sorted DatetimeIndex
closing then, but thanks for the report
Comment From: MarcoGorelli
(and thanks @saucoide for triaging!)