Pandas version checks

  • [X] I have checked that this issue has not already been reported.

  • [X] I have confirmed this bug exists on the latest version of pandas.

  • [X] I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd

index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
               '2022-01-01 00:00:02', '2022-01-01 00:00:03',
               '2022-01-01 01:00:01', '2022-01-01 02:00:02',
               '2022-01-01 03:00:03', '2032-01-01 01:00:01',
               '2042-01-01 01:00:01', '2032-01-01 01:00:01'],
              dtype='datetime64[ns]', name='TIME', freq=None)
df = pd.DataFrame(index=index, data=list(range(10)))

print(df.first('1H'))

Issue Description

Calling df.first('1H') on my df gives me KeyError: Timestamp('2022-01-01 01:00:00').

stack trace
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:3448, in Index.get_loc(self, key)
   3447 try:
-> 3448     return self._engine.get_loc(casted_key)
   3449 except KeyError as err:

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:533, in pandas._libs.index.DatetimeEngine.get_loc()

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:562, in pandas._libs.index.DatetimeEngine.get_loc()

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:202, in pandas._libs.index.IndexEngine._get_loc_duplicates()

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:210, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer()

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/_libs/index.pyx:106, in pandas._libs.index._unpack_bool_indexer()

KeyError: 1640998800000000000

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:608, in DatetimeIndex.get_loc(self, key)
    607 try:
--> 608     return Index.get_loc(self, key)
    609 except KeyError as err:

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:3450, in Index.get_loc(self, key)
   3449 except KeyError as err:
-> 3450     raise KeyError(key) from err
   3451 except TypeError:
   3452     # If we have a listlike key, _check_indexing_error will raise
   3453     #  InvalidIndexError. Otherwise we fall through and re-raise
   3454     #  the TypeError.

KeyError: Timestamp('2022-01-01 01:00:00')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In [13], line 11
      3 index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
      4                '2022-01-01 00:00:02', '2022-01-01 00:00:03',
      5                '2022-01-01 01:00:01', '2022-01-01 02:00:02',
      6                '2022-01-01 03:00:03', '2032-01-01 01:00:01',
      7                '2042-01-01 01:00:01', '2032-01-01 01:00:01'],
      8               dtype='datetime64[ns]', name='TIME', freq=None)
      9 df = pd.DataFrame(index=index, data=list(range(10)))
---> 11 print(df.first('1H'))

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/generic.py:8735, in NDFrame.first(self, offset)
   8732     end = self.index.searchsorted(end_date, side="left")
   8733     return self.iloc[:end]
-> 8735 return self.loc[:end]

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1099, in _LocationIndexer.__getitem__(self, key)
   1096 axis = self.axis or 0
   1098 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1099 return self._getitem_axis(maybe_callable, axis=axis)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1319, in _LocIndexer._getitem_axis(self, key, axis)
   1317 if isinstance(key, slice):
   1318     self._validate_key(key, axis)
-> 1319     return self._get_slice_axis(key, axis=axis)
   1320 elif com.is_bool_indexer(key):
   1321     return self._getbool_axis(key, axis=axis)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexing.py:1353, in _LocIndexer._get_slice_axis(self, slice_obj, axis)
   1350     return obj.copy(deep=False)
   1352 labels = obj._get_axis(axis)
-> 1353 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1355 if isinstance(indexer, slice):
   1356     return self.obj._slice(indexer, axis=axis)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:661, in DatetimeIndex.slice_indexer(self, start, end, step)
    653 # GH#33146 if start and end are combinations of str and None and Index is not
    654 # monotonic, we can not use Index.slice_indexer because it does not honor the
    655 # actual elements, is only searching for start and end
    656 if (
    657     check_str_or_none(start)
    658     or check_str_or_none(end)
    659     or self.is_monotonic_increasing
    660 ):
--> 661     return Index.slice_indexer(self, start, end, step)
    663 mask = np.array(True)
    664 raise_mask = np.array(True)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6109, in Index.slice_indexer(self, start, end, step)
   6065 def slice_indexer(
   6066     self,
   6067     start: Hashable | None = None,
   6068     end: Hashable | None = None,
   6069     step: int | None = None,
   6070 ) -> slice:
   6071     """
   6072     Compute the slice indexer for input labels and step.
   6073
   (...)
   6107     slice(1, 3, None)
   6108     """
-> 6109     start_slice, end_slice = self.slice_locs(start, end, step=step)
   6111     # return a slice
   6112     if not is_scalar(start_slice):

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6310, in Index.slice_locs(self, start, end, step)
   6308 end_slice = None
   6309 if end is not None:
-> 6310     end_slice = self.get_slice_bound(end, "right")
   6311 if end_slice is None:
   6312     end_slice = len(self)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6229, in Index.get_slice_bound(self, label, side)
   6226         return self._searchsorted_monotonic(label, side)
   6227     except ValueError:
   6228         # raise the original KeyError
-> 6229         raise err
   6231 if isinstance(slc, np.ndarray):
   6232     # get_loc may return a boolean array, which
   6233     # is OK as long as they are representable by a slice.
   6234     assert is_bool_dtype(slc.dtype)

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/base.py:6223, in Index.get_slice_bound(self, label, side)
   6221 # we need to look up the label
   6222 try:
-> 6223     slc = self.get_loc(label)
   6224 except KeyError as err:
   6225     try:

File ~/opt/anaconda3/envs/pandas-dev/lib/python3.10/site-packages/pandas/core/indexes/datetimes.py:610, in DatetimeIndex.get_loc(self, key)
    608     return Index.get_loc(self, key)
    609 except KeyError as err:
--> 610     raise KeyError(orig_key) from err

KeyError: Timestamp('2022-01-01 01:00:00')

Expected Behavior

df.first('1H') should give me the first 4 rows, all of which fall at the smallest hour, "2022-01-01 00",

Installed Versions

INSTALLED VERSIONS ------------------ commit : 7cb7592523380133f552e258f272a5694e37957a python : 3.10.4.final.0 python-bits : 64 OS : Darwin OS-release : 21.5.0 Version : Darwin Kernel Version 21.5.0: Tue Apr 26 21:08:22 PDT 2022; root:xnu-8020.121.3~4/RELEASE_X86_64 machine : x86_64 processor : i386 byteorder : little LC_ALL : None LANG : en_US.UTF-8 LOCALE : en_US.UTF-8 pandas : 2.0.0.dev0+1147.g7cb7592523 numpy : 1.23.3 pytz : 2022.2.1 dateutil : 2.8.2 setuptools : 63.4.1 pip : 22.1.2 Cython : 0.29.32 pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : None lxml.etree : None html5lib : None pymysql : None psycopg2 : None jinja2 : None IPython : 8.5.0 pandas_datareader: None bs4 : None bottleneck : None brotli : None fastparquet : None fsspec : None gcsfs : None matplotlib : None numba : None numexpr : None odfpy : None openpyxl : None pandas_gbq : None pyarrow : None pyreadstat : None pyxlsb : None s3fs : None scipy : None snappy : None sqlalchemy : None tables : None tabulate : None xarray : None xlrd : None zstandard : None tzdata : None qtpy : None pyqt5 : None

Comment From: saucoide

looks like it happens because the index is out of order:

import pandas as pd

index = pd.DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
               '2022-01-01 00:00:02', '2022-01-01 00:00:03',
               '2022-01-01 01:00:01', '2022-01-01 02:00:02',
               '2022-01-01 03:00:03', '2032-01-01 01:00:01',    # <--
               '2042-01-01 01:00:01', '2032-01-01 01:00:01'],  # <-- same value
              dtype='datetime64[ns]', name='TIME', freq=None)
df = pd.DataFrame(index=index, data=list(range(10)))

print(df.first('1H'))

Comment From: MarcoGorelli

agreed - the docs now specify

For a DataFrame with a sorted DatetimeIndex

closing then, but thanks for the report

Comment From: MarcoGorelli

(and thanks @saucoide for triaging!)