After discussion on the ML it was decided the below function should work in pandas.
It fails on the current master (0.11.0.dev-308beb1) with 32bit Python 2.7 on windows
def test_resample_nonnumeric():
import numpy as np
import pandas as pd
dates = pd.date_range('01-Jan-2014','05-Jan-2014', freq='D')
series = pd.TimeSeries(['a','b','c','d','e'], index=dates)
resampled_series = series[[0,1,3,4]].resample('D', fill_method='ffill')
assert (resampled_series.index == dates).all()
assert (resampled_series.values == np.asarray(['a','b','b','d','e'], dtype=object)).all()
https://groups.google.com/forum/?fromgroups=#!topic/pydata/NFA10wTVNu8
addtl example (related), how doing weird things:
In [30]: df = pd.DataFrame({
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
'Quantity': [1,3,5,8,9,3],
'Date' : [
DT.datetime(2013,9,1,13,0),
DT.datetime(2013,9,1,13,5),
DT.datetime(2013,10,1,20,0),
DT.datetime(2013,10,3,10,0),
DT.datetime(2013,12,2,12,0),
DT.datetime(2013,12,2,14,0),
]}).set_index(['Date'])
In [31]: df
Out[31]:
Buyer Quantity
Date
2013-09-01 13:00:00 Carl 1
2013-09-01 13:05:00 Mark 3
2013-10-01 20:00:00 Carl 5
2013-10-03 10:00:00 Joe 8
2013-12-02 12:00:00 Joe 9
2013-12-02 14:00:00 Carl 3
In [33]: df['Quantity'].resample('10D',how=sum)
Out[33]:
Date
2013-09-01 13:00:00 4.000000e+00
2013-09-11 13:00:00 NaN
2013-09-21 13:00:00 NaN
2013-10-01 13:00:00 1.300000e+01
2013-10-11 13:00:00 NaN
2013-10-21 13:00:00 NaN
2013-10-31 13:00:00 NaN
2013-11-10 13:00:00 NaN
2013-11-20 13:00:00 NaN
2013-11-30 13:00:00 1.200000e+01
2013-12-10 13:00:00 7.637868e-317
Freq: 10D, dtype: float64
Comment From: dhirschfeld
Since the actual usecase which exposed the issue was a TimeSeries of dates the below test also tests that functionality in case it takes a different in codepath.
def test_resample_nonnumeric():
import numpy as np
import pandas as pd
dates = pd.date_range('01-Jan-2014','05-Jan-2014', freq='D')
series = pd.TimeSeries(['a','b','c','d','e'], index=dates)
resampled_series = series[[0,1,3,4]].resample('D', fill_method='ffill')
assert (resampled_series.index == dates).all()
assert (resampled_series.values == np.asarray(['a','b','c','d','e'], dtype=object)).all()
series = pd.TimeSeries(dates, index=dates)
resampled_series = series[[0,1,3,4]].resample('D', fill_method='ffill')
assert (resampled_series.index == dates).all()
assert (resampled_series.values == series[[0,1,1,3,4]]).all()
Comment From: cpcloud
@dhirschfeld What would the how
argument do in this case? Be ignored?
Comment From: dhirschfeld
The docstring says how
is for downsampling whilst fill_method
is for upsampling.
OT: This has always bothered me - can't one argument suffice? I'm constantly having to check the docstring to find out which argument I'm supposed to be using depending on whether I happen to be upsampling or downsampling. Also, reindex
calls the same argument method
further adding to the confusion.
In either case I'd expect that if you do something that's not well defined for the type of your object it's reasonable to let whatever exception is created filter through. i.e. if you passed a (theoretical) linear argument to fill_method
you'd expect a TypeError because you can't divide a string by a numeric value:
In [28]: ('a'+'b')/2
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-28-5523b85b2ca5> in <module>()
----> 1 ('a'+'b')/2
TypeError: unsupported operand type(s) for /: 'str' and 'int'
if it's possible for it to work it should. Because summing strings is possible you'd expect that to work, but not mean. This in fact seems to be the case:
In [39]: dates = pd.date_range('01-Jan-2014', periods=26, freq='D')
...: series = pd.TimeSeries(map(chr, range(97, 123)), index=dates)
...:
In [40]: series
Out[40]:
2014-01-01 a
2014-01-02 b
2014-01-03 c
2014-01-04 d
2014-01-05 e
2014-01-06 f
2014-01-07 g
2014-01-08 h
2014-01-09 i
2014-01-10 j
2014-01-11 k
2014-01-12 l
2014-01-13 m
2014-01-14 n
2014-01-15 o
2014-01-16 p
2014-01-17 q
2014-01-18 r
2014-01-19 s
2014-01-20 t
2014-01-21 u
2014-01-22 v
2014-01-23 w
2014-01-24 x
2014-01-25 y
2014-01-26 z
Freq: D, dtype: object
In [41]: series.resample('W', how='sum')
Out[41]:
2014-01-05 abcde
2014-01-12 fghijkl
2014-01-19 mnopqrs
2014-01-26 tuvwxyz
Freq: W-SUN, dtype: object
series.resample('W', how='mean')
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-42-e4ce7d0b1edd> in <module>()
----> 1 series.resample('W', how='mean')
C:\dev\bin\Python27\lib\site-packages\pandas\core\generic.pyc in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, limit, base)
255 fill_method=fill_method, convention=convention,
256 limit=limit, base=base)
--> 257 return sampler.resample(self)
258
259 def first(self, offset):
C:\dev\bin\Python27\lib\site-packages\pandas\tseries\resample.pyc in resample(self, obj)
81
82 if isinstance(axis, DatetimeIndex):
---> 83 rs = self._resample_timestamps(obj)
84 elif isinstance(axis, PeriodIndex):
85 offset = to_offset(self.freq)
C:\dev\bin\Python27\lib\site-packages\pandas\tseries\resample.pyc in _resample_timestamps(self, obj)
206 if len(grouper.binlabels) < len(axlabels) or self.how is not None:
207 grouped = obj.groupby(grouper, axis=self.axis)
--> 208 result = grouped.aggregate(self._agg_method)
209 else:
210 # upsampling shortcut
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in aggregate(self, func_or_funcs, *args, **kwargs)
1409 """
1410 if isinstance(func_or_funcs, basestring):
-> 1411 return getattr(self, func_or_funcs)(*args, **kwargs)
1412
1413 if hasattr(func_or_funcs, '__iter__'):
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in mean(self)
351 """
352 try:
--> 353 return self._cython_agg_general('mean')
354 except GroupByError:
355 raise
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in _cython_agg_general(self, how, numeric_only)
466
467 if len(output) == 0:
--> 468 raise DataError('No numeric types to aggregate')
469
470 return self._wrap_aggregated_output(output, names)
DataError: No numeric types to aggregate
Stranger, it seems that ffill
does work, but only if you don't index it first:
In [43]: dates = pd.date_range('01-Jan-2014','05-Jan-2014', freq='D')
...: series = pd.TimeSeries(['a','b','c','d','e'], index=dates)
...: series.resample('H', fill_method='ffill')
...:
Out[43]:
2014-01-01 00:00:00 a
2014-01-01 01:00:00 a
2014-01-01 02:00:00 a
2014-01-01 03:00:00 a
2014-01-01 04:00:00 a
2014-01-01 05:00:00 a
2014-01-01 06:00:00 a
2014-01-01 07:00:00 a
2014-01-01 08:00:00 a
2014-01-01 09:00:00 a
2014-01-01 10:00:00 a
2014-01-01 11:00:00 a
2014-01-01 12:00:00 a
2014-01-01 13:00:00 a
2014-01-01 14:00:00 a
...
2014-01-04 10:00:00 d
2014-01-04 11:00:00 d
2014-01-04 12:00:00 d
2014-01-04 13:00:00 d
2014-01-04 14:00:00 d
2014-01-04 15:00:00 d
2014-01-04 16:00:00 d
2014-01-04 17:00:00 d
2014-01-04 18:00:00 d
2014-01-04 19:00:00 d
2014-01-04 20:00:00 d
2014-01-04 21:00:00 d
2014-01-04 22:00:00 d
2014-01-04 23:00:00 d
2014-01-05 00:00:00 e
Freq: H, Length: 97, dtype: object
In [44]: series[[0,1,3,4]].resample('H', fill_method='ffill')
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-44-6621beea2243> in <module>()
----> 1 series[[0,1,3,4]].resample('H', fill_method='ffill')
C:\dev\bin\Python27\lib\site-packages\pandas\core\generic.pyc in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, limit, base)
255 fill_method=fill_method, convention=convention,
256 limit=limit, base=base)
--> 257 return sampler.resample(self)
258
259 def first(self, offset):
C:\dev\bin\Python27\lib\site-packages\pandas\tseries\resample.pyc in resample(self, obj)
81
82 if isinstance(axis, DatetimeIndex):
---> 83 rs = self._resample_timestamps(obj)
84 elif isinstance(axis, PeriodIndex):
85 offset = to_offset(self.freq)
C:\dev\bin\Python27\lib\site-packages\pandas\tseries\resample.pyc in _resample_timestamps(self, obj)
221 # Irregular data, have to use groupby
222 grouped = obj.groupby(grouper, axis=self.axis)
--> 223 result = grouped.aggregate(self._agg_method)
224
225 if self.fill_method is not None:
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in aggregate(self, func_or_funcs, *args, **kwargs)
1409 """
1410 if isinstance(func_or_funcs, basestring):
-> 1411 return getattr(self, func_or_funcs)(*args, **kwargs)
1412
1413 if hasattr(func_or_funcs, '__iter__'):
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in mean(self)
351 """
352 try:
--> 353 return self._cython_agg_general('mean')
354 except GroupByError:
355 raise
C:\dev\bin\Python27\lib\site-packages\pandas\core\groupby.pyc in _cython_agg_general(self, how, numeric_only)
466
467 if len(output) == 0:
--> 468 raise DataError('No numeric types to aggregate')
469
470 return self._wrap_aggregated_output(output, names)
DataError: No numeric types to aggregate
In [45]: pd.__version__
Out[45]: '0.11.0'
Comment From: WillAyd
Original example is no longer reproducible - if you have an updated example feel free to reopen!