BRIEF EXPLANATION
Copying a column using df.apply() will fail if that column contains lists and the first item in that column is a list of length exactly two.
CODE SAMPLE
import pandas as pd
def list_to_list(df):
return df['list']
check = pd.DataFrame({'id': [0,1], 'list':[['1','2'], ['1', '2']]})
check['new_list'] = check.apply(list_to_list, axis = 1)
Running the code above results in the following error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-48-02b4161cb08e> in <module>()
1 check = pd.DataFrame({'id': [0,1], 'list':[['1','2'], ['1', '2']]})
----> 2 check['new_list'] = check.apply(list_to_list, axis = 1)
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __setitem__(self, key, value)
2297 else:
2298 # set column
-> 2299 self._set_item(key, value)
2300
2301 def _setitem_slice(self, key, value):
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _set_item(self, key, value)
2365 self._ensure_valid_index(value)
2366 value = self._sanitize_column(key, value)
-> 2367 NDFrame._set_item(self, key, value)
2368
2369 # check if we are modifying a copy
/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_item(self, key, value)
1206
1207 def _set_item(self, key, value):
-> 1208 self._data.set(key, value)
1209 self._clear_item_cache()
1210
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in set(self, item, value, check)
3332 except KeyError:
3333 # This item wasn't present, just insert at end
-> 3334 self.insert(len(self.items), item, value)
3335 return
3336
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in insert(self, loc, item, value, allow_duplicates)
3434 block = make_block(values=value,
3435 ndim=self.ndim,
-> 3436 placement=slice(loc, loc+1))
3437
3438 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in make_block(values, placement, klass, ndim, dtype, fastpath)
2452
2453 return klass(values, ndim=ndim, fastpath=fastpath,
-> 2454 placement=placement)
2455
2456
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in __init__(self, values, ndim, fastpath, placement, **kwargs)
1626 super(ObjectBlock, self).__init__(values, ndim=ndim,
1627 fastpath=fastpath,
-> 1628 placement=placement, **kwargs)
1629
1630 @property
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in __init__(self, values, placement, ndim, fastpath)
85 raise ValueError('Wrong number of items passed %d,'
86 ' placement implies %d' % (
---> 87 len(self.values), len(self.mgr_locs)))
88
89 @property
ValueError: Wrong number of items passed 2, placement implies 1
check1 = pd.DataFrame({'id': [0,1], 'list':[['1','2'], ['1', '2', '3']]})
check1['new_list'] = check1.apply(list_to_list, axis = 1)
Running the code above results in the following error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-571835e7c0cb> in <module>()
1 check1 = pd.DataFrame({'id': [0,1], 'list':[['1','2'], ['1', '2', '3']]})
----> 2 check1['new_list'] = check1.apply(list_to_list, axis = 1)
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3970 if reduce is None:
3971 reduce = True
-> 3972 return self._apply_standard(f, axis, reduce=reduce)
3973 else:
3974 return self._apply_broadcast(f, axis)
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
4079 index = None
4080
-> 4081 result = self._constructor(data=results, index=index)
4082 result.columns = res_index
4083
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
224 dtype=dtype, copy=copy)
225 elif isinstance(data, dict):
--> 226 mgr = self._init_dict(data, index, columns, dtype=dtype)
227 elif isinstance(data, ma.MaskedArray):
228 import numpy.ma.mrecords as mrecords
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
361
362 return _arrays_to_mgr(arrays, data_names, index, columns,
--> 363 dtype=dtype)
364
365 def _init_ndarray(self, values, index, columns, dtype=None,
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5166 axes = [_ensure_index(columns), _ensure_index(index)]
5167
-> 5168 return create_block_manager_from_arrays(arrays, arr_names, axes)
5169
5170
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
3914 return mgr
3915 except (ValueError) as e:
-> 3916 construction_error(len(arrays), arrays[0].shape, axes, e)
3917
3918
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes, e)
3878 implied = tuple(map(int, [len(ax) for ax in axes]))
3879 if passed == implied and e is not None:
-> 3880 raise e
3881 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
3882 passed,implied))
ValueError: could not broadcast input array from shape (3) into shape (2)
Increasing the number of items in the second list increases the number associated with the first "shape" in the error message
check2 = pd.DataFrame({'id': [0,1], 'list':[['1','2'], []]})
check2['new_list'] = check2.apply(list_to_list, axis = 1)
Running the code above results in the following error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-50-a4f4e0227671> in <module>()
1 check2 = pd.DataFrame({'id': [0,1], 'list':[['1','2'], []]})
----> 2 check2['new_list'] = check2.apply(list_to_list, axis = 1)
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3970 if reduce is None:
3971 reduce = True
-> 3972 return self._apply_standard(f, axis, reduce=reduce)
3973 else:
3974 return self._apply_broadcast(f, axis)
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
4079 index = None
4080
-> 4081 result = self._constructor(data=results, index=index)
4082 result.columns = res_index
4083
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
224 dtype=dtype, copy=copy)
225 elif isinstance(data, dict):
--> 226 mgr = self._init_dict(data, index, columns, dtype=dtype)
227 elif isinstance(data, ma.MaskedArray):
228 import numpy.ma.mrecords as mrecords
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
361
362 return _arrays_to_mgr(arrays, data_names, index, columns,
--> 363 dtype=dtype)
364
365 def _init_ndarray(self, values, index, columns, dtype=None,
/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5166 axes = [_ensure_index(columns), _ensure_index(index)]
5167
-> 5168 return create_block_manager_from_arrays(arrays, arr_names, axes)
5169
5170
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
3914 return mgr
3915 except (ValueError) as e:
-> 3916 construction_error(len(arrays), arrays[0].shape, axes, e)
3917
3918
/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes, e)
3878 implied = tuple(map(int, [len(ax) for ax in axes]))
3879 if passed == implied and e is not None:
-> 3880 raise e
3881 raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
3882 passed,implied))
ValueError: Shape of passed values is (2, 0), indices imply (2, 2)
EXPECTED OUTPUT
In each case, I would expect the 'new_list' column to be created without errors, and with lists of the specified length.
INSTALLED VERSIONS
commit: None python: 2.7.11.final.0 python-bits: 64 OS: Linux OS-release: 2.6.32-573.22.1.el6.x86_64 machine: x86_64 processor: x86_64 byteorder: little LC_ALL: None LANG: en_US.UTF-8
pandas: 0.17.1 nose: 1.3.7 pip: 8.0.3 setuptools: 20.1.1 Cython: 0.23.4 numpy: 1.10.4 scipy: 0.17.0 statsmodels: 0.6.1 IPython: 4.0.3 sphinx: 1.3.5 patsy: 0.4.0 dateutil: 2.4.2 pytz: 2015.7 blosc: None bottleneck: 1.0.0 tables: 3.2.2 numexpr: 2.4.6 matplotlib: 1.5.1 openpyxl: 2.3.2 xlrd: 0.9.4 xlwt: 1.0.0 xlsxwriter: 0.8.4 lxml: 3.5.0 bs4: 4.4.1 html5lib: None httplib2: None apiclient: None sqlalchemy: 1.0.11 pymysql: 0.6.7.None psycopg2: None Jinja2: None
Comment From: TomAugspurger
There was a similar issue about this somewhat recently, can't find it ATM though.
Basically .apply tries to be smart about the return type, but in this case it's wrong. It's the fact that check.loc[0, 'list'] == shape[0] that's throwing things off, so you get
In [35]: check.apply(list_to_list, axis = 1)
Out[35]:
id list
0 1 2
1 1 2
which can't be assigned into a series.
You're probably better off writing a vectorized function and calling that on check['list'] directly if possible, or doing the iteration yourself manually with df.itertuples, something like
In [49]: def list_to_list(row):
...: return row.list
...:
...:
In [50]: pd.Series(list_to_list(v) for v in check.itertuples())
Out[50]:
0 [1, 2]
1 [1, 2]
dtype: object
Let me know if you have any specific usage questions on your actual example.