Code Sample, a copy-pastable example if possible
df.to_hdf('../data/base.h5', 'all')
/work/.pylib/3/pandas/core/generic.py in to_hdf(self, path_or_buf, key, **kwargs)
2529 from pandas.io import pytables
2530
-> 2531 pytables.to_hdf(path_or_buf, key, self, **kwargs)
2532
2533 def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs):
/work/.pylib/3/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
276 path_or_buf, mode=mode, complevel=complevel, complib=complib
277 ) as store:
--> 278 f(store)
279 else:
280 f(path_or_buf)
/work/.pylib/3/pandas/io/pytables.py in <lambda>(store)
269 f = lambda store: store.append(key, value, **kwargs)
270 else:
--> 271 f = lambda store: store.put(key, value, **kwargs)
272
273 path_or_buf = _stringify_path(path_or_buf)
/work/.pylib/3/pandas/io/pytables.py in put(self, key, value, format, append, **kwargs)
957 format = get_option("io.hdf.default_format") or "fixed"
958 kwargs = self._validate_format(format, kwargs)
--> 959 self._write_to_group(key, value, append=append, **kwargs)
960
961 def remove(self, key, where=None, start=None, stop=None):
/work/.pylib/3/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1523
1524 # write the object
-> 1525 s.write(obj=value, append=append, complib=complib, **kwargs)
1526
1527 if s.is_table and index:
/work/.pylib/3/pandas/io/pytables.py in write(self, obj, **kwargs)
3230 blk_items = data.items.take(blk.mgr_locs)
3231 self.write_array(
-> 3232 "block{idx}_values".format(idx=i), blk.values, items=blk_items
3233 )
3234 self.write_index("block{idx}_items".format(idx=i), blk_items)
/work/.pylib/3/pandas/io/pytables.py in write_array(self, key, value, items)
2997
2998 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
-> 2999 vlarr.append(value)
3000 else:
3001 if empty_array:
/usr/local/anaconda3/lib/python3.6/site-packages/tables/vlarray.py in append(self, sequence)
545 nparr = None
546
--> 547 self._append(nparr, nobjects)
548 self.nrows += 1
549
tables/hdf5extension.pyx in tables.hdf5extension.VLArray._append()
OverflowError: value too large to convert to int
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None python : 3.6.7.final.0 python-bits : 64 OS : Linux OS-release : 3.10.0-693.5.2.el7.x86_64 machine : x86_64 processor : x86_64 byteorder : little LC_ALL : zh_CN.utf8 LANG : None LOCALE : zh_CN.UTF-8
pandas : 0.25.0 numpy : 1.17.0 pytz : 2019.2 dateutil : 2.8.0 pip : 18.1 setuptools : 41.0.1 Cython : 0.29.10 pytest : 5.0.1 hypothesis : None sphinx : 2.1.2 blosc : None feather : 0.4.0 xlsxwriter : 1.1.8 lxml.etree : 4.2.5 html5lib : 1.0.1 pymysql : None psycopg2 : None jinja2 : 2.10 IPython : 7.2.0 pandas_datareader: None bs4 : 4.7.1 bottleneck : 1.2.1 fastparquet : None gcsfs : None lxml.etree : 4.2.5 matplotlib : 3.1.1 numexpr : 2.6.9 odfpy : None openpyxl : 2.6.1 pandas_gbq : None pyarrow : 0.14.1 pytables : None s3fs : None scipy : 1.1.0 sqlalchemy : 1.3.5 tables : 3.5.1 xarray : None xlrd : 1.2.0 xlwt : 1.3.0 xlsxwriter : 1.1.8
Comment From: RainFung
This is my df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 52724767 entries, 0 to 52724766
Data columns (total 16 columns):
country_id object
buyer_admin_id uint32
item_id uint32
log_time object
irank uint32
buy_flag uint8
is_train uint8
time_rank uint16
time_flag_rank uint16
item_flag_rank uint32
item_all_rank uint32
date object
day object
cate_id float64
store_id float64
item_price float64
dtypes: float64(3), object(4), uint16(2), uint32(5), uint8(2)
memory usage: 4.4+ GB
Comment From: TomAugspurger
That exception seems to come from pytables. Is it a bug / limitation there? Or are we using pytables incorrectly?
Comment From: RainFung
It passed when i reduce data amount...
df[:10000000].to_hdf('../data/base.h5', 'all')
Comment From: TomAugspurger
That makes sense given the error.
Comment From: gokceneraslan
Same happens with the feather and parquet formats too, right?
Comment From: courtarro
I can confirm that the same issue happens when attempting to run to_parquet
using fastparquet
. For large datasets, the write fails with the same error.