Code Sample
I am trying to get a DataFrame from dict whose values are of variable length. However, I get a ValueError
.
In [1]: import numpy as np
In [2]: import pandas as pd
In [3]: x_dict = {"A{}".format(i): np.random.randint(1, 25, data) for i, data in enumerate([5, 10, 15, 20])}
In [4]: x_dict
Out[4]:
{'A0': array([ 5, 7, 17, 10, 24]),
'A1': array([ 4, 8, 11, 7, 9, 1, 23, 22, 20, 15]),
'A2': array([ 9, 5, 20, 2, 21, 12, 21, 8, 1, 10, 5, 21, 7, 9, 5]),
'A3': array([ 2, 1, 24, 21, 24, 16, 6, 9, 20, 7, 24, 11, 23, 15, 3, 22, 5,
21, 1, 20])}
In [5]: x_df = pd.DataFrame.from_dict(x_dict)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-e23195a7d765> in <module>()
----> 1 x_df = pd.DataFrame.from_dict(x_dict)
/python2.7/site-packages/pandas/core/frame.pyc in from_dict(cls, data, orient, dtype)
807 raise ValueError('only recognize index or columns for orient')
808
--> 809 return cls(data, index=index, columns=columns, dtype=dtype)
810
811 @deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient')
/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
222 dtype=dtype, copy=copy)
223 elif isinstance(data, dict):
--> 224 mgr = self._init_dict(data, index, columns, dtype=dtype)
225 elif isinstance(data, ma.MaskedArray):
226 import numpy.ma.mrecords as mrecords
/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
358 arrays = [data[k] for k in keys]
359
--> 360 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
361
362 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
5229 # figure out the index, if necessary
5230 if index is None:
-> 5231 index = extract_index(arrays)
5232 else:
5233 index = _ensure_index(index)
/python2.7/site-packages/pandas/core/frame.pyc in extract_index(data)
5277 lengths = list(set(raw_lengths))
5278 if len(lengths) > 1:
-> 5279 raise ValueError('arrays must all be same length')
5280
5281 if have_dicts:
ValueError: arrays must all be same length
Expected output
DataFrame with keys as columns and values as rows.
In [11]: x_df
Out[11]:
A1 A0 A3 A2
0 4.0 5.0 2.0 9.0
1 8.0 7.0 1.0 5.0
2 11.0 17.0 24.0 20.0
3 7.0 10.0 21.0 2.0
4 9.0 24.0 24.0 21.0
5 1.0 NaN 16.0 12.0
6 23.0 NaN 6.0 21.0
7 22.0 NaN 9.0 8.0
8 20.0 NaN 20.0 1.0
9 15.0 NaN 7.0 10.0
10 NaN NaN 24.0 5.0
11 NaN NaN 11.0 21.0
12 NaN NaN 23.0 7.0
13 NaN NaN 15.0 9.0
14 NaN NaN 3.0 5.0
15 NaN NaN 22.0 NaN
16 NaN NaN 5.0 NaN
17 NaN NaN 21.0 NaN
18 NaN NaN 1.0 NaN
19 NaN NaN 20.0 NaN
output of pd.show_versions()
INSTALLED VERSIONS - commit: None - python: 2.7.11.final.0 - python-bits: 64 - OS: Darwin - OS-release: 15.5.0 - machine: x86_64 - processor: i386 - byteorder: little - LC_ALL: None - LANG: en_US.UTF - pandas: 0.18.1 - nose: 1.3.7 - pip: 8.1.2 - setuptools: 23.1.0 - Cython: 0.24 - numpy: 1.11.1 - scipy: 0.17.1 - statsmodels: 0.6.1 - xarray: None - IPython: 4.2.0 - sphinx: 1.4.4 - patsy: None - dateutil: 2.5.3 - pytz: 2016.4 - blosc: None - bottleneck: None - tables: None - numexpr: None - matplotlib: 1.5.1 - openpyxl: None - xlrd: 1.0.0 - xlwt: None - xlsxwriter: None - lxml: None - bs4: 4.4.1 - html5lib: None - httplib2: None - apiclient: None - sqlalchemy: None - pymysql: None - psycopg2: None - jinja2: 2.8 - boto: None - pandas_datareader: None
Comment From: jreback
if you wrap each of these in a Series it would work it doesn't because it's impossible to align a non indexed ragged array as the error message indicates
Comment From: akshayparopkari
Thank you @jreback! 👍