Pandas version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of pandas.
-
[ ] I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import polars as pl
from datetime import datetime
dataframe = pl.DataFrame({"integer": [1, 2, 3],
"date": [
(datetime(2022, 1, 1)),
(datetime(2022, 1, 2)),
(datetime(2022, 1, 3))
],
"float":[4.0, 5.0, 6.0],
"strings": ["first", "second", "third"]
})
df_pd_pyarrow = dataframe.to_pandas(use_pyarrow_extension_array=True)
#ERROR *** NotImplementedError: large_string ***
Issue Description
Hi guys! Polars is another tool to work with dataframes.
When converting to Pandas, one dataframe in PyArrow (generated by Polars), which has one column with string datatype, I got this error:
NotImplementedError: large_string
This only occurs in Pandas 2.0rc0. With Pandas 1.5.3 it works without error.
This not appears to be a Polars' bug, as whats was tested here
Expected Behavior
It's supposed to work, as its works with v 1.5.3
Installed Versions
pandas.show_versions()
INSTALLED VERSIONS
#------------------
commit : 1a2e300170efc08cb509a0b4ff6248f8d55ae777
python : 3.10.6.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.16.3-microsoft-standard-WSL2
Version : #1 SMP Fri Apr 2 22:23:49 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.0.0rc0
numpy : 1.24.2
pytz : 2022.7.1
dateutil : 2.8.2
setuptools : 59.6.0
pip : 22.0.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.11.0
pandas_datareader: None
bs4 : 4.11.2
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : 2.0.5.post1
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : None
qtpy : None
pyqt5 : None
Comment From: mroeschke
Thanks for the report. Could you post the full traceback?
Comment From: romiof
Thanks for the report. Could you post the full traceback?
Sure...
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[4], line 1
----> 1 df_pd_pyarrow = dataframe.to_pandas(use_pyarrow_extension_array=True)
File ~/.local/lib/python3.10/site-packages/polars/internals/dataframe/frame.py:2061, in DataFrame.to_pandas(self, use_pyarrow_extension_array, *args, **kwargs)
2059 tbl = pa.Table.from_batches(record_batches)
2060 if use_pyarrow_extension_array:
-> 2061 return tbl.to_pandas(
2062 self_destruct=True,
2063 split_blocks=True,
2064 types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
2065 **kwargs,
2066 )
2068 date_as_object = kwargs.pop("date_as_object", False)
2069 return tbl.to_pandas(date_as_object=date_as_object, **kwargs)
File ~/.local/lib/python3.10/site-packages/pyarrow/array.pxi:830, in pyarrow.lib._PandasConvertible.to_pandas()
File ~/.local/lib/python3.10/site-packages/pyarrow/table.pxi:3990, in pyarrow.lib.Table._to_pandas()
File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:820, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
818 _check_data_column_metadata_consistency(all_columns)
819 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
822 axes = [columns, index]
823 return BlockManager(blocks, axes)
File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1171, in _table_to_blocks(options, block_table, categories, extension_columns)
1168 columns = block_table.column_names
1169 result = pa.lib.table_to_blocks(options, block_table, categories,
1170 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
1172 for item in result]
File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1171, in <listcomp>(.0)
1168 columns = block_table.column_names
1169 result = pa.lib.table_to_blocks(options, block_table, categories,
1170 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
1172 for item in result]
File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:781, in _reconstruct_block(item, columns, extension_columns)
778 raise ValueError("This column does not support to be converted "
779 "to a pandas ExtensionArray")
780 pd_ext_arr = pandas_dtype.__from_arrow__(arr)
--> 781 block = _int.make_block(pd_ext_arr, placement=placement)
782 else:
783 block = _int.make_block(block_arr, placement=placement)
File ~/.local/lib/python3.10/site-packages/pandas/core/internals/api.py:73, in make_block(values, placement, klass, ndim, dtype)
70 placement = BlockPlacement(placement)
72 ndim = maybe_infer_ndim(values, placement, ndim)
---> 73 if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
74 # GH#41168 ensure we can pass 1D dt64tz values
75 # More generally, any EA dtype that isn't is_1d_only_ea_dtype
76 values = extract_array(values, extract_numpy=True)
77 values = ensure_block_shape(values, ndim)
File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/common.py:414, in is_period_dtype(arr_or_dtype)
386 """
387 Check whether an array-like or dtype is of the Period dtype.
388
(...)
410 True
411 """
412 if isinstance(arr_or_dtype, ExtensionDtype):
413 # GH#33400 fastpath for dtype object
--> 414 return arr_or_dtype.type is Period
416 if arr_or_dtype is None:
417 return False
File ~/.local/lib/python3.10/site-packages/pandas/core/arrays/arrow/dtype.py:135, in ArrowDtype.type(self)
133 return type(pa_type)
134 else:
--> 135 raise NotImplementedError(pa_type)
NotImplementedError: large_string
Comment From: ghuls
PyArrow categoricals are also not supported in 2.0rc0, while it worked in 1.5.3 (although conversion was quite slow):
cat categorical.tsv
chr1 3003641 3003701 TGTATCGCAAGCTTAT-1 3
chr1 3003658 3003977 CTAGTAATCCAAGTTA-1 2
chr1 3003665 3003699 TGTTGGCCATAGCGAG-1 1
chr1 3003699 3003743 AGAGATTAGTAGCTTA-1 5
chr1 3003719 3003767 CGGCTAATCAAGTGAG-1 2
chr1 3003733 3003863 CGCAATTAGTAGCCAT-1 3
chr1 3003741 3003930 TCCTTAGTCTAAGGTC-1 1
chr1 3003775 3003853 CACATACAGCGAGTAA-1 3
chr1 3003786 3003840 TCCTTTACAGCCTAAC-1 1
chr1 3003800 3003843 GACTCACCAACCCTAA-1 1
In [1]: import pyarrow as pa
In [2]: import pyarrow.csv
In [3]: import pandas as pd
In [4]: def read_fragments(fragments_bed_filename):
...: df = pa.csv.read_csv(
...: fragments_bed_filename,
...: read_options=pa.csv.ReadOptions(
...: use_threads=True,
...: skip_rows=0,
...: column_names=["Chromosome", "Start", "End", "Name", "Score"],
...: ),
...: parse_options=pa.csv.ParseOptions(
...: delimiter="\t",
...: quote_char=False,
...: escape_char=False,
...: newlines_in_values=False,
...: ),
...: convert_options=pa.csv.ConvertOptions(
...: column_types={
...: "Chromosome": pa.dictionary(pa.int32(), pa.string()),
...: "Start": pa.int32(),
...: "End": pa.int32(),
...: "Name": pa.dictionary(pa.int32(), pa.string()),
...: },
...: ),
...: ) #.to_pandas()
...: return df
...:
In [6]: fragments_bed_filename = "categorical.tsv"
In [7]: %time pa_df = read_fragments(fragments_bed_filename)
CPU times: user 11.2 s, sys: 1.42 s, total: 12.6 s
Wall time: 1.15 s
In [8]: pa_df
Out[8]:
pyarrow.Table
Chromosome: dictionary<values=string, indices=int32, ordered=0>
Start: int32
End: int32
Name: dictionary<values=string, indices=int32, ordered=0>
Score: int64
----
Chromosome: [ -- dictionary:
["chr1"] -- indices:
[0,0,0,0,0,0,0,0,0,0]]
Start: [[3003641,3003658,3003665,3003699,3003719,3003733,3003741,3003775,3003786,3003800]]
End: [[3003701,3003977,3003699,3003743,3003767,3003863,3003930,3003853,3003840,3003843]]
Name: [ -- dictionary:
["TGTATCGCAAGCTTAT-1","CTAGTAATCCAAGTTA-1","TGTTGGCCATAGCGAG-1","AGAGATTAGTAGCTTA-1","CGGCTAATCAAGTGAG-1","CGCAATTAGTAGCCAT-1","TCCTTAGTCTAAGGTC-1","CACATACAGCGAGTAA-1","TCCTTTACAGCCTAAC-1","GACTCACCAACCCTAA-1"] -- indices:
[0,1,2,3,4,5,6,7,8,9]]
Score: [[3,2,1,5,2,3,1,3,1,1]]
In [9]: pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-9-234d02574769> in <module>
----> 1 pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
818 _check_data_column_metadata_consistency(all_columns)
819 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
821
822 axes = [columns, index]
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
1169 result = pa.lib.table_to_blocks(options, block_table, categories,
1170 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
1172 for item in result]
1173
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
1169 result = pa.lib.table_to_blocks(options, block_table, categories,
1170 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
1172 for item in result]
1173
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _reconstruct_block(item, columns, extension_columns)
779 "to a pandas ExtensionArray")
780 pd_ext_arr = pandas_dtype.__from_arrow__(arr)
--> 781 block = _int.make_block(pd_ext_arr, placement=placement)
782 else:
783 block = _int.make_block(block_arr, placement=placement)
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/internals/api.py in make_block(values, placement, klass, ndim, dtype)
71
72 ndim = maybe_infer_ndim(values, placement, ndim)
---> 73 if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
74 # GH#41168 ensure we can pass 1D dt64tz values
75 # More generally, any EA dtype that isn't is_1d_only_ea_dtype
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/dtypes/common.py in is_period_dtype(arr_or_dtype)
412 if isinstance(arr_or_dtype, ExtensionDtype):
413 # GH#33400 fastpath for dtype object
--> 414 return arr_or_dtype.type is Period
415
416 if arr_or_dtype is None:
~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/arrays/arrow/dtype.py in type(self)
133 return type(pa_type)
134 else:
--> 135 raise NotImplementedError(pa_type)
136
137 @property
NotImplementedError: dictionary<values=string, indices=int32, ordered=0>
With Pandas 1.5.3
In [8]: df = pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)
In [9]: df.dtypes
Out[9]:
Chromosome dictionary<values=string, indices=int32, order...
Start int32[pyarrow]
End int32[pyarrow]
Name dictionary<values=string, indices=int32, order...
Score int64[pyarrow]
dtype: object