Pandas BUG: Pandas 2.0rc0 can't handle string column with PyArrow (large_string error)

Pandas version checks

[X] I have checked that this issue has not already been reported.
[X] I have confirmed this bug exists on the latest version of pandas.
[ ] I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import polars as pl
from datetime import datetime

dataframe = pl.DataFrame({"integer": [1, 2, 3], 
                          "date": [
                              (datetime(2022, 1, 1)), 
                              (datetime(2022, 1, 2)), 
                              (datetime(2022, 1, 3))
                          ], 
                          "float":[4.0, 5.0, 6.0],
                          "strings": ["first", "second", "third"]
                        })

df_pd_pyarrow = dataframe.to_pandas(use_pyarrow_extension_array=True)


#ERROR *** NotImplementedError: large_string ***

Issue Description

Hi guys! Polars is another tool to work with dataframes.

When converting to Pandas, one dataframe in PyArrow (generated by Polars), which has one column with string datatype, I got this error:

NotImplementedError: large_string

This only occurs in Pandas 2.0rc0. With Pandas 1.5.3 it works without error.

This not appears to be a Polars' bug, as whats was tested here

Expected Behavior

It's supposed to work, as its works with v 1.5.3

Installed Versions

pandas.show_versions() 
INSTALLED VERSIONS
#------------------
commit           : 1a2e300170efc08cb509a0b4ff6248f8d55ae777
python           : 3.10.6.final.0
python-bits      : 64
OS               : Linux
OS-release       : 5.10.16.3-microsoft-standard-WSL2
Version          : #1 SMP Fri Apr 2 22:23:49 UTC 2021
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : None
LANG             : C.UTF-8
LOCALE           : en_US.UTF-8 
pandas           : 2.0.0rc0
numpy            : 1.24.2
pytz             : 2022.7.1
dateutil         : 2.8.2
setuptools       : 59.6.0
pip              : 22.0.2
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.1.2
IPython          : 8.11.0
pandas_datareader: None
bs4              : 4.11.2
bottleneck       : None
brotli           : None
fastparquet      : None
fsspec           : None
gcsfs            : None
matplotlib       : None
numba            : None
numexpr          : None
odfpy            : None
openpyxl         : None
pandas_gbq       : None
pyarrow          : 11.0.0
pyreadstat       : None
pyxlsb           : None
s3fs             : None
scipy            : None
snappy           : None
sqlalchemy       : 2.0.5.post1
tables           : None
tabulate         : None
xarray           : None
xlrd             : None
zstandard        : None
tzdata           : None
qtpy             : None
pyqt5            : None

Comment From: mroeschke

Thanks for the report. Could you post the full traceback?

Comment From: romiof

Thanks for the report. Could you post the full traceback?

Sure...

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
Cell In[4], line 1
----> 1 df_pd_pyarrow = dataframe.to_pandas(use_pyarrow_extension_array=True)

File ~/.local/lib/python3.10/site-packages/polars/internals/dataframe/frame.py:2061, in DataFrame.to_pandas(self, use_pyarrow_extension_array, *args, **kwargs)
   2059 tbl = pa.Table.from_batches(record_batches)
   2060 if use_pyarrow_extension_array:
-> 2061     return tbl.to_pandas(
   2062         self_destruct=True,
   2063         split_blocks=True,
   2064         types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
   2065         **kwargs,
   2066     )
   2068 date_as_object = kwargs.pop("date_as_object", False)
   2069 return tbl.to_pandas(date_as_object=date_as_object, **kwargs)

File ~/.local/lib/python3.10/site-packages/pyarrow/array.pxi:830, in pyarrow.lib._PandasConvertible.to_pandas()

File ~/.local/lib/python3.10/site-packages/pyarrow/table.pxi:3990, in pyarrow.lib.Table._to_pandas()

File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:820, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    818 _check_data_column_metadata_consistency(all_columns)
    819 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    822 axes = [columns, index]
    823 return BlockManager(blocks, axes)

File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1171, in _table_to_blocks(options, block_table, categories, extension_columns)
   1168 columns = block_table.column_names
   1169 result = pa.lib.table_to_blocks(options, block_table, categories,
   1170                                 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
   1172         for item in result]

File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1171, in <listcomp>(.0)
   1168 columns = block_table.column_names
   1169 result = pa.lib.table_to_blocks(options, block_table, categories,
   1170                                 list(extension_columns.keys()))
-> 1171 return [_reconstruct_block(item, columns, extension_columns)
   1172         for item in result]

File ~/.local/lib/python3.10/site-packages/pyarrow/pandas_compat.py:781, in _reconstruct_block(item, columns, extension_columns)
    778         raise ValueError("This column does not support to be converted "
    779                          "to a pandas ExtensionArray")
    780     pd_ext_arr = pandas_dtype.__from_arrow__(arr)
--> 781     block = _int.make_block(pd_ext_arr, placement=placement)
    782 else:
    783     block = _int.make_block(block_arr, placement=placement)

File ~/.local/lib/python3.10/site-packages/pandas/core/internals/api.py:73, in make_block(values, placement, klass, ndim, dtype)
     70     placement = BlockPlacement(placement)
     72 ndim = maybe_infer_ndim(values, placement, ndim)
---> 73 if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
     74     # GH#41168 ensure we can pass 1D dt64tz values
     75     # More generally, any EA dtype that isn't is_1d_only_ea_dtype
     76     values = extract_array(values, extract_numpy=True)
     77     values = ensure_block_shape(values, ndim)

File ~/.local/lib/python3.10/site-packages/pandas/core/dtypes/common.py:414, in is_period_dtype(arr_or_dtype)
    386 """
    387 Check whether an array-like or dtype is of the Period dtype.
    388 
   (...)
    410 True
    411 """
    412 if isinstance(arr_or_dtype, ExtensionDtype):
    413     # GH#33400 fastpath for dtype object
--> 414     return arr_or_dtype.type is Period
    416 if arr_or_dtype is None:
    417     return False

File ~/.local/lib/python3.10/site-packages/pandas/core/arrays/arrow/dtype.py:135, in ArrowDtype.type(self)
    133     return type(pa_type)
    134 else:
--> 135     raise NotImplementedError(pa_type)

NotImplementedError: large_string

Comment From: ghuls

PyArrow categoricals are also not supported in 2.0rc0, while it worked in 1.5.3 (although conversion was quite slow):

cat categorical.tsv

chr1    3003641 3003701 TGTATCGCAAGCTTAT-1  3
chr1    3003658 3003977 CTAGTAATCCAAGTTA-1  2
chr1    3003665 3003699 TGTTGGCCATAGCGAG-1  1
chr1    3003699 3003743 AGAGATTAGTAGCTTA-1  5
chr1    3003719 3003767 CGGCTAATCAAGTGAG-1  2
chr1    3003733 3003863 CGCAATTAGTAGCCAT-1  3
chr1    3003741 3003930 TCCTTAGTCTAAGGTC-1  1
chr1    3003775 3003853 CACATACAGCGAGTAA-1  3
chr1    3003786 3003840 TCCTTTACAGCCTAAC-1  1
chr1    3003800 3003843 GACTCACCAACCCTAA-1  1

In [1]: import pyarrow as pa

In [2]: import pyarrow.csv

In [3]: import pandas as pd

In [4]: def read_fragments(fragments_bed_filename):
   ...:     df = pa.csv.read_csv(
   ...:             fragments_bed_filename,
   ...:             read_options=pa.csv.ReadOptions(
   ...:                 use_threads=True,
   ...:                 skip_rows=0,
   ...:                 column_names=["Chromosome", "Start", "End", "Name", "Score"],
   ...:             ),
   ...:             parse_options=pa.csv.ParseOptions(
   ...:                 delimiter="\t",
   ...:                 quote_char=False,
   ...:                 escape_char=False,
   ...:                 newlines_in_values=False,
   ...:             ),
   ...:             convert_options=pa.csv.ConvertOptions(
   ...:                 column_types={
   ...:                     "Chromosome": pa.dictionary(pa.int32(), pa.string()),
   ...:                     "Start": pa.int32(),
   ...:                     "End": pa.int32(),
   ...:                     "Name": pa.dictionary(pa.int32(), pa.string()),
   ...:                 },
   ...:             ),
   ...:         )  #.to_pandas()
   ...:     return df
   ...: 


In [6]: fragments_bed_filename = "categorical.tsv"

In [7]: %time pa_df = read_fragments(fragments_bed_filename)
CPU times: user 11.2 s, sys: 1.42 s, total: 12.6 s
Wall time: 1.15 s

In [8]: pa_df
Out[8]: 
pyarrow.Table
Chromosome: dictionary<values=string, indices=int32, ordered=0>
Start: int32
End: int32
Name: dictionary<values=string, indices=int32, ordered=0>
Score: int64
----
Chromosome: [  -- dictionary:
["chr1"]  -- indices:
[0,0,0,0,0,0,0,0,0,0]]
Start: [[3003641,3003658,3003665,3003699,3003719,3003733,3003741,3003775,3003786,3003800]]
End: [[3003701,3003977,3003699,3003743,3003767,3003863,3003930,3003853,3003840,3003843]]
Name: [  -- dictionary:
["TGTATCGCAAGCTTAT-1","CTAGTAATCCAAGTTA-1","TGTTGGCCATAGCGAG-1","AGAGATTAGTAGCTTA-1","CGGCTAATCAAGTGAG-1","CGCAATTAGTAGCCAT-1","TCCTTAGTCTAAGGTC-1","CACATACAGCGAGTAA-1","TCCTTTACAGCCTAAC-1","GACTCACCAACCCTAA-1"]  -- indices:
[0,1,2,3,4,5,6,7,8,9]]
Score: [[3,2,1,5,2,3,1,3,1,1]]

In [9]: pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-9-234d02574769> in <module>
----> 1 pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    818     _check_data_column_metadata_consistency(all_columns)
    819     columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820     blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    821 
    822     axes = [columns, index]

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
   1169     result = pa.lib.table_to_blocks(options, block_table, categories,
   1170                                     list(extension_columns.keys()))
-> 1171     return [_reconstruct_block(item, columns, extension_columns)
   1172             for item in result]
   1173 

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
   1169     result = pa.lib.table_to_blocks(options, block_table, categories,
   1170                                     list(extension_columns.keys()))
-> 1171     return [_reconstruct_block(item, columns, extension_columns)
   1172             for item in result]
   1173 

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _reconstruct_block(item, columns, extension_columns)
    779                              "to a pandas ExtensionArray")
    780         pd_ext_arr = pandas_dtype.__from_arrow__(arr)
--> 781         block = _int.make_block(pd_ext_arr, placement=placement)
    782     else:
    783         block = _int.make_block(block_arr, placement=placement)

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/internals/api.py in make_block(values, placement, klass, ndim, dtype)
     71 
     72     ndim = maybe_infer_ndim(values, placement, ndim)
---> 73     if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
     74         # GH#41168 ensure we can pass 1D dt64tz values
     75         # More generally, any EA dtype that isn't is_1d_only_ea_dtype

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/dtypes/common.py in is_period_dtype(arr_or_dtype)
    412     if isinstance(arr_or_dtype, ExtensionDtype):
    413         # GH#33400 fastpath for dtype object
--> 414         return arr_or_dtype.type is Period
    415 
    416     if arr_or_dtype is None:

~/software/anaconda3/envs/polars/lib/python3.8/site-packages/pandas/core/arrays/arrow/dtype.py in type(self)
    133             return type(pa_type)
    134         else:
--> 135             raise NotImplementedError(pa_type)
    136 
    137     @property

NotImplementedError: dictionary<values=string, indices=int32, ordered=0>

With Pandas 1.5.3

In [8]: df = pa_df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),)

In [9]: df.dtypes
Out[9]: 
Chromosome    dictionary<values=string, indices=int32, order...
Start                                            int32[pyarrow]
End                                              int32[pyarrow]
Name          dictionary<values=string, indices=int32, order...
Score                                            int64[pyarrow]
dtype: object