import pandas as pd
class MyDataFrame(pd.DataFrame):

    @property
    def _constructor(self):
        return MyDataFrame

    _metadata = ['new_property']

    def __init__(self, data, new_property, index=None, columns=None, dtype=None, copy=True):

        super(MyDataFrame, self).__init__(data=data,
                                          index=index,
                                          columns=columns,
                                          dtype=dtype,
                                          copy=copy)
        self.new_property = new_property

data1 = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [15, 25, 30], 'd': [1, 1, 2]}
df1 = MyDataFrame(data1, new_property='value')
df1[['a', 'b']]

Problem description

I'm working on a new data structure that subclasses pandas DataFrame. I want to enforce my new data structure to have new_property, so that it can be processed safely later on. However, I'm running into error when using my new data structure, because the constructor gets called by some internal pandas function without the required property. The code above produces the following error.

Traceback (most recent call last): File "C:\ProgramData\Anaconda3\lib\site- packages\IPython\core\interactiveshell.py", line 2881, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in df1[['a', 'b']] File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2053, in getitem return self._getitem_array(key) File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2098, in _getitem_array return self.take(indexer, axis=1, convert=True) File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1670, in take result = self._constructor(new_data).finalize(self) TypeError: init() missing 1 required positional argument: 'new_property'

Expected Output

I expect the slicing happen without any error. Is there a fix to this or an alternative way to design this to enforce my new data structure to have new_property?

Output of pd.show_versions()

INSTALLED VERSIONS ------------------ commit: None python: 3.6.0.final.0 python-bits: 64 OS: Windows OS-release: 10 machine: AMD64 processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel byteorder: little LC_ALL: None LANG: None LOCALE: None.None pandas: 0.19.2 nose: 1.3.7 pip: 9.0.1 setuptools: 27.2.0 Cython: 0.25.2 numpy: 1.11.3 scipy: 0.18.1 statsmodels: 0.6.1 xarray: None IPython: 5.1.0 sphinx: 1.5.1 patsy: 0.4.1 dateutil: 2.6.0 pytz: 2016.10 blosc: None bottleneck: 1.2.0 tables: 3.2.2 numexpr: 2.6.1 matplotlib: 2.0.2 openpyxl: 2.4.1 xlrd: 1.0.0 xlwt: 1.2.0 xlsxwriter: 0.9.6 lxml: 3.7.2 bs4: 4.5.3 html5lib: None httplib2: None apiclient: None sqlalchemy: 1.1.5 pymysql: None psycopg2: None jinja2: 2.9.4 boto: 2.45.0 pandas_datareader: None

Comment From: chris-b1

Standard recommendation - avoid subclassing if you can! There will be some new accessor apis in 0.23 that may make this easier in some cases (#18827)

That said, here you need to make new_property an optional arg - DataFrame.__finalize__ will pass it along to the sliced instance.

# omitted
  def __init__(self, data, new_property=None, index=None, columns=None, dtype=None, copy=True):
# omitted

df1 = MyDataFrame(data1, new_property='value')

In [93]: df1[['a', 'b']].new_property
Out[93]: 'value'

Comment From: hlums

@chris-b1 Thanks for the reply! Yes, making new_property will avoid the error, but it doesn't enforce MyDataFrame to have new_property. The user will be able to create MyDataFrame by df1=MyDataFrame(data1) without passing a value for new_property, which will cause problem in downstream processing.

Comment From: chris-b1

In that case you could define an alternative constructor that only pandas uses and enforce your api on __init__

class MyDataFrame(pd.DataFrame):
    @property
    def _constructor(self):
        return MyDataFrame._internal_ctor

    _metadata = ['new_property']

    @classmethod
    def _internal_ctor(cls, *args, **kwargs):
        kwargs['new_property'] = None
        return cls(*args, **kwargs)

    def __init__(self, data, new_property, index=None, columns=None, dtype=None, copy=True):
        super(MyDataFrame, self).__init__(data=data,
                                          index=index,
                                          columns=columns,
                                          dtype=dtype,
                                          copy=copy)
        self.new_property = new_property

data1 = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [15, 25, 30], 'd': [1, 1, 2]}
df1 = MyDataFrame(data1, new_property='value')

df1[['a', 'b']].new_property
Out[121]: 'value'

MyDataFrame(data1)
TypeError: __init__() missing 1 required positional argument: 'new_property'

Comment From: hlums

@chris-b1 Thank you so much! This is exactly what I needed!

Comment From: hlums

@chris-b1 I'm getting the following error when using pd.concat on my subclass.

Traceback (most recent call last): File "C:\Anaconda\envs\myenv\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 15, in pd.concat([test, test]) File "C:\Anaconda\envs\myenv\lib\site-packages\pandas\core\reshape\concat.py", line 207, in concat return op.get_result() File "C:\Anaconda\envs\myenv\lib\site-packages\pandas\core\reshape\concat.py", line 412, in get_result return (cons._from_axes(new_data, self.new_axes) File "C:\Anaconda\envs\myenv\lib\site-packages\pandas\core\generic.py", line 334, in _from_axes return cls(data, **kwargs) TypeError: init() missing 1 required positional argument: 'new_property'.

It seems like the _from_axes() class method in generic.py is calling the init() method of my class directly, without going through the _constructor. It doesn't seem to be a solvable problem to me, but still wanted to check with a pandas expert.

Thanks!

Comment From: chris-b1

Can you open a new issue with just a reproduction of that? I think should be calling _constructor but not sure.

Here's a hacky workaround

    def __init__(self, data, new_property=None, index=None, columns=None, dtype=None, copy=True):
        if not isinstance(data, pd.core.internals.BlockManager) and not new_property:
            raise ValueError("arg new_property required")
        super(MyDataFrame, self).__init__(data=data,
                                          index=index,
                                          columns=columns,
                                          dtype=dtype,
                                          copy=copy)
        self.new_property = new_property

Comment From: hlums

@chris-b1 Thanks for the suggestion! I will open an issue about this. For now, I'm overwriting _from_axes and make it call our internal constructor. Seems to be working fine.