Pandas Dataframe group apply can fail on trivial cases.

Code Sample, a copy-pastable example if possible

import pandas as pd
import numpy as np

def gen_one_df(start=0):
    np.random.seed(0)
    return pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})[start:]
df = gen_one_df()

print(df)

grouped = df.groupby([str(x) for x in df.dtypes], axis=1)

for x, y in grouped:
    print(x)
    print(y)


def return_new_data(x):
#     return pd.Series(range(5), index=range(count, count+5))
#     return df.reindex(range(count, 5))
    return gen_one_df(0)
    # this works, probably due to index of the return value not being the same or whatever.
    # return gen_one_df(1)

print(grouped.apply(return_new_data))

Problem description

Here, just as a trivial test of the functionality of group apply for DataFrames. I first group the frame by columns based datatypes. Then when applying, I simply return two identical dataframes (see return_new_data) that I assume to be concatenated together along columns (with group names as part of multiindex). However, this fails when the returned dataframes have the same index as the one on which GroupBy object is based on.

I think this issue is probably caused by the large amount of "well-educated guessing" in the code, and I think this should be added to #13056.

Expected Output


# original dataframe.

      data1     data2 key1 key2
0  1.764052 -0.977278    a  one
1  0.400157  0.950088    a  two
2  0.978738 -0.151357    b  one
3  2.240893 -0.103219    b  two
4  1.867558  0.410599    a  one

# each group.

float64
      data1     data2
0  1.764052 -0.977278
1  0.400157  0.950088
2  0.978738 -0.151357
3  2.240893 -0.103219
4  1.867558  0.410599
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one

# expected output.

    float64                        object
      data1     data2 key1 key2     data1     data2 key1 key2
0  1.764052 -0.977278    a  one  1.764052 -0.977278    a  one 
1  0.400157  0.950088    a  two  0.400157  0.950088    a  two
2  0.978738 -0.151357    b  one  0.978738 -0.151357    b  one
3  2.240893 -0.103219    b  two  2.240893 -0.103219    b  two
4  1.867558  0.410599    a  one  1.867558  0.410599    a  one

actual output:

ValueError                                Traceback (most recent call last)
<ipython-input-1-d64daf6b7b92> in <module>()
     26     # return gen_one_df(1)
     27
---> 28 print(grouped.apply(return_new_data))

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
    803         # ignore SettingWithCopy here in case the user mutates
    804         with option_context('mode.chained_assignment', None):
--> 805             return self._python_apply_general(f)
    806
    807     def _python_apply_general(self, f):

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
    812             keys,
    813             values,
--> 814             not_indexed_same=mutated or self.mutated)
    815
    816     def _iterate_slices(self):

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
   3854         elif isinstance(v, DataFrame):
   3855             return self._concat_objects(keys, values,
-> 3856                                         not_indexed_same=not_indexed_same)
   3857         elif self.grouper.groupings is not None:
   3858             if len(self.grouper.groupings) > 1:

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/groupby.py in _concat_objects(self, keys, values, not_indexed_same)
    991                     result = result.take(indexer, axis=self.axis)
    992                 else:
--> 993                     result = result.reindex(ax, axis=self.axis)
    994
    995         elif self.group_keys:

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    125         @wraps(func)
    126         def wrapper(*args, **kwargs):
--> 127             return func(*args, **kwargs)
    128
    129         if not PY2:
                                                                                                                                                                                            [16/388]
~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   2933         kwargs.pop('axis', None)
   2934         kwargs.pop('labels', None)
-> 2935         return super(DataFrame, self).reindex(**kwargs)
   2936
   2937     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   3021         # perform the reindex on the axes
   3022         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023                                   fill_value, copy).__finalize__(self)
   3024
   3025     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   2863         if columns is not None:
   2864             frame = frame._reindex_columns(columns, method, copy, level,
-> 2865                                            fill_value, limit, tolerance)
   2866
   2867         index = axes['index']

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
   2888         return self._reindex_with_indexers({1: [new_columns, indexer]},
   2889                                            copy=copy, fill_value=fill_value,
-> 2890                                            allow_dups=False)
   2891
   2892     def _reindex_multi(self, axes, copy, fill_value):

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   3143                                                 fill_value=fill_value,
   3144                                                 allow_dups=allow_dups,
-> 3145                                                 copy=copy)
   3146
   3147         if copy and new_data is self._data:

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   4137         # some axes don't allow reindexing with dups
   4138         if not allow_dups:
-> 4139             self.axes[axis]._can_reindex(indexer)
   4140
   4141         if axis >= self.ndim:

~/miniconda2/envs/tf15/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
   2942         # trying to reindex on an axis with duplicates
   2943         if not self.is_unique and len(indexer):
-> 2944             raise ValueError("cannot reindex from a duplicate axis")
   2945
   2946     def reindex(self, target, method=None, level=None, limit=None,

ValueError: cannot reindex from a duplicate axis

Output of `pd.show_versions()`

INSTALLED VERSIONS ------------------ commit: None python: 3.6.4.final.0 python-bits: 64 OS: Linux OS-release: 2.6.32-573.3.1.el6.x86_64 machine: x86_64 processor: x86_64 byteorder: little LC_ALL: None LANG: en_US.iso885915 LOCALE: en_US.ISO8859-15 pandas: 0.22.0 pytest: 3.4.0 pip: 9.0.1 setuptools: 38.4.0 Cython: None numpy: 1.13.3 scipy: 1.0.0 pyarrow: None xarray: None IPython: 6.2.1 sphinx: None patsy: 0.5.0 dateutil: 2.6.1 pytz: 2017.3 blosc: None bottleneck: None tables: None numexpr: None feather: None matplotlib: 2.1.2 openpyxl: None xlrd: None xlwt: None xlsxwriter: None lxml: None bs4: None html5lib: 0.9999999 sqlalchemy: None pymysql: None psycopg2: None jinja2: 2.10 s3fs: None fastparquet: None pandas_gbq: None pandas_datareader: None

Comment From: jreback

I added to the other issue. this would require an effort to make the appropriate test cases then systematic fixing. If you want to submit xfailed tests would be helpful.

Comment From: zym1010

Thanks. I will add some xfail test cases.

Pandas Dataframe group apply can fail on trivial cases.

Code Sample, a copy-pastable example if possible

Problem description

Expected Output

Output of pd.show_versions()

Output of `pd.show_versions()`