import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
column_names = iris.feature_names
df = pd.DataFrame(iris.data, columns = column_names)
print(df.head(n=5))
df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] )
print(list(df), df_dummy.shape)
X = df_dummy.drop(['sepal length (cm)'], axis = 1)
Problem description
I can't do anything with the dataframe based on col names when using Sparse = True - such as dropping a variable, which is replicated above. The full error print is under details.
Expected Output
The expected output can be seen by removing sparse = True
from df_dummy = pd.get_dummies(df, sparse = True, columns = ['petal length (cm)'] )
, and then running the subsequent lines.
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
pandas: 0.21.0
pytest: 3.2.1
pip: 9.0.1
setuptools: 36.5.0.post20170921
Cython: 0.26.1
numpy: 1.12.1
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.1.0
sphinx: 1.6.3
patsy: 0.4.1
dateutil: 2.6.1
pytz: 2017.2
blosc: None
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.2
feather: None
matplotlib: 2.1.0
openpyxl: 2.4.8
xlrd: 1.1.0
xlwt: 1.2.0
xlsxwriter: 1.0.2
lxml: 4.1.0
bs4: 4.6.0
html5lib: 0.999999999
sqlalchemy: 1.1.13
pymysql: None
psycopg2: None
jinja2: 2.9.6
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
In [ ]:
COMPLETE ERROR PRINT
------------------
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in ()
15 print(list(df), df_dummy.shape)
16
---> 17 X = df_dummy.drop(['sepal length (cm)'], axis = 1)
/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
2528 for axis, labels in axes.items():
2529 if labels is not None:
-> 2530 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
2531
2532 if inplace:
/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
2561 else:
2562 new_axis = axis.drop(labels, errors=errors)
-> 2563 dropped = self.reindex(**{axis_name: new_axis})
2564 try:
2565 dropped.axes[axis_].set_names(axis.names, inplace=True)
/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
125 @wraps(func)
126 def wrapper(*args, **kwargs):
--> 127 return func(*args, **kwargs)
128
129 if not PY2:
/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
2933 kwargs.pop('axis', None)
2934 kwargs.pop('labels', None)
-> 2935 return super(DataFrame, self).reindex(**kwargs)
2936
2937 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
3021 # perform the reindex on the axes
3022 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023 fill_value, copy).__finalize__(self)
3024
3025 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2863 if columns is not None:
2864 frame = frame._reindex_columns(columns, method, copy, level,
-> 2865 fill_value, limit, tolerance)
2866
2867 index = axes['index']
/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _reindex_columns(self, columns, method, copy, level, fill_value, limit, takeable)
701 return self._constructor(
702 sdict, index=self.index, columns=columns,
--> 703 default_fill_value=self._default_fill_value).__finalize__(self)
704
705 def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in __init__(self, data, index, columns, default_kind, default_fill_value, dtype, copy)
89 fill_value=default_fill_value)
90 elif isinstance(data, dict):
---> 91 mgr = self._init_dict(data, index, columns, dtype=dtype)
92 elif isinstance(data, (np.ndarray, list)):
93 mgr = self._init_matrix(data, index, columns, dtype=dtype)
/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _init_dict(self, data, index, columns, dtype)
168 sdict.update((c, nan_arr) for c in columns if c not in sdict)
169
--> 170 return to_manager(sdict, columns, index)
171
172 def _init_matrix(self, data, index, columns, dtype=None):
/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in to_manager(sdf, columns, index)
896
897 return create_block_manager_from_arrays(
--> 898 [sdf[c] for c in columns], columns, axes)
899
900
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in create_block_manager_from_arrays(arrays, names, axes)
4630
4631 try:
-> 4632 blocks = form_blocks(arrays, names, axes)
4633 mgr = BlockManager(blocks, axes)
4634 mgr._consolidate_inplace()
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in form_blocks(arrays, names, axes)
4726
4727 if len(sparse_items) > 0:
-> 4728 sparse_blocks = _sparse_blockify(sparse_items)
4729 blocks.extend(sparse_blocks)
4730
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _sparse_blockify(tuples, dtype)
4788 array = _maybe_to_sparse(array)
4789 block = make_block(array, klass=SparseBlock, fastpath=True,
-> 4790 placement=[i])
4791 new_blocks.append(block)
4792
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2950 placement=placement, dtype=dtype)
2951
-> 2952 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2953
2954 # TODO: flexible with index=None and/or items=None
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath, **kwargs)
1701
1702 if not isinstance(values, self._holder):
-> 1703 raise TypeError("values must be {0}".format(self._holder.__name__))
1704
1705 self.values = values
Comment From: TomAugspurger
I think this is a duplicate of https://github.com/pandas-dev/pandas/issues/18686
Basically, get_dummies(..., sparse=True)
converts everything to sparse, not just the newly created columns.