pd.__version__
u'0.20.1'
from random import random,choice
import pandas as pd
states = ['Alabama',
'Alaska',
'Arizona',
'Arkansas',
'California',
'Colorado',
'Connecticut',
'Delaware',
'Florida',
'Georgia',
'Hawaii',
'Idaho',
'Illinois',
'Indiana',
'Iowa',
'Kansas',
'Kentucky',
'Louisiana',
'Maine',
'Maryland',
'Massachusetts',
'Michigan',
'Minnesota',
'Mississippi',
'Missouri',
'Montana',
'Nebraska',
'Nevada',
'New Hampshire',
'New Jersey',
'New Mexico',
'New York',
'North Carolina',
'North Dakota',
'Ohio',
'Oklahoma',
'Oregon',
'Pennsylvania',
'Rhode Island',
'South Carolina',
'South Dakota',
'Tennessee',
'Texas',
'Utah',
'Vermont',
'Virginia',
'Washington',
'West Virginia',
'Wisconsin',
'Wyoming',
'Puerto Rico']
data = [(random(),choice(states)) for x in xrange(1000000)]
print data[:10]
[(0.24627963293887123, 'Connecticut'), (0.36761688447782837, 'Texas'), (0.3197398283371117, 'Colorado'), (0.40862900551970815, 'Minnesota'), (0.17710135182385411, 'North Dakota'), (0.32584204720734733, 'Wisconsin'), (0.18994014352660016, 'New Jersey'), (0.8459119228061891, 'Wisconsin'), (0.02739343466509858, 'Florida'), (0.42620942318964605, 'Kentucky')]
data_pd = pd.DataFrame.from_records(data,columns=[["random_id","state"]])
data_pd.head()
random_id | state | |
---|---|---|
0 | 0.246280 | Connecticut |
1 | 0.367617 | Texas |
2 | 0.319740 | Colorado |
3 | 0.408629 | Minnesota |
4 | 0.177101 | North Dakota |
data_pd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
random_id 1000000 non-null float64
state 1000000 non-null object
dtypes: float64(1), object(1)
memory usage: 15.3+ MB
data_pd.drop_duplicates().info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
random_id 1000000 non-null float64
state 1000000 non-null object
dtypes: float64(1), object(1)
memory usage: 22.9+ MB
As you can see from the second .info() call, memory usage goes up. I've tried with examples that actually remove dupes, and the memory still goes up, even when many rows are deleted.
Comment From: jreback
Your index was materialized from a RangeIndex -> Int64Index. .drop_duplicates()
(and some indexing operations) cannot preserve these indexes. You are showing a special case where nothing is dropped.
In [12]: data_pd.memory_usage(deep=True)
Out[12]:
Index 80
random_id 8000000
state 65492355
dtype: int64
In [13]: data_pd.drop_duplicates().memory_usage(deep=True)
Out[13]:
Index 8000000
random_id 8000000
state 65492355
dtype: int64