pd.__version__
u'0.20.1'
from random import random,choice
import pandas as pd

states = ['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming',
 'Puerto Rico']

data = [(random(),choice(states)) for x in xrange(1000000)]
print data[:10]
[(0.24627963293887123, 'Connecticut'), (0.36761688447782837, 'Texas'), (0.3197398283371117, 'Colorado'), (0.40862900551970815, 'Minnesota'), (0.17710135182385411, 'North Dakota'), (0.32584204720734733, 'Wisconsin'), (0.18994014352660016, 'New Jersey'), (0.8459119228061891, 'Wisconsin'), (0.02739343466509858, 'Florida'), (0.42620942318964605, 'Kentucky')]
data_pd = pd.DataFrame.from_records(data,columns=[["random_id","state"]])
data_pd.head()
random_id state
0 0.246280 Connecticut
1 0.367617 Texas
2 0.319740 Colorado
3 0.408629 Minnesota
4 0.177101 North Dakota
data_pd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
random_id    1000000 non-null float64
state        1000000 non-null object
dtypes: float64(1), object(1)
memory usage: 15.3+ MB
data_pd.drop_duplicates().info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
random_id    1000000 non-null float64
state        1000000 non-null object
dtypes: float64(1), object(1)
memory usage: 22.9+ MB

As you can see from the second .info() call, memory usage goes up. I've tried with examples that actually remove dupes, and the memory still goes up, even when many rows are deleted.

Comment From: jreback

Your index was materialized from a RangeIndex -> Int64Index. .drop_duplicates() (and some indexing operations) cannot preserve these indexes. You are showing a special case where nothing is dropped.

In [12]: data_pd.memory_usage(deep=True)
Out[12]: 
Index              80
random_id     8000000
state        65492355
dtype: int64

In [13]: data_pd.drop_duplicates().memory_usage(deep=True)
Out[13]: 
Index         8000000
random_id     8000000
state        65492355
dtype: int64