A small, complete example of the issue

```start_time = timeit.default_timer() np.set_printoptions(suppress=True)

df=pd.read_csv('/Users/sudarshan/Desktop/masterproject/SampleDatasets/dataset3.csv') print (df.isnull().any()) df.dropna(inplace=True) print (df.isnull().any())

df= df.fillna(lambda x: x.median())

just_dummies = df['Flags'].str.get_dummies(sep='|')

dfMod = pd.concat([df, just_dummies], axis=1) print dfMod.head() print(dfMod.columns.tolist()) dfMod['A'] = dfMod['A'].astype(float) dfMod['C'] = dfMod['C'].astype(float) dfMod['E'] = dfMod['E'].astype(float) dfMod['F'] = dfMod['F'].astype(float) dfMod['P'] = dfMod['P'].astype(float) dfMod['R'] = dfMod['R'].astype(float) dfMod['S'] = dfMod['S'].astype(float)

print(dfMod.columns.tolist())

print (df.head())

le = preprocessing.LabelEncoder()

le.fit(df['Flags'])

print le.classes_

df['Flags']=le.transform(df['Flags'])

print (df.head())

X = df.as_matrix(columns=['Packets','Bytes','Duration','A','C','E','F','P','R','S']) X[ ~np.isfinite(X) ] = 0 print "converted into matrix" print X[0:5,:] print (np.any(np.isnan(X)))

y = df['Type']

scaler = preprocessing.StandardScaler().fit(X) standardized_X= Imputer().fit_transform(X) print standardized_X

Traceback (most recent call last):
  File "randomforest5changingflags.py", line 36, in <module>
    dfMod['C'] = dfMod['C'].astype(float)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2056, in __getitem__
    return self._getitem_column(key)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2063, in _getitem_column
    return self._get_item_cache(key)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/generic.py", line 1385, in _get_item_cache
    values = self._data.get(item)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/internals.py", line 3515, in get
    loc = self.items.get_loc(item)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/indexes/base.py", line 2091, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 138, in pandas.index.IndexEngine.get_loc (pandas/index.c:4149)
  File "pandas/index.pyx", line 160, in pandas.index.IndexEngine.get_loc (pandas/index.c:4013)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: 'C'

#### Expected Output

#### Output of ``pd.show_versions()``

<details>
# Paste the output here pd.show_versions() here

</details>
Python 2.7 using pandas 0.19.1rc version

**Comment From: jorisvandenbossche**

Please provide a reproducible example with runnable code.

But the error message indicates that you don't have a column named 'C' in `dfMod`


**Comment From: sudarshan1989**

import csv
import numpy as np
import pandas as pd
import timeit
from   sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,roc_auc_score
from sklearn.externals import joblib
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.metrics import confusion_matrix
import os
import psutil
from sklearn.preprocessing import Imputer

start_time = timeit.default_timer()
np.set_printoptions(suppress=True)

df=pd.read_csv('/Users/sudarshan/Desktop/masterproject/SampleDatasets/dataset3.csv')
print (df.isnull().any())
df.dropna(inplace=True)
print (df.isnull().any())
#df= df.fillna(lambda x: x.median())
just_dummies = df['Flags'].str.get_dummies(sep='|')

dfMod = pd.concat([df, just_dummies], axis=1) 
print dfMod.head()
print(dfMod.columns.tolist())
dfMod['A'] = dfMod['A'].astype(float)
#dfMod['C'] = dfMod['C'].astype(float)
dfMod['E'] = dfMod['E'].astype(float)
dfMod['F'] = dfMod['F'].astype(float)
dfMod['P'] = dfMod['P'].astype(float)
dfMod['R'] = dfMod['R'].astype(float)
dfMod['S'] = dfMod['S'].astype(float)

print(dfMod.columns.tolist())
#print (df.head())
#le = preprocessing.LabelEncoder()
#le.fit(df['Flags'])
#print le.classes_
#df['Flags']=le.transform(df['Flags'])
#print (df.head())
X = df.as_matrix(columns=['Packets','Bytes','Duration','A','C','E','F','P','R','S'])
X[ ~np.isfinite(X) ] = 0
print "converted into matrix"
print X[0:5,:]
print (np.any(np.isnan(X)))

y = df['Type']

scaler = preprocessing.StandardScaler().fit(X)
standardized_X= Imputer().fit_transform(X)
print standardized_X

X_train, X_test, y_train, y_test = train_test_split(standardized_X, y, test_size=0.25,random_state=1)
clf = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=True, n_jobs=-1, random_state=None, verbose=0, warm_start=False, class_weight=None)
clf = clf.fit(X_train, y_train)
print(RandomForestClassifier.score(clf,X_train, y_train))
y_prediction = clf.predict(X_test)
print (accuracy_score(y_test, y_prediction))
print ("finished")
report = classification_report(y_test, y_prediction)
print (report)
#plt.plot(y_test, y_prediction)
#plt.xlabel('features')
#plt.ylabel('class_features')
#plt.title('Report')
#plt.show()
X_tt = np.array([5.0,520,520.000, 1.0, 1.0, 1.0,1.0,1.0,1.0,0.0])

#X_tt = np.array([3,724,0.574,32])
result = clf.predict(X_tt)
#print ("test result")
print(result)
#plt.plot(X_tt)
#plt.show()

print("\tPrecision: %1.3f" % precision_score(y_test,y_prediction))
print("\tRecall: %1.3f" % recall_score(y_test,y_prediction))
print("\tF1: %1.3f\n" % f1_score(y_test, y_prediction))
print(confusion_matrix(y_test, y_prediction))
print(roc_auc_score(y_test, y_prediction))

runningtime = timeit.default_timer() - start_time

print(runningtime)

process = psutil.Process(os.getpid())

print(process.memory_info().rss)

#joblib.dump(le, 'savedresult.pkl')

python randomforest5changingflags.py
/Users/sudarshan/anaconda3/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Type           False
Source         False
Destination    False
Sport          False
Dport          False
Stime          False
Etime          False
Flags           True
Packets        False
Bytes          False
Duration       False
dtype: bool
Type           False
Source         False
Destination    False
Sport          False
Dport          False
Stime          False
Etime          False
Flags          False
Packets        False
Bytes          False
Duration       False
dtype: bool
   Type          Source    Destination  Sport  Dport                    Stime  \
0     1  215.94.251.130  71.126.222.64  54638  50330  2007-08-04 21:52:02.034  
1     1  199.219.164.26  71.126.222.64  51571  60552  2007-08-04 21:52:02.044  
2     1  192.153.200.55  71.126.222.64  55753  45498  2007-08-04 21:52:02.044  
3     1   229.51.190.15  71.126.222.64  28860  22196  2007-08-04 21:52:02.044  
4     1   38.174.166.55  71.126.222.64  58062   8970  2007-08-04 21:50:09.433

             Etime Flags  Packets  Bytes  ...   RP  RPA  S  SA  SEC  \


0  2007-08-04 21:52:02.044     S        2     96  ...    0    0  1   0    0  
1  2007-08-04 21:52:02.044     S        1     48  ...    0    0  1   0    0  
2  2007-08-04 21:52:02.044     S        1     48  ...    0    0  1   0    0  
3  2007-08-04 21:52:02.044     S        1     48  ...    0    0  1   0    0  
4  2007-08-04 21:52:02.044     S        2     96  ...    0    0  1   0    0   

   SPA  SPAC  SR  SRA  SRPA  
0    0     0   0    0     0  
1    0     0   0    0     0  
2    0     0   0    0     0  
3    0     0   0    0     0  
4    0     0   0    0     0  

[5 rows x 48 columns]
['Type', 'Source', 'Destination', 'Sport', 'Dport', 'Stime', 'Etime', 'Flags', 'Packets', 'Bytes', 'Duration', 'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA']
Traceback (most recent call last):
  File "randomforest5changingflags.py", line 37, in <module>
    dfMod['E'] = dfMod['E'].astype(float)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2056, in **getitem**
    return self._getitem_column(key)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2063, in _getitem_column
    return self._get_item_cache(key)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/generic.py", line 1385, in _get_item_cache
    values = self._data.get(item)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/internals.py", line 3515, in get
    loc = self.items.get_loc(item)
  File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/indexes/base.py", line 2091, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 138, in pandas.index.IndexEngine.get_loc (pandas/index.c:4149)
  File "pandas/index.pyx", line 160, in pandas.index.IndexEngine.get_loc (pandas/index.c:4013)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: 'E'

This is my algorithm code,if you happend see my .csv fro which I am extracting the values it is having flags columns with values A,C,E,F,R,P,S...I am trying to splitup the column into subcolumn as a easy input into the machine learning algorithm


**Comment From: jorisvandenbossche**

@sudarshan1989 You are printing the columns, which gives:

['Type', 'Source', 'Destination', 'Sport', 'Dport', 'Stime', 'Etime', 'Flags', 'Packets', 'Bytes', 'Duration', 'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA'] ```

(copied from above)

I don't see a 'C' or 'E' column in this. So the KeyError is very logical I think.

Comment From: sudarshan1989

No,as you can see my code in the flag column I am trying to split the flag values,into unique columns in the df Mod code....

'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA']

So i need to splitup as 'A','C','E','F','P','R','S' to convert as matrix value and feed into machine learning algorithm

Comment From: sudarshan1989

If it should throw error,it must do it for A right...It didn't For me C,E,P are the issue in dfMod

Comment From: jorisvandenbossche

If it should throw error,it must do it for A right.

No, because there is an 'A' column in the printed column names.

If you want help on your actual problem, I recommend to ask a question on something like StackOverflow (but also there, you will have to be much more explicit in what is the problem exactly and try to trim down the example code)

Pandas Pandas Keyerror

A small, complete example of the issue

df= df.fillna(lambda x: x.median())

print (df.head())

le = preprocessing.LabelEncoder()

le.fit(df['Flags'])

print le.classes_

df['Flags']=le.transform(df['Flags'])

print (df.head())