A small, complete example of the issue
```start_time = timeit.default_timer() np.set_printoptions(suppress=True)
df=pd.read_csv('/Users/sudarshan/Desktop/masterproject/SampleDatasets/dataset3.csv') print (df.isnull().any()) df.dropna(inplace=True) print (df.isnull().any())
df= df.fillna(lambda x: x.median())
just_dummies = df['Flags'].str.get_dummies(sep='|')
dfMod = pd.concat([df, just_dummies], axis=1) print dfMod.head() print(dfMod.columns.tolist()) dfMod['A'] = dfMod['A'].astype(float) dfMod['C'] = dfMod['C'].astype(float) dfMod['E'] = dfMod['E'].astype(float) dfMod['F'] = dfMod['F'].astype(float) dfMod['P'] = dfMod['P'].astype(float) dfMod['R'] = dfMod['R'].astype(float) dfMod['S'] = dfMod['S'].astype(float)
print(dfMod.columns.tolist())
print (df.head())
le = preprocessing.LabelEncoder()
le.fit(df['Flags'])
print le.classes_
df['Flags']=le.transform(df['Flags'])
print (df.head())
X = df.as_matrix(columns=['Packets','Bytes','Duration','A','C','E','F','P','R','S']) X[ ~np.isfinite(X) ] = 0 print "converted into matrix" print X[0:5,:] print (np.any(np.isnan(X)))
y = df['Type']
scaler = preprocessing.StandardScaler().fit(X) standardized_X= Imputer().fit_transform(X) print standardized_X
Traceback (most recent call last):
File "randomforest5changingflags.py", line 36, in <module>
dfMod['C'] = dfMod['C'].astype(float)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2056, in __getitem__
return self._getitem_column(key)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2063, in _getitem_column
return self._get_item_cache(key)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/generic.py", line 1385, in _get_item_cache
values = self._data.get(item)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/internals.py", line 3515, in get
loc = self.items.get_loc(item)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/indexes/base.py", line 2091, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 138, in pandas.index.IndexEngine.get_loc (pandas/index.c:4149)
File "pandas/index.pyx", line 160, in pandas.index.IndexEngine.get_loc (pandas/index.c:4013)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: 'C'
#### Expected Output
#### Output of ``pd.show_versions()``
<details>
# Paste the output here pd.show_versions() here
</details>
Python 2.7 using pandas 0.19.1rc version
**Comment From: jorisvandenbossche**
Please provide a reproducible example with runnable code.
But the error message indicates that you don't have a column named 'C' in `dfMod`
**Comment From: sudarshan1989**
import csv
import numpy as np
import pandas as pd
import timeit
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,roc_auc_score
from sklearn.externals import joblib
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.metrics import confusion_matrix
import os
import psutil
from sklearn.preprocessing import Imputer
start_time = timeit.default_timer()
np.set_printoptions(suppress=True)
df=pd.read_csv('/Users/sudarshan/Desktop/masterproject/SampleDatasets/dataset3.csv')
print (df.isnull().any())
df.dropna(inplace=True)
print (df.isnull().any())
#df= df.fillna(lambda x: x.median())
just_dummies = df['Flags'].str.get_dummies(sep='|')
dfMod = pd.concat([df, just_dummies], axis=1)
print dfMod.head()
print(dfMod.columns.tolist())
dfMod['A'] = dfMod['A'].astype(float)
#dfMod['C'] = dfMod['C'].astype(float)
dfMod['E'] = dfMod['E'].astype(float)
dfMod['F'] = dfMod['F'].astype(float)
dfMod['P'] = dfMod['P'].astype(float)
dfMod['R'] = dfMod['R'].astype(float)
dfMod['S'] = dfMod['S'].astype(float)
print(dfMod.columns.tolist())
#print (df.head())
#le = preprocessing.LabelEncoder()
#le.fit(df['Flags'])
#print le.classes_
#df['Flags']=le.transform(df['Flags'])
#print (df.head())
X = df.as_matrix(columns=['Packets','Bytes','Duration','A','C','E','F','P','R','S'])
X[ ~np.isfinite(X) ] = 0
print "converted into matrix"
print X[0:5,:]
print (np.any(np.isnan(X)))
y = df['Type']
scaler = preprocessing.StandardScaler().fit(X)
standardized_X= Imputer().fit_transform(X)
print standardized_X
X_train, X_test, y_train, y_test = train_test_split(standardized_X, y, test_size=0.25,random_state=1)
clf = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=True, n_jobs=-1, random_state=None, verbose=0, warm_start=False, class_weight=None)
clf = clf.fit(X_train, y_train)
print(RandomForestClassifier.score(clf,X_train, y_train))
y_prediction = clf.predict(X_test)
print (accuracy_score(y_test, y_prediction))
print ("finished")
report = classification_report(y_test, y_prediction)
print (report)
#plt.plot(y_test, y_prediction)
#plt.xlabel('features')
#plt.ylabel('class_features')
#plt.title('Report')
#plt.show()
X_tt = np.array([5.0,520,520.000, 1.0, 1.0, 1.0,1.0,1.0,1.0,0.0])
#X_tt = np.array([3,724,0.574,32])
result = clf.predict(X_tt)
#print ("test result")
print(result)
#plt.plot(X_tt)
#plt.show()
print("\tPrecision: %1.3f" % precision_score(y_test,y_prediction))
print("\tRecall: %1.3f" % recall_score(y_test,y_prediction))
print("\tF1: %1.3f\n" % f1_score(y_test, y_prediction))
print(confusion_matrix(y_test, y_prediction))
print(roc_auc_score(y_test, y_prediction))
runningtime = timeit.default_timer() - start_time
print(runningtime)
process = psutil.Process(os.getpid())
print(process.memory_info().rss)
#joblib.dump(le, 'savedresult.pkl')
python randomforest5changingflags.py
/Users/sudarshan/anaconda3/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Type False
Source False
Destination False
Sport False
Dport False
Stime False
Etime False
Flags True
Packets False
Bytes False
Duration False
dtype: bool
Type False
Source False
Destination False
Sport False
Dport False
Stime False
Etime False
Flags False
Packets False
Bytes False
Duration False
dtype: bool
Type Source Destination Sport Dport Stime \
0 1 215.94.251.130 71.126.222.64 54638 50330 2007-08-04 21:52:02.034
1 1 199.219.164.26 71.126.222.64 51571 60552 2007-08-04 21:52:02.044
2 1 192.153.200.55 71.126.222.64 55753 45498 2007-08-04 21:52:02.044
3 1 229.51.190.15 71.126.222.64 28860 22196 2007-08-04 21:52:02.044
4 1 38.174.166.55 71.126.222.64 58062 8970 2007-08-04 21:50:09.433
Etime Flags Packets Bytes ... RP RPA S SA SEC \
0 2007-08-04 21:52:02.044 S 2 96 ... 0 0 1 0 0
1 2007-08-04 21:52:02.044 S 1 48 ... 0 0 1 0 0
2 2007-08-04 21:52:02.044 S 1 48 ... 0 0 1 0 0
3 2007-08-04 21:52:02.044 S 1 48 ... 0 0 1 0 0
4 2007-08-04 21:52:02.044 S 2 96 ... 0 0 1 0 0
SPA SPAC SR SRA SRPA
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
[5 rows x 48 columns]
['Type', 'Source', 'Destination', 'Sport', 'Dport', 'Stime', 'Etime', 'Flags', 'Packets', 'Bytes', 'Duration', 'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA']
Traceback (most recent call last):
File "randomforest5changingflags.py", line 37, in <module>
dfMod['E'] = dfMod['E'].astype(float)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2056, in **getitem**
return self._getitem_column(key)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/frame.py", line 2063, in _getitem_column
return self._get_item_cache(key)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/generic.py", line 1385, in _get_item_cache
values = self._data.get(item)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/core/internals.py", line 3515, in get
loc = self.items.get_loc(item)
File "/Users/sudarshan/anaconda3/lib/python2.7/site-packages/pandas/indexes/base.py", line 2091, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 138, in pandas.index.IndexEngine.get_loc (pandas/index.c:4149)
File "pandas/index.pyx", line 160, in pandas.index.IndexEngine.get_loc (pandas/index.c:4013)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: 'E'
This is my algorithm code,if you happend see my .csv fro which I am extracting the values it is having flags columns with values A,C,E,F,R,P,S...I am trying to splitup the column into subcolumn as a easy input into the machine learning algorithm
**Comment From: jorisvandenbossche**
@sudarshan1989 You are printing the columns, which gives:
['Type', 'Source', 'Destination', 'Sport', 'Dport', 'Stime', 'Etime', 'Flags', 'Packets', 'Bytes', 'Duration', 'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA'] ```
(copied from above)
I don't see a 'C' or 'E' column in this. So the KeyError is very logical I think.
Comment From: sudarshan1989
No,as you can see my code in the flag column I am trying to split the flag values,into unique columns in the df Mod code....
'A', 'AC', 'F', 'FA', 'FPA', 'FPAC', 'FR', 'FRA', 'FRPA', 'FRPAC', 'FS', 'FSA', 'FSPA', 'FSPAC', 'FSPAE', 'FSPAEC', 'FSRA', 'FSRAEC', 'FSRPA', 'FSRPAC', 'PA', 'PAC', 'R', 'RA', 'RAC', 'RC', 'RE', 'RP', 'RPA', 'S', 'SA', 'SEC', 'SPA', 'SPAC', 'SR', 'SRA', 'SRPA']
So i need to splitup as 'A','C','E','F','P','R','S' to convert as matrix value and feed into machine learning algorithm
Comment From: sudarshan1989
If it should throw error,it must do it for A right...It didn't For me C,E,P are the issue in dfMod
Comment From: jorisvandenbossche
If it should throw error,it must do it for A right.
No, because there is an 'A' column in the printed column names.
If you want help on your actual problem, I recommend to ask a question on something like StackOverflow (but also there, you will have to be much more explicit in what is the problem exactly and try to trim down the example code)