Pima Indians diabetes – Prediction & KNN visualization

towards-data-science

This post was originally published by Hardik Deshmukh at Towards Data Science

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()scaler.fit(df.drop('Outcome', axis = 1))scaler_features = scaler.transform(df.drop('Outcome', axis = 1))df_feat = pd.DataFrame(scaler_features, columns = df.columns[:-1])# appending the outcome feature
df_feat['Outcome'] = df['Outcome'].astype(int)df = df_feat.copy()
df.head()
# to reverse scaler transformation#s = scaler.inverse_transform(df_feat)
#df_feat = pd.DataFrame(s, columns = df.columns[:-1])
X = df.drop('Outcome', axis = 1)
y = df['Outcome']from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(  X, y, test_size=0.3, random_state=0)

Check for the best K value by getting Receiver Operating Characteristic Accuracy for each K ranging from 1 to 100

import sklearntt = {}
il = []
ac=[]
for  i in range(1,100):
from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=i)knn.fit(X_train,y_train)y_pred = knn.predict(X_test)from sklearn.metrics import accuracy_score
il.append(i)
ac.append( sklearn.metrics.roc_auc_score(y_test,y_pred) )tt.update({'K':il})
tt.update({'ROC_ACC':ac})vv = pd.DataFrame(tt)
vv.sort_values('ROC_ACC',ascending=False,inplace=True,ignore_index=True)
vv.head(10)

Top 10 best ROC Accuracy for corresponding K values

Selecting ‘k = 9’

from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=9)knn.fit(X_train,y_train)y_pred = knn.predict(X_test)from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)
from sklearn.metrics import roc_curve
plt.figure(dpi=100)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr,tpr,label = "%.2f" %sklearn.metrics.roc_auc_score(y_test,y_pred))
plt.legend(loc = 'lower right')plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Diabetes classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')plt.grid(True)
import sklearn
sklearn.metrics.roc_auc_score(y_test,y_pred)

0.7399724565329662

data = {'test':y_test.values.ravel(),'pred':y_pred.ravel(),'number':np.arange(0,len(y_test))}pt = pd.DataFrame(data)pt.iplot(
kind='scatter',
x='number',
y=['test','pred'], 
color=['white','yellow'],
theme='solar',
mode='lines+markers'
)
Spread the word

This post was originally published by Hardik Deshmukh at Towards Data Science

Related posts