Standardize the scale of the data §

Why scale matters? §

Because each variable has a different scale, and the variable with large scale would have much effect compared to small one.

E.g. varA ranges from 1-10, while varB ranges from 1000 - 5000.

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS",axis=1)

# Transform
scaled_feature = scaler.transform(df.drop('TARGET CLASS',axis=1))

# Recreate a feature data frame
df_feat = pd.DataFrame(scaled_feature,columns=df.df.columns[:-1])

Train test split §

from sklearn.cross_validation import train_test_split

X = df_feat
y = df['TARGET CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

Predict §

from sklearn.neighbors import KNeborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)

from sklearn.metrics import classification_repot,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

Choose a better k value by using elbow method §

error_rate = []

# Plot many models and see which has the lowest error rate
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_1 = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,
color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plot.title('Error Rate vs K value)
plt.xlabel('K')
plt.ylabel('Error Rate')