Link: K nearest neighbours (KNN) theory Python
Standardize the scale of the data
Why scale matters?
Because each variable has a different scale, and the variable with large scale would have much effect compared to small one.
E.g. varA ranges from 1-10, while varB ranges from 1000 - 5000.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS",axis=1)
# Transform
scaled_feature = scaler.transform(df.drop('TARGET CLASS',axis=1))
# Recreate a feature data frame
df_feat = pd.DataFrame(scaled_feature,columns=df.df.columns[:-1])
Train test split
from sklearn.cross_validation import train_test_split
X = df_feat
y = df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
Predict
from sklearn.neighbors import KNeborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
from sklearn.metrics import classification_repot,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
Choose a better k value by using elbow method
error_rate = []
# Plot many models and see which has the lowest error rate
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_1 = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,
color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plot.title('Error Rate vs K value)
plt.xlabel('K')
plt.ylabel('Error Rate')