In [20]:

```
#Importing required packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn
from pprint import pprint
%matplotlib inline
```

In [21]:

```
#Let's begin by exploring one of scikit-learn's easiest sample datasets, the Iris.
from sklearn.datasets import load_iris
iris = load_iris()
print iris.keys()
```

In [22]:

```
#The Iris contains data about 3 types of Iris flowers namely:
print iris.target_names
#Let's look at the shape of the Iris dataset
print iris.data.shape
print iris.target.shape
#So there is data for 150 Iris flowers and a target set with 0,1,2 depending on the type of Iris.
#Let's look at the features
print iris.feature_names
#Great, now the objective is to learn from this dataset so given a new Iris flower we can best guess its type
#Let's keep this simple to start with and train on the whole dataset.
```

In [4]:

```
#Fitting the Iris dataset using KNN
X,y = iris.data, iris.target
#Fitting KNN with 1 Neighbor. This is generally a very bad idea since the 1st closest neighbor to each point is itself
#so we will definitely overfit. It's equivalent to hardcoding labels for each row in the dataset.
iris_knn = KNeighborsClassifier(n_neighbors = 1).fit(X,y)
#Now running predictions on the same dataset X. We should obviously get 100% accuracy
y_pred = iris_knn.predict(X)
#Confirming our assumption that if we run predictions on X, we will accurately predict all 150 labels
print np.all(y_pred==y)
iris_knn.score(X,y) #Score is 100%
```

Out[4]:

In [23]:

```
#Let's run KNN with something more reasonable, neighbors = 3
iris_knn = KNeighborsClassifier(n_neighbors = 5).fit(X,y)
print iris_knn.score(X,y)
y_pred=iris_knn.predict(X)
```

In [27]:

```
#Let's run a quick PCA to reduce X down to 2D so we can visualize
from sklearn.decomposition import PCA
X_proj = PCA(2).fit_transform(X)
#Let's plot the actual labels and those predicted by KNN side-by-side for comparison
fig, ax = plt.subplots(1,2,figsize=(8,3))
ax[0].scatter(X_proj[:,0], X_proj[:,1], c=y, cmap=plt.cm.Paired)
ax[0].set_title('Actual labels')
ax[1].scatter(X_proj[:,0], X_proj[:,1], c=y_pred, cmap=plt.cm.Paired)
ax[1].set_title('Predicted labels')
plt.show()
```

In [28]:

```
#OK let's now split the iris dataset into Training and Test sets so we can see how well
#the classifier generalizes
X_train, X_test, y_train, y_test = train_test_split(X,y)
print X_train.shape, X_test.shape
```

In [29]:

```
#Now let's train on X_train and predict on X_test
iris_knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred = iris_knn.predict(X_test)
#Let's check the scores for X_train and X_test
print "n_neighbors=5, Training cross-validation score", iris_knn.score(X_train, y_train)
print "n_neighbors=5, Test cross-validation score", iris_knn.score(X_test, y_test)
#They're almost the same and looks like the algorithm is generalizing pretty well.
#Just to confirm understanding let's run with neighbors=1. This must yield a very high training score
#but not a great test score.
#Now let's train on X_train and predict on X_test
iris_knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
y_pred = iris_knn.predict(X_test)
#Let's check the scores for X_train and X_test
print "n_neighbors=1, Training cross-validation score", iris_knn.score(X_train, y_train)
print "n_neighbors=1, Test cross-validation score", iris_knn.score(X_test, y_test)
#That's perfect! Reverting back to 5 neighbors so we can plot a comparison figure
iris_knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred = iris_knn.predict(X_test)
```

In [32]:

```
#Plotting test actuals and predicted for comparison
X_proj = PCA(2).fit_transform(X_test)
fig, ax = plt.subplots(1,2,figsize=(8,3))
ax[0].scatter(X_proj[:,0], X_proj[:,1], c=y_test, s=50, cmap=plt.cm.Paired)
ax[0].set_title('Test Actuals')
ax[1].scatter(X_proj[:,0], X_proj[:,1], c=y_pred, s=50, cmap=plt.cm.Paired)
ax[1].set_title('Test Predicted')
plt.show()
```

**PIMA Indians Diabetes Data (UCI ML Repo)**

Let's move on to bigger and better things. We'll analyze the Diabetes dataset available for download from the UCI Machine Learning Repository.

In [33]:

```
#Download the data from the UCI website using urllib
import urllib
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data")
raw_data = urllib.urlopen(url)
#The file is a CSV, let's read it into a numpy array
#Note: not using Pandas to examine/clean the dataset at this point since this dataset is pretty well-cleansed.
diab = np.genfromtxt(raw_data, delimiter=",")
print diab.shape
#This dataset has 9 columns, 9th one seems to be the labels, 1 or 0 for Diabetes or no Diabetes.
#Let's split into X,y
X,y = diab[:,:-1], diab[:,-1:].squeeze() #squeeze to flatten the labels into the vector y
print X.shape, y.shape
#Let's run the train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y)
print X_train.shape, X_test.shape
```

In [34]:

```
#Great! now, let's run the kNN classifier with 3 neighbors and see how it does.
diab_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
y_pred = diab_knn.predict(X_test)
y_train_pred = diab_knn.predict(X_train)
#Let's get the score summary
print "Results with 3 Neighbors"
print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])
#So we had a training accuracy of 84% but only a test accuracy of 72%
#The precision/recall metrics tell us out of each category (No Diabetes or Diabetes) how many we predicted right.
```

In [35]:

```
#OK, not so great so far. Let's see if we can improve the score by increasing neighbors.
diab_knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred = diab_knn.predict(X_test)
y_train_pred = diab_knn.predict(X_train)
#Let's get the score summary
print "Results with 5 Neighbors"
print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])
#OK we did ever so slightly better on the test set but did much poorly on the Training set.
#If we continue to tune in this fashion, we will end up biasing our model based on the Test set.
#Trying out even more number of neighbors, turns out our first choice of neighbors=3 is actually the best,
#Is there any other setting we could play with?
```

In [36]:

```
#Let's try the Ball Tree algorithm, just for kickers!
diab_knn = KNeighborsClassifier(n_neighbors=3, algorithm="ball_tree").fit(X_train, y_train)
y_pred = diab_knn.predict(X_test)
y_train_pred = diab_knn.predict(X_train)
#Let's get the score summary
print "Results with 3 Neighbors and the Ball Tree algo"
print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])
# Looks like we've exhausted the capabilities of KNN. Let's try this with other more sophisticated supervised learning
# algorithms.
#No Changes, looks like we've hit the ceiling with kNN.
```

In [37]:

```
#Below is a plot_learning_curve module that's provided by scikit-learn. It allows us to quickly and easily visualize how
#well the model is performing based on number of samples we're training on. It helps to understand situations such as
#high variance or bias.
#We'll call this module in the next segment.
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
```

In [38]:

```
#Let's try one last technique of creating a cross-validation set. Like I mentioned earlier, when you tune parameters
#based on Test results, you could possibly end up biasing your model based on Test. Although this may not be an issue
#in many instances, you could create a cross validation set to avoid this.
#Luckily scikit-learn has builit-in packages that can help with this. We'll use a crossvalidation generator that
#can train the model by tuning the parameters based on a cross-validation subset (cv) that is picked from within the training
#set. A different cv subset will be picked for each iteration, we control the number of iterations. Then we will use these
#cv/train splits and run a gridsearch function that will evaluate the model with each split and tune parameters to give us
#the best parameter that gives the optimal result.
#OK let's now follow these steps to implement the above and see if it makes any difference to the result
#80/20 test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#Choose Estimator as KNN
estimator = KNeighborsClassifier(n_neighbors=3)
#Choose cross-validation generator - let's choose ShuffleSplit which randomly shuffles and selects Train and CV sets
#for each iteration. There are other methods like the KFold split.
from sklearn.cross_validation import ShuffleSplit
#cv = KFold(X_train.shape[0])
cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0)
#Apply the cross-validation iterator on the Training set using GridSearchCV. This will run the classifier on the
#different train/cv splits using parameters specified and return the model that has the best results
#Note that we are tuning based on the F1 score 2PR/P+R where P is Precision and R is Recall. This may not always be
#the best score to tune our model on. I will explore this area further in a seperate exercise. For now, we'll use F1.
from sklearn.grid_search import GridSearchCV
classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(n_neighbors=[2,3,4,5]), scoring='f1')
#Also note that we're feeding multiple neighbors to the GridSearch to try out.
#We'll now fit the training dataset to this classifier
classifier.fit(X_train, y_train)
#Let's look at the best estimator that was found by GridSearchCV
print classifier.best_estimator_
#Let's look at the learning curve for our model based on number of training examples. We'll call the plot_learning_curve
#module by feeding it the estimator (best estimator returned from GS) and train/cv sets.
#The module simply runs the estimator multiple times on subsets of the data provided and plots the train and cv scores.
from sklearn.learning_curve import learning_curve
title = 'Learning Curves (kNN, $\n_neighbors=%.6f$)' %classifier.best_estimator_.n_neighbors
estimator = KNeighborsClassifier(n_neighbors=classifier.best_estimator_.n_neighbors)
plot_learning_curve(estimator, title, X_train, y_train, cv=cv)
plt.show()
#Great, we see that as the number of training samples increases, both training and cv scores are on the rise. This is
#a good sign. If training score rose but cv score fell that would indicate high variance (overfitting). If both train and cv
#scores are low that would indicate high bias (underfitting).
```

In [42]:

```
#Let's finally predict the Test labels and look at the final scores. Remember we've not touched our test set so far. This is
#to get a good grip on how well the model will generalize.
y_pred = classifier.predict(X_test)
print "Final Classification Report"
print metrics.classification_report(y_test, y_pred)
print "Generalization Accuracy: %.6f" %metrics.accuracy_score(y_test,y_pred)
```

**76%** probability if a person has diabetes (or not), provided information as we have it in the PIMA Indians Diabetes dataset provided by UCI.