KNN APPLICATION FOR DRUG CLASSIFICATION

You can download data from here: https://www.kaggle.com/prathamtripathi/drug-classification

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

df <- read_csv("drug200.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Age = col_double(),
##   Sex = col_character(),
##   BP = col_character(),
##   Cholesterol = col_character(),
##   Na_to_K = col_double(),
##   Drug = col_character()
## )

# looking for NAs
apply(is.na(df), 2, sum)

##         Age         Sex          BP Cholesterol     Na_to_K        Drug 
##           0           0           0           0           0           0

Visualization of target variable :

t <- table(df$Drug)
t <- as.data.frame(t)
colnames(t) <- c("Drug","count")
ggplot(t, aes(x=Drug, y=count, fill=Drug)) +
geom_bar(stat="identity", color="black") +
theme_minimal() +
geom_text(aes(label=count), vjust=-0.6, size=6) +
scale_fill_brewer(palette="Set1")

Remember, KNN is calculated by the Euclidean distance between points. So the calculation requiries numbers:

df$Drug<-as.factor(df$Drug)
df$Sex<-as.factor(df$Sex)
df$Cholesterol<-as.numeric(as_factor(df$Cholesterol))
df$BP<-as.numeric(as_factor(df$BP))

df$Sex<-as.numeric(df$Sex)

Euclidean distance is a little sensitive, so we need to normalize the values of each variable to the range 0:1.

normalize <- function(x) {
    return((x - min(x)) / (max(x) - min(x)))
}
df[,-c(6)]<-apply(df[,-c(6)],2,normalize)

#  splitting with caTools
library(caTools)
set.seed(123)
df<-as.data.frame(df)
sample<-sample.split(df,SplitRatio = 0.85)
train<-subset(df,sample==T)
test<-subset(df,sample==F)

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(test)

	Age	Sex	BP	Cholesterol	Na_to_K	Drug
5	0.7796610	0	0.5	0	0.3681906	DrugY
11	0.5423729	0	0.5	0	0.1719307	drugC
17	0.9152542	1	0.5	1	0.1621740	drugX
23	0.5423729	1	0.5	1	0.7598662	DrugY
29	0.4067797	0	0.5	1	0.5137282	DrugY
35	0.6440678	1	1.0	0	0.2459191	drugX
41	0.9830508	0	1.0	0	0.4050285	DrugY
47	0.3728814	0	0.0	0	0.2133342	drugA
53	0.7966102	1	0.5	1	0.6540121	DrugY
59	0.7627119	1	1.0	1	0.1195197	drugX
65	0.7627119	0	0.0	0	0.2199637	drugB
71	0.9322034	1	0.0	0	0.2407280	drugB
77	0.3559322	0	0.0	0	0.1541372	drugA
83	0.2881356	0	0.5	0	0.1076678	drugC
89	0.3728814	0	0.0	1	0.5260492	DrugY
95	0.6949153	1	0.5	0	0.2735005	DrugY
101	0.2711864	1	0.0	1	0.1751829	drugA
107	0.1186441	1	1.0	0	0.1777472	drugX
113	0.3389831	1	0.5	1	0.0907186	drugX
119	0.2881356	0	0.0	1	0.1258052	drugA
125	0.6440678	0	0.0	1	0.1946964	drugB
131	0.9322034	0	1.0	0	0.4446807	DrugY
137	0.6779661	0	0.0	0	0.1472262	drugB
143	0.7627119	1	0.0	1	0.0735506	drugB
149	0.7796610	0	0.5	1	0.0334918	drugX
155	0.3728814	1	0.5	1	0.3269435	DrugY
161	0.2542373	0	1.0	0	0.1305272	drugX
167	0.7288136	0	0.5	0	0.6371881	DrugY
173	0.4067797	0	1.0	1	0.3426105	DrugY
179	0.4067797	1	1.0	0	0.3033335	DrugY
185	0.0508475	0	0.0	0	0.9668835	DrugY
191	0.7288136	1	0.0	0	0.3978360	DrugY
197	0.0169492	1	0.5	0	0.1794046	drugC

Sanitiy check: we’ve normalized our numeric features so certain features don’t dominate the euclidian distance, and we’ve coded our categorical features as dummy variables so that they can be included in our distance calculations. Then we split our data for the model evaluation.

In next step we need the cross validation values. I choose 100 for number of repeats however,because a recent article recommend that we should use it as 100 (see: (Fränti and Sieranoja 2019)). With number 100, we are going to use a 100-fold cross validation. With this process we could determine the best k values.

library(rpart)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

fit_control<-trainControl(method = "repeatedcv",number = 100,
                                   repeats=100)
set.seed(123)
model<-caret::train(Drug~.,data=train,method="knn",trControl=fit_control,tuneGrid=expand.grid(k=1:20))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

model

## k-Nearest Neighbors 
## 
## 167 samples
##   5 predictor
##   5 classes: 'drugA', 'drugB', 'drugC', 'drugX', 'DrugY' 
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold, repeated 100 times) 
## Summary of sample sizes: 163, 165, 165, 164, 165, 165, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.8572144  0.7198673
##    2  0.7845243  0.5955541
##    3  0.7666090  0.5689619
##    4  0.7592450  0.5556471
##    5  0.7661586  0.5714250
##    6  0.7465099  0.5362901
##    7  0.7408108  0.5213184
##    8  0.7126288  0.4815372
##    9  0.7193153  0.4871902
##   10  0.7150829  0.4793582
##   11  0.7089027  0.4689272
##   12  0.7171117  0.4723061
##   13  0.7191009  0.4712694
##   14  0.7174342  0.4607085
##   15  0.7163153  0.4557729
##   16  0.7135279  0.4411374
##   17  0.7012955  0.4139307
##   18  0.6988523  0.4008048
##   19  0.6724486  0.3543964
##   20  0.6580216  0.3254944
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.

plot(model)

library(class)
set.seed(4)
modelknn<-knn(train=train[,1:5],test = test[,1:5],cl=train$Drug,k=1)
confusionMatrix(test$Drug,modelknn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction drugA drugB drugC drugX DrugY
##      drugA     4     0     0     0     0
##      drugB     1     3     0     0     1
##      drugC     0     0     3     0     0
##      drugX     0     0     0     6     1
##      DrugY     0     0     2     1    11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8182          
##                  95% CI : (0.6454, 0.9302)
##     No Information Rate : 0.3939          
##     P-Value [Acc > NIR] : 7.571e-07       
##                                           
##                   Kappa : 0.755           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: drugA Class: drugB Class: drugC Class: drugX
## Sensitivity                0.8000      1.00000      0.60000       0.8571
## Specificity                1.0000      0.93333      1.00000       0.9615
## Pos Pred Value             1.0000      0.60000      1.00000       0.8571
## Neg Pred Value             0.9655      1.00000      0.93333       0.9615
## Prevalence                 0.1515      0.09091      0.15152       0.2121
## Detection Rate             0.1212      0.09091      0.09091       0.1818
## Detection Prevalence       0.1212      0.15152      0.09091       0.2121
## Balanced Accuracy          0.9000      0.96667      0.80000       0.9093
##                      Class: DrugY
## Sensitivity                0.8462
## Specificity                0.8500
## Pos Pred Value             0.7857
## Neg Pred Value             0.8947
## Prevalence                 0.3939
## Detection Rate             0.3333
## Detection Prevalence       0.4242
## Balanced Accuracy          0.8481

Our predictive accuracy is 81.82 percent, as we see above. This is pretty good performance, considering that simplicity of knn.

In sum, The k-nearest neighbors classification approach is rather simple to understand and implement. Yet it is very effective as we can see. The training phase is very fast and KNN makes no assumptions about the underlying data distribution so we could use it in various problems. But of course there are some weakness also. First thing that comes the my mind is selecting K is often arbitrary. Without scaling, k-NN cannot handle nominal or outlier data. And it can’t work with missing data.

Visualization of Knn

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

plot.df = data.frame(test[,1:5], predicted = modelknn)
plot.df1 = data.frame(x = plot.df$Na_to_K, 
                      y = plot.df$Age, 
                      predicted = plot.df$predicted)

find_hull = function(df) df[chull(df$x, df$y), ]
boundary = ddply(plot.df1, .variables = "predicted", .fun = find_hull)

ggplot(plot.df, aes(Na_to_K, Age, color = predicted, fill = predicted)) + 
  geom_point(size = 5) + 
  geom_polygon(data = boundary, aes(x,y), alpha = 0.5)

KNN Application

enes altun

9/5/2021

KNN APPLICATION FOR DRUG CLASSIFICATION

Visualization of Knn