##################################################################
##
##     An Overview/Tutorial for Several Data Mining Methods in R
##
##     TOPICS:
##       
##       1) Creating test and training set
##       2) Chi^2 testing
##       3) kNN (k nearest nieghbor)
##       4) Random Forest
##       5) Clustering: kmeans, EM,  hclust
##       6) Decision Trees 
##       7) Naive Bayes
##       8) SVM
##
###################################################################
## Dr. A. Gates
##
## NOTE: This particular tutorial is not focussed on data cleaning
##      or data preprocessing/prep
##      I created several other tutorials that deal with and show 
##      many examples in that area. 
##      All datasets have individual issues, concerns, and features
##      Different methods require various data formats. 
##      It is important to practice preparing dirty data as well. 
##
###################################################################
##
## The Data:
##          For this tutorial, I will use the simple R iris
##          dataset. This will allow us to focus on the methods
##          and not the data cleaning and prep. I will again note
##          that in the "real world" if there is such a thing - 
##          data cleaning, prep, and pre-processing can consume
##          more than 70% of coding and analysis time.
##
###################################################################

### The Libraries
library(datasets)
library(class) ## for knn
library(mlr) ## for vis
## Loading required package: ParamHelpers
library(ggplot2)
library(plyr) ## load this BEFORE dplyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lattice)
library(caret) 
## 
## Attaching package: 'caret'
## The following object is masked from 'package:mlr':
## 
##     train
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:mlr':
## 
##     impute
library(ElemStatLearn)
## 
## Attaching package: 'ElemStatLearn'
## The following object is masked from 'package:plyr':
## 
##     ozone
library(gmodels)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## LIBRARIES
library(stringr)
#install.packages("e1071")
library(e1071)
#install.packages("naivebayes")
library(naivebayes)
library(mclust)
## Package 'mclust' version 5.4
## Type 'citation("mclust")' for citing this R package in publications.
library(cluster)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## install.packages("rpart")
## install.packages('rattle')
## install.packages('rpart.plot')
## install.packages('RColorBrewer')
## install.packages("Cairo")
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(rpart.plot)
library(RColorBrewer)
library(Cairo)
# install.packages("philentropy")
library(philentropy)
# install.packages("forcats")
library(forcats)
# install.packages("lsa")
library(lsa) #for cosine similarity
## Loading required package: SnowballC
# install.packages("igraph")
library(igraph)  #to create network of cos sim matrix
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:class':
## 
##     knn
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# install.packages("corrplot")
library(corrplot)
## corrplot 0.84 loaded
## install.packages("pastecs") ## for stats
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
##install.packages("dplyr")
library(dplyr)
## install.packages("ggpubr")
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:pastecs':
## 
##     extract
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:mclust':
## 
##     sim
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#############          The Data            ################################

data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Species is the label
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
(head(iris))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
## Because we are doing kNN - we will shuffle (mix up) the 
## rows of the dataset. I will do this randomly. 

set.seed(9850) ## setting a seed will allow you to reproduce random results
## Create 150 random numbers between 0 and 1
(u_num <- runif(nrow(iris)))
##   [1] 0.749575882 0.997086017 0.652001954 0.432928278 0.332312413
##   [6] 0.865406471 0.179331242 0.478493654 0.295795314 0.664406579
##  [11] 0.711770326 0.780119344 0.135679218 0.081918849 0.631659831
##  [16] 0.529684246 0.017083371 0.947442812 0.288261390 0.010243757
##  [21] 0.889722764 0.099838558 0.976578544 0.626311010 0.818966399
##  [26] 0.814217131 0.228938267 0.393345113 0.963064569 0.158522949
##  [31] 0.365362736 0.815672850 0.160475621 0.325075457 0.956077240
##  [36] 0.221829941 0.240358861 0.654842327 0.833372210 0.223317408
##  [41] 0.567319461 0.645643225 0.046593236 0.166861771 0.095600741
##  [46] 0.280708688 0.274372719 0.306021348 0.466236177 0.714833845
##  [51] 0.819010729 0.413965707 0.033480894 0.163171474 0.614522173
##  [56] 0.625591099 0.987969234 0.590955717 0.291943232 0.848013638
##  [61] 0.239626787 0.227112662 0.014037226 0.235208923 0.348486998
##  [66] 0.752009868 0.397804687 0.173633337 0.115410871 0.096681800
##  [71] 0.772989324 0.579852495 0.592361025 0.317120232 0.265488403
##  [76] 0.736040238 0.725467216 0.866390575 0.774121353 0.041318237
##  [81] 0.717216028 0.618082392 0.025139597 0.958615328 0.237069942
##  [86] 0.157850945 0.919268952 0.373450233 0.612524643 0.580367921
##  [91] 0.040488273 0.606024550 0.253320484 0.617464615 0.670980009
##  [96] 0.498678416 0.539632167 0.620603637 0.743601094 0.707388356
## [101] 0.881044094 0.140365195 0.009580307 0.641032226 0.919105340
## [106] 0.968964117 0.157093387 0.494844032 0.453633379 0.211360556
## [111] 0.803511472 0.522870498 0.537666918 0.922744670 0.975340607
## [116] 0.223733670 0.371116180 0.035312952 0.322225733 0.157145070
## [121] 0.168501345 0.063806432 0.489302015 0.968255649 0.474587146
## [126] 0.269455400 0.451698734 0.220212331 0.986794680 0.775118793
## [131] 0.658783297 0.088049627 0.815227871 0.743987815 0.083619478
## [136] 0.465915223 0.759169244 0.886477632 0.603841368 0.155911440
## [141] 0.747532928 0.689924665 0.986163957 0.512607964 0.311356318
## [146] 0.644091529 0.804400820 0.133151028 0.757746826 0.420075185
## Use these random numbers (u_num) to shuffle the iris rows into 
## a new data frame. Order by u_num - so random order. 
NewIris <- iris[order(u_num),]
(head(NewIris, n=15))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 103          7.1         3.0          5.9         2.1  virginica
## 20           5.1         3.8          1.5         0.3     setosa
## 63           6.0         2.2          4.0         1.0 versicolor
## 17           5.4         3.9          1.3         0.4     setosa
## 83           5.8         2.7          3.9         1.2 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 118          7.7         3.8          6.7         2.2  virginica
## 91           5.5         2.6          4.4         1.2 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 43           4.4         3.2          1.3         0.2     setosa
## 122          5.6         2.8          4.9         2.0  virginica
## 14           4.3         3.0          1.1         0.1     setosa
## 135          6.1         2.6          5.6         1.4  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 45           5.1         3.8          1.9         0.4     setosa
######### Next - normalize the data attributes ######
##
## Normalization is important so that no attributes overpower
## other attributes.
## Warning: do not normalize blindly. Think about what your goal is
## and what the nature of the data is. 

(summary(NewIris))
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
## Notice that the Min and Max for Sepal.Length is 4.3 to 7.9, but
## Petal.Width ranges from .1 to 2.5. 
## The normalization will scale all attributes so that all values
## have the same max and min. There are many ways to scale - 
## such as min-max or z-scores (called standardizing). 

## Create a function to use min-max to re-scale/normalize
## the numerical attributes

Min_Max_function <- function(x){
  return(  (x - min(x)) /(max(x) - min(x))   )
}

## Let's check this function first on a few numbers
## By hand:  (1 - 1)/ (3 - 1) = 0, (2-1)/(3-1) = 1/2,  (3-1)/(3-1)=1
(Min_Max_function(c(1,2,3)))
## [1] 0.0 0.5 1.0
## You can see that the output is correct and is 0, .5, and 1
## So, now we know that the Min_Max_function works.

## Next, apply the Min_Max to all the NewIris data.
Norm_Iris <- as.data.frame(lapply(NewIris[,c(1,2,3,4)], Min_Max_function))
Iris_Labels <- NewIris[,5]
(head(Norm_Iris))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.7777778  0.41666667   0.83050847  0.83333333
## 2    0.2222222  0.75000000   0.08474576  0.08333333
## 3    0.4722222  0.08333333   0.50847458  0.37500000
## 4    0.3055556  0.79166667   0.05084746  0.12500000
## 5    0.4166667  0.29166667   0.49152542  0.45833333
## 6    0.7222222  0.45833333   0.66101695  0.58333333
(head(Iris_Labels))
## [1] virginica  setosa     versicolor setosa     versicolor versicolor
## Levels: setosa versicolor virginica
## This looks good!
## Now, let's add back the labels
Spec <- NewIris$Species
(head(ReadyIrisDF <- data.frame(Norm_Iris, Species=Spec)))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1    0.7777778  0.41666667   0.83050847  0.83333333  virginica
## 2    0.2222222  0.75000000   0.08474576  0.08333333     setosa
## 3    0.4722222  0.08333333   0.50847458  0.37500000 versicolor
## 4    0.3055556  0.79166667   0.05084746  0.12500000     setosa
## 5    0.4166667  0.29166667   0.49152542  0.45833333 versicolor
## 6    0.7222222  0.45833333   0.66101695  0.58333333 versicolor
## Now we have a dataset we can work with. It is called ReadyIrisDF

## The next step is to create from this dataset a Testset and a Trainset
## To create the Testset, randomly grab about 1/5 of the data
## There are many ways to do this. I will use sample.
(n <- round(nrow(ReadyIrisDF)/5))
## [1] 30
(s <- sample(1:nrow(ReadyIrisDF), n))
##  [1] 149  89  20  17  67  71  69  99  18  25   8  91  84 145  35  11   2
## [18] 109  97 130 103   6  58 120  39  38  31 142 122   3
## The test set is the sample
IrisTestSet <- ReadyIrisDF[s,]
## The trainng set is the not sample
IrisTrainSet <- ReadyIrisDF[-s,]
## Have a look...
(head(IrisTestSet,n=5))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 149    0.5555556   0.5416667   0.62711864   0.6250000 versicolor
## 89     0.6111111   0.3333333   0.61016949   0.5833333 versicolor
## 20     0.1388889   0.4166667   0.06779661   0.0000000     setosa
## 17     0.2222222   0.7083333   0.08474576   0.1250000     setosa
## 67     0.5277778   0.3333333   0.64406780   0.7083333  virginica
(head(IrisTrainSet,n=5))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1    0.7777778   0.4166667   0.83050847   0.8333333  virginica
## 4    0.3055556   0.7916667   0.05084746   0.1250000     setosa
## 5    0.4166667   0.2916667   0.49152542   0.4583333 versicolor
## 7    0.9444444   0.7500000   0.96610169   0.8750000  virginica
## 9    0.3888889   0.2500000   0.42372881   0.3750000 versicolor
### OK - now we have a Test and Train set:  IrisTestSet and IrisTrainSet
## The test set & train set needs to be seperated into just the attributes
## and just the labels. 

IrisTestSet_numonly <- IrisTestSet[,-5]
IrisTestSet_labels <- IrisTestSet[,5]
IrisTrainSet_numonly <- IrisTrainSet[,-5]
IrisTrainSet_labels <- IrisTrainSet[,5]
(head(IrisTestSet_numonly ))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 149    0.5555556   0.5416667   0.62711864   0.6250000
## 89     0.6111111   0.3333333   0.61016949   0.5833333
## 20     0.1388889   0.4166667   0.06779661   0.0000000
## 17     0.2222222   0.7083333   0.08474576   0.1250000
## 67     0.5277778   0.3333333   0.64406780   0.7083333
## 71     0.6666667   0.5416667   0.79661017   0.8333333
(head(IrisTestSet_labels))
## [1] versicolor versicolor setosa     setosa     virginica  virginica 
## Levels: setosa versicolor virginica
(head(IrisTrainSet_numonly ))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.77777778   0.4166667   0.83050847  0.83333333
## 4    0.30555556   0.7916667   0.05084746  0.12500000
## 5    0.41666667   0.2916667   0.49152542  0.45833333
## 7    0.94444444   0.7500000   0.96610169  0.87500000
## 9    0.38888889   0.2500000   0.42372881  0.37500000
## 10   0.02777778   0.5000000   0.05084746  0.04166667
(head(IrisTrainSet_labels))
## [1] virginica  setosa     versicolor virginica  versicolor setosa    
## Levels: setosa versicolor virginica
########################################################################
############    SET UP THE kNN MODEL ###################################
########################################################################
## https://www.analyticsvidhya.com/blog/2015/08/learning-concept-knn-algorithms-programming/
## CHoose k for the number of NN you want to consider
## sqrt(nrow) is a good starting point for k
## However, testing k is best!

## NOTICE: !! Here, I am calling knn from its library (class) explicitly
## This can sometimes help if you run into odd errors. class::knn 
k <- round(sqrt(nrow(iris)))
kNN_fit <- class::knn(train=IrisTrainSet_numonly, test=IrisTestSet_numonly, 
               cl=IrisTrainSet_labels,k = k, prob=TRUE)
print(kNN_fit)
##  [1] versicolor versicolor setosa     setosa     virginica  virginica 
##  [7] virginica  setosa     versicolor versicolor versicolor versicolor
## [13] versicolor virginica  setosa     virginica  setosa     versicolor
## [19] virginica  versicolor versicolor versicolor versicolor virginica 
## [25] setosa     versicolor versicolor setosa     virginica  versicolor
## attr(,"prob")
##  [1] 0.7500000 0.9166667 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000
##  [8] 1.0000000 0.5833333 0.7500000 1.0000000 1.0000000 0.9166667 1.0000000
## [15] 1.0000000 0.7500000 1.0000000 0.9166667 1.0000000 0.9166667 1.0000000
## [22] 0.7500000 1.0000000 0.7500000 1.0000000 1.0000000 1.0000000 1.0000000
## [29] 0.9166667 0.9166667
## Levels: setosa versicolor virginica
## Check the classification accuracy
(table(kNN_fit, IrisTestSet_labels))
##             IrisTestSet_labels
## kNN_fit      setosa versicolor virginica
##   setosa          7          0         0
##   versicolor      0         15         0
##   virginica       0          0         8
## Very good prediction!
## prop.chisq: If TRUE, chi-square contribution of each cell will be included
## chisq: If TRUE, the results of a chi-square test will be included
CrossTable(x = IrisTestSet$Species, y = kNN_fit,prop.chisq=F) 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  30 
## 
##  
##                     | kNN_fit 
## IrisTestSet$Species |     setosa | versicolor |  virginica |  Row Total | 
## --------------------|------------|------------|------------|------------|
##              setosa |          7 |          0 |          0 |          7 | 
##                     |      1.000 |      0.000 |      0.000 |      0.233 | 
##                     |      1.000 |      0.000 |      0.000 |            | 
##                     |      0.233 |      0.000 |      0.000 |            | 
## --------------------|------------|------------|------------|------------|
##          versicolor |          0 |         15 |          0 |         15 | 
##                     |      0.000 |      1.000 |      0.000 |      0.500 | 
##                     |      0.000 |      1.000 |      0.000 |            | 
##                     |      0.000 |      0.500 |      0.000 |            | 
## --------------------|------------|------------|------------|------------|
##           virginica |          0 |          0 |          8 |          8 | 
##                     |      0.000 |      0.000 |      1.000 |      0.267 | 
##                     |      0.000 |      0.000 |      1.000 |            | 
##                     |      0.000 |      0.000 |      0.267 |            | 
## --------------------|------------|------------|------------|------------|
##        Column Total |          7 |         15 |          8 |         30 | 
##                     |      0.233 |      0.500 |      0.267 |            | 
## --------------------|------------|------------|------------|------------|
## 
## 
## See below for notes about Chi^2. Also note that CrossTable info can
## be found here:

###########################################################################
########  A note about Chi^2 ###########################################
###########################################################################
## Chi^2 is a test of independence. When variables are not independent
## then assuming independence or using all variables may not be ideal.
## Good Chi^2 YouTube Video
## https://www.youtube.com/watch?v=1RecjImtImY
## Chi^2 is for nominal or categorical variables.
## You can use Chi^2 to determine if two variables are independent
## In this case, Ho: variables are independent
## You can also use Chi^2 to test if variables are sig diff from given
## or expected values or proportions. 
## Here, Ho: NOt sig diff.
############################################################################
## Example: Using Chi^2 on iris data

#(IrisTable <- table(ReadyIrisDF$Sepal.Length, ReadyIrisDF$Petal.Length))

## Chi^2 is actually intended for use with nominal categorical data
## Therefore, we must first discretize...
(head(NewIrisDF <- ReadyIrisDF))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1    0.7777778  0.41666667   0.83050847  0.83333333  virginica
## 2    0.2222222  0.75000000   0.08474576  0.08333333     setosa
## 3    0.4722222  0.08333333   0.50847458  0.37500000 versicolor
## 4    0.3055556  0.79166667   0.05084746  0.12500000     setosa
## 5    0.4166667  0.29166667   0.49152542  0.45833333 versicolor
## 6    0.7222222  0.45833333   0.66101695  0.58333333 versicolor
## CatSepLen stands for categorical Sepal Length
## Create a new column in the DF
NewIrisDF$CatSepLen <- NULL
## Cut the current numerical values into 3 equal parts. The use of 3 is arbitrary.
## You can cut into as many parts as you wish.
NewIrisDF$CatSepLen=cut(NewIrisDF$Sepal.Length, breaks=3,labels=c("Small", "Medium", "Large"))
NewIrisDF$CatPetLen=cut(NewIrisDF$Petal.Length, breaks=3,labels=c("Small", "Medium", "Large"))
(head(NewIrisDF))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species CatSepLen
## 1    0.7777778  0.41666667   0.83050847  0.83333333  virginica     Large
## 2    0.2222222  0.75000000   0.08474576  0.08333333     setosa     Small
## 3    0.4722222  0.08333333   0.50847458  0.37500000 versicolor    Medium
## 4    0.3055556  0.79166667   0.05084746  0.12500000     setosa     Small
## 5    0.4166667  0.29166667   0.49152542  0.45833333 versicolor    Medium
## 6    0.7222222  0.45833333   0.66101695  0.58333333 versicolor     Large
##   CatPetLen
## 1     Large
## 2     Small
## 3    Medium
## 4     Small
## 5    Medium
## 6    Medium
## Look at Chi^2 for CatSepLen and CatPetLen
## Ho: Sepal length and petal length are independent
## Ha: Sepal length and petal length are NOT independent
(IrisTable2 <- table(NewIrisDF$CatSepLen, NewIrisDF$CatPetLen))
##         
##          Small Medium Large
##   Small     47     12     0
##   Medium     3     39    29
##   Large      0      3    17
(CHI <- chisq.test(IrisTable2))
## 
##  Pearson's Chi-squared test
## 
## data:  IrisTable2
## X-squared = 115.98, df = 4, p-value < 2.2e-16
## Given that p is close to 0
## we reject Ho. 
## This result is significant and so these variables are *not* indepndent
## In R, we can use attributes() to see options for objects
(attributes(CHI))
## $names
## [1] "statistic" "parameter" "p.value"   "method"    "data.name" "observed" 
## [7] "expected"  "residuals" "stdres"   
## 
## $class
## [1] "htest"
(CHI$p.value)
## [1] 3.847931e-24
(CHI$expected)
##         
##              Small Medium     Large
##   Small  19.666667  21.24 18.093333
##   Medium 23.666667  25.56 21.773333
##   Large   6.666667   7.20  6.133333
## We can also look at CatSepLen and Species
(IrisTable3 <- table(NewIrisDF$CatSepLen, NewIrisDF$Species))
##         
##          setosa versicolor virginica
##   Small      47         11         1
##   Medium      3         36        32
##   Large       0          3        17
(chisq.test(IrisTable3))
## 
##  Pearson's Chi-squared test
## 
## data:  IrisTable3
## X-squared = 111.63, df = 4, p-value < 2.2e-16
## Also sig - Sepal Length is correlated (and so is not indepnedent)
## from Species.



#################  Visualize kNN #######################################
## First, visualize the data :
ggpairs(iris)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Our kNN model is called kNN_fit

(plotDF <- data.frame(IrisTestSet_numonly, predicted = kNN_fit))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width  predicted
## 149    0.5555556  0.54166667   0.62711864  0.62500000 versicolor
## 89     0.6111111  0.33333333   0.61016949  0.58333333 versicolor
## 20     0.1388889  0.41666667   0.06779661  0.00000000     setosa
## 17     0.2222222  0.70833333   0.08474576  0.12500000     setosa
## 67     0.5277778  0.33333333   0.64406780  0.70833333  virginica
## 71     0.6666667  0.54166667   0.79661017  0.83333333  virginica
## 69     0.9444444  0.41666667   0.86440678  0.91666667  virginica
## 99     0.1111111  0.50000000   0.05084746  0.04166667     setosa
## 18     0.5277778  0.08333333   0.59322034  0.58333333 versicolor
## 25     0.4722222  0.58333333   0.59322034  0.62500000 versicolor
## 8      0.3333333  0.25000000   0.57627119  0.45833333 versicolor
## 91     0.3333333  0.16666667   0.45762712  0.37500000 versicolor
## 84     0.1666667  0.16666667   0.38983051  0.37500000 versicolor
## 145    0.4166667  0.33333333   0.69491525  0.95833333  virginica
## 35     0.1944444  0.50000000   0.03389831  0.04166667     setosa
## 11     0.3611111  0.33333333   0.66101695  0.79166667  virginica
## 2      0.2222222  0.75000000   0.08474576  0.08333333     setosa
## 109    0.6944444  0.33333333   0.64406780  0.54166667 versicolor
## 97     0.6666667  0.41666667   0.71186441  0.91666667  virginica
## 130    0.2500000  0.29166667   0.49152542  0.54166667 versicolor
## 103    0.3611111  0.29166667   0.54237288  0.50000000 versicolor
## 6      0.7222222  0.45833333   0.66101695  0.58333333 versicolor
## 58     0.3611111  0.37500000   0.44067797  0.50000000 versicolor
## 120    0.8055556  0.41666667   0.81355932  0.62500000  virginica
## 39     0.1944444  0.58333333   0.10169492  0.12500000     setosa
## 38     0.4444444  0.41666667   0.54237288  0.58333333 versicolor
## 31     0.4166667  0.29166667   0.52542373  0.37500000 versicolor
## 142    0.2500000  0.58333333   0.06779661  0.04166667     setosa
## 122    0.6111111  0.50000000   0.69491525  0.79166667  virginica
## 3      0.4722222  0.08333333   0.50847458  0.37500000 versicolor
# First use Convex hull to determine boundary points of each cluster
(plotDF2 <- data.frame(x = plotDF$Sepal.Length, 
                       y = plotDF$Sepal.Width, 
                       predicted = plotDF$predicted))
##            x          y  predicted
## 1  0.5555556 0.54166667 versicolor
## 2  0.6111111 0.33333333 versicolor
## 3  0.1388889 0.41666667     setosa
## 4  0.2222222 0.70833333     setosa
## 5  0.5277778 0.33333333  virginica
## 6  0.6666667 0.54166667  virginica
## 7  0.9444444 0.41666667  virginica
## 8  0.1111111 0.50000000     setosa
## 9  0.5277778 0.08333333 versicolor
## 10 0.4722222 0.58333333 versicolor
## 11 0.3333333 0.25000000 versicolor
## 12 0.3333333 0.16666667 versicolor
## 13 0.1666667 0.16666667 versicolor
## 14 0.4166667 0.33333333  virginica
## 15 0.1944444 0.50000000     setosa
## 16 0.3611111 0.33333333  virginica
## 17 0.2222222 0.75000000     setosa
## 18 0.6944444 0.33333333 versicolor
## 19 0.6666667 0.41666667  virginica
## 20 0.2500000 0.29166667 versicolor
## 21 0.3611111 0.29166667 versicolor
## 22 0.7222222 0.45833333 versicolor
## 23 0.3611111 0.37500000 versicolor
## 24 0.8055556 0.41666667  virginica
## 25 0.1944444 0.58333333     setosa
## 26 0.4444444 0.41666667 versicolor
## 27 0.4166667 0.29166667 versicolor
## 28 0.2500000 0.58333333     setosa
## 29 0.6111111 0.50000000  virginica
## 30 0.4722222 0.08333333 versicolor
find_hull <- function(df) df[chull(df$x, df$y), ]

boundary <- ddply(plotDF2, .variables = "predicted", .fun = find_hull)

ggplot(plotDF, aes(Sepal.Length, Sepal.Width, color = predicted, fill = predicted)) + 
  geom_point(size = 5) + 
  geom_polygon(data = boundary, aes(x,y), alpha = 0.5)

######### Other online tutorials for kNN -------------
## https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/
## https://www.r-bloggers.com/k-nearest-neighbor-step-by-step-tutorial/
## http://michael.hahsler.net/SMU/EMIS7332/R/viz_classifier.html

#######################################################################
############               RANDOM FOREST                   ############
#######################################################################

## About RF
## Ensemble Learning is a type of Supervised Learning Technique.
## We generate multiple Models on a training dataset and 
## combine (average) their Output Rules 
## to generate a stronger Model 
## RF is an emsemble of Decision Trees (DT)
## Averaging the DTs helps to reduce the variance 
## and improve the performance of 
## This also helps to avoid overfitting.

## We will again use the training and testing set form above.
## To save time, when you plan to try out many ML methods
## such as SVM, kNN, RF, DT, etc....create a clean Training
## and Testing set and then you can use the same sets to experiment
## with all the methods.

## We have the following clean and *normalized* data:
(head(IrisTestSet,n=15))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 149    0.5555556  0.54166667   0.62711864  0.62500000 versicolor
## 89     0.6111111  0.33333333   0.61016949  0.58333333 versicolor
## 20     0.1388889  0.41666667   0.06779661  0.00000000     setosa
## 17     0.2222222  0.70833333   0.08474576  0.12500000     setosa
## 67     0.5277778  0.33333333   0.64406780  0.70833333  virginica
## 71     0.6666667  0.54166667   0.79661017  0.83333333  virginica
## 69     0.9444444  0.41666667   0.86440678  0.91666667  virginica
## 99     0.1111111  0.50000000   0.05084746  0.04166667     setosa
## 18     0.5277778  0.08333333   0.59322034  0.58333333 versicolor
## 25     0.4722222  0.58333333   0.59322034  0.62500000 versicolor
## 8      0.3333333  0.25000000   0.57627119  0.45833333 versicolor
## 91     0.3333333  0.16666667   0.45762712  0.37500000 versicolor
## 84     0.1666667  0.16666667   0.38983051  0.37500000 versicolor
## 145    0.4166667  0.33333333   0.69491525  0.95833333  virginica
## 35     0.1944444  0.50000000   0.03389831  0.04166667     setosa
(head(IrisTrainSet,n=15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1    0.77777778   0.4166667   0.83050847  0.83333333  virginica
## 4    0.30555556   0.7916667   0.05084746  0.12500000     setosa
## 5    0.41666667   0.2916667   0.49152542  0.45833333 versicolor
## 7    0.94444444   0.7500000   0.96610169  0.87500000  virginica
## 9    0.38888889   0.2500000   0.42372881  0.37500000 versicolor
## 10   0.02777778   0.5000000   0.05084746  0.04166667     setosa
## 12   0.00000000   0.4166667   0.01694915  0.00000000     setosa
## 13   0.50000000   0.2500000   0.77966102  0.54166667  virginica
## 14   1.00000000   0.7500000   0.91525424  0.79166667  virginica
## 15   0.22222222   0.7500000   0.15254237  0.12500000     setosa
## 16   0.36111111   0.2083333   0.49152542  0.41666667 versicolor
## 19   0.61111111   0.4166667   0.71186441  0.79166667  virginica
## 21   0.41666667   0.2916667   0.69491525  0.75000000  virginica
## 22   0.72222222   0.4583333   0.74576271  0.83333333  virginica
## 23   0.16666667   0.2083333   0.59322034  0.66666667  virginica
(head(IrisTestSet_numonly ))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 149    0.5555556   0.5416667   0.62711864   0.6250000
## 89     0.6111111   0.3333333   0.61016949   0.5833333
## 20     0.1388889   0.4166667   0.06779661   0.0000000
## 17     0.2222222   0.7083333   0.08474576   0.1250000
## 67     0.5277778   0.3333333   0.64406780   0.7083333
## 71     0.6666667   0.5416667   0.79661017   0.8333333
(head(IrisTestSet_labels))
## [1] versicolor versicolor setosa     setosa     virginica  virginica 
## Levels: setosa versicolor virginica
(head(IrisTrainSet_numonly ))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.77777778   0.4166667   0.83050847  0.83333333
## 4    0.30555556   0.7916667   0.05084746  0.12500000
## 5    0.41666667   0.2916667   0.49152542  0.45833333
## 7    0.94444444   0.7500000   0.96610169  0.87500000
## 9    0.38888889   0.2500000   0.42372881  0.37500000
## 10   0.02777778   0.5000000   0.05084746  0.04166667
(head(IrisTrainSet_labels))
## [1] virginica  setosa     versicolor virginica  versicolor setosa    
## Levels: setosa versicolor virginica
#######  Set up Random Forest -----------------

Iris_fit_RF <- randomForest(Species ~ . , data = IrisTrainSet)
print(Iris_fit_RF)
## 
## Call:
##  randomForest(formula = Species ~ ., data = IrisTrainSet) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 7.5%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         43          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          5        37   0.1190476
pred_RF<-predict(Iris_fit_RF, IrisTestSet_numonly) 
(table(pred_RF, IrisTestSet_labels))
##             IrisTestSet_labels
## pred_RF      setosa versicolor virginica
##   setosa          7          0         0
##   versicolor      0         15         0
##   virginica       0          0         8
(attributes(Iris_fit_RF))
## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
(Iris_fit_RF$confusion)
##            setosa versicolor virginica class.error
## setosa         43          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          5        37   0.1190476
(Iris_fit_RF$classes)
## [1] "setosa"     "versicolor" "virginica"
#########   vis ---------------------------------
## Number of nodes in the trees in the RF. 
hist(treesize(Iris_fit_RF))

## Which variables were most important? ---------------
varImpPlot(Iris_fit_RF)

## Here we see that Petal attributes are more important than sepal.
## We could remove sepal from RF to see if we can improve prediction
(head(IrisTrainSetReduced <- IrisTrainSet[,-c(1,2)]))
##    Petal.Length Petal.Width    Species
## 1    0.83050847  0.83333333  virginica
## 4    0.05084746  0.12500000     setosa
## 5    0.49152542  0.45833333 versicolor
## 7    0.96610169  0.87500000  virginica
## 9    0.42372881  0.37500000 versicolor
## 10   0.05084746  0.04166667     setosa
(head(IrisTestSet_numonly_Reduced <- IrisTestSet_numonly[,-c(1,2)]))
##     Petal.Length Petal.Width
## 149   0.62711864   0.6250000
## 89    0.61016949   0.5833333
## 20    0.06779661   0.0000000
## 17    0.08474576   0.1250000
## 67    0.64406780   0.7083333
## 71    0.79661017   0.8333333
Iris_fit_RF2 <- randomForest(Species ~ . , data = IrisTrainSetReduced)
print(Iris_fit_RF2)
## 
## Call:
##  randomForest(formula = Species ~ ., data = IrisTrainSetReduced) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 6.67%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         43          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          4        38   0.0952381
pred_RF2<-predict(Iris_fit_RF2, IrisTestSet_numonly_Reduced) 
(table(pred_RF2, IrisTestSet_labels))
##             IrisTestSet_labels
## pred_RF2     setosa versicolor virginica
##   setosa          7          0         0
##   versicolor      0         15         0
##   virginica       0          0         8
(attributes(Iris_fit_RF2))
## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
## Compare the two RF options....
(Iris_fit_RF2$confusion)
##            setosa versicolor virginica class.error
## setosa         43          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          4        38   0.0952381
(Iris_fit_RF$confusion)
##            setosa versicolor virginica class.error
## setosa         43          0         0   0.0000000
## versicolor      0         31         4   0.1142857
## virginica       0          5        37   0.1190476
## There is no difference - so the other two var are not
## hurting or helping the prediction.


## Fun resources for RF
## https://www.youtube.com/watch?v=dJclNIN-TPo&t=516s

################################################################################
#####           Clustering       ###############################################
################################################################################

################################################################
##   Mclust vs. k-means
##
## mclust presents normal-model-based EM clustering, 
## wherease k-means is not dependent on any type of the distribution, 
## it is not model-based. Mclust will use a soft assignment, 
## whereas k-means uses a hard assignment. The mclust is a contributed 
## R package for model-based clustering, classification, and density 
## estimation based on finite normal mixture modelling. It provides 
## functions for parameter estimation via the EM algorithm for normal 
## mixture models with a variety of covariance structures.
## Details: https://cran.r-project.org/web/packages/mclust/vignettes/mclust.html
##
##  ALternatively, K means will start with the assumption that a given data 
## point belongs to one cluster. 
################################################################
## Our current data:
(head(NewIris))   ## Not normalized and with labels as Species
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 103          7.1         3.0          5.9         2.1  virginica
## 20           5.1         3.8          1.5         0.3     setosa
## 63           6.0         2.2          4.0         1.0 versicolor
## 17           5.4         3.9          1.3         0.4     setosa
## 83           5.8         2.7          3.9         1.2 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
(head(Norm_Iris))  ## Normalized w/Min-max and no labels
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.7777778  0.41666667   0.83050847  0.83333333
## 2    0.2222222  0.75000000   0.08474576  0.08333333
## 3    0.4722222  0.08333333   0.50847458  0.37500000
## 4    0.3055556  0.79166667   0.05084746  0.12500000
## 5    0.4166667  0.29166667   0.49152542  0.45833333
## 6    0.7222222  0.45833333   0.66101695  0.58333333
## EM clustering --------------------------------------
## OPtion 1 - I will choose the number of clusters as 3
Clust_EM_Iris_norm <- Mclust(Norm_Iris,G=3)
(Clust_EM_Iris_norm)
## 'Mclust' model object:
##  best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 3 components
summary(Clust_EM_Iris_norm)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
## 
##  log.likelihood   n df      BIC      ICL
##        540.8369 150 44 861.2059 857.9858
## 
## Clustering table:
##  1  2  3 
## 55 50 45
plot(Clust_EM_Iris_norm, what = "classification")

#(attributes(Clust_EM_Iris_norm))
# The table of results shows that the clustering is very good. setosa and virginica
## are perfect and vericolor is incorrect by 5.
(table(Iris_Labels, Clust_EM_Iris_norm$classification))
##             
## Iris_Labels   1  2  3
##   setosa      0 50  0
##   versicolor  5  0 45
##   virginica  50  0  0
## Option 2 - do not set G
Clust_EM_Iris_norm2 <- Mclust(Norm_Iris)
(Clust_EM_Iris_norm2)
## 'Mclust' model object:
##  best model: ellipsoidal, equal shape (VEV) with 2 components
summary(Clust_EM_Iris_norm2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEV (ellipsoidal, equal shape) model with 2 components:
## 
##  log.likelihood   n df      BIC      ICL
##        502.8617 150 26 875.4468 875.4396
## 
## Clustering table:
##   1   2 
## 100  50
plot(Clust_EM_Iris_norm2, what = "classification")