###############################################
##  Tutorial: TOPICS
##    
##    SVM - support vector machines
##     - for iris
##     - for Kaggle Titanic data
##    
##  Dr. A, Gates, 2018
##  
###############################################
## Fun YouTube Resources:
## 
## https://www.youtube.com/watch?v=ueKqDlMxueE   ## SVM
## https://www.youtube.com/watch?v=pS5gXENd3a4   ## SVM
## 
## 
## I will start the examples using the iris dataset
## because it is easy to see and understand
## It also clusters well and has correct labels
## Remember that "real" data is not as easy to 
## work with because it may not cluster, it may have
## incorrect or odd labels or no labels, etc. 

## Next - below, I will use the Kaggle Titanic Datasets
## These will offer a more realiztic view of these
## methods and will also include the required cleaning
## and prep. 



########################
## libraries
## NOTE: Always install.packages("ThePackName") if needed
## for each library included.
#install.packages("e1071")
library(e1071)  # for machine learning methods
#install.packages("mlr")
library(mlr)
## Loading required package: ParamHelpers
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:e1071':
## 
##     impute
# install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:mlr':
## 
##     train
#install.packages("naivebayes")
##library(naivebayes)
library(datasets)
library(ggplot2)
library(MASS)  


#######################################
##            IRIS                   ##
#######################################

#### Look at the iris data ####
## 
## Here, we do not need to clean or prep the
## data. However, when using real data, you 
## will spend 80% of your time prepping/cleaning

########### View the Data 
plot(iris)

(head(iris))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
(str(iris))
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## NULL
(summary(iris))
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
(nrow(iris))
## [1] 150
## col is color...
plot(iris$Sepal.Length, iris$Petal.Width, col=iris$Species)

plot(iris$Petal.Length,iris$Petal.Width, col=iris$Species)

## using qplot
qplot(iris$Petal.Length, iris$Petal.Width, data=iris, color=iris$Species)

###### Create a Test and Train set ##############
## Random sample without replacement
## sample(x, size, replace = FALSE, prob = NULL)
## Create a random sample of 30 numbers from 1 - 150
samplerownums<- sample(150,40)
(iris_Testset <- iris[samplerownums,])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 116          6.4         3.2          5.3         2.3  virginica
## 39           4.4         3.0          1.3         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 107          4.9         2.5          4.5         1.7  virginica
## 3            4.7         3.2          1.3         0.2     setosa
## 52           6.4         3.2          4.5         1.5 versicolor
## 137          6.3         3.4          5.6         2.4  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 101          6.3         3.3          6.0         2.5  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 22           5.1         3.7          1.5         0.4     setosa
## 123          7.7         2.8          6.7         2.0  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 53           6.9         3.1          4.9         1.5 versicolor
## 139          6.0         3.0          4.8         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 61           5.0         2.0          3.5         1.0 versicolor
## 120          6.0         2.2          5.0         1.5  virginica
## 25           4.8         3.4          1.9         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 84           6.0         2.7          5.1         1.6 versicolor
## 48           4.6         3.2          1.4         0.2     setosa
## 68           5.8         2.7          4.1         1.0 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 127          6.2         2.8          4.8         1.8  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 95           5.6         2.7          4.2         1.3 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 16           5.7         4.4          1.5         0.4     setosa
## 82           5.5         2.4          3.7         1.0 versicolor
## 10           4.9         3.1          1.5         0.1     setosa
## 59           6.6         2.9          4.6         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## Remove and keep the labels
(irisTestLabels <- iris_Testset[,c(5)])
##  [1] virginica  setosa     setosa     virginica  setosa     versicolor
##  [7] virginica  virginica  virginica  virginica  virginica  setosa    
## [13] virginica  virginica  versicolor virginica  virginica  versicolor
## [19] virginica  setosa     setosa     setosa     versicolor setosa    
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica  virginica  virginica  versicolor versicolor setosa    
## [37] versicolor setosa     versicolor versicolor
## Levels: setosa versicolor virginica
iris_Testset<-iris_Testset[,-c(5)]
(head(iris_Testset))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 116          6.4         3.2          5.3         2.3
## 39           4.4         3.0          1.3         0.2
## 6            5.4         3.9          1.7         0.4
## 107          4.9         2.5          4.5         1.7
## 3            4.7         3.2          1.3         0.2
## 52           6.4         3.2          4.5         1.5
## For the training data, we want to have/keep the class label
iris_Trainset <- iris[-samplerownums,]
(head(iris_Trainset))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 7          4.6         3.4          1.4         0.3  setosa
## 8          5.0         3.4          1.5         0.2  setosa
#################  Set up the SVM -----------------
## Soft svm with cost as the penalty for points
## being in the wrong location of the margin
## boundaries
## There are many kernel options...

###################################################
## Polynomial Kernel...
SVM_fit_P <- svm(Species~., data=iris_Trainset, 
               kernel="polynomial", cost=.1, 
               scale=FALSE)
print(SVM_fit_P)
## 
## Call:
## svm(formula = Species ~ ., data = iris_Trainset, kernel = "polynomial", 
##     cost = 0.1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  0.1 
##      degree:  3 
##       gamma:  0.25 
##      coef.0:  0 
## 
## Number of Support Vectors:  11
##Prediction --
(pred_P <- predict(SVM_fit_P, iris_Testset, type="class"))
##        116         39          6        107          3         52 
##  virginica     setosa     setosa  virginica     setosa versicolor 
##        137        124        101        108        133         22 
##  virginica  virginica  virginica  virginica  virginica     setosa 
##        123        117         53        139        105         61 
##  virginica  virginica versicolor  virginica  virginica versicolor 
##        120         25         41          9         84         48 
## versicolor     setosa     setosa     setosa  virginica     setosa 
##         68         58         78         67         85         56 
## versicolor versicolor versicolor versicolor versicolor versicolor 
##        127        104        134         95         71         16 
##  virginica  virginica versicolor versicolor  virginica     setosa 
##         82         10         59         89 
## versicolor     setosa versicolor versicolor 
## Levels: setosa versicolor virginica
## COnfusion Matrix
(Ptable <- table(pred_P, irisTestLabels))
##             irisTestLabels
## pred_P       setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         13         2
##   virginica       0          2        13
## We have 4 variables and so need our plot to be more precise
plot(SVM_fit_P, data=iris_Trainset, Petal.Width~Petal.Length, 
     slice=list(Sepal.Width=3, Sepal.Length=4))

## The above places Petal.Width on the x and Petal.Length
## on the y. It also holds Sepal.Width constant at 3 and
## Sepal.Length constant at 4.
## We need to do this because out plot is 2D and so can
## only show 2 dimensions/attributes as variables 

## ------ View/calucalte misclassification
## The Ptable above is the confusion matrix that shows
## what was classified correctly and what was not

## Misclassification Rate for Polynomial
(MR_P <- 1 - sum(diag(Ptable))/sum(Ptable))
## [1] 0.1
###############################
## Linear Kernel...
SVM_fit_L <- svm(Species~., data=iris_Trainset, 
                 kernel="linear", cost=.1, 
                 scale=FALSE)
print(SVM_fit_L)
## 
## Call:
## svm(formula = Species ~ ., data = iris_Trainset, kernel = "linear", 
##     cost = 0.1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  46
##Prediction --
(pred_L <- predict(SVM_fit_L, iris_Testset, type="class"))
##        116         39          6        107          3         52 
##  virginica     setosa     setosa versicolor     setosa versicolor 
##        137        124        101        108        133         22 
##  virginica  virginica  virginica  virginica  virginica     setosa 
##        123        117         53        139        105         61 
##  virginica  virginica versicolor versicolor  virginica versicolor 
##        120         25         41          9         84         48 
## versicolor     setosa     setosa     setosa  virginica     setosa 
##         68         58         78         67         85         56 
## versicolor versicolor  virginica versicolor versicolor versicolor 
##        127        104        134         95         71         16 
## versicolor  virginica  virginica versicolor  virginica     setosa 
##         82         10         59         89 
## versicolor     setosa versicolor versicolor 
## Levels: setosa versicolor virginica
(L_table<-table(pred_L, irisTestLabels))
##             irisTestLabels
## pred_L       setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         4
##   virginica       0          3        11
plot(SVM_fit_L, data=iris_Trainset, Petal.Width~Petal.Length, 
     slice=list(Sepal.Width=3, Sepal.Length=4))

## Misclassification Rate for Linear
(MR_L <- 1 - sum(diag(L_table))/sum(L_table))
## [1] 0.175
####################################
## Radial Kernel...
SVM_fit_R <- svm(Species~., data=iris_Trainset, 
                 kernel="radial", cost=.1, 
                 scale=FALSE)
print(SVM_fit_R)
## 
## Call:
## svm(formula = Species ~ ., data = iris_Trainset, kernel = "radial", 
##     cost = 0.1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  83
##Prediction --
(pred_R <- predict(SVM_fit_R, iris_Testset, type="class"))
##        116         39          6        107          3         52 
##  virginica     setosa     setosa versicolor     setosa versicolor 
##        137        124        101        108        133         22 
##  virginica  virginica  virginica  virginica  virginica     setosa 
##        123        117         53        139        105         61 
##  virginica  virginica  virginica versicolor  virginica versicolor 
##        120         25         41          9         84         48 
## versicolor     setosa     setosa     setosa  virginica     setosa 
##         68         58         78         67         85         56 
## versicolor versicolor  virginica versicolor versicolor versicolor 
##        127        104        134         95         71         16 
## versicolor  virginica  virginica versicolor versicolor     setosa 
##         82         10         59         89 
## versicolor     setosa versicolor versicolor 
## Levels: setosa versicolor virginica
(R_table<-table(pred_R, irisTestLabels))
##             irisTestLabels
## pred_R       setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         4
##   virginica       0          3        11
plot(SVM_fit_R, data=iris_Trainset, Petal.Width~Petal.Length, 
     slice=list(Sepal.Width=3, Sepal.Length=4))

## Misclassification Rate for Radial
(MR_R <- 1 - sum(diag(R_table))/sum(R_table))
## [1] 0.175
## So, the polynomial seems to do the best job most of the 
## time. Remember that the sample is random.
## We can also update the cost. See below for how to 
## tune the cost....


####################################################
## SVM EXAMPLE 2 ###################################

## We cannot plot the above because the number of 
## attributes and the label together exceed 3
## Let's re-run the SVM with fewer attributes

Columns <- c("Petal.Length", "Petal.Width", "Species")
samplerownums<- sample(150,40)
iris_Testset_petal <- iris[samplerownums,Columns]
## Remove and keep the labels
(irisTestLabels <- iris_Testset_petal[,c(3)])
##  [1] setosa     versicolor virginica  versicolor versicolor setosa    
##  [7] versicolor versicolor virginica  virginica  versicolor setosa    
## [13] virginica  virginica  virginica  virginica  versicolor versicolor
## [19] setosa     setosa     setosa     versicolor virginica  virginica 
## [25] setosa     setosa     virginica  versicolor versicolor virginica 
## [31] virginica  setosa     virginica  setosa     setosa     versicolor
## [37] versicolor versicolor virginica  virginica 
## Levels: setosa versicolor virginica
(iris_Testset_petal<-iris_Testset_petal[,-c(3)])
##     Petal.Length Petal.Width
## 26           1.6         0.2
## 64           4.7         1.4
## 102          5.1         1.9
## 72           4.0         1.3
## 71           4.8         1.8
## 41           1.3         0.3
## 95           4.2         1.3
## 82           3.7         1.0
## 139          4.8         1.8
## 108          6.3         1.8
## 90           4.0         1.3
## 11           1.5         0.2
## 128          4.9         1.8
## 115          5.1         2.4
## 133          5.6         2.2
## 138          5.5         1.8
## 52           4.5         1.5
## 78           5.0         1.7
## 42           1.3         0.3
## 10           1.5         0.1
## 8            1.5         0.2
## 85           4.5         1.5
## 150          5.1         1.8
## 114          5.0         2.0
## 25           1.9         0.2
## 43           1.3         0.2
## 136          6.1         2.3
## 97           4.2         1.3
## 75           4.3         1.3
## 149          5.4         2.3
## 140          5.4         2.1
## 50           1.4         0.2
## 118          6.7         2.2
## 6            1.7         0.4
## 40           1.5         0.2
## 62           4.2         1.5
## 70           3.9         1.1
## 65           3.6         1.3
## 104          5.6         1.8
## 123          6.7         2.0
(head(iris_Testset_petal))
##     Petal.Length Petal.Width
## 26           1.6         0.2
## 64           4.7         1.4
## 102          5.1         1.9
## 72           4.0         1.3
## 71           4.8         1.8
## 41           1.3         0.3
## For the training data, we want to have/keep the class label
iris_Trainset_petal <- iris[-samplerownums, Columns]
(head(iris_Trainset_petal))
##   Petal.Length Petal.Width Species
## 1          1.4         0.2  setosa
## 2          1.4         0.2  setosa
## 3          1.3         0.2  setosa
## 4          1.5         0.2  setosa
## 5          1.4         0.2  setosa
## 7          1.4         0.3  setosa
## Set up the SVM again
SVM_fit2 <- svm(Species~., data=iris_Trainset_petal, kernel="linear",
                cost=.1)
print(SVM_fit2)
## 
## Call:
## svm(formula = Species ~ ., data = iris_Trainset_petal, kernel = "linear", 
##     cost = 0.1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  60
plot(SVM_fit2, iris_Trainset_petal)