 ## R Program on Analysis Using the Leaf Dataset

• 20th Oct, 2022
• 16:21 PM

## Data Initialization

library(caret)
library(caTools)
library(xgboost)
library(MLmetrics)
library(h2o)
library(e1071)
library(ggplot2)

#Lets count the no. of NA values

sapply(train_raw, function(x) sum(is.na(x)))

## Principal component analysis

data <- train_raw[,-(1:2)]
covX <- cov(data)                          # Covariance matrix
pca <- prcomp(covX)                        # Perform PCA

#Variance Explained
var_exp <- as.data.frame(pca\$sdev^2/sum(pca\$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp\$Principal_Components,var_exp\$Variance,type='b',xlim=c(0,50),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')

## XGBoost Classifier

pca_fin <- pca\$rotation[,1:22]        # Rotaion matrix (194x22)

PCA <- function(X) {                # Reduce observations from N x 194 to N x 22
as.matrix(X) %*% pca_fin
}

train_pca_X <- PCA(train_X)
test_pca_X  <- PCA(test_X)

xgb.grid <- expand.grid(
nrounds=100,
max_depth=c(5,10,15),
eta=c(0.5,0.2,0.1),
gamma=c(0,0.5),
colsample_bytree=0.75,
min_child_weight=5,
subsample=0.66
)

xgb.trcontrol <- trainControl(
method="cv",
number=3,
verboseIter=TRUE,
returnData=FALSE,
returnResamp="all",
classProbs=TRUE,
allowParallel=TRUE
)

system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
verbose=1,
trControl=xgb.trcontrol,
tuneGrid=xgb.grid,
method="xgbTree"
))

## Naive bayes
pca_data <- data.frame(train_raw\$species,train_pca_X)

system.time(NB<- naiveBayes(train_raw.species~.,pca_data))

pred <- predict(NB,newdata=pca_data[,-1],type='raw')

error_nb <- MultiLogLoss(y_true = pca_data[,1], y_pred = as.matrix(pred))
error_nb

## H2o

train.id <- train_raw\$id
train_raw\$id <- NULL
test.id <- validate\$id
validate\$id <- NULL
validate\$species <- NA

#We will create a local instance of the h2o platform in order to be able to create layers for deep learning and make predictions.

localH2O <- h2o.init(max_mem_size = "12g")

h2o.train <- as.h2o(train_raw)

h2o.test <-  as.h2o(validate)

set.seed(13579)

## Deep learning with two hidden layers (1024, 512).
system.time(model <- h2o.deeplearning(x = 2:ncol(h2o.train),
y = 1,
training_frame = h2o.train,
activation = "TanhWithDropout",
input_dropout_ratio = 0,
hidden_dropout_ratios = c(0.1,0.1),
balance_classes = F,
hidden = c(1024,512),
epochs = 250,
loss = "CrossEntropy",
categorical_encoding = "OneHotInternal"))

#prediction
save(model, file = "h2omodel.RData")
ytrain <- h2o.predict(model, h2o.train, type = 'raw')

error_dl <- MultiLogLoss(y_true = train_raw[,1], y_pred = as.matrix(ytrain[,-1]))
error_dl

h2o.shutdown()

## Model Comparison

error <- data.frame(Model=c("XGBoost","Naive Bayes","Deep Learning"),Error=c(error_xgb,error_nb,error_dl))

ggplot(error,aes(x=Model,y=Error))+geom_bar(stat='identity')+theme_bw()+
ggtitle('Comparison of Model Accuracy')

## Observations

# From the chart we can clearly see that Deep Learning provides the best accuracy, that coupled with the absence of any data prep and the relatively low computation time of 68 secs for learning, Deep Learning provides the best option for the current problem.
# Another interesting thing to note is that Naive Bayes, a basic classifier provides better accuracy than the more evolved XGboost algorithm.