## R Studio Solution for Data Engineering and Mining Task

• 17th Nov, 2022
• 17:26 PM

#  DIGIT RECOGNIZER  #

rm(list = ls())
set.seed(1001)
setwd("~/Assignments/Job 1")
library(data.table)
library(FNN)
library(caret)

data <- data[seq(10,nrow(data),by = 10),]
y_label <- 'label'

## DATA PRE-PROCESSING:

# 01. Removing columns with zero-variance:
idx <- (apply(data, 2,function(x){sd(x)}) == 0) # columns having zero-variance
data <- data[,!idx] # removing columns having zero-variance

# 02. Use PCA to reduce dimension:
x = as.matrix(data[,-1])
y = as.factor(data[,y_label])
pca_data = scale(x)
pca = prcomp(pca_data)
cumpro <- cumsum(pca\$sdev^2 / sum(pca\$sdev^2))
plot(cumpro[0:200], xlab = "Principal components", ylab = "Amount of explained variance", main = "Cumulative variance plot")
pca_x <- x %*% pca\$rotation[,1:100] # taking top 100 PCA axis

# 03. Splitting the train data into 70:30
prop <- 0.7
idx <- sample.int(nrow(pca_x),ceiling(prop*nrow(pca_x)),FALSE)
train.x <- pca_x[idx,]
test.x <- pca_x[-idx,]
train.y <- y[idx]
test.y <- y[-idx]

###########################
## NAIVE BAYES ESTIMATOR ##
###########################
start <- proc.time()
model_nb <- train(train.x,train.y,'nb',trControl=trainControl(method='cv',number=10))

pred_test <- predict(model_nb,test.x)
pred_train <- predict(model_nb,train.x)
end <- proc.time()
time_nb <- end - start

train_acc_nb <- confusionMatrix(train.y,pred_train)
test_acc_nb <- confusionMatrix(test.y,pred_test)

###########################
## K-NEAREST NEIGHBOUR ####
###########################
start <- proc.time()

model_knn <- train(train.x,train.y,
method = "knn",
trControl = trainControl(method='cv',number=10),
tuneLength = 20)

pred_train <- predict(model_knn,train.x)
pred_test <- predict(model_knn,test.x)
end <- proc.time()
time_knn <- end - start

train_acc_knn <- confusionMatrix(train.y,pred_train)
test_acc_knn <- confusionMatrix(test.y,pred_test)

###########################
## SUPPORT VECTOR MACHINE #
###########################
library(e1071)

start <- proc.time()
model_svm <- train( x = train.x,y = train.y,
method = "svmLinear",
trControl = trainControl(method='cv',number=10)
)

pred_test <- predict(model_svm,test.x)
pred_train <- predict(model_svm,train.x)

end <- proc.time()
time_svm <- end - start

train_acc_svm <- confusionMatrix(train.y,pred_train)
test_acc_svm <- confusionMatrix(test.y,pred_test)

##########################
##      COMPARISON      ##
##########################
train_acc_nb\$overall['Accuracy']
train_acc_knn\$overall['Accuracy']
train_acc_svm\$overall['Accuracy']

test_acc_nb\$overall['Accuracy']
test_acc_knn\$overall['Accuracy']
test_acc_svm\$overall['Accuracy']

time_nb
time_knn
time_svm