
R Programming Homework Solution on Data Pre-Processing and Exploratory Data Analysis
- 15th Jul, 2022
- 16:12 PM
# Importing the packages required. library(readr) library(dplyr) library(ggplot2) library(gridExtra) library(lattice) library(DataExplorer) library(grDevices) library(factoextra) library(caret) library(rpart) library(rpart.plot) library(randomForest) library(ranger) library(Metrics) library(ROCit) library(kableExtra) library(GoodmanKruskal) # Reading data into the environment df <- read.csv("Coronaryheartriskstudy.csv") head(df) # Creating report to understand the data create_report(df) # descriptive statistics mystats=function(x){ if(class(x)=="numeric"){ Var_Type=class(x) n<-length(x) nmiss<-sum(is.na(x)) mean<-mean(x,na.rm=T) std<-sd(x,na.rm=T) var<-var(x,na.rm=T) min<-min(x,na.rm=T) p1<-quantile(x,0.01,na.rm=T) p5<-quantile(x,0.05,na.rm=T) p10<-quantile(x,0.1,na.rm=T) q1<-quantile(x,0.25,na.rm=T) q2<-quantile(x,0.5,na.rm=T) q3<-quantile(x,0.75,na.rm=T) p90<-quantile(x,0.9,na.rm=T) p95<-quantile(x,0.95,na.rm=T) p99<-quantile(x,0.99,na.rm=T) max<-max(x,na.rm=T) UC1=mean(x,na.rm=T)+3*sd(x,na.rm=T) LC1=mean(x,na.rm=T)-3*sd(x,na.rm=T) UC2=quantile(x,0.99,na.rm=T) LC2=quantile(x,0.01,na.rm=T) iqr=IQR(x,na.rm=T) UC3=q3+1.5*iqr LC3=q1-1.5*iqr ot1<-max>UC1 | min ot2<-max>UC2 | min ot3<-max>UC3 | min return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,mean=mean,std=std,var=var,min=min,p1=p1,p5=p5,p10=p10,q1=q1,q2=q2,q3=q3,p90=p90,p95=p95,p99=p99,max=max,ot_m1=ot1,ot_m2=ot2,ot_m2=ot3)) } else{ Var_Type=class(x) n<-length(x) nmiss<-sum(is.na(x)) fre<-table(x) prop<-prop.table(table(x)) x[is.na(x)]<-x[which.max(prop.table(table(x)))] return(c(Var_Type=Var_Type, n=n,nmiss=nmiss,freq=fre,proportion=prop)) } } #Vector of numaerical variables num_var= sapply(df,is.numeric) Other_var= !sapply(df,is.numeric) #Applying above defined function on numerical variables my_num_data<-t(data.frame(apply(df[num_var], 2, mystats))) my_cat_data<-data.frame(t(apply(df[Other_var], 2, mystats))) View(my_num_data) View(my_cat_data) #Missing Value Treatment df[,num_var] <- apply(data.frame(df[,num_var]), 2, function(x){x <- replace(x, is.na(x), mean(x, na.rm=TRUE))}) df[,Other_var] <- apply(data.frame(df[,Other_var]), 2, function(x){x <- replace(x, is.na(x), which.max(prop.table(table(x))))}) # Outlier capping with p95 and p5. numeric_vars = names(li)[sapply(df, FUN=is.numeric)] outlier_treat <- function(x){ UC1 = quantile(x, p=0.95,na.rm=T) LC1 = quantile(x, p=0.05,na.rm=T) x[x>UC1]=UC1 x[x return(x) } mydata_num_vars = data.frame(apply(df[numeric_vars], 2, FUN=outlier_treat)) # Univariate plots of the dataset all_plots <- lapply(X = 1:ncol(df), function(x) ggplot()+ geom_bar(data = df, aes(x = df[,x]), fill = "skyblue" , alpha = 0.6)+ theme_linedraw()+ xlab(colnames(df)[x]) ) #colnames(dataset) marrangeGrob(grobs=all_plots, nrow=2, ncol=2) # Bivariate plots of the dataset ptlist_bi <- list() for (var in colnames(df) ){ if (class(df[,var]) %in% c("factor","logical") ) { ###print(class(dataset[,var])) ###print("first if") ptlist_bi[[var]] <- ggplot(data = df) + geom_bar( aes_string(x = var, fill ="TenYearCHD" ), position = "fill") + theme_linedraw() + xlab(var) } else if (class(df[,var]) %in% c("numeric","double","integer") ) { ###print(class(dataset[,var])) # #print("second if") ptlist_bi[[var]] <- ggplot(data = df) + geom_boxplot(aes_string(y = var, x ="TenYearCHD" )) + theme_linedraw() + xlab("TenYearCHD") + ylab(var) } } #names(ptlist_bi) marrangeGrob(grobs=ptlist_bi, nrow=2, ncol=2) # have here, being male is directly related to TenYearCHD, thus the variable male seems a relatively good predictor. # Similarly, Age seems a good predictor since the patients with TenYearCHD == TRUE, have higher median of age, with # almost a similar distribution. In contrast, there seems no relation between different categories of the education # and the response variable. The current Smoker variable shows a slight relation with the response variable, as the # current smokers have a slightly higher risk of TenYearCHD. With the same manner we can investigate remaining plots. # Association between qualitative variables class_list <- lapply(X = 1:ncol(df), function(x) class(df[,x])) names(class_list) <- colnames(df) dataset_cat_variables <- subset(x = df, select = names(which(class_list=="logical"| class_list=="factor"))) #colnames(dataset_cat_variables) dataset_cat_variables$education <- NULL GKmatrix_dataset <- GKtauDataframe(dataset_cat_variables) plot(GKmatrix_dataset) # Modelling # Load the library caTools library(caTools) # Randomly split the data into training and testing sets set.seed(1000) # One needs to put between 50% and 80% of data in the training set split = sample.split(df$TenYearCHD, SplitRatio = 0.65) # Split up the data using subset train = subset(df, split == TRUE) test = subset(df, split == FALSE) # Logistic Regression Model framinghamLog = glm(TenYearCHD ~ ., data = train, family = binomial) summary(framinghamLog) # Predictions on the test set predictTest = predict(framinghamLog, type = "response", newdata = test) # Confusion matrix with threshold of 0.5 ConfMat <- table(test$TenYearCHD, predictTest > 0.5) ConfMat # Accuracy (ConfMat[1, 1] + ConfMat[2, 2])/(ConfMat[1, 1] + ConfMat[1, 2] + ConfMat[2, 1] + ConfMat[2, 2]) # Baseline accuracy (1069 + 6)/(1069 + 6 + 187 + 11) library(ROCR) ROCRpred = prediction(predictTest, test$TenYearCHD) as.numeric(performance(ROCRpred, "auc")@y.values)