
R Programming Homework Solution on Methods of Data Analysis
- 8th Jul, 2022
- 15:58 PM
df <- read.csv('FP_dataset.csv') dim(df) # 31 col ,1508 rows str(df) ## checking for NA na_count <-sapply(df, function(y) sum(length(which(is.na(y))))) na_count <- data.frame(na_count) # No na present in the dataset ###### EDA # Target columns Analysis library('ggplot2') library('dplyr') library('grid') library('gridExtra') library('RColorBrewer') library('corrplot') boxplot(df$ADM_RATE) summary(df$ADM_RATE) p1 <- df %>% ggplot(aes(ADM_RATE)) + geom_histogram(bins = 100, fill = "red") + labs(x = "ADM_RATE") + ggtitle("ADM_RATE feature distribution") grid.arrange(p1, layout_matrix = rbind(c(1))) ### Predictor variables # there are 31 columns out og which 1 is response , left with 30 predictor variable , # out of which 2 are categorical namely institution name and postal code and rest are numerical out of the 28 numerical , two are ID columns which we wont be analysisng # so we are left with 28 numerical variables # categorical # INSTNM length(unique(df$INSTNM)) # count of unique institutions : 1497 df %>% mutate(tar = as.character(INSTNM)) %>% group_by(tar) %>% count() %>% arrange(desc(n)) %>% head(10) %>% ggplot(aes(reorder(tar, n, FUN = min), n)) + geom_col(fill = "blue") + coord_flip() + labs(x = "INSTNM values", y = "frequency") + ggtitle("Top 10 Most frequent INSTNM values") # state postcode length(unique(df$STABBR)) #count of unique state post code : 54 df %>% mutate(tar = as.character(STABBR)) %>% group_by(tar) %>% count() %>% arrange(desc(n)) %>% head(10) %>% ggplot(aes(reorder(tar, n, FUN = min), n)) + geom_col(fill = "blue") + coord_flip() + labs(x = "STABBR values", y = "frequency") + ggtitle("Top 10 Most frequent STABBR values") # Numerical library('data.table') library('tibble') library('tidyr') library('stringr') library('forcats') var_means <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% summarise_all(funs(mean)) %>% gather(everything(), key = "feature", value = "mean") var_medians <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% summarise_all(funs(median)) %>% gather(everything(), key = "feature", value = "median") var_sd <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% summarise_all(funs(sd)) %>% gather(everything(), key = "feature", value = "sd") stat <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% summarise_all(funs(sum(.<0>% gather(everything(), key = "feature", value = "zeros") %>% left_join(var_means, by = "feature") %>% left_join(var_medians, by = "feature") %>% left_join(var_sd, by = "feature") summary(stat) # Visualising the above p1 <- stat %>% ggplot(aes(mean)) + geom_histogram() + labs(x = "Feature means") + ggtitle("Feature means") p2 <- stat %>% ggplot(aes(median)) + geom_histogram() + labs(x = "Feature medians") + ggtitle("Feature medians") p3 <- stat %>% ggplot(aes(sd)) + geom_histogram() + labs(x = "Feature std dev") + ggtitle("Feature std dev") grid.arrange(p1, p2, p3, layout_matrix = rbind(c(1),c(2),c(3))) # Correlation with the response variable # Spearman spearman_correlations <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% cor(df$ADM_RATE, method = "spearman") %>% as.tibble() %>% rename(cor = V1) ggplot(spearman_correlations, aes(x=cor)) + geom_histogram() + labs(x = "Spearman Correlation coefficient") + ggtitle("Spearman Correlation of numeric predictor variable vs response variable") # Pearson Pearson_correlations <- df %>% select(-X, -UNITID,-INSTNM,-STABBR,-ADM_RATE) %>% cor(df$ADM_RATE, method = "pearson") %>% as.tibble() %>% rename(cor_p = V1) ggplot(Pearson_correlations, aes(x=cor_p)) + geom_histogram() + labs(x = "Pearson Correlation coefficient") + ggtitle("Pearson Correlation of numeric predictor variable vs response variable") # correlation plot (excluding character data type columns) drop2 <- c('X', 'UNITID','INSTNM','STABBR') df_ex_char <- df[ , !(names(df) %in% drop2)] install.packages('corrplot') library(corrplot) corrplot(cor(df_ex_char), order = "hclust") # Important features using xgboost install.packages('xgboost') library(xgboost) install.packages('Matrix') library(Matrix) sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char) model <- xgboost(data = sparse_matrix, label = df_ex_char$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2,eval_metric = "merror", objective = "reg:linear") importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model) print(xgb.plot.importance(importance_matrix = importance ,top_n = 10)) ## Regression Modelling lmADM_RATE = lm(ADM_RATE~., data = df_ex_char) #Create the linear regression summary(lmADM_RATE) plot(lmADM_RATE$residuals, pch = 16, col = "red") ## Infuential points plot(cooks.distance(lmADM_RATE), pch = 16, col = "blue") # deleting influential points and high residual points and then re modelling ## HighLeverage <- cooks.distance(lmADM_RATE) > (2/nrow(df_ex_char)) LargeResiduals <- rstudent(lmADM_RATE) > 0.01 df_ex_char_influ_removed <- df_ex_char[!HighLeverage & !LargeResiduals,] lmADM_RATE_influ_removed<-lm(ADM_RATE ~ ., data = df_ex_char_influ_removed) summary(lmADM_RATE_influ_removed) plot(lmADM_RATE_influ_removed$residuals, pch = 16, col = "red") # as we can see , in deleting the influential points and the points with high residual values, r2 value is increased from 0.24 to 0.76 ## Performing xgboost on the data havinng no influential and high residula points sparse_matrix <- sparse.model.matrix(ADM_RATE~.-1,data = df_ex_char_influ_removed) model <- xgboost(data = sparse_matrix, label = df_ex_char_influ_removed$ADM_RATE, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear") importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model) print(xgb.plot.importance(importance_matrix = importance ,top_n = 10)) ## Stepwise regression , variable selection using AIC df <- read.csv('FP_dataset.csv') df_ex_char <- df df_ex_char$X <- NULL df_ex_char$UNITID <- NULL df_ex_char$INSTNM <- NULL df_ex_char$STABBR <- NULL #drop2 <- c('X', 'UNITID','INSTNM','STABBR') #df_ex_char <- df[ , !(names(df) %in% drop2)] lm1 <- lm(ADM_RATE ~ . , data = df_ex_char) selectedMod1 <- step(lm1) summary(selectedMod1) all_vifs <- car::vif(selectedMod1) print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity ## recursively removoing variable having VIF > 4 signif_all <- names(all_vifs) # Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4. while(any(all_vifs > 4)){ var_with_max_vif <- names(which(all_vifs == max(all_vifs))) # get the var with max vif signif_all <- signif_all[!(signif_all) %in% var_with_max_vif] # remove myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep="")) # new formula selectedMod2 <- lm(myForm, data=df_ex_char) # re-build model with new formula all_vifs <- car::vif(selectedMod2) } summary(selectedMod2) # Recursively remove non-significant variables , on the condition that p value < 0> all_vars <- names(selectedMod2[[1]])[-1] # names of all X variables # Get the non-significant vars summ <- summary(selectedMod2) # model summary pvals <- summ[[4]][, 4] # get all p values not_significant <- character() not_significant <- names(which(pvals > 0.1)) not_significant <- not_significant[!not_significant %in% "(Intercept)"] # If there are any non-significant variables, while(length(not_significant) > 0){ all_vars <- all_vars[!all_vars %in% not_significant[1]] myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep="")) # new formula selectedMod3 <- lm(myForm, data=df_ex_char) # re-build model with new formula # Get the non-significant vars. summ <- summary(selectedMod3) pvals <- summ[[4]][, 4] not_significant <- character() not_significant <- names(which(pvals > 0.1)) not_significant <- not_significant[!not_significant %in% "(Intercept)"] } summary(selectedMod3) car::vif(selectedMod3) ### step wise regression on removed influential points and high residuals data lm1 <- lm(ADM_RATE ~ . , data = df_ex_char_influ_removed) selectedMod1 <- step(lm1) # this step function iterates and builds model , then select one with least aic. summary(selectedMod1) all_vifs <- car::vif(selectedMod1) print(all_vifs) ##There are variable having more than 4 VIF , indicating multicollinearity ## recursively removoing variable having VIF > 4 signif_all <- names(all_vifs) # Remove vars with VIF> 4 and re-build model until none of VIFs don't exceed 4. while(any(all_vifs > 4)){ var_with_max_vif <- names(which(all_vifs == max(all_vifs))) # get the var with max vif signif_all <- signif_all[!(signif_all) %in% var_with_max_vif] # remove myForm <- as.formula(paste("ADM_RATE ~ ", paste (signif_all, collapse=" + "), sep="")) # new formula selectedMod2 <- lm(myForm, data=df_ex_char_influ_removed) # re-build model with new formula all_vifs <- car::vif(selectedMod2) } summary(selectedMod2) # Recursively remove non-significant variables , on the condition that p value < 0> all_vars <- names(selectedMod2[[1]])[-1] # names of all X variables # Get the non-significant vars summ <- summary(selectedMod2) # model summary pvals <- summ[[4]][, 4] # get all p values not_significant <- character() not_significant <- names(which(pvals > 0.1)) not_significant <- not_significant[!not_significant %in% "(Intercept)"] # If there are any non-significant variables, while(length(not_significant) > 0){ all_vars <- all_vars[!all_vars %in% not_significant[1]] myForm <- as.formula(paste("ADM_RATE ~ ", paste (all_vars, collapse=" + "), sep="")) # new formula selectedMod3 <- lm(myForm, data=df_ex_char_influ_removed) # re-build model with new formula # Get the non-significant vars. summ <- summary(selectedMod3) pvals <- summ[[4]][, 4] not_significant <- character() not_significant <- names(which(pvals > 0.1)) not_significant <- not_significant[!not_significant %in% "(Intercept)"] } summary(selectedMod3) car::vif(selectedMod3) ## BOX COX transformation on the data install.packages('schoolmath') library(schoolmath) df_ex_char_copy <- data.frame(df_ex_char) for (i in 1:ncol(df_ex_char_copy)){ if (sum(is.positive(df_ex_char_copy[,i])) == 1508){ bc <- car::powerTransform(df_ex_char_copy[,i], family="bcPower") df_ex_char_copy[,i] <- (df_ex_char_copy[,i]) * (as.numeric(bc[1][1])) } } # linear regression after box cox transformation of data lmADM_RATE = lm(ADM_RATE~., data = df_ex_char_copy) #Create the linear regression summary(lmADM_RATE)