
Q2R Script Assignment Solution
- 23rd Nov, 2021
- 23:43 PM
df <- read.csv('HealthExpend.csv') head(df) summary(df) ## outliers in dependent variables boxplot(df$EXPENDOP) boxplot(df$EXPENDIP) ## line plot plot(df$EXPENDOP,type = "o") ## dropping EXPENDIP drop <- c("EXPENDIP") df <- df[ , !(names(df) %in% drop)] names(df) dim(df) ## data types install.packages('purrr') library(purrr) map(df,class) # to know the data types of the columns ## levels of each variable (charecter & integer) drop1 <- c("EXPENDOP") df_char_int <- df[ , !(names(df) %in% drop1)] install.packages('dplyr') library(dplyr) df_char_int %>%summarise_each(funs(n_distinct)) ## checking for NA na_count <-sapply(df, function(y) sum(length(which(is.na(y))))) na_count <- data.frame(na_count) #lapply(df, function(x) sum(is.na(x))) ## unique values of INDUSCLASS unique(df$INDUSCLASS) sum(df$INDUSCLASS == "") ## 888 observations have blank category # one of the class is blank i.e '' , which is around 44.4 % ## correlation plot (excluding character data type columns) drop2 <- c("INDUSCLASS","PHSTAT","INCOME","MARISTAT","EDUC","REGION","RACE") df_ex_char <- df[ , !(names(df) %in% drop2)] install.packages('corrplot') library(corrplot) corrplot(cor(df_ex_char), order = "hclust") ###### Analyisis on Independent variables boxplot(df$ï..AGE) summary(df$ï..AGE) # count of each levels of each column install.packages('ggplot2') library(ggplot2) qplot(df$EDUC, geom="histogram") # to plot for integer variables barplot(prop.table(table(df$INDUSCLASS))) # to plot for categ variables #### Feature Importance # Since INDUSCLASS column has 44.4 % of values as blank , we will drop this column # Also dropping all the categorical columns like (RACE , INCOME etc , since we already have #label encoded columns of these , such as RACE1 , INCOME1 etc) drop3 <- c("INDUSCLASS") df_final <- df[ , !(names(df) %in% drop2)] names(df_final) dim(df_final) # 20 columns and 2000 rows # Feature importance using xgboost with presence of outlier in dependent variable install.packages('xgboost') library(xgboost) install.packages('Matrix') library(Matrix) sparse_matrix <- sparse.model.matrix(EXPENDOP~.-1,data = df_final) model <- xgboost(data = sparse_matrix, label = df_final$EXPENDOP, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear") importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model) print(xgb.plot.importance(importance_matrix = importance ,top_n = 10)) # Feature importance using xgboost without presence of outlier in dependent variable df_finall <- df_final[df_final$EXPENDOP <= 1155, ] # removong outliers , 499 rows removed sparse_matrix <- sparse.model.matrix(EXPENDOP~.-1,data = df_finall) model <- xgboost(data = sparse_matrix, label = df_finall$EXPENDOP, max.depth = 6, eta = 0.3, nthread = 4, nrounds = 16, verbose = 2, objective = "reg:linear") importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = model) print(xgb.plot.importance(importance_matrix = importance ))