Login
Order Now
Support
R Programming Assignment Solution on Recommender

R Programming Assignment Solution on Recommender

  • 18th Aug, 2022
  • 17:14 PM

---
title: "Recommender"
output: html_document
date: '2022-06-16'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## R Markdown


```{r message=FALSE, warning=FALSE}
library(tidyverse)
library(caret)
library(stats)
library(mltools)
library(Metrics)
library(data.table)
library(tools)
library(stringi)
library(textclean)
library(fastDummies)
library(shiny)
library(cluster)
library(bslib)
library(knitr)
```


```{r  results='asis'}

netflix = read.csv('netflix_titles.csv')

kable(head(netflix[1:6]))

```
```{r}
release_year=netflix$release_year

hist(release_year, col="lightblue", ylim=c(0,200))
```
```{r}
x<-sample((netflix$type),100)
x<-factor(x)
t<-table(x); barplot(t/sum(t)*100,ylab="Per cent")
```

```{r}
# Subset the data to only get the main director 
netflix_director = as.data.frame(str_split(netflix$director, ",", simplify = TRUE)[,1]) 
colnames(netflix_director) = "lead_director"
netflix_director = netflix_director %>% mutate(lead_director = ifelse(lead_director == "", "No Director", lead_director))


# Subset the data to only get the lead cast member 
netflix_lead = as.data.frame(str_split(netflix$cast, ",", simplify = TRUE)[,1]) 
colnames(netflix_lead) = "cast_lead"
netflix_lead = netflix_lead %>% mutate(cast_lead = ifelse(cast_lead == "", "Unknown Cast", cast_lead))


# split up the genres into individual columns
# Check to see what kind of genres am I dealing with: 

unique_genres = c(unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,1]), unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,2]), unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,3]))
netflix_english_content = netflix %>% 
  select(title, listed_in) %>% 
  mutate(
    non_english_content = ifelse(str_detect(listed_in, "(.+/s)?British TV Shows(.+/s)?") == TRUE, FALSE,
                                 ifelse(str_detect(listed_in, "(.+/s)?International TV Shows(.+/s)?|(.+/s)?International Movies(.+/s)?|(.+/s)?Anime Series(.+/s)?|(.+/s)?Anime Features(.+/s)?|(.+/s)?Korean TV Shows(.+/s)?|(.+/s)?Spanish-Language TV Shows(.+/s)?") == TRUE, TRUE, FALSE))
  ) 


netflix_english_nation = netflix %>% select(title, country) %>% 
  mutate(
    in_english_nation = ifelse(str_detect(country, "(.+/s)?United States(.+/s)?|(.+/s)?United Kingdom(.+/s)?|(.+/s)?Canada(.+/s)?|(.+/s)?Australia(.+/s)?|(.+/s)?New Zealand(.+/s)?|(.+/s)?Ireland(.+/s)?|(.+/s)?Jamaica(.+/s)?|(.+/s)?Barbados(.+/s)?") == TRUE, TRUE, FALSE)
  )

# Figure out which content is English or Not
# Will be based on Title, Genre, Country 
# I need to make a function to be able to determine if a character is ASCII or not .... use for Title 

is_not_english = function(string){
  some_count = 0 # Give a running count for any instance of non-ASCII character 
  
  for(char in str_split(string, boundary("character"))[[1]]){
    if(str_detect(char, "[A-Za-z0-9 \\*\\!\\(\\):,&\\@\\'\\%\\.\\?\\%\\-]") == F){
      some_count = some_count + 1
    }
  }
  
  outcome = ifelse(some_count >= 2, "Likely Not English", "Likely English") # characters with at least 2 non-ASCII character = likely not English
  
  return(outcome)
}


# Apply this function to tell if a title is a suspected non-English content

netflix_cleaning = cbind(as.data.frame(unlist(map(netflix$title, is_not_english))) %>% rename(is_english = "unlist(map(netflix$title, is_not_english))"), netflix)


# Determining if content is really English or Not  
# If netflix_english_content has non_english_content == TRUE --> For sure not English
# Looking at country + Title, if netflix_english_nation == TRUE + Netflix_english_content$non_english_content == FALSE + Netflix_cleaning$is_english == "Likely English" --> English
# netflix_english_nation == FALSE + Netflix_english_content$non_english_content == FALSE + Netflix_cleaning$is_english == "Likely English"/"Not Likely English" --> Not English


english_check = cbind(netflix_english_nation %>% select(in_english_nation), netflix_english_content %>% select(non_english_content), netflix_cleaning)

final_english_check = english_check %>% 
  select(title ,non_english_content, is_english, in_english_nation, cast, director, description) %>% 
  mutate(
    assume_english = ifelse(c(non_english_content == T & in_english_nation == T & is_english == "Likely English"), "no",
                            ifelse(c(non_english_content == T & in_english_nation == F & is_english == "Likely English"), "no",
                                   ifelse(c(non_english_content == T & in_english_nation == T & is_english == "Likely Not English"), "no",
                                          ifelse(c(non_english_content == T & in_english_nation == F & is_english == "Likely Not English"), "no", 
                                                 ifelse(c(non_english_content == F & in_english_nation == T & is_english == "Likely English"), "yes",
                                                        ifelse(c(non_english_content == F & in_english_nation == F & is_english == "Likely English"), "no",
                                                               ifelse(c(non_english_content == F & in_english_nation == T & is_english == "Likely Not English"), "no", "no")))))))
  )


english_check = as.data.frame(final_english_check$assume_english)

netflix_english = cbind(netflix, english_check) 

netflix_english = netflix_english %>% rename(is_english = "final_english_check$assume_english")


# Determine if a movie is modern or not; well go with the year 2000 and older = modern movie 

netflix_modern_english = netflix_english %>% mutate(is_modern = ifelse(release_year > 1999, 1, 0))


# Combine the individual data frame about lead director, cast lead and whether the content is likely English or not into 1 single data frame

netflix_combine = cbind(as.data.frame(netflix_director$lead_director), as.data.frame(netflix_lead$cast_lead), netflix_modern_english)

netflix_combine = netflix_combine %>% select(-cast, -director, date_added, -release_year, -duration) %>% rename(lead_director = "netflix_director$lead_director", cast_lead = "netflix_lead$cast_lead")

netflix_combine = netflix_combine %>% select(show_id, title, is_modern, type, lead_director, cast_lead, country, is_english, rating, listed_in, description) %>% mutate(type = ifelse(type == "Movie", 1, 0)) %>% rename(is_movie = "type")

# Split up the listed out genres so that we have individual columns represening out each group
# NOTE: some things may double up like international content which will refer to anything that isn't American content 

netflix_combine = netflix_combine %>% 
  mutate(
    international = ifelse(str_detect(listed_in, "(.+/s)?International TV Shows(.+/s)?|(.+/s)?International Movies(.+/s)?|(.+/s)?British TV Shows(.+/s)?|(.+/s)?Spanish\\-Language TV Shows(.+/s)?|(.+/s)?Korean TV Shows(.+/s)?") == T, 1, 0), 
    drama = ifelse(str_detect(listed_in, "(.+/s)?Dramas(.+/s)?|(.+/s)?TV Dramas(.+/s)?") == T, 1, 0), 
    horror = ifelse(str_detect(listed_in, "(.+/s)?Horror Movies(.+/s)?|(.+/s)?TV Horror(.+/s)?") == T, 1, 0), 
    action_adventure = ifelse(str_detect(listed_in, "(.+/s)?Action \\& Adventure(.+/s)?|(.+/s)?TV Action \\& Adventure(.+/s)?") == T, 1, 0),
    crime = ifelse(str_detect(listed_in, "(.+/s)?Crime TV Shows(.+/s)?") == T, 1, 0), 
    docu = ifelse(str_detect(listed_in, "(.+/s)?Documentaries(.+/s)?|(.+/s)?Docuseries(.+/s)?|(.+/s)?Science \\& Nature TV(.+/s)?") == T, 1, 0), 
    comedy = ifelse(str_detect(listed_in, "(.+/s)?Comedies(.+/s)?|(.+/s)?TV Comedies(.+/s)?|(.+/s)?Stand\\-up Comedy(.+/s)?|(.+/s)?Stand\\-Up Comedy \\& Talk Shows(.+/s)?") == T, 1, 0),
    anime = ifelse(str_detect(listed_in, "(.+/s)?Anime Features(.+/s)?|(.+/s)?Anime Series(.+/s)?") == T, 1, 0), 
    independent = ifelse(str_detect(listed_in, "(.+/s)?Independent Movies(.+/s)?") == T, 1, 0), 
    sports = ifelse(str_detect(listed_in, "(.+/s)?Sport Movies(.+/s)?") == T, 1, 0), 
    reality = ifelse(str_detect(listed_in, "(.+/s)?Reality TV(.+/s)?") == T, 1, 0), 
    sci_fi = ifelse(str_detect(listed_in, "(.+/s)?TV Sci\\-Fi \\& Fantasy(.+/s)?|(.+/s)?Sci\\-Fi \\& Fantasy(.+/s)?") == T, 1, 0),
    family = ifelse(str_detect(listed_in, "(.+/s)?Kid\\'s TV(.+/s)?|(.+/s)?Children \\& Family Movies(.+/s)?|(.+/s)?Teen TV Shows(.+/s)?|(.+/s)?Faith \\& Spirituality(.+/s)?") == T, 1, 0),
    classic = ifelse(str_detect(listed_in, "(.+/s)?Classic Movies(.+/s)?|(.+/s)?Cult Movies(.+/s)?|(.+/s)?Classic \\& Cult TV(.+/s)?") == T, 1, 0),
    thriller_mystery = ifelse(str_detect(listed_in, "(.+/s)?Thrillers(.+/s)?|(.+/s)?TV Thrillers(.+/s)?|(.+/s)?TV Mysteries(.+/s)?") == T, 1, 0), 
    musical = ifelse(str_detect(listed_in, "(.+/s)?Music \\& Musicals(.+/s)?") == T, 1, 0), 
    romantic = ifelse(str_detect(listed_in, "(.+/s)?Romantic TV Shows(.+/s)?|(.+/s)?Romantic Movies(.+/s)?|(.+/s)?LGBTQ Movies(.+/s)?") == T, 1, 0)
  ) %>% 
  select(
    -listed_in, -country
  )

# Repeat the process for Content Ratings as well

netflix_combine = netflix_combine %>%
  mutate(
    tv_ma = ifelse(rating == "TV-MA", 1, 0),
    r_rated = ifelse(rating == "R", 1, 0),
    pg_13 = ifelse(rating == "PG-13", 1, 0),
    tv_14 = ifelse(rating == "TV-14", 1, 0),
    tv_pg = ifelse(rating == "TV-PG", 1, 0),
    not_rated = ifelse(rating == "NR", 1,ifelse(rating == "UR",1, 0)),
    tv_g = ifelse(rating == "TV-G", 1, 0),
    tv_y = ifelse(rating == "TV-Y", 1, 0),
    tv_y7 = ifelse(rating == "TV-Y7", 1, ifelse(rating == "TV-Y7-FV",1, 0)),
    pg = ifelse(rating == "PG", 1, 0),
    g_rated = ifelse(rating == "G", 1, 0),
    nc_17 = ifelse(rating == "NC-17", 1, 0)
  ) %>%
  select(
    -rating
  )

# TRansform the values here to binary digits for distance metric 

netflix_combine = netflix_combine %>% 
  mutate(
    is_english = ifelse(is_english == "no", 0, 1),
    lead_director = ifelse(is.na(lead_director), "Unknown/No Director", lead_director),
    cast_lead = ifelse(is.na(cast_lead), "Unknown/No Lead", cast_lead)
  )


# LET'S FIGURE OUT HOW THE END POINT WILL WORK OUT 

# I want to be able to essentially create a means where I take in a show and it gives me an output of some number of movies or shows based on my input
# From this output, I'll also like to be able to have the ability to titrate it out so that I only get a certain output that'll meet my wants.
# These will include: Genres (like A multi-select deal?), Movie vs. TV Shows vs. Doesn't Matter, Some number of recommendations 

# FIRST STEP: convert certain variables into the correct data type 

netflix_combine = netflix_combine %>% mutate(cast_lead = as.factor(cast_lead), lead_director = as.factor(lead_director))

# SECOND STEP: dummify the categorical variables 

netflix_combine_for_knn = fastDummies::dummy_cols(netflix_combine, select_columns = c("cast_lead"), remove_selected_columns = TRUE)

# THIRD STEP: Create a dataframe + matrix used for KNN algorithm


netflix_for_matrix = netflix_combine_for_knn %>% select(-description, -title, -lead_director, -is_movie)
rownames(netflix_for_matrix) = netflix_for_matrix[, 1]
netflix_for_matrix = netflix_for_matrix %>% select(-show_id)

netflix_matrix = as.matrix(dist(netflix_for_matrix, method = "binary"))

# FOURTH STEP: Create a function to be used to generate choices


new_recommendation = function(title, data, matrix, k, reference_data){
  
  # translate the title to show_id
  
  show_id = reference_data$show_id[reference_data$title == title]
  
  # create a holder vector to store findings 
  id = rep(0, nrow(data))
  metric = rep(0, nrow(data))
  content_title = reference_data$title
  description = reference_data$description
  country = reference_data$country
  genres = reference_data$listed_in
  type = reference_data$type
  
  for(i in 1:nrow(data)) {
    if(rownames(data)[i] == show_id) {
      next
    } 
    id[i] = colnames(matrix)[i]
    metric[i] = matrix[show_id, i]
  }
  
  choices = cbind(as.data.frame(id), as.data.frame(content_title), as.data.frame(description), as.data.frame(metric), as.data.frame(type), as.data.frame(country), as.data.frame(genres))
  choices = choices %>% arrange(metric) %>% filter(content_title != title)
  choices = as.data.frame(choices)
  return(choices[0:(k+1),])
}  
```

```{r}
new_recommendation("Lucifer", netflix_for_matrix, netflix_matrix, 10, netflix)$content_title
```

Share this post

assignment helpassignment helperassignment expertsassignment writing services