In this post, I use a #TidyTuesday dataset on Netflix TV shows to develop a penalized logistic regression model that predicts whether a show will have a single season or multiple seasons. The model leverages predictors such as the show's title, summary description, genre, and other relevant features. The primary aim of the project is to demonstrate the application of basic text analysis in predictive modeling.

Import the dataset

library(tidyverse)
library(tidytuesdayR)

tuesdata <- tidytuesdayR::tt_load('2021-04-20')
## 
##  Downloading file 1 of 1: `netflix_titles.csv`
Netflix_data <- tuesdata$netflix_titles
library(skimr)
library(lubridate)

## skim the data to get a rough understanding of the dataset

skim(Netflix_data)
Data summary
Name Netflix_data
Number of rows 7787
Number of columns 12
_______________________
Column type frequency:
character 11
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
show_id 0 1.00 2 5 0 7787 0
type 0 1.00 5 7 0 2 0
title 0 1.00 1 104 0 7787 0
director 2389 0.69 2 208 0 4049 0
cast 718 0.91 3 771 0 6831 0
country 507 0.93 4 123 0 681 0
date_added 10 1.00 11 19 0 1565 0
rating 7 1.00 1 8 0 14 0
duration 0 1.00 5 10 0 216 0
listed_in 0 1.00 6 79 0 492 0
description 0 1.00 61 248 0 7769 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
release_year 0 1 2013.93 8.76 1925 2013 2017 2018 2021 ▁▁▁▁▇
Netflix_data %>% filter(type == "TV Show") %>%
  select(description) %>% slice_sample(n = 10)
## # A tibble: 10 x 1
##    description                                                                  
##    <chr>                                                                        
##  1 Saved from a plane crash and given supernatural powers, teen Tarzan joins fo~
##  2 Bear buddies Bucky and Bjorn play games, go on far-out adventures and learn ~
##  3 Bea Smith is locked up while awaiting trial for the alleged attempted murder~
##  4 Famed horticulturist Alan Titchmarsh hosts this uplifting show in which he a~
##  5 Rebellious and broke, Sophia stumbles into creating an online business and l~
##  6 This docuseries details the suspicious death of Alberto Nisman, investigator~
##  7 This documentary series explores the stories behind history's most fascinati~
##  8 A mission gone awry transports Sonic the Hedgehog, his friends, Dr. Eggman a~
##  9 Follow renowned soccer club Juventus on and off the pitch as they attempt to~
## 10 A young journalist is forced into a life of crime to save his father and fam~
Netflix_pre <- Netflix_data %>% filter(type == "TV Show") %>%
  transmute(
    title,
    country = fct_lump_min(country, min = 100, other_level = "Country-other"),
    month = factor(months(mdy(date_added))),
    year_2021_release = max(release_year) - release_year,
    rating = fct_lump_min(rating, min = 100, other_level = "TV-other"),
    duration = relevel(factor(
      case_when(duration == "1 Season" ~ "One season",
                TRUE ~ "More than one season")
    ), ref = "One season"),
    listed_in,
    description
  ) %>%
  na.omit()

The graph below shows the bar plot of TV Rating by Duration. It is seen that TV-Y programs (aimed at a very young audience, including children from ages 2-6), and TV-Y7 programs (most appropriate for children age 7 and up) are more likely to have more than one season!

library(viridisLite)

Netflix_pre %>%
  count(rating, duration) %>% group_by(rating) %>%
  mutate(Proportion = n / sum(n)) %>%
  ggplot(aes(rating, Proportion, fill = duration)) +
  geom_col(position = "dodge") +
  scale_fill_viridis_d() +
  labs(x = "TV Rating", y = "Proportion") +
  guides(fill = guide_legend(title = "Duration"))

The following bar plot displays the share of countries in “One season” and “More than one season” TV shows.

Netflix_pre %>%
  count(country, duration) %>% group_by(duration) %>%
  mutate(Proportion = n / sum(n)) %>%
  ggplot(aes(country, Proportion, fill = country)) +
  geom_col(position = "dodge", show.legend = FALSE) +
  scale_fill_viridis_d() +
  labs(x = NULL, y = "Proportion") +
  facet_wrap(~ duration, scales = "free_x") +
  coord_flip()

Combine the variables “title”, “listed_in”, and “description” to have a single text variable. Changing the word “Sci-Fi” to “Sci_Fi” let us keep it when tokenizing the text variable. There might be more of words of this nature, but we are not going to do a deep search.

library(tidytext)
library(tidyr)
library(tm)
library(wordcloud)


Netflix_concat <- Netflix_pre %>%
  unite(
    "full_description",
    c(title, listed_in, description),
    remove = FALSE,
    sep = " "
  ) %>%
  mutate(full_description = str_replace_all(full_description,
                                            pattern = "Sci-Fi",
                                            replacement = "Sci_Fi"))


## lets create wordclouds for the both categories of the response variable.
## see the most frequent words in each category.


word_cloud_data <- Netflix_concat %>%
  unnest_tokens(word, full_description, token = "words") %>%
  anti_join(tibble(word = c(stopwords("english"), "shows", "tv"))) %>%
  count(duration, word, sort = TRUE) %>%
  ungroup()

layout.matrix <- matrix(c(1, 2), nrow = 1, ncol = 2)

layout(mat = layout.matrix,
       heights = c(1, 2), # Heights of the two rows
       widths = c(1.8, 2)) # Widths of the two columns

word_cloud_data %>% filter(duration == "One season") %>%
  with(
    wordcloud(
      word,
      n,
      scale = c(2, 0.5),
      min.freq = 20,
      max.words = 100,
      colors = viridis(10),
      rot.per = 0,
      random.order = FALSE
    )
  ) # Left panel

word_cloud_data %>% filter(duration == "More than one season") %>%
  with(
    wordcloud(
      word,
      n,
      scale = c(2, 0.4),
      min.freq = 20,
      max.words = 100,
      colors = viridis(10),
      rot.per = 0,
      random.order = FALSE
    )
  ) # Right panel

Build Model

Start with splitting our data into training and testing sets.

library(tidymodels)

set.seed(123)

split_Netflix_concat <-
  initial_split(Netflix_concat, strata = duration, prop = 0.7)

Netflix_train <- training(split_Netflix_concat)

Netflix_test <- testing(split_Netflix_concat)

Create a recipe to pre-process our data.

library(textrecipes)
library(themis)

Netflix_rec <- recipe(duration ~ full_description +
                        country +
                        month +
                        year_2021_release +
                        rating,
                      data = Netflix_train) %>%
  step_dummy(country, month, rating) %>%
  step_tokenize(full_description) %>%
  step_stopwords(full_description,
                 custom_stopword_source = c(stopwords("english"), "shows", "tv")) %>%
  step_stem(full_description) %>%
  step_tokenfilter(full_description, max_tokens = 800) %>%
  step_tf(full_description) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(duration)

Netflix_rec
## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          5
## 
## Operations:
## 
## Dummy variables from country, month, rating
## Tokenization for full_description
## Stop word removal for full_description
## Stemming for full_description
## Text filtering for full_description
## Term frequency with full_description
## Centering and scaling for all_numeric_predictors()
## SMOTE based on duration

We use a generalized linear model (Logistic LASSO regression), and tune its parameters usning a validation dataset.

library(glmnet)

svm_spec <- logistic_reg(penalty = tune(),
                         mixture = tune()) %>%
  set_mode("classification") %>%
  set_engine("glmnet")


Netflix_wf <- workflow() %>%
  add_recipe(Netflix_rec) %>%
  add_model(svm_spec)

Netflix_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_dummy()
## * step_tokenize()
## * step_stopwords()
## * step_stem()
## * step_tokenfilter()
## * step_tf()
## * step_normalize()
## * step_smote()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = tune()
## 
## Computational engine: glmnet
svm_grid <- grid_regular(penalty(),
                         mixture(),
                         levels = 40)

svm_grid
## # A tibble: 1,600 x 2
##     penalty mixture
##       <dbl>   <dbl>
##  1 1   e-10       0
##  2 1.80e-10       0
##  3 3.26e-10       0
##  4 5.88e-10       0
##  5 1.06e- 9       0
##  6 1.91e- 9       0
##  7 3.46e- 9       0
##  8 6.24e- 9       0
##  9 1.13e- 8       0
## 10 2.03e- 8       0
## # ... with 1,590 more rows
set.seed(234)

cv_fold <- vfold_cv(Netflix_train, strata = duration, v = 5)

cv_fold
## #  5-fold cross-validation using stratification 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1187/297]> Fold1
## 2 <split [1187/297]> Fold2
## 3 <split [1187/297]> Fold3
## 4 <split [1187/297]> Fold4
## 5 <split [1188/296]> Fold5
library(doParallel)

cl <- makeCluster(2)

registerDoParallel(cl)

set.seed(345)

svm_res <- tune_grid(
  Netflix_wf,
  resamples = cv_fold,
  grid = svm_grid,
  metrics = metric_set(roc_auc)
)

svm_res %>% collect_metrics()
## # A tibble: 1,600 x 8
##     penalty mixture .metric .estimator  mean     n std_err .config              
##       <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1 1   e-10       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  2 1.80e-10       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  3 3.26e-10       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  4 5.88e-10       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  5 1.06e- 9       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  6 1.91e- 9       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  7 3.46e- 9       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  8 6.24e- 9       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
##  9 1.13e- 8       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
## 10 2.03e- 8       0 roc_auc binary     0.626     5  0.0109 Preprocessor1_Model0~
## # ... with 1,590 more rows
svm_res %>% show_best(metric = "roc_auc")
## # A tibble: 5 x 8
##   penalty mixture .metric .estimator  mean     n std_err .config                
##     <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                  
## 1  0.0289   0.872 roc_auc binary     0.704     5  0.0102 Preprocessor1_Model1394
## 2  0.0289   0.846 roc_auc binary     0.703     5  0.0102 Preprocessor1_Model1354
## 3  0.0289   0.897 roc_auc binary     0.703     5  0.0105 Preprocessor1_Model1434
## 4  0.0289   0.769 roc_auc binary     0.703     5  0.0108 Preprocessor1_Model1234
## 5  0.0289   0.821 roc_auc binary     0.703     5  0.0101 Preprocessor1_Model1314
auc_best <- svm_res %>% select_best(metric = "roc_auc")

final_wf <- finalize_workflow(Netflix_wf, auc_best)

final_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_dummy()
## * step_tokenize()
## * step_stopwords()
## * step_stem()
## * step_tokenfilter()
## * step_tf()
## * step_normalize()
## * step_smote()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.0289426612471674
##   mixture = 0.871794871794872
## 
## Computational engine: glmnet

We finalized the workflow, and now we fit it on our training dataset to see the most important predictors.

library(vip)

wf_final_train <- final_wf %>%
  fit(Netflix_train) %>%
  extract_fit_parsnip()

wf_final_train %>%
  vi(Lambda = auc_best$penalty, alpha = auc_best$mixture) %>%
  group_by(Sign) %>%
  slice_max(abs(Importance), n = 30) %>%
  ungroup() %>%
  mutate(
    Variable = str_replace_all(Variable, "tf_full_description", "word"),
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(Variable, Importance, fill = Sign)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Sign, scales = "free") +
  coord_flip()

In the following graph we can see what words help distinguish between “One season” shows and “More than one season” shows.

tidy(wf_final_train) %>%
  mutate(word_var = str_detect(term, "tf_full_description")) %>%
  filter(term != "(Intercept)", word_var == TRUE) %>%
  group_by(sign = relevel(factor(
    case_when(estimate > 0 ~ "More than one season",
              TRUE ~ "One season")
  ), ref = "One season")) %>%
  slice_max(abs(estimate), n = 20) %>%
  ungroup() %>%
  mutate(term = str_replace_all(term, "tf_full_description", "word")) %>%
  ggplot(aes(abs(estimate), fct_reorder(term, abs(estimate)), fill = sign)) +
  geom_col(show.legend = FALSE) +
  facet_wrap( ~ sign, scales = "free") +
  labs(x = "Coefficient from Logistic LASSO regression", y = NULL,
       title = "Words that are most predictive of number of seasons") +
  theme(plot.title = element_text(hjust = 0.5))

Evaluate the Model

Now, we can evaluate the performance of our model on the testing dataset. Based on the resulted “roc_auc”, we can happily say that we are not overfitting since the resulted “roc_auc” value here is close to the “roc_auc” value obtained using the validation dataset. Given the limited inputs, accuracy and roc_auc values for the testing dataset suggest that our model is acceptable:)

test_res <- final_wf %>% last_fit(split_Netflix_concat,
                                  metrics = metric_set(roc_auc, precision, accuracy))

test_res %>% collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 precision binary         0.773 Preprocessor1_Model1
## 2 accuracy  binary         0.697 Preprocessor1_Model1
## 3 roc_auc   binary         0.727 Preprocessor1_Model1
## confusion matrix

test_res %>%
  collect_predictions() %>%
  conf_mat(duration, .pred_class)
##                       Truth
## Prediction             One season More than one season
##   One season                  310                   91
##   More than one season        102                  135
## ROC curve

test_res %>%
  collect_predictions() %>%
  roc_curve(duration, `.pred_One season`) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(size = 1.5, color = "#440154FF") +
  geom_abline(
    alpha = 0.8,
    color = "#FDE725FF",
    size = 1.2
  )