In this post, I analyze a #TidyTuesday dataset on honey bee colonies to investigate the factors contributing to colony losses in the United States. The main goal is to tune the hyperparameters of an XGBoost model and then use the resulting model to predict colony loss rates.

Import the datasets

library(tidyverse)

colony <-
  readr::read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv'
  )
stressor <-
  readr::read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/stressor.csv'
  )
stressor_wider <-
  pivot_wider(stressor, names_from = stressor, values_from = stress_pct)

colony_raw <-
  colony %>% left_join(stressor_wider, by = c("year", "months", "state"))

Use missForest to impute missing values

library(missForest)

colony_joined <- colony_raw %>% mutate_if(is.character, factor) %>%
  mutate(year = factor(year)) %>%
  rename(
    `Stressor: Varroa mites` = `Varroa mites`,
    `Stressor: Other pests/parasites` = `Other pests/parasites`,
    `Stressor: Pesticides` = Pesticides,
    `Stressor: Other` = Other,
    `Stressor: Unknown` = Unknown
  ) %>%
  select(-colony_n,-colony_reno,-colony_added) %>% as.data.frame()

colony_imputed <- missForest(colony_joined, ntree = 100)

colony_full <- colony_imputed$ximp

colony_full <-
  colony_full %>% mutate(colony_lost = round(colony_lost)) %>%
  filter(state != "United States") %>% rename_with( ~ str_to_title(.))

Let’s do some data visualizations

library(scales)

colony_full %>% ggplot(aes(x = Months, y = Colony_lost_pct,
                           color = Months)) +
  geom_boxplot(outlier.alpha = 0.5, show.legend = FALSE) +
  labs(x = NULL, y = "Colonies lost", title = "Percent of total colonies lost by month and year") +
  theme(
    axis.text.x = element_text(
      angle = 45,
      vjust = 1,
      hjust = 1
    ),
    plot.title = element_text(hjust = 0.5)
  ) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free_y")

colony_full %>% ggplot(aes(x = `Stressor: Varroa Mites`, y = Colony_lost_pct,
                           color = Months)) +
  geom_point() + scale_color_viridis_d() +
  labs(x = "Percent of colonies affected by Varroa mites",
       y = "Colonies lost", title = "Percent of total colonies lost due to Varroa mites") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free")

colony_full %>% ggplot(aes(x = `Stressor: Pesticides`, y = Colony_lost_pct,
                           color = Months)) +
  geom_point() + scale_color_viridis_d() +
  labs(x = "Percent of colonies affected by pesticides",
       y = "Colonies lost", title = "Percent of total colonies lost due to pesticides") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free")

## Define the outcome of interest as the number of colony losses per hundred colonies.

colony_full <-
  colony_full %>% mutate(Colony_lost_rate = Colony_lost * 100 / Colony_max) %>%
  select(-Year,-Colony_max,-Colony_lost,-Colony_lost_pct)

library(tidymodels)

set.seed(100)

colony_split <- colony_full %>% initial_split()


colony_test <- testing(colony_split)
colony_train <- training(colony_split)

Data preprocessing and feature engineering

boost_rec <- recipe(Colony_lost_rate ~ ., data = colony_train) %>%
  step_log(all_outcomes()) %>% step_dummy(all_nominal())

boost_rec
## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          9
## 
## Operations:
## 
## Log transformation on all_outcomes()
## Dummy variables from all_nominal()
boost_prep <- prep(boost_rec)

train_juiced <- juice(boost_prep)

Build XGBoost model and tune the hyperparameters

boost_spec <- boost_tree(
  tree_depth = tune(),
  trees = 1000,
  learn_rate = tune(),
  mtry = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

boost_spec
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost
boost_grid <- grid_latin_hypercube(
  tree_depth(),
  learn_rate(),
  finalize(mtry(), colony_train),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  size = 20
)

boost_wf <-
  workflow() %>% add_recipe(boost_rec) %>% add_model(boost_spec)


library(doParallel)

set.seed(123)

boost_fold <- vfold_cv(colony_train)

cl <- makeCluster(2)

registerDoParallel(cl)

set.seed(234)

tune_res <- tune_grid(boost_wf, resamples = boost_fold,
                      grid = boost_grid, control = control_grid(save_pred = TRUE))

tune_res %>% collect_metrics()
## # A tibble: 40 x 12
##     mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
##  1    10    37          1   5.08e- 6  0.00000000191       0.548 rmse   
##  2    10    37          1   5.08e- 6  0.00000000191       0.548 rsq    
##  3     9    11          5   2.84e- 9  0.00000160          0.159 rmse   
##  4     9    11          5   2.84e- 9  0.00000160          0.159 rsq    
##  5    10    36          3   7.85e- 5  0.0000000122        0.739 rmse   
##  6    10    36          3   7.85e- 5  0.0000000122        0.739 rsq    
##  7     6    31         15   1.14e-10  0.00000461          0.866 rmse   
##  8     6    31         15   1.14e-10  0.00000461          0.866 rsq    
##  9     4    20          6   1.53e- 8  0.368               0.796 rmse   
## 10     4    20          6   1.53e- 8  0.368               0.796 rsq    
## # ... with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## #   n <int>, std_err <dbl>, .config <chr>
library(kableExtra)

tune_res %>%
  show_best(metric = "rmse") %>% select(-.config) %>%
  mutate_if(is.numeric, round, digits = 4) %>%
  kbl() %>%
  kable_paper("hover", font_size = 13, full_width = F)
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric .estimator mean n std_err
6 22 12 0.0212 0.0001 0.3479 rmse standard 0.6333 10 0.0276
8 26 3 0.0638 0.0000 0.3190 rmse standard 0.6356 10 0.0262
2 12 8 0.0060 8.3842 0.1162 rmse standard 0.7382 10 0.0343
7 16 9 0.0018 0.0000 0.4127 rmse standard 0.7487 10 0.0242
1 34 14 0.0008 0.0792 0.2520 rmse standard 1.1155 10 0.0169
rmse_best <- tune_res %>% select_best(metric = "rmse")

final_model <- finalize_model(boost_spec,
                              rmse_best)
final_model
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 6
##   trees = 1000
##   min_n = 22
##   tree_depth = 12
##   learn_rate = 0.0211556647988789
##   loss_reduction = 5.14459208654843e-05
##   sample_size = 0.347938195504248
## 
## Computational engine: xgboost

Most important predictors

library(vip)

final_model %>% set_engine("xgboost") %>%
  fit(Colony_lost_rate ~ ., data = train_juiced) %>%
  vip(geom = "point")

Evaluate the final XGBoost model

final_wf <- workflow() %>%
  add_recipe(boost_rec) %>%
  add_model(final_model)

xgb_fres <- final_wf %>%
  last_fit(colony_split)

xgb_fres %>%
  collect_metrics() %>% select(-.config) %>%
  kbl() %>%
  kable_paper("hover", full_width = F)
.metric .estimator .estimate
rmse standard 0.5981180
rsq standard 0.4490157
collect_predictions(xgb_fres)
## # A tibble: 299 x 5
##    id               .pred  .row Colony_lost_rate .config             
##    <chr>            <dbl> <int>            <dbl> <chr>               
##  1 train/test split  1.86     3             2.37 Preprocessor1_Model1
##  2 train/test split  2.71     5             2.48 Preprocessor1_Model1
##  3 train/test split  2.60     7             2.59 Preprocessor1_Model1
##  4 train/test split  2.18     8             2.63 Preprocessor1_Model1
##  5 train/test split  2.91    15             3.66 Preprocessor1_Model1
##  6 train/test split  1.79    25             2.13 Preprocessor1_Model1
##  7 train/test split  2.94    29             3.29 Preprocessor1_Model1
##  8 train/test split  2.00    33             2.01 Preprocessor1_Model1
##  9 train/test split  2.87    37             3.05 Preprocessor1_Model1
## 10 train/test split  2.35    40             2.54 Preprocessor1_Model1
## # ... with 289 more rows