Predicting Bee Colony Losses

In this post, I analyze a #TidyTuesday dataset on honey bee colonies to investigate the factors contributing to colony losses in the United States. The main goal is to tune the hyperparameters of an XGBoost model and then use the resulting model to predict colony loss rates.

Import the datasets

library(tidyverse)

colony <-
  readr::read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv'
  )
stressor <-
  readr::read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/stressor.csv'
  )

stressor_wider <-
  pivot_wider(stressor, names_from = stressor, values_from = stress_pct)

colony_raw <-
  colony %>% left_join(stressor_wider, by = c("year", "months", "state"))

Use missForest to impute missing values

library(missForest)

colony_joined <- colony_raw %>% mutate_if(is.character, factor) %>%
  mutate(year = factor(year)) %>%
  rename(
    `Stressor: Varroa mites` = `Varroa mites`,
    `Stressor: Other pests/parasites` = `Other pests/parasites`,
    `Stressor: Pesticides` = Pesticides,
    `Stressor: Other` = Other,
    `Stressor: Unknown` = Unknown
  ) %>%
  select(-colony_n,-colony_reno,-colony_added) %>% as.data.frame()

colony_imputed <- missForest(colony_joined, ntree = 100)

colony_full <- colony_imputed$ximp

colony_full <-
  colony_full %>% mutate(colony_lost = round(colony_lost)) %>%
  filter(state != "United States") %>% rename_with( ~ str_to_title(.))

Let’s do some data visualizations

library(scales)

colony_full %>% ggplot(aes(x = Months, y = Colony_lost_pct,
                           color = Months)) +
  geom_boxplot(outlier.alpha = 0.5, show.legend = FALSE) +
  labs(x = NULL, y = "Colonies lost", title = "Percent of total colonies lost by month and year") +
  theme(
    axis.text.x = element_text(
      angle = 45,
      vjust = 1,
      hjust = 1
    ),
    plot.title = element_text(hjust = 0.5)
  ) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free_y")

In the chart here we see the distribution of colony losses by months and year. It is apparent that colony losses are more during January-March.

colony_full %>% ggplot(aes(x = `Stressor: Varroa Mites`, y = Colony_lost_pct,
                           color = Months)) +
  geom_point() + scale_color_viridis_d() +
  labs(x = "Percent of colonies affected by Varroa mites",
       y = "Colonies lost", title = "Percent of total colonies lost due to Varroa mites") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free")

colony_full %>% ggplot(aes(x = `Stressor: Pesticides`, y = Colony_lost_pct,
                           color = Months)) +
  geom_point() + scale_color_viridis_d() +
  labs(x = "Percent of colonies affected by pesticides",
       y = "Colonies lost", title = "Percent of total colonies lost due to pesticides") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  facet_wrap( ~ Year, scales = "free")

## Define the outcome of interest as the number of colony losses per hundred colonies.

colony_full <-
  colony_full %>% mutate(Colony_lost_rate = Colony_lost * 100 / Colony_max) %>%
  select(-Year,-Colony_max,-Colony_lost,-Colony_lost_pct)

library(tidymodels)

set.seed(100)

colony_split <- colony_full %>% initial_split()


colony_test <- testing(colony_split)
colony_train <- training(colony_split)

Data preprocessing and feature engineering

boost_rec <- recipe(Colony_lost_rate ~ ., data = colony_train) %>%
  step_log(all_outcomes()) %>% step_dummy(all_nominal())

boost_rec

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          9
## 
## Operations:
## 
## Log transformation on all_outcomes()
## Dummy variables from all_nominal()

boost_prep <- prep(boost_rec)

train_juiced <- juice(boost_prep)

Build XGBoost model and tune the hyperparameters

boost_spec <- boost_tree(
  tree_depth = tune(),
  trees = 1000,
  learn_rate = tune(),
  mtry = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

boost_spec

## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

boost_grid <- grid_latin_hypercube(
  tree_depth(),
  learn_rate(),
  finalize(mtry(), colony_train),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  size = 20
)

boost_wf <-
  workflow() %>% add_recipe(boost_rec) %>% add_model(boost_spec)


library(doParallel)

set.seed(123)

boost_fold <- vfold_cv(colony_train)

cl <- makeCluster(2)

registerDoParallel(cl)

set.seed(234)

tune_res <- tune_grid(boost_wf, resamples = boost_fold,
                      grid = boost_grid, control = control_grid(save_pred = TRUE))

tune_res %>% collect_metrics()

## # A tibble: 40 x 12
##     mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
##  1    10    37          1   5.08e- 6  0.00000000191       0.548 rmse   
##  2    10    37          1   5.08e- 6  0.00000000191       0.548 rsq    
##  3     9    11          5   2.84e- 9  0.00000160          0.159 rmse   
##  4     9    11          5   2.84e- 9  0.00000160          0.159 rsq    
##  5    10    36          3   7.85e- 5  0.0000000122        0.739 rmse   
##  6    10    36          3   7.85e- 5  0.0000000122        0.739 rsq    
##  7     6    31         15   1.14e-10  0.00000461          0.866 rmse   
##  8     6    31         15   1.14e-10  0.00000461          0.866 rsq    
##  9     4    20          6   1.53e- 8  0.368               0.796 rmse   
## 10     4    20          6   1.53e- 8  0.368               0.796 rsq    
## # ... with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## #   n <int>, std_err <dbl>, .config <chr>

library(kableExtra)

tune_res %>%
  show_best(metric = "rmse") %>% select(-.config) %>%
  mutate_if(is.numeric, round, digits = 4) %>%
  kbl() %>%
  kable_paper("hover", font_size = 13, full_width = F)

mtry	min_n	tree_depth	learn_rate	loss_reduction	sample_size	.metric	.estimator	mean	n	std_err
6	22	12	0.0212	0.0001	0.3479	rmse	standard	0.6333	10	0.0276
8	26	3	0.0638	0.0000	0.3190	rmse	standard	0.6356	10	0.0262
2	12	8	0.0060	8.3842	0.1162	rmse	standard	0.7382	10	0.0343
7	16	9	0.0018	0.0000	0.4127	rmse	standard	0.7487	10	0.0242
1	34	14	0.0008	0.0792	0.2520	rmse	standard	1.1155	10	0.0169

rmse_best <- tune_res %>% select_best(metric = "rmse")

final_model <- finalize_model(boost_spec,
                              rmse_best)
final_model

## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 6
##   trees = 1000
##   min_n = 22
##   tree_depth = 12
##   learn_rate = 0.0211556647988789
##   loss_reduction = 5.14459208654843e-05
##   sample_size = 0.347938195504248
## 
## Computational engine: xgboost

Most important predictors

library(vip)

final_model %>% set_engine("xgboost") %>%
  fit(Colony_lost_rate ~ ., data = train_juiced) %>%
  vip(geom = "point")

Evaluate the final XGBoost model

final_wf <- workflow() %>%
  add_recipe(boost_rec) %>%
  add_model(final_model)

xgb_fres <- final_wf %>%
  last_fit(colony_split)

xgb_fres %>%
  collect_metrics() %>% select(-.config) %>%
  kbl() %>%
  kable_paper("hover", full_width = F)

.metric	.estimator	.estimate
rmse	standard	0.5981180
rsq	standard	0.4490157

Let’s see some of the predictions and their corresponding true values:)

collect_predictions(xgb_fres)

## # A tibble: 299 x 5
##    id               .pred  .row Colony_lost_rate .config             
##    <chr>            <dbl> <int>            <dbl> <chr>               
##  1 train/test split  1.86     3             2.37 Preprocessor1_Model1
##  2 train/test split  2.71     5             2.48 Preprocessor1_Model1
##  3 train/test split  2.60     7             2.59 Preprocessor1_Model1
##  4 train/test split  2.18     8             2.63 Preprocessor1_Model1
##  5 train/test split  2.91    15             3.66 Preprocessor1_Model1
##  6 train/test split  1.79    25             2.13 Preprocessor1_Model1
##  7 train/test split  2.94    29             3.29 Preprocessor1_Model1
##  8 train/test split  2.00    33             2.01 Preprocessor1_Model1
##  9 train/test split  2.87    37             3.05 Preprocessor1_Model1
## 10 train/test split  2.35    40             2.54 Preprocessor1_Model1
## # ... with 289 more rows