In this post, I analyze a #TidyTuesday dataset on honey bee colonies to investigate the factors contributing to colony losses in the United States. The main goal is to tune the hyperparameters of an XGBoost model and then use the resulting model to predict colony loss rates.
Import the datasets
library(tidyverse)
colony <-
readr::read_csv(
'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv'
)
stressor <-
readr::read_csv(
'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/stressor.csv'
)
stressor_wider <-
pivot_wider(stressor, names_from = stressor, values_from = stress_pct)
colony_raw <-
colony %>% left_join(stressor_wider, by = c("year", "months", "state"))
Use missForest to impute missing values
library(missForest)
colony_joined <- colony_raw %>% mutate_if(is.character, factor) %>%
mutate(year = factor(year)) %>%
rename(
`Stressor: Varroa mites` = `Varroa mites`,
`Stressor: Other pests/parasites` = `Other pests/parasites`,
`Stressor: Pesticides` = Pesticides,
`Stressor: Other` = Other,
`Stressor: Unknown` = Unknown
) %>%
select(-colony_n,-colony_reno,-colony_added) %>% as.data.frame()
colony_imputed <- missForest(colony_joined, ntree = 100)
colony_full <- colony_imputed$ximp
colony_full <-
colony_full %>% mutate(colony_lost = round(colony_lost)) %>%
filter(state != "United States") %>% rename_with( ~ str_to_title(.))
Let’s do some data visualizations
library(scales)
colony_full %>% ggplot(aes(x = Months, y = Colony_lost_pct,
color = Months)) +
geom_boxplot(outlier.alpha = 0.5, show.legend = FALSE) +
labs(x = NULL, y = "Colonies lost", title = "Percent of total colonies lost by month and year") +
theme(
axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
),
plot.title = element_text(hjust = 0.5)
) +
scale_y_continuous(labels = label_percent(scale = 1)) +
facet_wrap( ~ Year, scales = "free_y")

- In the chart here we see the distribution of colony losses by months
and year. It is apparent that colony losses are more during
January-March.
colony_full %>% ggplot(aes(x = `Stressor: Varroa Mites`, y = Colony_lost_pct,
color = Months)) +
geom_point() + scale_color_viridis_d() +
labs(x = "Percent of colonies affected by Varroa mites",
y = "Colonies lost", title = "Percent of total colonies lost due to Varroa mites") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = label_percent(scale = 1)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
facet_wrap( ~ Year, scales = "free")

colony_full %>% ggplot(aes(x = `Stressor: Pesticides`, y = Colony_lost_pct,
color = Months)) +
geom_point() + scale_color_viridis_d() +
labs(x = "Percent of colonies affected by pesticides",
y = "Colonies lost", title = "Percent of total colonies lost due to pesticides") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = label_percent(scale = 1)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
facet_wrap( ~ Year, scales = "free")

## Define the outcome of interest as the number of colony losses per hundred colonies.
colony_full <-
colony_full %>% mutate(Colony_lost_rate = Colony_lost * 100 / Colony_max) %>%
select(-Year,-Colony_max,-Colony_lost,-Colony_lost_pct)
library(tidymodels)
set.seed(100)
colony_split <- colony_full %>% initial_split()
colony_test <- testing(colony_split)
colony_train <- training(colony_split)
Data preprocessing and feature engineering
boost_rec <- recipe(Colony_lost_rate ~ ., data = colony_train) %>%
step_log(all_outcomes()) %>% step_dummy(all_nominal())
boost_rec
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 9
##
## Operations:
##
## Log transformation on all_outcomes()
## Dummy variables from all_nominal()
boost_prep <- prep(boost_rec)
train_juiced <- juice(boost_prep)
Build XGBoost model and tune the hyperparameters
boost_spec <- boost_tree(
tree_depth = tune(),
trees = 1000,
learn_rate = tune(),
mtry = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
boost_spec
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
boost_grid <- grid_latin_hypercube(
tree_depth(),
learn_rate(),
finalize(mtry(), colony_train),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
size = 20
)
boost_wf <-
workflow() %>% add_recipe(boost_rec) %>% add_model(boost_spec)
library(doParallel)
set.seed(123)
boost_fold <- vfold_cv(colony_train)
cl <- makeCluster(2)
registerDoParallel(cl)
set.seed(234)
tune_res <- tune_grid(boost_wf, resamples = boost_fold,
grid = boost_grid, control = control_grid(save_pred = TRUE))
tune_res %>% collect_metrics()
## # A tibble: 40 x 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 10 37 1 5.08e- 6 0.00000000191 0.548 rmse
## 2 10 37 1 5.08e- 6 0.00000000191 0.548 rsq
## 3 9 11 5 2.84e- 9 0.00000160 0.159 rmse
## 4 9 11 5 2.84e- 9 0.00000160 0.159 rsq
## 5 10 36 3 7.85e- 5 0.0000000122 0.739 rmse
## 6 10 36 3 7.85e- 5 0.0000000122 0.739 rsq
## 7 6 31 15 1.14e-10 0.00000461 0.866 rmse
## 8 6 31 15 1.14e-10 0.00000461 0.866 rsq
## 9 4 20 6 1.53e- 8 0.368 0.796 rmse
## 10 4 20 6 1.53e- 8 0.368 0.796 rsq
## # ... with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## # n <int>, std_err <dbl>, .config <chr>
library(kableExtra)
tune_res %>%
show_best(metric = "rmse") %>% select(-.config) %>%
mutate_if(is.numeric, round, digits = 4) %>%
kbl() %>%
kable_paper("hover", font_size = 13, full_width = F)
mtry
|
min_n
|
tree_depth
|
learn_rate
|
loss_reduction
|
sample_size
|
.metric
|
.estimator
|
mean
|
n
|
std_err
|
6
|
22
|
12
|
0.0212
|
0.0001
|
0.3479
|
rmse
|
standard
|
0.6333
|
10
|
0.0276
|
8
|
26
|
3
|
0.0638
|
0.0000
|
0.3190
|
rmse
|
standard
|
0.6356
|
10
|
0.0262
|
2
|
12
|
8
|
0.0060
|
8.3842
|
0.1162
|
rmse
|
standard
|
0.7382
|
10
|
0.0343
|
7
|
16
|
9
|
0.0018
|
0.0000
|
0.4127
|
rmse
|
standard
|
0.7487
|
10
|
0.0242
|
1
|
34
|
14
|
0.0008
|
0.0792
|
0.2520
|
rmse
|
standard
|
1.1155
|
10
|
0.0169
|
rmse_best <- tune_res %>% select_best(metric = "rmse")
final_model <- finalize_model(boost_spec,
rmse_best)
final_model
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = 6
## trees = 1000
## min_n = 22
## tree_depth = 12
## learn_rate = 0.0211556647988789
## loss_reduction = 5.14459208654843e-05
## sample_size = 0.347938195504248
##
## Computational engine: xgboost
Most important predictors
library(vip)
final_model %>% set_engine("xgboost") %>%
fit(Colony_lost_rate ~ ., data = train_juiced) %>%
vip(geom = "point")

Evaluate the final XGBoost model
final_wf <- workflow() %>%
add_recipe(boost_rec) %>%
add_model(final_model)
xgb_fres <- final_wf %>%
last_fit(colony_split)
xgb_fres %>%
collect_metrics() %>% select(-.config) %>%
kbl() %>%
kable_paper("hover", full_width = F)
.metric
|
.estimator
|
.estimate
|
rmse
|
standard
|
0.5981180
|
rsq
|
standard
|
0.4490157
|
- Let’s see some of the predictions and their corresponding true
values:)
collect_predictions(xgb_fres)
## # A tibble: 299 x 5
## id .pred .row Colony_lost_rate .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 1.86 3 2.37 Preprocessor1_Model1
## 2 train/test split 2.71 5 2.48 Preprocessor1_Model1
## 3 train/test split 2.60 7 2.59 Preprocessor1_Model1
## 4 train/test split 2.18 8 2.63 Preprocessor1_Model1
## 5 train/test split 2.91 15 3.66 Preprocessor1_Model1
## 6 train/test split 1.79 25 2.13 Preprocessor1_Model1
## 7 train/test split 2.94 29 3.29 Preprocessor1_Model1
## 8 train/test split 2.00 33 2.01 Preprocessor1_Model1
## 9 train/test split 2.87 37 3.05 Preprocessor1_Model1
## 10 train/test split 2.35 40 2.54 Preprocessor1_Model1
## # ... with 289 more rows