library(randomForest)
library(pROC)
library(caret)
library(palmerpenguins)6 k-fold Cross Validation Approaches
6.1 Load Packages
6.2 Prepare data
penguins <- penguins |> na.omit()
penguins$species <- factor(penguins$species)6.3 Create k-fold cross validation
set.seed(123)
folds <- createFolds(penguins$species, k = 5)
folds$Fold1
[1] 8 13 16 20 23 27 36 51 56 57 58 60 63 64 87 90 91 94 96
[20] 106 110 112 118 124 125 130 134 138 139 151 152 154 156 166 181 182 190 200
[39] 204 206 211 213 222 234 235 236 238 240 243 244 254 257 263 266 273 277 278
[58] 284 291 302 309 310 312 313 317 324
$Fold2
[1] 10 12 14 19 22 30 33 43 46 47 54 61 68 71 72 74 78 82 92
[20] 97 102 104 107 108 111 119 120 123 132 147 149 158 160 161 169 174 178 180
[39] 191 192 198 199 201 202 215 219 226 231 237 242 248 250 259 267 268 270 274
[58] 280 287 300 316 319 320 322 332 333
$Fold3
[1] 3 4 5 15 17 18 25 29 32 35 37 40 44 45 48 66 70 86 89
[20] 93 99 105 116 122 129 131 136 140 141 144 157 165 172 177 183 184 186 187
[39] 194 195 197 205 207 209 210 212 217 218 220 228 229 232 262 276 281 282 285
[58] 288 289 292 293 298 304 315 323 325 329
$Fold4
[1] 1 6 9 11 24 26 28 38 39 41 52 62 67 69 79 81 83 84 85
[20] 88 98 100 109 115 128 135 143 145 146 153 155 159 164 167 168 175 188 193
[39] 196 208 216 221 223 224 227 230 233 241 247 256 258 261 265 269 271 286 290
[58] 299 301 303 305 306 307 308 311 321 331
$Fold5
[1] 2 7 21 31 34 42 49 50 53 55 59 65 73 75 76 77 80 95 101
[20] 103 113 114 117 121 126 127 133 137 142 148 150 162 163 170 171 173 176 179
[39] 185 189 203 214 225 239 245 246 249 251 252 253 255 260 264 272 275 279 283
[58] 294 295 296 297 314 318 326 327 328 330
6.4 Run random forest on each fold
results <- data.frame(
Fold = 1:5,
Accuracy = NA,
Sensitivity = NA,
Specificity = NA,
AUC = NA
)
roc_list <- list()
for (i in 1:5) {
test_idx <- folds[[i]]
train_set <- penguins[-test_idx, ]
test_set <- penguins[test_idx, ]
# Fit RF
rf_model <- randomForest(
species ~ bill_length_mm + bill_depth_mm +
flipper_length_mm + body_mass_g + sex,
data = train_set,
ntree = 500,
mtry = 3,
importance = TRUE
)
# Predict class
pred_class <- predict(rf_model, test_set)
# Predict probabilities
pred_prob <- predict(rf_model, test_set, type = "prob")
# Confusion matrix
cm <- confusionMatrix(pred_class, test_set$species)
results$Accuracy[i] <- cm$overall["Accuracy"]
results$Sensitivity[i] <- mean(cm$byClass[, "Sensitivity"], na.rm = TRUE)
results$Specificity[i] <- mean(cm$byClass[, "Specificity"], na.rm = TRUE)
# Hand–Till AUC for MULTICLASS
roc_obj <- multiclass.roc(response = test_set$species, predictor = pred_prob)
results$AUC[i] <- auc(roc_obj)
roc_list[[i]] <- roc_obj
}
results Fold Accuracy Sensitivity Specificity AUC
1 1 0.9545455 0.9631226 0.9794849 0.9990790
2 2 0.9848485 0.9885057 0.9937107 0.9986737
3 3 0.9850746 0.9888889 0.9937107 1.0000000
4 4 0.9850746 0.9761905 0.9912281 1.0000000
5 5 0.9701493 0.9523810 0.9824561 0.9958949
6.5 When using Random Forest with Cross-Validation, what is the final model?
During k-fold cross-validation, you train K different random forest models, each on a different subset of the data.
But none of those K models is your final model.
The purpose of Cross Validation in Random Forest
✔️ Evaluate multiple hyperparameter combinations
For example:
mtry (number of variables randomly chosen at each split)
ntree (number of trees)
nodesize
maxnodes
✔️ Measure model performance for each hyperparameter set
Using:
Accuracy
AUC
Sensitivity / Specificity
Kappa
Log-loss (if needed)
✔️ Choose the best-performing hyperparameter combination
(Usually the one giving highest CV accuracy or AUC.)
🎯 Then what?
After identifying the best hyperparameters
👉 You retrain one final random forest model on the entire dataset using ONLY the best hyperparameters.
6.6 Example
6.6.1 Hyperparameter Grid
grid <- expand.grid(
mtry = c(2, 3, 4),
ntree = c(300, 500, 800)
)
grid mtry ntree
1 2 300
2 3 300
3 4 300
4 2 500
5 3 500
6 4 500
7 2 800
8 3 800
9 4 800
6.6.2 Set up 5-fold cross-validation
set.seed(123)
folds <- createFolds(penguins$species, k = 5)6.6.3 Run cross-validation for each hyperparameter combination
cv_results <- data.frame(
mtry = grid$mtry,
ntree = grid$ntree,
Accuracy = NA
)
for (i in 1:nrow(grid)) {
m <- grid$mtry[i]
t <- grid$ntree[i]
acc_vec <- c()
for (k in 1:5) {
test_idx <- folds[[k]]
train_set <- penguins[-test_idx, ]
test_set <- penguins[test_idx, ]
# Fit model
rf_model <- randomForest(
species ~ bill_length_mm + bill_depth_mm +
flipper_length_mm + body_mass_g + sex,
data = train_set,
mtry = m,
ntree = t
)
# Predict
pred <- predict(rf_model, test_set)
# Accuracy for this fold
acc <- mean(pred == test_set$species)
acc_vec <- c(acc_vec, acc)
}
# Store mean CV accuracy
cv_results$Accuracy[i] <- mean(acc_vec)
}
cv_results mtry ntree Accuracy
1 2 300 0.9819539
2 3 300 0.9819539
3 4 300 0.9789236
4 2 500 0.9819539
5 3 500 0.9819539
6 4 500 0.9789236
7 2 800 0.9819539
8 3 800 0.9789236
9 4 800 0.9789236
6.6.4 Select the best hyperparameters
best_row <- cv_results[which.max(cv_results$Accuracy), ]
best_row mtry ntree Accuracy
1 2 300 0.9819539