6  k-fold Cross Validation Approaches

6.1 Load Packages

library(randomForest)
library(pROC)
library(caret)
library(palmerpenguins)

6.2 Prepare data

penguins <- penguins |> na.omit()
penguins$species <- factor(penguins$species)

6.3 Create k-fold cross validation

set.seed(123)
folds <- createFolds(penguins$species, k = 5)
folds
$Fold1
 [1]   8  13  16  20  23  27  36  51  56  57  58  60  63  64  87  90  91  94  96
[20] 106 110 112 118 124 125 130 134 138 139 151 152 154 156 166 181 182 190 200
[39] 204 206 211 213 222 234 235 236 238 240 243 244 254 257 263 266 273 277 278
[58] 284 291 302 309 310 312 313 317 324

$Fold2
 [1]  10  12  14  19  22  30  33  43  46  47  54  61  68  71  72  74  78  82  92
[20]  97 102 104 107 108 111 119 120 123 132 147 149 158 160 161 169 174 178 180
[39] 191 192 198 199 201 202 215 219 226 231 237 242 248 250 259 267 268 270 274
[58] 280 287 300 316 319 320 322 332 333

$Fold3
 [1]   3   4   5  15  17  18  25  29  32  35  37  40  44  45  48  66  70  86  89
[20]  93  99 105 116 122 129 131 136 140 141 144 157 165 172 177 183 184 186 187
[39] 194 195 197 205 207 209 210 212 217 218 220 228 229 232 262 276 281 282 285
[58] 288 289 292 293 298 304 315 323 325 329

$Fold4
 [1]   1   6   9  11  24  26  28  38  39  41  52  62  67  69  79  81  83  84  85
[20]  88  98 100 109 115 128 135 143 145 146 153 155 159 164 167 168 175 188 193
[39] 196 208 216 221 223 224 227 230 233 241 247 256 258 261 265 269 271 286 290
[58] 299 301 303 305 306 307 308 311 321 331

$Fold5
 [1]   2   7  21  31  34  42  49  50  53  55  59  65  73  75  76  77  80  95 101
[20] 103 113 114 117 121 126 127 133 137 142 148 150 162 163 170 171 173 176 179
[39] 185 189 203 214 225 239 245 246 249 251 252 253 255 260 264 272 275 279 283
[58] 294 295 296 297 314 318 326 327 328 330

6.4 Run random forest on each fold

results <- data.frame(
  Fold = 1:5,
  Accuracy = NA,
  Sensitivity = NA,
  Specificity = NA,
  AUC = NA
)

roc_list <- list()

for (i in 1:5) {
  
  test_idx  <- folds[[i]]
  train_set <- penguins[-test_idx, ]
  test_set  <- penguins[test_idx, ]
  
  # Fit RF
  rf_model <- randomForest(
    species ~ bill_length_mm + bill_depth_mm +
      flipper_length_mm + body_mass_g + sex,
    data = train_set,
    ntree = 500,
    mtry = 3,
    importance = TRUE
  )
  
  # Predict class
  pred_class <- predict(rf_model, test_set)
  
  # Predict probabilities
  pred_prob <- predict(rf_model, test_set, type = "prob")
  
  # Confusion matrix
  cm <- confusionMatrix(pred_class, test_set$species)
  
  results$Accuracy[i]    <- cm$overall["Accuracy"]
  results$Sensitivity[i] <- mean(cm$byClass[, "Sensitivity"], na.rm = TRUE)
  results$Specificity[i] <- mean(cm$byClass[, "Specificity"], na.rm = TRUE)
  
  # Hand–Till AUC for MULTICLASS
  roc_obj <- multiclass.roc(response = test_set$species, predictor = pred_prob)
  results$AUC[i] <- auc(roc_obj)
  
  roc_list[[i]] <- roc_obj
}

results
  Fold  Accuracy Sensitivity Specificity       AUC
1    1 0.9545455   0.9631226   0.9794849 0.9990790
2    2 0.9848485   0.9885057   0.9937107 0.9986737
3    3 0.9850746   0.9888889   0.9937107 1.0000000
4    4 0.9850746   0.9761905   0.9912281 1.0000000
5    5 0.9701493   0.9523810   0.9824561 0.9958949

6.5 When using Random Forest with Cross-Validation, what is the final model?

During k-fold cross-validation, you train K different random forest models, each on a different subset of the data.

But none of those K models is your final model.

The purpose of Cross Validation in Random Forest

✔️ Evaluate multiple hyperparameter combinations

For example:

mtry (number of variables randomly chosen at each split)

ntree (number of trees)

nodesize

maxnodes

✔️ Measure model performance for each hyperparameter set

Using:

Accuracy

AUC

Sensitivity / Specificity

Kappa

Log-loss (if needed)

✔️ Choose the best-performing hyperparameter combination

(Usually the one giving highest CV accuracy or AUC.)

🎯 Then what?

After identifying the best hyperparameters

👉 You retrain one final random forest model on the entire dataset using ONLY the best hyperparameters.

6.6 Example

6.6.1 Hyperparameter Grid

grid <- expand.grid(
  mtry  = c(2, 3, 4),
  ntree = c(300, 500, 800)
)
grid
  mtry ntree
1    2   300
2    3   300
3    4   300
4    2   500
5    3   500
6    4   500
7    2   800
8    3   800
9    4   800

6.6.2 Set up 5-fold cross-validation

set.seed(123)
folds <- createFolds(penguins$species, k = 5)

6.6.3 Run cross-validation for each hyperparameter combination

cv_results <- data.frame(
  mtry = grid$mtry,
  ntree = grid$ntree,
  Accuracy = NA
)

for (i in 1:nrow(grid)) {
  
  m <- grid$mtry[i]
  t <- grid$ntree[i]
  
  acc_vec <- c()
  
  for (k in 1:5) {
    
    test_idx  <- folds[[k]]
    train_set <- penguins[-test_idx, ]
    test_set  <- penguins[test_idx, ]
    
    # Fit model
    rf_model <- randomForest(
      species ~ bill_length_mm + bill_depth_mm +
        flipper_length_mm + body_mass_g + sex,
      data = train_set,
      mtry = m,
      ntree = t
    )
    
    # Predict
    pred <- predict(rf_model, test_set)
    
    # Accuracy for this fold
    acc <- mean(pred == test_set$species)
    acc_vec <- c(acc_vec, acc)
  }
  
  # Store mean CV accuracy
  cv_results$Accuracy[i] <- mean(acc_vec)
}

cv_results
  mtry ntree  Accuracy
1    2   300 0.9819539
2    3   300 0.9819539
3    4   300 0.9789236
4    2   500 0.9819539
5    3   500 0.9819539
6    4   500 0.9789236
7    2   800 0.9819539
8    3   800 0.9789236
9    4   800 0.9789236

6.6.4 Select the best hyperparameters

best_row <- cv_results[which.max(cv_results$Accuracy), ]
best_row
  mtry ntree  Accuracy
1    2   300 0.9819539