R code

library(palmerpenguins)
data(penguins)

Splitting data into training and test set

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──

## ✔ broom        0.8.0     ✔ rsample      0.1.1
## ✔ dials        0.1.1     ✔ tune         0.2.0
## ✔ infer        1.0.2     ✔ workflows    0.2.6
## ✔ modeldata    1.0.0     ✔ workflowsets 0.2.1
## ✔ parsnip      0.2.1     ✔ yardstick    1.0.0
## ✔ recipes      1.0.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

set.seed(123)
penguins <- na.omit(penguins) 
penguins_split <- initial_split(penguins, strata = species)
penguins_train <- training(penguins_split)
summary(penguins_train)

##       species          island    bill_length_mm bill_depth_mm  
##  Adelie   :109   Biscoe   :119   Min.   :33.1   Min.   :13.10  
##  Chinstrap: 51   Dream    : 95   1st Qu.:39.2   1st Qu.:15.70  
##  Gentoo   : 89   Torgersen: 35   Median :44.1   Median :17.30  
##                                  Mean   :44.0   Mean   :17.22  
##                                  3rd Qu.:48.5   3rd Qu.:18.70  
##                                  Max.   :59.6   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:116   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3600   male  :133   1st Qu.:2007  
##  Median :197.0     Median :4050                Median :2008  
##  Mean   :201.2     Mean   :4245                Mean   :2008  
##  3rd Qu.:214.0     3rd Qu.:4850                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009

penguins_test <- testing(penguins_split)
summary(penguins_test)

##       species         island   bill_length_mm  bill_depth_mm  
##  Adelie   :37   Biscoe   :44   Min.   :32.10   Min.   :13.60  
##  Chinstrap:17   Dream    :28   1st Qu.:39.50   1st Qu.:15.47  
##  Gentoo   :30   Torgersen:12   Median :45.20   Median :17.25  
##                                Mean   :43.99   Mean   :17.01  
##                                3rd Qu.:48.70   3rd Qu.:18.50  
##                                Max.   :58.00   Max.   :21.10  
##  flipper_length_mm  body_mass_g       sex          year     
##  Min.   :180.0     Min.   :2850   female:49   Min.   :2007  
##  1st Qu.:189.0     1st Qu.:3550   male  :35   1st Qu.:2008  
##  Median :196.0     Median :3925               Median :2008  
##  Mean   :200.2     Mean   :4094               Mean   :2008  
##  3rd Qu.:210.5     3rd Qu.:4631               3rd Qu.:2009  
##  Max.   :230.0     Max.   :5850               Max.   :2009

Next, let’s create cross-validation resamples of the training data, to evaluate our models.

set.seed(234)
penguins_folds <- vfold_cv(penguins_train, strata = species)
penguins_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [223/26]> Fold01
##  2 <split [224/25]> Fold02
##  3 <split [224/25]> Fold03
##  4 <split [224/25]> Fold04
##  5 <split [224/25]> Fold05
##  6 <split [224/25]> Fold06
##  7 <split [224/25]> Fold07
##  8 <split [224/25]> Fold08
##  9 <split [224/25]> Fold09
## 10 <split [226/23]> Fold10

Next, let’s preprocess our data to get it ready for modeling.

penguins_rec <- recipe(species ~ island + bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g + sex, data = penguins_train) 

penguins_rec2 <- recipe(species ~ island + bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g + sex, data = penguins_train)  %>%
   step_normalize(body_mass_g)

penguins_prep <- prep(penguins_rec)
penguins_prep

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          6
## 
## Training data contained 249 data points and no missing data.

juice(penguins_prep)

## # A tibble: 249 × 7
##    island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex   
##    <fct>              <dbl>         <dbl>             <int>       <int> <fct> 
##  1 Torgersen           40.3          18                 195        3250 female
##  2 Torgersen           36.7          19.3               193        3450 female
##  3 Torgersen           38.9          17.8               181        3625 female
##  4 Torgersen           39.2          19.6               195        4675 male  
##  5 Torgersen           41.1          17.6               182        3200 female
##  6 Torgersen           38.6          21.2               191        3800 male  
##  7 Torgersen           34.6          21.1               198        4400 male  
##  8 Torgersen           38.7          19                 195        3450 female
##  9 Torgersen           42.5          20.7               197        4500 male  
## 10 Torgersen           34.4          18.4               184        3325 female
## # … with 239 more rows, and 1 more variable: species <fct>

Random forest

rf_spec <- rand_forest(trees = 1000) %>%
  set_engine("ranger") %>%
  set_mode("classification")

rf_spec

## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   trees = 1000
## 
## Computational engine: ranger

SVM

svm_spec <- svm_rbf(cost = 0.5) %>%
  set_engine("kernlab") %>%
  set_mode("classification")

svm_spec

## Radial Basis Function Support Vector Machine Specification (classification)
## 
## Main Arguments:
##   cost = 0.5
## 
## Computational engine: kernlab

penguins_wf <- workflow() %>%
  add_recipe(penguins_rec)

penguins_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: None
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps

## Illustration only

penguins_prep2 <- prep(penguins_rec2)
penguins_prep2

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          6
## 
## Training data contained 249 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for body_mass_g [trained]

juice(penguins_prep2)

## # A tibble: 249 × 7
##    island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex   
##    <fct>              <dbl>         <dbl>             <int>       <dbl> <fct> 
##  1 Torgersen           40.3          18                 195      -1.21  female
##  2 Torgersen           36.7          19.3               193      -0.963 female
##  3 Torgersen           38.9          17.8               181      -0.751 female
##  4 Torgersen           39.2          19.6               195       0.520 male  
##  5 Torgersen           41.1          17.6               182      -1.27  female
##  6 Torgersen           38.6          21.2               191      -0.539 male  
##  7 Torgersen           34.6          21.1               198       0.187 male  
##  8 Torgersen           38.7          19                 195      -0.963 female
##  9 Torgersen           42.5          20.7               197       0.309 male  
## 10 Torgersen           34.4          18.4               184      -1.11  female
## # … with 239 more rows, and 1 more variable: species <fct>

penguins_wf2 <- workflow() %>%
  add_recipe(penguins_rec2)
penguins_wf2

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: None
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()

Now we can add a model, and the fit to each of the resamples. First, we can fit the random forest model.

set.seed(1234)
rf_rs <- penguins_wf %>%
  add_model(rf_spec) %>%
  fit_resamples(
    resamples = penguins_folds,
    metrics = metric_set(roc_auc, accuracy, sens, spec),
    control = control_grid(save_pred = TRUE)
  )
rf_rs

## # Resampling results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits           id     .metrics         .notes           .predictions     
##    <list>           <chr>  <list>           <list>           <list>           
##  1 <split [223/26]> Fold01 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [26 × 7]>
##  2 <split [224/25]> Fold02 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  3 <split [224/25]> Fold03 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  4 <split [224/25]> Fold04 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  5 <split [224/25]> Fold05 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  6 <split [224/25]> Fold06 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  7 <split [224/25]> Fold07 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  8 <split [224/25]> Fold08 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
##  9 <split [224/25]> Fold09 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [25 × 7]>
## 10 <split [226/23]> Fold10 <tibble [4 × 4]> <tibble [0 × 3]> <tibble [23 × 7]>

SVM

svm_rs <- penguins_wf %>%
  add_model(svm_spec) %>%
  fit_resamples(
    resamples = penguins_folds,
    metrics = metric_set(roc_auc, accuracy, sens, spec),
    control = control_grid(save_pred = TRUE)
  )

Evaluate Model

Random forests

collect_metrics(rf_rs)

## # A tibble: 4 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy multiclass 0.983    10 0.00941 Preprocessor1_Model1
## 2 roc_auc  hand_till  0.997    10 0.00198 Preprocessor1_Model1
## 3 sens     macro      0.980    10 0.0113  Preprocessor1_Model1
## 4 spec     macro      0.992    10 0.00481 Preprocessor1_Model1

conf_mat_resampled(rf_rs)

## # A tibble: 9 × 3
##   Prediction Truth      Freq
##   <fct>      <fct>     <dbl>
## 1 Adelie     Adelie     10.7
## 2 Adelie     Chinstrap   0.2
## 3 Adelie     Gentoo      0  
## 4 Chinstrap  Adelie      0.2
## 5 Chinstrap  Chinstrap   4.9
## 6 Chinstrap  Gentoo      0  
## 7 Gentoo     Adelie      0  
## 8 Gentoo     Chinstrap   0  
## 9 Gentoo     Gentoo      8.9

SVM

collect_metrics(svm_rs)

## # A tibble: 4 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy multiclass 0.992    10 0.00557 Preprocessor1_Model1
## 2 roc_auc  hand_till  1        10 0       Preprocessor1_Model1
## 3 sens     macro      0.987    10 0.00889 Preprocessor1_Model1
## 4 spec     macro      0.995    10 0.00330 Preprocessor1_Model1

conf_mat_resampled(svm_rs)

## # A tibble: 9 × 3
##   Prediction Truth      Freq
##   <fct>      <fct>     <dbl>
## 1 Adelie     Adelie     10.9
## 2 Adelie     Chinstrap   0.2
## 3 Adelie     Gentoo      0  
## 4 Chinstrap  Adelie      0  
## 5 Chinstrap  Chinstrap   4.9
## 6 Chinstrap  Gentoo      0  
## 7 Gentoo     Adelie      0  
## 8 Gentoo     Chinstrap   0  
## 9 Gentoo     Gentoo      8.9

Evaluation on test set

rf_fit <- rf_spec %>%
  fit(species ~ island + bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g + sex, data = penguins_train)
rf_fit

## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  1000 
## Sample size:                      249 
## Number of independent variables:  6 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.02190814

rf_testing_pred <- 
  predict(rf_fit, penguins_test, type = "prob")
rf_testing_pred

## # A tibble: 84 × 3
##    .pred_Adelie .pred_Chinstrap .pred_Gentoo
##           <dbl>           <dbl>        <dbl>
##  1        0.990         0.0100      0       
##  2        0.975         0.0251      0       
##  3        0.993         0.00726     0       
##  4        0.992         0.00757     0       
##  5        0.996         0.00358     0.00025 
##  6        0.998         0.00208     0.000167
##  7        0.966         0.0339      0       
##  8        0.679         0.311       0.00953 
##  9        0.978         0.0221      0       
## 10        0.997         0.00336     0       
## # … with 74 more rows

rf_training_pred <- 
  predict(rf_fit, penguins_train) %>% 
  bind_cols(predict(rf_fit, penguins_train, type = "prob")) %>% 
  # Add the true outcome data back in
  bind_cols(penguins_train %>% 
              select(species))
library(yardstick)
rf_training_pred %>%                # training set predictions
  accuracy(truth = species, .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.996

“Resampling allows us to simulate how well our model will perform on new data, and the test set acts as the final, unbiased check for our model’s performance.”

Source: https://www.tidymodels.org/start/resampling/

Support Vector Machines (SVM)

Introduction

Hyperplanes and Support Vectors

Linearly separable vs non-linearly separable

Illustration

R code

Random forest

SVM

SVM

Evaluate Model

Evaluation on test set