{torch} avec {tidymodels} et {brulee}

Résolution de la régression logistique

Published

Invalid Date

Introduction

L’idée ici est d’explorer la régression logistique en utilisant {torch} à l’aide du package {brulee}.

library(torch)
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
✔ broom        1.0.5     ✔ recipes      1.0.7
✔ dials        1.2.0     ✔ rsample      1.1.1
✔ dplyr        1.1.2     ✔ tibble       3.2.1
✔ ggplot2      3.4.2     ✔ tidyr        1.3.0
✔ infer        1.0.4     ✔ tune         1.1.1
✔ modeldata    1.2.0     ✔ workflows    1.1.3
✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
✔ purrr        1.0.2     ✔ yardstick    1.2.0
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(brulee)

Exemple

On reprend l’exemple détaillé l’an dernier pour la régression logistique, disponible à la page https://stateofther.github.io/finistR2022/autodiff.html.

set.seed(45)
n <- 100
p <- 3
X <- matrix(rnorm(n = n*p), ncol = p, nrow = n)
theta <- rnorm(3) %>% round(digits = 2)
probs <- (X %*% theta) %>% as.vector()
Y <- purrr::rbernoulli(n = n, p = probs) + 0.
Warning: `rbernoulli()` was deprecated in purrr 1.0.0.
x <- torch_tensor(X)
y <- torch_tensor(Y)

logistic_loss <- function(theta, x, y) {
  if (!is(theta, "torch_tensor")) {
    stop("theta must be a torch tensor")
  }
  odds <- torch_matmul(x, theta)
  log_lik <- torch_dot(y, odds) - torch_sum(torch_log(1 + torch_exp(odds)))
  return(-log_lik)
}
logistic_loss(theta = torch_tensor(theta), x = x, y = y)
torch_tensor
50.4573
[ CPUFloatType{} ]
eval_loss <- function(theta, verbose = TRUE) {
  loss <- logistic_loss(theta, x, y)
  if (verbose) {
    cat(paste(theta |> as.numeric(), collapse=", "), ": ", as.numeric(loss), "\n")
  }
  return(loss)
}
eval_loss(torch_tensor(theta), verbose = FALSE)
torch_tensor
50.4573
[ CPUFloatType{} ]
theta_current <- torch_tensor(rep(0, length(theta)), requires_grad = TRUE)
theta_optimizer <- optim_rprop(theta_current)
loss <- eval_loss(theta_current, verbose = FALSE)
loss$backward()
theta_optimizer$step()
NULL
num_iterations <- 100
loss_vector <- vector("numeric", length = num_iterations)
for (i in 1:num_iterations) {
  theta_optimizer$zero_grad()
  loss <- eval_loss(theta_current, verbose = FALSE)
  loss$backward()
  theta_optimizer$step()
  loss_vector[i] <- loss %>% as.numeric()
}
dplyr::tibble(
  torch = theta_current |> as.numeric(),
  glm   = glm(Y ~ 0 + X, family = "binomial") |> coefficients()
)
# A tibble: 3 × 2
   torch    glm
   <dbl>  <dbl>
1  1.79   1.79 
2 -1.04  -1.04 
3 -0.973 -0.973

Tidymodels

Il est possible d’effectuer une régression logistique à l’aide de plusieurs engine dans {tidymodels} :

show_engines("logistic_reg")
# A tibble: 7 × 2
  engine    mode          
  <chr>     <chr>         
1 glm       classification
2 glmnet    classification
3 LiblineaR classification
4 spark     classification
5 keras     classification
6 stan      classification
7 brulee    classification

On vérifie qu’on retrouve bien les mêmes coefficient en utilisant le package {glm} dans {tidymodels} pour effectuer notre régression logistique :

set.seed(20)
data_df <- data.frame(Y = as.factor(Y), X = X)
logistic_reg(engine = "glm") %>% 
  fit(Y ~ 0 + X.1 + X.2 + X.3, family = "binomial", data = data_df) %>% 
  extract_fit_engine() %>% # besoin d'extraire l'objet lm
  coef()
       X.1        X.2        X.3 
 1.7914697 -1.0379660 -0.9728552 

Le package {brulee} de l’univers tidymodels propose différents modèles classiques (réseau de neurones, régression logistique, régression linéaire, régression multinomiale) via l’infrastructure torch. La liste des loss disponibles dans le package :

ls(pattern = "loss$", envir = asNamespace("torch"))
 [1] "nn_adaptive_log_softmax_with_loss"    
 [2] "nn_bce_loss"                          
 [3] "nn_bce_with_logits_loss"              
 [4] "nn_cosine_embedding_loss"             
 [5] "nn_cross_entropy_loss"                
 [6] "nn_ctc_loss"                          
 [7] "nn_hinge_embedding_loss"              
 [8] "nn_kl_div_loss"                       
 [9] "nn_l1_loss"                           
[10] "nn_loss"                              
[11] "nn_margin_ranking_loss"               
[12] "nn_mse_loss"                          
[13] "nn_multi_margin_loss"                 
[14] "nn_multilabel_margin_loss"            
[15] "nn_multilabel_soft_margin_loss"       
[16] "nn_nll_loss"                          
[17] "nn_poisson_nll_loss"                  
[18] "nn_smooth_l1_loss"                    
[19] "nn_soft_margin_loss"                  
[20] "nn_triplet_margin_loss"               
[21] "nn_triplet_margin_with_distance_loss" 
[22] "nn_weighted_loss"                     
[23] "nnf_cosine_embedding_loss"            
[24] "nnf_ctc_loss"                         
[25] "nnf_hinge_embedding_loss"             
[26] "nnf_l1_loss"                          
[27] "nnf_margin_ranking_loss"              
[28] "nnf_mse_loss"                         
[29] "nnf_multi_margin_loss"                
[30] "nnf_multilabel_margin_loss"           
[31] "nnf_multilabel_soft_margin_loss"      
[32] "nnf_nll_loss"                         
[33] "nnf_poisson_nll_loss"                 
[34] "nnf_smooth_l1_loss"                   
[35] "nnf_soft_margin_loss"                 
[36] "nnf_triplet_margin_loss"              
[37] "nnf_triplet_margin_with_distance_loss"
[38] "torch__ctc_loss"                      
[39] "torch__cudnn_ctc_loss"                
[40] "torch__use_cudnn_ctc_loss"            
[41] "torch_cosine_embedding_loss"          
[42] "torch_cross_entropy_loss"             
[43] "torch_ctc_loss"                       
[44] "torch_hinge_embedding_loss"           
[45] "torch_huber_loss"                     
[46] "torch_l1_loss"                        
[47] "torch_margin_ranking_loss"            
[48] "torch_mse_loss"                       
[49] "torch_multi_margin_loss"              
[50] "torch_multilabel_margin_loss"         
[51] "torch_nll_loss"                       
[52] "torch_poisson_nll_loss"               
[53] "torch_smooth_l1_loss"                 
[54] "torch_soft_margin_loss"               
[55] "torch_triplet_margin_loss"            

On va regarder ici comment faire la même régression logistique que précédemment. Il est possible de spécifier soit avec les données sous forme de data.frame soit en utilisant des matrices. Deux procédures d’optimisation sont disponibles : ‘LBFGS’ et ‘SGD’.

A noter qu’il n’est pas possible de spécifier un modèle sans intercept, ni avec 0+ ni avec -1.

reg_log_brulee2 <- brulee_logistic_reg(x = as.matrix(X), y = as.factor(Y),
                         epochs = num_iterations, optimizer = "SGD", validation = 0)

reg_log_brulee1 <- brulee_logistic_reg(Y ~ X.1 + X.2 + X.3, data = data_df,
                         epochs = num_iterations, optimizer = "SGD", validation = 0)

En théorie il est possible récupérer les coefficients du modèle ajusté avec la méthode coef en spécifiant l’epoch désirée. Si epoch = NULL la meilleure epoch est choisie.

reg_log_brulee2 %>% coef()
reg_log_brulee2$estimates[[100]]
$fc1.weight
          [,1]       [,2]      [,3]
[1,] -1.096883  0.5906312  1.178145
[2,]  1.788497 -0.9432783 -1.099946

$fc1.bias
[1]  1.451596 -1.311797