Exercices autour de data.table

L’atelier du jour avait pour objectif la découvert du package data.table au travers d’exercises proposés lors de la dernière édition de UseR. Les slides avec les énoncés sont disponibles ici.

L’apport de notre atelier est d’avoir systématiquement essayé de traiter les questions avec - les fonctionalités R de base - les fonctionalités offertes par le tidyverse (principalement dplyr) - les fonctionnalités du package data.table

Pour les animateurs : vous êtes libre de compléter le post de blog. Nous ajouteroins des comparatifs en terme de temps de calcul plus tard (peut-être).

Préliminaires

C’est parti, on charge oles packages requis

library(data.table)
library(tidyverse)

Les exercices seront traités à l’aide de deux tables jouets, dont on propose des contreparties en - object data.frame - object tibble - object data.table

set.seed(20170703L)
DF1 = data.frame(id = sample(1:2, 9L, TRUE), 
                 code = sample(letters[1:3], 9, TRUE), 
                 valA = 1:9, valB = 10:18, 
                 stringsAsFactors = FALSE)
DF2 = data.frame(id = c(3L, 1L, 1L, 2L, 3L), 
                 code = c("b", "a", "c", "c", "d"), 
                 mul = 5:1, stringsAsFactors = FALSE)
## corresponding data tibble
TB1 <- as.tibble(DF1)
TB2 <- as.tibble(DF2)
## corresponding data tables
DT1 <- as.data.table(DF1)
DT2 <- as.data.table(DF2)

Question 1

Subset all rows where id column equals 1 & code column is not equal to “c”

base

base::subset(DF1, id == 1 & code != "c")
##   id code valA valB
## 2  1    b    2   11
## 7  1    b    7   16
with(DF1, DF1[id == 1 & code != "c",])
##   id code valA valB
## 2  1    b    2   11
## 7  1    b    7   16

dplyr

filter(TB1, id == 1 & code != "c")
## # A tibble: 2 x 4
##      id code   valA  valB
##   <int> <chr> <int> <int>
## 1     1 b         2    11
## 2     1 b         7    16

data.table

DT1[id == 1 & code != "c"] 
##    id code valA valB
## 1:  1    b    2   11
## 2:  1    b    7   16

Question 2

Select valA and valB columns from DF1

base R

DF1[, c("valA", "valB")]
##   valA valB
## 1    1   10
## 2    2   11
## 3    3   12
## 4    4   13
## 5    5   14
## 6    6   15
## 7    7   16
## 8    8   17
## 9    9   18

dplyr

select(TB1, valA, valB)
## # A tibble: 9 x 2
##    valA  valB
##   <int> <int>
## 1     1    10
## 2     2    11
## 3     3    12
## 4     4    13
## 5     5    14
## 6     6    15
## 7     7    16
## 8     8    17
## 9     9    18

data.table

DT1[ , .(valA,valB)]
##    valA valB
## 1:    1   10
## 2:    2   11
## 3:    3   12
## 4:    4   13
## 5:    5   14
## 6:    6   15
## 7:    7   16
## 8:    8   17
## 9:    9   18

Question 3

Get sum(valA) and sum(valB) for id > 1 as a 1-row, 2-col data.frame

base R

colSums(DF1[ DF1$id > 1, c("valA", "valB")])
## valA valB 
##   19   46

dplyr

TB1 %>% filter(id > 1) %>% select(valA, valB) %>% summarise_all(sum)
## # A tibble: 1 x 2
##    valA  valB
##   <int> <int>
## 1    19    46

data.table

DT1[id > 1, .(sum(valA), sum(valB))]
##    V1 V2
## 1: 19 46

Question 4

Replace valB with valB+1 for all rows where code == “c”

base R

DF1$valB[DF1$code=="c"] = DF1$valB[DF1$code=="c"] + 1
DF1
##   id code valA valB
## 1  1    c    1   11
## 2  1    b    2   11
## 3  1    c    3   13
## 4  1    c    4   14
## 5  2    a    5   14
## 6  2    a    6   15
## 7  1    b    7   16
## 8  2    a    8   17
## 9  1    c    9   19

dplyr

TB1 <- TB1 %>% mutate(valB = ifelse(code == "c", valB + 1,valB))
## TB1[TB1$code=="c", ] <- filter(TB1, code=="c") %>% mutate(valB= valB+1) # alternative solution
TB1
## # A tibble: 9 x 4
##      id code   valA  valB
##   <int> <chr> <int> <dbl>
## 1     1 c         1  11.0
## 2     1 b         2  11.0
## 3     1 c         3  13.0
## 4     1 c         4  14.0
## 5     2 a         5  14.0
## 6     2 a         6  15.0
## 7     1 b         7  16.0
## 8     2 a         8  17.0
## 9     1 c         9  19.0

data.table

DT1[code == "c", valB := valB + 1L]
DT1
##    id code valA valB
## 1:  1    c    1   11
## 2:  1    b    2   11
## 3:  1    c    3   13
## 4:  1    c    4   14
## 5:  2    a    5   14
## 6:  2    a    6   15
## 7:  1    b    7   16
## 8:  2    a    8   17
## 9:  1    c    9   19

Question 5

Add a new column valC column with values equal to valB^2 - valA^2

base R

DF1 <- transform(DF1, valC = valB^2 - valA^2)
## DF1$valC <- DF1$valB^2 - DF1$valA^2 # alternate solution
DF1
##   id code valA valB valC
## 1  1    c    1   11  120
## 2  1    b    2   11  117
## 3  1    c    3   13  160
## 4  1    c    4   14  180
## 5  2    a    5   14  171
## 6  2    a    6   15  189
## 7  1    b    7   16  207
## 8  2    a    8   17  225
## 9  1    c    9   19  280

dplyr

TB1 <- mutate(TB1, valC = valB^2 - valA^2)
TB1
## # A tibble: 9 x 5
##      id code   valA  valB  valC
##   <int> <chr> <int> <dbl> <dbl>
## 1     1 c         1  11.0   120
## 2     1 b         2  11.0   117
## 3     1 c         3  13.0   160
## 4     1 c         4  14.0   180
## 5     2 a         5  14.0   171
## 6     2 a         6  15.0   189
## 7     1 b         7  16.0   207
## 8     2 a         8  17.0   225
## 9     1 c         9  19.0   280

data.table

DT1[, valC := valB^2 - valA^2]
DT1
##    id code valA valB valC
## 1:  1    c    1   11  120
## 2:  1    b    2   11  117
## 3:  1    c    3   13  160
## 4:  1    c    4   14  180
## 5:  2    a    5   14  171
## 6:  2    a    6   15  189
## 7:  1    b    7   16  207
## 8:  2    a    8   17  225
## 9:  1    c    9   19  280

Question 6

Get sum(valA) and sum(valB) grouped by id and code (i.e., for each unique combination of id,code)

base

aggregate(.~ id + code, DF1, sum)
##   id code valA valB valC
## 1  2    a   19   46  585
## 2  1    b    9   27  324
## 3  1    c   17   57  740
aggregate(DF1[, c("valA", "valB")], list(DF1$id, DF1$code), sum)
##   Group.1 Group.2 valA valB
## 1       2       a   19   46
## 2       1       b    9   27
## 3       1       c   17   57

dplyr

TB1 %>% group_by(id, code) %>% summarise_all(sum)
## # A tibble: 3 x 5
## # Groups:   id [?]
##      id code   valA  valB  valC
##   <int> <chr> <int> <dbl> <dbl>
## 1     1 b         9  27.0   324
## 2     1 c        17  57.0   740
## 3     2 a        19  46.0   585

data.table

DT1[, .(SumA=sum(valA), SumB = sum(valB)), by=list(id,code) ] 
##    id code SumA SumB
## 1:  1    c   17   57
## 2:  1    b    9   27
## 3:  2    a   19   46

Question 7

Get sum(valA) and sum(valB) grouped by id for id >= 2 & code %in% c(“a”, “c”)

base

aggregate(.~ id , subset(DF1, id >=2 & code %in% c("a","c"), -code), sum)
##   id valA valB valC
## 1  2   19   46  585

dplyr

TB1 %>% 
  group_by(id) %>% 
  filter(id >=2, code %in% c("a", "c")) %>% 
  select(-code, -valC) %>% 
  summarise_all(sum)
## # A tibble: 1 x 3
##      id  valA  valB
##   <int> <int> <dbl>
## 1     2    19  46.0

data.table

DT1[ (id>=2) & code %in% c("a", "c") , .(SumA=sum(valA), SumB = sum(valB)), by=id ] 
##    id SumA SumB
## 1:  2   19   46

Question 8

Replace valA with max(valA)-min(valA) grouped by code

base

DF1 <- transform(DF1, valA = rep(tapply(valA, code, function(x) diff(range(x)))[code]))
DF1
##   id code valA valB valC
## 1  1    c    8   11  120
## 2  1    b    5   11  117
## 3  1    c    8   13  160
## 4  1    c    8   14  180
## 5  2    a    3   14  171
## 6  2    a    3   15  189
## 7  1    b    5   16  207
## 8  2    a    3   17  225
## 9  1    c    8   19  280

dplyr

TB1 <- TB1 %>% group_by(code) %>% mutate(valA= max(valA)-min(valA))
TB1
## # A tibble: 9 x 5
## # Groups:   code [3]
##      id code   valA  valB  valC
##   <int> <chr> <dbl> <dbl> <dbl>
## 1     1 c      8.00  11.0   120
## 2     1 b      5.00  11.0   117
## 3     1 c      8.00  13.0   160
## 4     1 c      8.00  14.0   180
## 5     2 a      3.00  14.0   171
## 6     2 a      3.00  15.0   189
## 7     1 b      5.00  16.0   207
## 8     2 a      3.00  17.0   225
## 9     1 c      8.00  19.0   280

data.table

DT1[, "valA" := max(valA)-min(valA), by=code] 
DT1
##    id code valA valB valC
## 1:  1    c    8   11  120
## 2:  1    b    5   11  117
## 3:  1    c    8   13  160
## 4:  1    c    8   14  180
## 5:  2    a    3   14  171
## 6:  2    a    3   15  189
## 7:  1    b    5   16  207
## 8:  2    a    3   17  225
## 9:  1    c    8   19  280

Question 9

Create a new col named valD with max(valB)-min(valA) grouped by code

base

DF1 <- transform(DF1, valD = by(DF1, code, function(x) max(x$valB) - min(x$valA))[code])
DF1
##   id code valA valB valC valD
## 1  1    c    8   11  120   11
## 2  1    b    5   11  117   11
## 3  1    c    8   13  160   11
## 4  1    c    8   14  180   11
## 5  2    a    3   14  171   14
## 6  2    a    3   15  189   14
## 7  1    b    5   16  207   11
## 8  2    a    3   17  225   14
## 9  1    c    8   19  280   11

dplyr

TB1 <- TB1 %>% group_by(code) %>% mutate(valD= max(valB)-min(valA))
TB1
## # A tibble: 9 x 6
## # Groups:   code [3]
##      id code   valA  valB  valC  valD
##   <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1     1 c      8.00  11.0   120  11.0
## 2     1 b      5.00  11.0   117  11.0
## 3     1 c      8.00  13.0   160  11.0
## 4     1 c      8.00  14.0   180  11.0
## 5     2 a      3.00  14.0   171  14.0
## 6     2 a      3.00  15.0   189  14.0
## 7     1 b      5.00  16.0   207  11.0
## 8     2 a      3.00  17.0   225  14.0
## 9     1 c      8.00  19.0   280  11.0

data.table

DT1[, "valD" := max(valB)-min(valA), by=code] 
DT1
##    id code valA valB valC valD
## 1:  1    c    8   11  120   11
## 2:  1    b    5   11  117   11
## 3:  1    c    8   13  160   11
## 4:  1    c    8   14  180   11
## 5:  2    a    3   14  171   14
## 6:  2    a    3   15  189   14
## 7:  1    b    5   16  207   11
## 8:  2    a    3   17  225   14
## 9:  1    c    8   19  280   11

Question 10

Subset DF1 by DF2 on id,code column. That is, for each row of DF2$id, DF2$code, get valA and valB cols from DF1. Include rows that have no matches as well.

base

merge(DF1, DF2[, c("id", "code")], by = c("id", "code"), all.y=TRUE)
##   id code valA valB valC valD
## 1  1    a   NA   NA   NA   NA
## 2  1    c    8   11  120   11
## 3  1    c    8   13  160   11
## 4  1    c    8   14  180   11
## 5  1    c    8   19  280   11
## 6  2    c   NA   NA   NA   NA
## 7  3    b   NA   NA   NA   NA
## 8  3    d   NA   NA   NA   NA

dplyr

right_join(TB1, TB2 %>% select(id, code), by = c("id", "code")) 
## # A tibble: 8 x 6
## # Groups:   code [?]
##      id code   valA  valB  valC  valD
##   <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1     3 b     NA     NA      NA  NA  
## 2     1 a     NA     NA      NA  NA  
## 3     1 c      8.00  11.0   120  11.0
## 4     1 c      8.00  13.0   160  11.0
## 5     1 c      8.00  14.0   180  11.0
## 6     1 c      8.00  19.0   280  11.0
## 7     2 c     NA     NA      NA  NA  
## 8     3 d     NA     NA      NA  NA
## inner_join to keep only rows present in tibbles, left_join to keep all rows from TB1
## anti_join to keep only rows from TB1 with no matching rows in TB2

data.table

merge(DT1, DT2[ , .(id, code)], all.y = TRUE)
##    id code valA valB valC valD
## 1:  1    a   NA   NA   NA   NA
## 2:  1    c    8   11  120   11
## 3:  1    c    8   13  160   11
## 4:  1    c    8   14  180   11
## 5:  1    c    8   19  280   11
## 6:  2    c   NA   NA   NA   NA
## 7:  3    b   NA   NA   NA   NA
## 8:  3    d   NA   NA   NA   NA

Question 11

Same as (10), but fetch just the first matching row of DF1 for each row of DF2$id, DF2$code. Exclude non-matching rows.

base

merge(DF1[!duplicated(DF1[, c("id", "code")]), ],
      DF2[, c("id", "code")], by = c("id", "code"))
##   id code valA valB valC valD
## 1  1    c    8   11  120   11

dplyr

TB1 %>% group_by(id, code) %>% slice(1) %>% ## first entry in each (id, code) group
  inner_join(select(TB2, id, code), by = c("id", "code"))
## # A tibble: 1 x 6
## # Groups:   id, code [?]
##      id code   valA  valB  valC  valD
##   <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1     1 c      8.00  11.0   120  11.0

data.table

merge(DT1[, .SD[1], by = list(id, code)], ## first entry in each (id, code) group
      DT2[ , .(id, code)])
##    id code valA valB valC valD
## 1:  1    c    8   11  120   11

Question 12

For every row of DF2$id, DF2$code that matches with DF1’s, update valA with valA*mul.

base

transform(merge(DF1, DF2, by = c("id", "code"), all.y=TRUE), valA = valA * mul)
##   id code valA valB valC valD mul
## 1  1    a   NA   NA   NA   NA   4
## 2  1    c   24   11  120   11   3
## 3  1    c   24   13  160   11   3
## 4  1    c   24   14  180   11   3
## 5  1    c   24   19  280   11   3
## 6  2    c   NA   NA   NA   NA   2
## 7  3    b   NA   NA   NA   NA   5
## 8  3    d   NA   NA   NA   NA   1

dplyr

right_join(TB1, TB2, by = c("id", "code")) %>% mutate(valA = valA * mul)
## # A tibble: 8 x 7
## # Groups:   code [4]
##      id code   valA  valB  valC  valD   mul
##   <int> <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1     3 b      NA    NA      NA  NA       5
## 2     1 a      NA    NA      NA  NA       4
## 3     1 c      24.0  11.0   120  11.0     3
## 4     1 c      24.0  13.0   160  11.0     3
## 5     1 c      24.0  14.0   180  11.0     3
## 6     1 c      24.0  19.0   280  11.0     3
## 7     2 c      NA    NA      NA  NA       2
## 8     3 d      NA    NA      NA  NA       1

data.table

DT3 <- merge(DT1, DT2, all.y = TRUE)[ , "valA" := valA * mul]
show(DT3) ## je ne comprends pas pourquoi il faut utiliser show pour l'affichage
##    id code valA valB valC valD mul
## 1:  1    a   NA   NA   NA   NA   4
## 2:  1    c   24   11  120   11   3
## 3:  1    c   24   13  160   11   3
## 4:  1    c   24   14  180   11   3
## 5:  1    c   24   19  280   11   3
## 6:  2    c   NA   NA   NA   NA   2
## 7:  3    b   NA   NA   NA   NA   5
## 8:  3    d   NA   NA   NA   NA   1

Question 13

Add a new column val to DF1 with values from DF2$mul where DF2$id, DF2$code matches with DF1’s. Rows that don’t match should have NA.

base

transform(merge(DF1, DF2, by = c("id", "code"), all.x = TRUE, sort = FALSE), mul = mul)
##   id code valA valB valC valD mul
## 1  1    c    8   11  120   11   3
## 2  1    c    8   13  160   11   3
## 3  1    c    8   14  180   11   3
## 4  1    c    8   19  280   11   3
## 5  2    a    3   14  171   14  NA
## 6  1    b    5   11  117   11  NA
## 7  1    b    5   16  207   11  NA
## 8  2    a    3   17  225   14  NA
## 9  2    a    3   15  189   14  NA

dplyr

left_join(TB1, TB2, by = c("id", "code"))
## # A tibble: 9 x 7
## # Groups:   code [?]
##      id code   valA  valB  valC  valD   mul
##   <int> <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1     1 c      8.00  11.0   120  11.0     3
## 2     1 b      5.00  11.0   117  11.0    NA
## 3     1 c      8.00  13.0   160  11.0     3
## 4     1 c      8.00  14.0   180  11.0     3
## 5     2 a      3.00  14.0   171  14.0    NA
## 6     2 a      3.00  15.0   189  14.0    NA
## 7     1 b      5.00  16.0   207  11.0    NA
## 8     2 a      3.00  17.0   225  14.0    NA
## 9     1 c      8.00  19.0   280  11.0     3

data.table

merge(DT1, DT2, all.x = TRUE)
##    id code valA valB valC valD mul
## 1:  1    b    5   11  117   11  NA
## 2:  1    b    5   16  207   11  NA
## 3:  1    c    8   11  120   11   3
## 4:  1    c    8   13  160   11   3
## 5:  1    c    8   14  180   11   3
## 6:  1    c    8   19  280   11   3
## 7:  2    a    3   14  171   14  NA
## 8:  2    a    3   15  189   14  NA
## 9:  2    a    3   17  225   14  NA

Related