library(tidyverse)
library(rvest)
theme_set(theme_bw())
df <- 
  "https://fr.wikipedia.org/wiki/Liste_des_pays_par_population" %>% 
  read_html() %>% 
  html_node('table') %>% 
  html_table() %>% 
  as_tibble()
df
## # A tibble: 204 x 6
##    Rang  `Pays ou territo… `Population[Note…  Date Source Commentaires    
##    <chr> <chr>             <chr>             <int> <chr>  <chr>           
##  1 1     Chine             1 415 045 928      2018 Offic… Pays le plus pe…
##  2 2     Inde              1 355 621 800      2018 Offic… Pays le plus pe…
##  3 •     Union européenne  512 596 403        2018 Offic… L'Union europée…
##  4 3     États-Unis        328 286 400        2018 Offic… Pays le plus pe…
##  5 4     Indonésie         266 471 000        2018 Offic… Archipel le plu…
##  6 5     Pakistan          207 774 520        2017 Offic… ""              
##  7 6     Brésil            207 096 196        2017 Offic… Pays le plus pe…
##  8 7     Nigeria           190 632 261        2017 CIA W… Pays le plus pe…
##  9 8     Bangladesh        160 339 154        2016 Offic… ""              
## 10 9     Russie            146 544 710        2016 Offic… La population r…
## # ... with 194 more rows
pop <- pull(df, `Population[Note 2]`)
head(pop)
## [1] "1 415 045 928" "1 355 621 800" "512 596 403"   "328 286 400"  
## [5] "266 471 000"   "207 774 520"
premier_chiffre <- str_sub(pop, 1, 1)
head(premier_chiffre)
## [1] "1" "1" "5" "3" "2" "2"
qplot(premier_chiffre) + geom_bar(fill = "grey50", color = "black")

frequences <- 
  tibble(premier_chiffre) %>% 
  group_by(premier_chiffre) %>% 
  summarise(eff_obs = n()) %>% 
  mutate(freq_obs = eff_obs/sum(eff_obs)) %>% 
  mutate(freq_unif = 1/9,
         freq_benford = c(0.301,    0.176,  0.125,  0.097, 0.079,   0.067, 0.058,   0.051, 0.046))
frequences
## # A tibble: 9 x 5
##   premier_chiffre eff_obs freq_obs freq_unif freq_benford
##   <chr>             <int>    <dbl>     <dbl>        <dbl>
## 1 1                    62   0.304      0.111        0.301
## 2 2                    30   0.147      0.111        0.176
## 3 3                    28   0.137      0.111        0.125
## 4 4                    17   0.0833     0.111        0.097
## 5 5                    20   0.0980     0.111        0.079
## 6 6                    15   0.0735     0.111        0.067
## 7 7                     8   0.0392     0.111        0.058
## 8 8                    10   0.0490     0.111        0.051
## 9 9                    14   0.0686     0.111        0.046
frequences %>% 
  select(premier_chiffre, freq_obs, freq_unif, freq_benford) %>% 
  gather(-premier_chiffre, key = type, value = freq) %>% 
  mutate(type = factor(type, 
                       levels = c("freq_obs", "freq_benford", "freq_unif"),
                       labels = c("Observé", "Benford", "Uniforme"))) %>% 
  print() %>% 
  ggplot() +
  aes(x = premier_chiffre, y = freq, fill = type) +
  geom_col(position = "dodge", color = "black") +
  labs(x = "Premier chiffre", y = "Fréquence", 
       fill = "Type", title = "Barplot des différences fréquences")
## # A tibble: 27 x 3
##    premier_chiffre type       freq
##    <chr>           <fct>     <dbl>
##  1 1               Observé  0.304 
##  2 2               Observé  0.147 
##  3 3               Observé  0.137 
##  4 4               Observé  0.0833
##  5 5               Observé  0.0980
##  6 6               Observé  0.0735
##  7 7               Observé  0.0392
##  8 8               Observé  0.0490
##  9 9               Observé  0.0686
## 10 1               Uniforme 0.111 
## # ... with 17 more rows