# Summarizing with dplyr

## 2018/12/20

## Not run:
## remove (almost) everything in the working environment.
## You will get no warning, so don't do this unless you are really sure.
rm(list = ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
##     filter, lag
## The following objects are masked from 'package:base':
##
##     intersect, setdiff, setequal, union
library(dslabs)
data("heights")
data("murders")

### dplyr

We are going to compute the average and the standard deviation for males.

(s <- heights %>% filter(sex=="Male") %>%
summarise(average=mean(height), standard_deviation=sd(height)))
##   average standard_deviation
## 1   69.31              3.611

### The dot placeholder

# add murder_rate column
murders <- murders %>%
mutate(murder_rate=total/population * 100000)
us_murder_rate <- murders %>% mutate(rate = (sum(total)/sum(population)) * 100000 )

summarise(us_murder_rate, mean(rate))
##   mean(rate)
## 1      3.035

We can’t use the US murder rate object because it’s a data frame. If we want to use the result with a function that requires a numeric value, we won’t be able to do it. Below we show a useful trick to access a value stored in data that is being piped using the pipe character.

us_murder_rate %>% summarise(rate = sum(total)/sum(population)*100000) %>% .\$rate
## [1] 3.035

### Group By

heights %>%
group_by(sex) %>%
summarise(average=mean(height), standard_deviation=sd(height) )
## # A tibble: 2 x 3
##   sex    average standard_deviation
##   <fct>    <dbl>              <dbl>
## 1 Female    64.9               3.76
## 2 Male      69.3               3.61
murders %>%
group_by(region) %>%
summarise(median_rate = median(murder_rate) )
## # A tibble: 4 x 2
##   region        median_rate
##   <fct>               <dbl>
## 1 Northeast            1.80
## 2 South                3.40
## 3 North Central        1.97
## 4 West                 1.29

### sorting data tables

murders %>% arrange(population) %>% head()
##                  state abb        region population total murder_rate
## 1              Wyoming  WY          West     563626     5      0.8871
## 2 District of Columbia  DC         South     601723    99     16.4528
## 3              Vermont  VT     Northeast     625741     2      0.3196
## 4         North Dakota  ND North Central     672591     4      0.5947
## 5               Alaska  AK          West     710231    19      2.6752
## 6         South Dakota  SD North Central     814180     8      0.9826

### top_n()

murders %>% top_n(10, murder_rate)
##                   state abb        region population total murder_rate
## 1               Arizona  AZ          West    6392017   232       3.630
## 2              Delaware  DE         South     897934    38       4.232
## 3  District of Columbia  DC         South     601723    99      16.453
## 4               Georgia  GA         South    9920000   376       3.790
## 5             Louisiana  LA         South    4533372   351       7.743
## 6              Maryland  MD         South    5773552   293       5.075
## 7              Michigan  MI North Central    9883640   413       4.179
## 8           Mississippi  MS         South    2967297   120       4.044
## 9              Missouri  MO North Central    5988927   321       5.360
## 10       South Carolina  SC         South    4625364   207       4.475

### Order output with top_n()

murders %>% arrange(desc(murder_rate)) %>% top_n(10)
## Selecting by murder_rate
##                   state abb        region population total murder_rate
## 1  District of Columbia  DC         South     601723    99      16.453
## 2             Louisiana  LA         South    4533372   351       7.743
## 3              Missouri  MO North Central    5988927   321       5.360
## 4              Maryland  MD         South    5773552   293       5.075
## 5        South Carolina  SC         South    4625364   207       4.475
## 6              Delaware  DE         South     897934    38       4.232
## 7              Michigan  MI North Central    9883640   413       4.179
## 8           Mississippi  MS         South    2967297   120       4.044
## 9               Georgia  GA         South    9920000   376       3.790
## 10              Arizona  AZ          West    6392017   232       3.630
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
##
## Matrix products: default
## BLAS: /home/michael/anaconda3/lib/R/lib/libRblas.so
## LAPACK: /home/michael/anaconda3/lib/R/lib/libRlapack.so
##
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
## [1] bindrcpp_0.2.2       dslabs_0.3.3         dplyr_0.7.6
## [4] RevoUtils_11.0.1     RevoUtilsMath_11.0.0
##
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18     knitr_1.20       bindr_0.1.1      magrittr_1.5
##  [5] tidyselect_0.2.4 R6_2.2.2         rlang_0.2.1      fansi_0.2.3
##  [9] stringr_1.3.1    tools_3.5.1      xfun_0.4.11      utf8_1.1.4
## [13] cli_1.0.0        htmltools_0.3.6  yaml_2.2.0       rprojroot_1.3-2
## [17] digest_0.6.15    assertthat_0.2.0 tibble_1.4.2     crayon_1.3.4
## [21] bookdown_0.7     purrr_0.2.5      codetools_0.2-15 glue_1.3.0
## [25] evaluate_0.11    rmarkdown_1.10   blogdown_0.9.8   stringi_1.2.4
## [29] pillar_1.3.0     compiler_3.5.1   backports_1.1.2  pkgconfig_2.0.1

# References

R Core Team. 2018. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2018. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.