Basics of ggplot2

Michael Taylor

2018/12/19

library(dslabs)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data(murders)
# nudge_x = 1 moves all the labels to the right.
p <- murders %>% 
  ggplot(aes(x = population/10^6, y = total, label = abb)) +
  geom_point(size = 3) +
  geom_text(nudge_x = 0.05) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10")

p

# nudge_x = 1 moves all the labels to the right.
p <- murders %>% 
  ggplot(aes(x = population/10^6, y = total, label = abb)) +
  geom_point(size = 3 , color = "blue") +
  geom_text(nudge_x = 0.05) +
  scale_x_log10() +
  scale_y_log10() +
  xlab("Populations in millions (log scale)") +
  ylab("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in US 2010")

p

Now because the color of each point will depend on the category and the region from which each state is, we have to use a mapping. To map each point to a color, we need to use aes, since this is a mapping.

# nudge_x = 1 moves all the labels to the right.
p <- murders %>% 
  ggplot(aes(x = population/10^6, 
             y = total, 
             label = abb, 
             color = region)) +
  geom_point(size = 3) +
  geom_text(nudge_x = 0.05) +
  scale_x_log10() +
  scale_y_log10() +
  xlab("Populations in millions (log scale)") +
  ylab("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in US 2010")

p

(r <- murders %>% summarise(rate = sum(total) / sum(population)*10^6) %>% .$rate)
## [1] 30.35
# nudge_x = 1 moves all the labels to the right.
# The default line for geom_abline has slope 1 and intercept 0.
p <- murders %>% 
  ggplot(aes(x = population/10^6, 
             y = total, 
             label = abb, 
             color = region)) +
  geom_abline(intercept = log10(r), 
              lty = 2, 
              color = "darkgrey") +
  geom_point(size = 3) +
  geom_text(nudge_x = 0.05) +
  scale_x_log10() +
  scale_y_log10() +
  xlab("Populations in millions (log scale)") +
  ylab("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in US 2010") +
  scale_color_discrete(name = "Region")

p

library(ggthemes)
library(ggrepel)
# nudge_x = 1 moves all the labels to the right.
# The default line for geom_abline has slope 1 and intercept 0.
p <- murders %>% 
  ggplot(aes(x = population/10^6, 
             y = total, 
             label = abb, 
             color = region)) +
  geom_abline(intercept = log10(r), 
              lty = 2, 
              color = "darkgrey") +
  geom_point(size = 3) +
  geom_text(nudge_x = 0.05) +
  scale_x_log10() +
  scale_y_log10() +
  xlab("Populations in millions (log scale)") +
  ylab("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in US 2010") +
  scale_color_discrete(name = "Region") +
  theme_economist()

p

# nudge_x = 1 moves all the labels to the right.
# The default line for geom_abline has slope 1 and intercept 0.
# using geom_text_repel
p <- murders %>% 
  ggplot(aes(x = population/10^6, 
             y = total, 
             label = abb, 
             color = region)) +
  geom_abline(intercept = log10(r), 
              lty = 2, 
              color = "darkgrey") +
  geom_point(size = 3) +
  geom_text_repel() +
  scale_x_log10() +
  scale_y_log10() +
  xlab("Populations in millions (log scale)") +
  ylab("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in US 2010") +
  scale_color_discrete(name = "Region")
  
p + theme_economist()

p + ds_theme_set()

data("heights")
p <- heights %>% filter(sex=="Male") %>% 
  ggplot(aes(x=height))

p + geom_histogram(binwidth = 1, fill="blue", color="black")+
  xlab("Male heights in inches") +
  ggtitle("Histogram")

p + geom_density(fill="blue")

So now we have to redefine p because it needs a different argument. Instead of x it’s now sample.

By default, the Q-Q plot is compared to the normal distribution with average zero and standard deviation one.

p <- heights %>% filter(sex=="Male") %>% ggplot(aes(sample=height))

p + geom_qq()

To change this, again, from the health file, we learn that we need to use the dparams argument. So now what we do is we define an object params that will have the mean and standard deviation of our data. We use some dplyr functions to do this, and now we add the geometry by assigning this new object that we created to the dparams argument. And now we see that the Q-Q plot is plotted against a normal distribution with the same mean and deviation as our data. It looks like this. We can then add identity lines to see how well the normal approximation works. And in this case, we simply add the layer geom_abline, which adds an identity line.

heights %>% 
  filter(sex=="Male") %>% 
  summarise(mean=mean(height), 
            sd=sd(height))
##    mean    sd
## 1 69.31 3.611
params <- heights %>% 
  filter(sex=="Male") %>% 
  summarise(mean=mean(height), 
            sd=sd(height))

p + geom_qq(dparams = params) +
  geom_abline()

Another option here is to first scale the data so that we have them in standard units and plot it against the standard normal distribution. This saves us the step of having to computethe mean and standard deviation.

heights %>% filter(sex=="Male") %>% 
  ggplot(aes(sample=scale(height))) +
  geom_qq() +
  geom_abline()

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p <- heights %>% filter(sex=="Male") %>% ggplot(aes(x=height))
p1 <- p + geom_histogram(binwidth = 1, fill="blue", color="black")
p2 <- p + geom_histogram(binwidth = 2, fill="blue", color="black")
p3 <- p + geom_histogram(binwidth = 3, fill="blue", color="black")

grid.arrange(p1, p2, p3, ncol=3)

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
## 
## Matrix products: default
## BLAS: /home/michael/anaconda3/lib/R/lib/libRblas.so
## LAPACK: /home/michael/anaconda3/lib/R/lib/libRlapack.so
## 
## locale:
## [1] en_CA.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.3        ggrepel_0.8.0        ggthemes_4.0.0      
## [4] bindrcpp_0.2.2       ggplot2_3.0.0        dplyr_0.7.6         
## [7] dslabs_0.3.3         RevoUtils_11.0.1     RevoUtilsMath_11.0.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18     compiler_3.5.1   pillar_1.3.0     plyr_1.8.4      
##  [5] bindr_0.1.1      tools_3.5.1      digest_0.6.15    evaluate_0.11   
##  [9] tibble_1.4.2     gtable_0.2.0     pkgconfig_2.0.1  rlang_0.2.1     
## [13] yaml_2.2.0       blogdown_0.9.8   xfun_0.4.11      withr_2.1.2     
## [17] stringr_1.3.1    knitr_1.20       rprojroot_1.3-2  grid_3.5.1      
## [21] tidyselect_0.2.4 glue_1.3.0       R6_2.2.2         rmarkdown_1.10  
## [25] bookdown_0.7     purrr_0.2.5      magrittr_1.5     backports_1.1.2 
## [29] scales_0.5.0     codetools_0.2-15 htmltools_0.3.6  assertthat_0.2.0
## [33] colorspace_1.3-2 labeling_0.3     stringi_1.2.4    lazyeval_0.2.1  
## [37] munsell_0.5.0    crayon_1.3.4

References

Irizarry, Rafael A. 2017. Dslabs: Data Science Labs. https://CRAN.R-project.org/package=dslabs.

R Core Team. 2018. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, Kohske Takahashi, Claus Wilke, and Kara Woo. 2018. Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics. https://CRAN.R-project.org/package=ggplot2.

Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2018. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.