Cleaning Data in R

Michael Taylor

2018/05/22

library(downloader)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
# include all packages used in the post
knitr::opts_chunk$set(cache = TRUE)
url <- "https://assets.datacamp.com/production/course_723/datasets/bmi_clean.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
bmi <- read.csv(filename)
glimpse(bmi)
## Observations: 199
## Variables: 30
## $ Country <fct> Afghanistan, Albania, Algeria, Andorra, Angola, Antigu...
## $ Y1980   <dbl> 21.48678, 25.22533, 22.25703, 25.66652, 20.94876, 23.3...
## $ Y1981   <dbl> 21.46552, 25.23981, 22.34745, 25.70868, 20.94371, 23.3...
## $ Y1982   <dbl> 21.45145, 25.25636, 22.43647, 25.74681, 20.93754, 23.4...
## $ Y1983   <dbl> 21.43822, 25.27176, 22.52105, 25.78250, 20.93187, 23.5...
## $ Y1984   <dbl> 21.42734, 25.27901, 22.60633, 25.81874, 20.93569, 23.6...
## $ Y1985   <dbl> 21.41222, 25.28669, 22.69501, 25.85236, 20.94857, 23.7...
## $ Y1986   <dbl> 21.40132, 25.29451, 22.76979, 25.89089, 20.96030, 23.8...
## $ Y1987   <dbl> 21.37679, 25.30217, 22.84096, 25.93414, 20.98025, 23.9...
## $ Y1988   <dbl> 21.34018, 25.30450, 22.90644, 25.98477, 21.01375, 24.0...
## $ Y1989   <dbl> 21.29845, 25.31944, 22.97931, 26.04450, 21.05269, 24.1...
## $ Y1990   <dbl> 21.24818, 25.32357, 23.04600, 26.10936, 21.09007, 24.2...
## $ Y1991   <dbl> 21.20269, 25.28452, 23.11333, 26.17912, 21.12136, 24.3...
## $ Y1992   <dbl> 21.14238, 25.23077, 23.18776, 26.24017, 21.14987, 24.4...
## $ Y1993   <dbl> 21.06376, 25.21192, 23.25764, 26.30356, 21.13938, 24.5...
## $ Y1994   <dbl> 20.97987, 25.22115, 23.32273, 26.36793, 21.14186, 24.6...
## $ Y1995   <dbl> 20.91132, 25.25874, 23.39526, 26.43569, 21.16022, 24.6...
## $ Y1996   <dbl> 20.85155, 25.31097, 23.46811, 26.50769, 21.19076, 24.7...
## $ Y1997   <dbl> 20.81307, 25.33988, 23.54160, 26.58255, 21.22621, 24.7...
## $ Y1998   <dbl> 20.78591, 25.39116, 23.61592, 26.66337, 21.27082, 24.8...
## $ Y1999   <dbl> 20.75469, 25.46555, 23.69486, 26.75078, 21.31954, 24.9...
## $ Y2000   <dbl> 20.69521, 25.55835, 23.77659, 26.83179, 21.37480, 24.9...
## $ Y2001   <dbl> 20.62643, 25.66701, 23.86256, 26.92373, 21.43664, 25.0...
## $ Y2002   <dbl> 20.59848, 25.77167, 23.95294, 27.02525, 21.51765, 25.1...
## $ Y2003   <dbl> 20.58706, 25.87274, 24.05243, 27.12481, 21.59924, 25.2...
## $ Y2004   <dbl> 20.57759, 25.98136, 24.15957, 27.23107, 21.69218, 25.2...
## $ Y2005   <dbl> 20.58084, 26.08939, 24.27001, 27.32827, 21.80564, 25.3...
## $ Y2006   <dbl> 20.58749, 26.20867, 24.38270, 27.43588, 21.93881, 25.5...
## $ Y2007   <dbl> 20.60246, 26.32753, 24.48846, 27.53363, 22.08962, 25.6...
## $ Y2008   <dbl> 20.62058, 26.44657, 24.59620, 27.63048, 22.25083, 25.7...

Getting a feel for your data

# Check the class of bmi
class(bmi)
## [1] "data.frame"
dim(bmi)
## [1] 199  30
names(bmi)
##  [1] "Country" "Y1980"   "Y1981"   "Y1982"   "Y1983"   "Y1984"   "Y1985"  
##  [8] "Y1986"   "Y1987"   "Y1988"   "Y1989"   "Y1990"   "Y1991"   "Y1992"  
## [15] "Y1993"   "Y1994"   "Y1995"   "Y1996"   "Y1997"   "Y1998"   "Y1999"  
## [22] "Y2000"   "Y2001"   "Y2002"   "Y2003"   "Y2004"   "Y2005"   "Y2006"  
## [29] "Y2007"   "Y2008"

Viewing the structure of your data

Load dplyr

library(dplyr)

View a summary of bmi

summary(bmi)
##                 Country        Y1980           Y1981           Y1982      
##  Afghanistan        :  1   Min.   :19.01   Min.   :19.04   Min.   :19.07  
##  Albania            :  1   1st Qu.:21.27   1st Qu.:21.31   1st Qu.:21.36  
##  Algeria            :  1   Median :23.31   Median :23.39   Median :23.46  
##  Andorra            :  1   Mean   :23.15   Mean   :23.21   Mean   :23.26  
##  Angola             :  1   3rd Qu.:24.82   3rd Qu.:24.89   3rd Qu.:24.94  
##  Antigua and Barbuda:  1   Max.   :28.12   Max.   :28.36   Max.   :28.58  
##  (Other)            :193                                                  
##      Y1983           Y1984           Y1985           Y1986      
##  Min.   :19.10   Min.   :19.13   Min.   :19.16   Min.   :19.20  
##  1st Qu.:21.42   1st Qu.:21.45   1st Qu.:21.47   1st Qu.:21.49  
##  Median :23.57   Median :23.64   Median :23.73   Median :23.82  
##  Mean   :23.32   Mean   :23.37   Mean   :23.42   Mean   :23.48  
##  3rd Qu.:25.02   3rd Qu.:25.06   3rd Qu.:25.11   3rd Qu.:25.20  
##  Max.   :28.82   Max.   :29.05   Max.   :29.28   Max.   :29.52  
##                                                                 
##      Y1987           Y1988           Y1989           Y1990      
##  Min.   :19.23   Min.   :19.27   Min.   :19.31   Min.   :19.35  
##  1st Qu.:21.50   1st Qu.:21.52   1st Qu.:21.55   1st Qu.:21.57  
##  Median :23.87   Median :23.93   Median :24.03   Median :24.14  
##  Mean   :23.53   Mean   :23.59   Mean   :23.65   Mean   :23.71  
##  3rd Qu.:25.27   3rd Qu.:25.34   3rd Qu.:25.37   3rd Qu.:25.39  
##  Max.   :29.75   Max.   :29.98   Max.   :30.20   Max.   :30.42  
##                                                                 
##      Y1991           Y1992           Y1993           Y1994      
##  Min.   :19.40   Min.   :19.45   Min.   :19.51   Min.   :19.59  
##  1st Qu.:21.60   1st Qu.:21.65   1st Qu.:21.74   1st Qu.:21.76  
##  Median :24.20   Median :24.19   Median :24.27   Median :24.36  
##  Mean   :23.76   Mean   :23.82   Mean   :23.88   Mean   :23.94  
##  3rd Qu.:25.42   3rd Qu.:25.48   3rd Qu.:25.54   3rd Qu.:25.62  
##  Max.   :30.64   Max.   :30.85   Max.   :31.04   Max.   :31.23  
##                                                                 
##      Y1995           Y1996           Y1997           Y1998      
##  Min.   :19.67   Min.   :19.71   Min.   :19.74   Min.   :19.77  
##  1st Qu.:21.83   1st Qu.:21.89   1st Qu.:21.94   1st Qu.:22.00  
##  Median :24.41   Median :24.42   Median :24.50   Median :24.49  
##  Mean   :24.00   Mean   :24.07   Mean   :24.14   Mean   :24.21  
##  3rd Qu.:25.70   3rd Qu.:25.78   3rd Qu.:25.85   3rd Qu.:25.94  
##  Max.   :31.41   Max.   :31.59   Max.   :31.77   Max.   :31.95  
##                                                                 
##      Y1999           Y2000           Y2001           Y2002      
##  Min.   :19.80   Min.   :19.83   Min.   :19.86   Min.   :19.84  
##  1st Qu.:22.04   1st Qu.:22.12   1st Qu.:22.22   1st Qu.:22.29  
##  Median :24.61   Median :24.66   Median :24.73   Median :24.81  
##  Mean   :24.29   Mean   :24.36   Mean   :24.44   Mean   :24.52  
##  3rd Qu.:26.01   3rd Qu.:26.09   3rd Qu.:26.19   3rd Qu.:26.30  
##  Max.   :32.13   Max.   :32.32   Max.   :32.51   Max.   :32.70  
##                                                                 
##      Y2003           Y2004           Y2005           Y2006      
##  Min.   :19.81   Min.   :19.79   Min.   :19.79   Min.   :19.80  
##  1st Qu.:22.37   1st Qu.:22.45   1st Qu.:22.54   1st Qu.:22.63  
##  Median :24.89   Median :25.00   Median :25.11   Median :25.24  
##  Mean   :24.61   Mean   :24.70   Mean   :24.79   Mean   :24.89  
##  3rd Qu.:26.38   3rd Qu.:26.47   3rd Qu.:26.53   3rd Qu.:26.59  
##  Max.   :32.90   Max.   :33.10   Max.   :33.30   Max.   :33.49  
##                                                                 
##      Y2007           Y2008      
##  Min.   :19.83   Min.   :19.87  
##  1st Qu.:22.73   1st Qu.:22.83  
##  Median :25.36   Median :25.50  
##  Mean   :24.99   Mean   :25.10  
##  3rd Qu.:26.66   3rd Qu.:26.82  
##  Max.   :33.69   Max.   :33.90  
## 
# Histogram of BMIs from 2008
hist(bmi$Y2008)

# Scatter plot comparing BMIs from 1980 to those from 2008
plot(bmi$Y1980, bmi$Y2008)

Tidying data

head(bmi)
##               Country    Y1980    Y1981    Y1982    Y1983    Y1984
## 1         Afghanistan 21.48678 21.46552 21.45145 21.43822 21.42734
## 2             Albania 25.22533 25.23981 25.25636 25.27176 25.27901
## 3             Algeria 22.25703 22.34745 22.43647 22.52105 22.60633
## 4             Andorra 25.66652 25.70868 25.74681 25.78250 25.81874
## 5              Angola 20.94876 20.94371 20.93754 20.93187 20.93569
## 6 Antigua and Barbuda 23.31424 23.39054 23.45883 23.53735 23.63584
##      Y1985    Y1986    Y1987    Y1988    Y1989    Y1990    Y1991    Y1992
## 1 21.41222 21.40132 21.37679 21.34018 21.29845 21.24818 21.20269 21.14238
## 2 25.28669 25.29451 25.30217 25.30450 25.31944 25.32357 25.28452 25.23077
## 3 22.69501 22.76979 22.84096 22.90644 22.97931 23.04600 23.11333 23.18776
## 4 25.85236 25.89089 25.93414 25.98477 26.04450 26.10936 26.17912 26.24017
## 5 20.94857 20.96030 20.98025 21.01375 21.05269 21.09007 21.12136 21.14987
## 6 23.73109 23.83449 23.93649 24.05364 24.16347 24.26782 24.36568 24.45644
##      Y1993    Y1994    Y1995    Y1996    Y1997    Y1998    Y1999    Y2000
## 1 21.06376 20.97987 20.91132 20.85155 20.81307 20.78591 20.75469 20.69521
## 2 25.21192 25.22115 25.25874 25.31097 25.33988 25.39116 25.46555 25.55835
## 3 23.25764 23.32273 23.39526 23.46811 23.54160 23.61592 23.69486 23.77659
## 4 26.30356 26.36793 26.43569 26.50769 26.58255 26.66337 26.75078 26.83179
## 5 21.13938 21.14186 21.16022 21.19076 21.22621 21.27082 21.31954 21.37480
## 6 24.54096 24.60945 24.66461 24.72544 24.78714 24.84936 24.91721 24.99158
##      Y2001    Y2002    Y2003    Y2004    Y2005    Y2006    Y2007    Y2008
## 1 20.62643 20.59848 20.58706 20.57759 20.58084 20.58749 20.60246 20.62058
## 2 25.66701 25.77167 25.87274 25.98136 26.08939 26.20867 26.32753 26.44657
## 3 23.86256 23.95294 24.05243 24.15957 24.27001 24.38270 24.48846 24.59620
## 4 26.92373 27.02525 27.12481 27.23107 27.32827 27.43588 27.53363 27.63048
## 5 21.43664 21.51765 21.59924 21.69218 21.80564 21.93881 22.08962 22.25083
## 6 25.05857 25.13039 25.20713 25.29898 25.39965 25.51382 25.64247 25.76602

Column headers are values not variables.

Gathering columns into key-value pairs

  • Apply the gather() function to bmi, saving the result to bmi_long. This will create two new columns:
    • year, containing as values what are currently column headers
    • bmi_val, the actual BMI values
bmi_long <- bmi %>% 
  gather(year, bmi_val, -Country)
  • View the first 20 rows of bmi_long
head(bmi_long)
##               Country  year  bmi_val
## 1         Afghanistan Y1980 21.48678
## 2             Albania Y1980 25.22533
## 3             Algeria Y1980 22.25703
## 4             Andorra Y1980 25.66652
## 5              Angola Y1980 20.94876
## 6 Antigua and Barbuda Y1980 23.31424

Spreading key-value pairs into columns

  • Use spread() to reverse the operation that you performed in the last exercise with gather(). In other words, make bmi_long wide again, saving the result to bmi_wide
bmi_long %>% 
  spread(year, bmi_val) %>% head()
##               Country    Y1980    Y1981    Y1982    Y1983    Y1984
## 1         Afghanistan 21.48678 21.46552 21.45145 21.43822 21.42734
## 2             Albania 25.22533 25.23981 25.25636 25.27176 25.27901
## 3             Algeria 22.25703 22.34745 22.43647 22.52105 22.60633
## 4             Andorra 25.66652 25.70868 25.74681 25.78250 25.81874
## 5              Angola 20.94876 20.94371 20.93754 20.93187 20.93569
## 6 Antigua and Barbuda 23.31424 23.39054 23.45883 23.53735 23.63584
##      Y1985    Y1986    Y1987    Y1988    Y1989    Y1990    Y1991    Y1992
## 1 21.41222 21.40132 21.37679 21.34018 21.29845 21.24818 21.20269 21.14238
## 2 25.28669 25.29451 25.30217 25.30450 25.31944 25.32357 25.28452 25.23077
## 3 22.69501 22.76979 22.84096 22.90644 22.97931 23.04600 23.11333 23.18776
## 4 25.85236 25.89089 25.93414 25.98477 26.04450 26.10936 26.17912 26.24017
## 5 20.94857 20.96030 20.98025 21.01375 21.05269 21.09007 21.12136 21.14987
## 6 23.73109 23.83449 23.93649 24.05364 24.16347 24.26782 24.36568 24.45644
##      Y1993    Y1994    Y1995    Y1996    Y1997    Y1998    Y1999    Y2000
## 1 21.06376 20.97987 20.91132 20.85155 20.81307 20.78591 20.75469 20.69521
## 2 25.21192 25.22115 25.25874 25.31097 25.33988 25.39116 25.46555 25.55835
## 3 23.25764 23.32273 23.39526 23.46811 23.54160 23.61592 23.69486 23.77659
## 4 26.30356 26.36793 26.43569 26.50769 26.58255 26.66337 26.75078 26.83179
## 5 21.13938 21.14186 21.16022 21.19076 21.22621 21.27082 21.31954 21.37480
## 6 24.54096 24.60945 24.66461 24.72544 24.78714 24.84936 24.91721 24.99158
##      Y2001    Y2002    Y2003    Y2004    Y2005    Y2006    Y2007    Y2008
## 1 20.62643 20.59848 20.58706 20.57759 20.58084 20.58749 20.60246 20.62058
## 2 25.66701 25.77167 25.87274 25.98136 26.08939 26.20867 26.32753 26.44657
## 3 23.86256 23.95294 24.05243 24.15957 24.27001 24.38270 24.48846 24.59620
## 4 26.92373 27.02525 27.12481 27.23107 27.32827 27.43588 27.53363 27.63048
## 5 21.43664 21.51765 21.59924 21.69218 21.80564 21.93881 22.08962 22.25083
## 6 25.05857 25.13039 25.20713 25.29898 25.39965 25.51382 25.64247 25.76602
patient <- c('X', 'Y', 'X', 'Y', 'X', 'Y')
patient <- factor(patient)
treatment <- c("A", "A", "B", "B", "C", "C")
year_mo <- c("2010-10", "2010-10", "2012-08", "2012-08", "2014-12", "2014-12")
response <- c(1, 4, 2, 5, 3, 6)
treatments <- tibble(patient, treatment, year_mo, response)

treatments
## # A tibble: 6 x 4
##   patient treatment year_mo response
##   <fct>   <chr>     <chr>      <dbl>
## 1 X       A         2010-10        1
## 2 Y       A         2010-10        4
## 3 X       B         2012-08        2
## 4 Y       B         2012-08        5
## 5 X       C         2014-12        3
## 6 Y       C         2014-12        6

*Apply the separate() function to treatments

  • Separate year_mo into two columns: year and month
  • Be sure to specify the correct separator with the sep argument
treatments_cc <- treatments %>% 
  separate(year_mo, into = c("year", "month"), sep = "-")
treatments_cc
## # A tibble: 6 x 5
##   patient treatment year  month response
##   <fct>   <chr>     <chr> <chr>    <dbl>
## 1 X       A         2010  10           1
## 2 Y       A         2010  10           4
## 3 X       B         2012  08           2
## 4 Y       B         2012  08           5
## 5 X       C         2014  12           3
## 6 Y       C         2014  12           6
  • Apply the unite() function to treatment
    • Reunite the year and monthy columns into a single column called year_mo
    • Separate each year and month with a dash(-)
treatments<- treatments_cc %>% 
  unite(year_mo, year, month, sep = "-")
treatments
## # A tibble: 6 x 4
##   patient treatment year_mo response
##   <fct>   <chr>     <chr>      <dbl>
## 1 X       A         2010-10        1
## 2 Y       A         2010-10        4
## 3 X       B         2012-08        2
## 4 Y       B         2012-08        5
## 5 X       C         2014-12        3
## 6 Y       C         2014-12        6

Column headers are values, not variable names

url <- "https://assets.datacamp.com/production/course_723/datasets/census-retail.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
census <- read.csv(filename)
glimpse(census)
## Observations: 24
## Variables: 13
## $ YEAR <int> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 200...
## $ JAN  <int> 146913, 157525, 167504, 182423, 189167, 202414, 209684, 2...
## $ FEB  <int> 147270, 156292, 169652, 179472, 192269, 204273, 209532, 2...
## $ MAR  <int> 146831, 154774, 172775, 180996, 193993, 204965, 210792, 2...
## $ APR  <int> 148082, 158996, 173099, 181702, 194712, 203372, 213623, 2...
## $ MAY  <int> 149015, 160624, 172340, 183543, 196210, 201676, 214619, 2...
## $ JUN  <int> 149821, 160171, 174307, 186088, 196127, 204666, 216324, 2...
## $ JUL  <int> 150809, 162832, 174801, 185470, 196229, 207049, 214853, 2...
## $ AUG  <int> 151064, 162491, 177289, 186814, 196215, 207643, 213669, 2...
## $ SEP  <int> 152595, 163285, 178776, 187338, 198843, 208298, 215712, 2...
## $ OCT  <int> 153577, 164711, 180569, 186546, 200488, 208064, 219465, 2...
## $ NOV  <int> 153605, 166593, 180695, 189052, 200200, 208982, 221150, 2...
## $ DEC  <int> 155504, 168101, 181492, 190809, 201191, 209379, 223226, 2...
  • View the head of census.
head(census)
##   YEAR    JAN    FEB    MAR    APR    MAY    JUN    JUL    AUG    SEP
## 1 1992 146913 147270 146831 148082 149015 149821 150809 151064 152595
## 2 1993 157525 156292 154774 158996 160624 160171 162832 162491 163285
## 3 1994 167504 169652 172775 173099 172340 174307 174801 177289 178776
## 4 1995 182423 179472 180996 181702 183543 186088 185470 186814 187338
## 5 1996 189167 192269 193993 194712 196210 196127 196229 196215 198843
## 6 1997 202414 204273 204965 203372 201676 204666 207049 207643 208298
##      OCT    NOV    DEC
## 1 153577 153605 155504
## 2 164711 166593 168101
## 3 180569 180695 181492
## 4 186546 189052 190809
## 5 200488 200200 201191
## 6 208064 208982 209379
  • Gather the month columns, creating two new columns (month and amount), saving the result to census2.
census2 <- census %>% 
  gather(month, amount, -YEAR)
  • Run the code given to arrange() the rows of census2 by the YEAR column.
census2 <- census2 %>% arrange(YEAR)
  • View the first 20 rows of the result.
head(census2)
##   YEAR month amount
## 1 1992   JAN 146913
## 2 1992   FEB 147270
## 3 1992   MAR 146831
## 4 1992   APR 148082
## 5 1992   MAY 149015
## 6 1992   JUN 149821

Variables are stored in both rows and columns

owner <- c("Jason","Jason", "Jason", "Lisa", "Lisa", "Lisa", "Terrence", "Terrence", "Terrence")
type <- c("dog", "cat", "bird", "dog", "cat", "bird", "dog", "cat", "bird")
num <- c(2, 4, 3, 7, 10, 9, 8, 5, 1)
pets <- tibble(owner, type, num)
pets
## # A tibble: 9 x 3
##   owner    type    num
##   <chr>    <chr> <dbl>
## 1 Jason    dog       2
## 2 Jason    cat       4
## 3 Jason    bird      3
## 4 Lisa     dog       7
## 5 Lisa     cat      10
## 6 Lisa     bird      9
## 7 Terrence dog       8
## 8 Terrence cat       5
## 9 Terrence bird      1
spread(pets, type, num)
## # A tibble: 3 x 4
##   owner     bird   cat   dog
##   <chr>    <dbl> <dbl> <dbl>
## 1 Jason        3     4     2
## 2 Lisa         9    10     7
## 3 Terrence     1     5     8
census_long <- census2 %>% 
  group_by(YEAR) %>% 
  mutate(type = cut(amount,
                    breaks = 3,
                    labels = c('low', 'med', 'high')
                    )
         )
head(census_long, 10)
## # A tibble: 10 x 4
## # Groups:   YEAR [1]
##     YEAR month amount type 
##    <int> <chr>  <int> <fct>
##  1  1992 JAN   146913 low  
##  2  1992 FEB   147270 low  
##  3  1992 MAR   146831 low  
##  4  1992 APR   148082 low  
##  5  1992 MAY   149015 low  
##  6  1992 JUN   149821 med  
##  7  1992 JUL   150809 med  
##  8  1992 AUG   151064 med  
##  9  1992 SEP   152595 med  
## 10  1992 OCT   153577 high
# Spread the type column
census_long2 <- spread(census_long,type, amount) 

# View first 20 rows of census_long2
head(census_long2, 20)
## # A tibble: 20 x 6
## # Groups:   YEAR [2]
##     YEAR month    low    med   high `<NA>`
##    <int> <chr>  <int>  <int>  <int>  <int>
##  1  1992 APR   148082     NA     NA     NA
##  2  1992 AUG       NA 151064     NA     NA
##  3  1992 DEC       NA     NA 155504     NA
##  4  1992 FEB   147270     NA     NA     NA
##  5  1992 JAN   146913     NA     NA     NA
##  6  1992 JUL       NA 150809     NA     NA
##  7  1992 JUN       NA 149821     NA     NA
##  8  1992 MAR   146831     NA     NA     NA
##  9  1992 MAY   149015     NA     NA     NA
## 10  1992 NOV       NA     NA 153605     NA
## 11  1992 OCT       NA     NA 153577     NA
## 12  1992 SEP       NA 152595     NA     NA
## 13  1993 APR   158996     NA     NA     NA
## 14  1993 AUG       NA 162491     NA     NA
## 15  1993 DEC       NA     NA 168101     NA
## 16  1993 FEB   156292     NA     NA     NA
## 17  1993 JAN   157525     NA     NA     NA
## 18  1993 JUL       NA 162832     NA     NA
## 19  1993 JUN       NA 160171     NA     NA
## 20  1993 MAR   154774     NA     NA     NA

Preparing data for analysis

Working with dates using lubridate() functions

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
url <- "https://assets.datacamp.com/production/course_723/datasets/students_with_dates.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
students <- read.csv(filename)
glimpse(students)
## Observations: 395
## Variables: 33
## $ X           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ school      <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP...
## $ sex         <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F,...
## $ dob         <fct> 2000-06-05, 1999-11-25, 1998-02-02, 1997-12-20, 19...
## $ address     <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,...
## $ famsize     <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, ...
## $ Pstatus     <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T,...
## $ Medu        <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4,...
## $ Fedu        <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4,...
## $ Mjob        <fct> at_home, at_home, at_home, health, other, services...
## $ Fjob        <fct> teacher, other, other, services, other, other, oth...
## $ reason      <fct> course, course, other, home, home, reputation, hom...
## $ guardian    <fct> mother, father, mother, mother, father, mother, mo...
## $ traveltime  <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1,...
## $ studytime   <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3,...
## $ failures    <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ schoolsup   <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no,...
## $ famsup      <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes...
## $ paid        <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes,...
## $ activities  <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes...
## $ nursery     <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ higher      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
## $ internet    <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes...
## $ romantic    <fct> no, no, no, yes, no, no, no, no, no, no, no, no, n...
## $ famrel      <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3,...
## $ freetime    <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2,...
## $ goout       <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3,...
## $ Dalc        <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Walc        <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2,...
## $ health      <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2,...
## $ nurse_visit <fct> 2014-04-10 14:59:54, 2015-03-12 14:59:54, 2015-09-...
## $ absences    <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, ...
## $ Grades      <fct> 5/6/6, 5/5/6, 7/8/10, 15/14/15, 6/10/10, 15/15/15,...
  • Coerce the following columns:
    • Grades to character
    • Medu to factor (categorical variable representing mother’s education level)
    • Fedu to factor (categorical variable representing father’s education level)
# Coerce Grades to character
students$Grades <- as.character(students$Grades)

# Coerce Medu to factor
students$Medu <- as.factor(students$Medu)

# Coerce Fedu to factor
students$Fedu <- as.factor(students$Fedu)
    
# Look at students once more with s
  • Print “17 Sep 2015” as a date
dmy("17 Sep 2015")
## [1] "2015-09-17"
  • Print “July 15, 2012 12:56” as a date and time (note there are hours and minutes, but no seconds!)
mdy_hm("July 15, 2012 12:56")
## [1] "2012-07-15 12:56:00 UTC"
  • Coerce dob to a date (with no time)
students2 <- students
students$dob <- ymd(students2$dob)
  • Coerce nurse_visit to a date and time
students2$nurse_visit <- ymd_hms(students2$nurse_visit)
  • Use str() to see the changes to students2
glimpse(students2)
## Observations: 395
## Variables: 33
## $ X           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ school      <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP...
## $ sex         <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F,...
## $ dob         <fct> 2000-06-05, 1999-11-25, 1998-02-02, 1997-12-20, 19...
## $ address     <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,...
## $ famsize     <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, ...
## $ Pstatus     <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T,...
## $ Medu        <fct> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4,...
## $ Fedu        <fct> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4,...
## $ Mjob        <fct> at_home, at_home, at_home, health, other, services...
## $ Fjob        <fct> teacher, other, other, services, other, other, oth...
## $ reason      <fct> course, course, other, home, home, reputation, hom...
## $ guardian    <fct> mother, father, mother, mother, father, mother, mo...
## $ traveltime  <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1,...
## $ studytime   <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3,...
## $ failures    <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ schoolsup   <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no,...
## $ famsup      <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes...
## $ paid        <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes,...
## $ activities  <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes...
## $ nursery     <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ higher      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
## $ internet    <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes...
## $ romantic    <fct> no, no, no, yes, no, no, no, no, no, no, no, no, n...
## $ famrel      <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3,...
## $ freetime    <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2,...
## $ goout       <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3,...
## $ Dalc        <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Walc        <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2,...
## $ health      <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2,...
## $ nurse_visit <dttm> 2014-04-10 14:59:54, 2015-03-12 14:59:54, 2015-09...
## $ absences    <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, ...
## $ Grades      <chr> "5/6/6", "5/5/6", "7/8/10", "15/14/15", "6/10/10",...

Trimming and padding strings

# Load the stringr package
library(stringr)
  • Trim all leading and trailing white space from the first set of strings
# Trim all leading and trailing whitespace

str_trim(c("   Filip ", "Nick  ", " Jonathan"))
## [1] "Filip"    "Nick"     "Jonathan"
  • Pad the second set of strings with leading zeros such that all are 9 characters in length
# Pad these strings with leading zeros

str_pad(c("23485W", "8823453Q", "994Z"), width = 9, pad = 0)
## [1] "00023485W" "08823453Q" "00000994Z"

Upper and lower case

R currently contains the following state.abb from (R Core Team 2018) data sets.

state.abb: character vector of 2-letter abbreviations for the state names.

(states <- state.abb)
##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"
# Make states all uppercase and save result to states_upper
(states_upper <- toupper(states))
##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"
# Make states_upper all lowercase again
tolower(states_upper)
##  [1] "al" "ak" "az" "ar" "ca" "co" "ct" "de" "fl" "ga" "hi" "id" "il" "in"
## [15] "ia" "ks" "ky" "la" "me" "md" "ma" "mi" "mn" "ms" "mo" "mt" "ne" "nv"
## [29] "nh" "nj" "nm" "ny" "nc" "nd" "oh" "ok" "or" "pa" "ri" "sc" "sd" "tn"
## [43] "tx" "ut" "vt" "va" "wa" "wv" "wi" "wy"

Finding and replacing strings

  • Detect all dates of birth (dob) in 1997 using str_detect(). This should return a vector of TRUE and FALSE values.
# Detect all dates of birth (dob) in 1997
head(str_detect(students2$dob, '1997'), 50)
##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [12] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [23]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [45] FALSE FALSE  TRUE  TRUE  TRUE  TRUE
  • Replace all instances of "F" with "Female" in students2$sex
# In the sex column, replace "F" with "Female"...
students2$sex <- str_replace(students2$sex, "F", "Female")
  • Replace all instances of "M" with "Male" in students2$sex
# ...And "M" with "Male"
students2$sex <- str_replace(students2$sex, "M", "Male")
  • View the head() of students2 to see the result of these replacements
head(students2, 10)
##     X school    sex        dob address famsize Pstatus Medu Fedu     Mjob
## 1   1     GP Female 2000-06-05       U     GT3       A    4    4  at_home
## 2   2     GP Female 1999-11-25       U     GT3       T    1    1  at_home
## 3   3     GP Female 1998-02-02       U     LE3       T    1    1  at_home
## 4   4     GP Female 1997-12-20       U     GT3       T    4    2   health
## 5   5     GP Female 1998-10-04       U     GT3       T    3    3    other
## 6   6     GP   Male 1999-06-16       U     LE3       T    4    3 services
## 7   7     GP   Male 1997-09-20       U     LE3       T    2    2    other
## 8   8     GP Female 1998-07-05       U     GT3       A    4    4    other
## 9   9     GP   Male 2000-03-18       U     LE3       A    3    2 services
## 10 10     GP   Male 1997-03-18       U     GT3       T    3    4    other
##        Fjob     reason guardian traveltime studytime failures schoolsup
## 1   teacher     course   mother          2         2        0       yes
## 2     other     course   father          1         2        0        no
## 3     other      other   mother          1         2        3       yes
## 4  services       home   mother          1         3        0        no
## 5     other       home   father          1         2        0        no
## 6     other reputation   mother          1         2        0        no
## 7     other       home   mother          1         2        0        no
## 8   teacher       home   mother          2         2        0       yes
## 9     other       home   mother          1         2        0        no
## 10    other       home   mother          1         2        0        no
##    famsup paid activities nursery higher internet romantic famrel freetime
## 1      no   no         no     yes    yes       no       no      4        3
## 2     yes   no         no      no    yes      yes       no      5        3
## 3      no  yes         no     yes    yes      yes       no      4        3
## 4     yes  yes        yes     yes    yes      yes      yes      3        2
## 5     yes  yes         no     yes    yes       no       no      4        3
## 6     yes  yes        yes     yes    yes      yes       no      5        4
## 7      no   no         no     yes    yes      yes       no      4        4
## 8     yes   no         no     yes    yes       no       no      4        1
## 9     yes  yes         no     yes    yes      yes       no      4        2
## 10    yes  yes        yes     yes    yes      yes       no      5        5
##    goout Dalc Walc health         nurse_visit absences   Grades
## 1      4    1    1      3 2014-04-10 14:59:54        6    5/6/6
## 2      3    1    1      3 2015-03-12 14:59:54        4    5/5/6
## 3      2    2    3      3 2015-09-21 14:59:54       10   7/8/10
## 4      2    1    1      5 2015-09-03 14:59:54        2 15/14/15
## 5      2    1    2      5 2015-04-07 14:59:54        4  6/10/10
## 6      2    1    2      5 2013-11-15 14:59:54       10 15/15/15
## 7      4    1    1      3 2015-09-20 14:59:54        0 12/12/11
## 8      4    1    1      1 2015-02-01 14:59:54        6    6/5/6
## 9      2    1    1      1 2015-04-12 14:59:54        0 16/18/19
## 10     1    1    1      5 2014-12-30 14:59:54        0 14/15/15

Finding missing values

name <- factor(c('Sarah', 'Tom', 'David', 'Alice'))
n_friends <- c(244, NA, 145,  43)
status <- factor(c("   Going out!", "","Movie night...", ""))
(social_df <- tibble(name, n_friends, status))
## # A tibble: 4 x 3
##   name  n_friends status         
##   <fct>     <dbl> <fct>          
## 1 Sarah       244 "   Going out!"
## 2 Tom          NA ""             
## 3 David       145 Movie night... 
## 4 Alice        43 ""
  • Call is.na() on social_df to spot all NA values.
is.na(social_df)
##       name n_friends status
## [1,] FALSE     FALSE  FALSE
## [2,] FALSE      TRUE  FALSE
## [3,] FALSE     FALSE  FALSE
## [4,] FALSE     FALSE  FALSE
  • Wrap the above with the any() function to ask the question “Are there any NA values in my dataset?”.
any(is.na(social_df))
## [1] TRUE
  • View a summary() of the dataset to see how missing values are broken out.
summary(social_df)
##     name     n_friends                status 
##  Alice:1   Min.   : 43.0                 :2  
##  David:1   1st Qu.: 94.0      Going out! :1  
##  Sarah:1   Median :145.0   Movie night...:1  
##  Tom  :1   Mean   :144.0                     
##            3rd Qu.:194.5                     
##            Max.   :244.0                     
##            NA's   :1
  • Use table to identify odd values of the status variable.
table(social_df$status)
## 
##                    Going out! Movie night... 
##              2              1              1

Dealing with missing values

  • Replace all empty strings (i.e. “”) with NA in the status column of social_df.
social_df$status[social_df$status == ""] <- NA
  • Print the updated version of social_df to confirm your changes.
social_df
## # A tibble: 4 x 3
##   name  n_friends status         
##   <fct>     <dbl> <fct>          
## 1 Sarah       244 "   Going out!"
## 2 Tom          NA <NA>           
## 3 David       145 Movie night... 
## 4 Alice        43 <NA>
  • Use complete.cases() to return a vector containing TRUE and FALSE to see which rows have NO missing values.
complete.cases(social_df)
## [1]  TRUE FALSE  TRUE FALSE
  • Use na.omit() to remove all rows with one or more missing values (without saving the result).
na.omit(social_df)
## # A tibble: 2 x 3
##   name  n_friends status         
##   <fct>     <dbl> <fct>          
## 1 Sarah       244 "   Going out!"
## 2 David       145 Movie night...
social_df[complete.cases(social_df),]
## # A tibble: 2 x 3
##   name  n_friends status         
##   <fct>     <dbl> <fct>          
## 1 Sarah       244 "   Going out!"
## 2 David       145 Movie night...

Dealing with outliers and obvious errors

students3 <- select(students, dob, absences)
span <- interval(students3$dob, Sys.Date())
students3 <- students3 %>% 
  mutate(age = year(as.period(span, unit = "year")
                    )
         )
head(students3)
##          dob absences age
## 1 2000-06-05        6  17
## 2 1999-11-25        4  18
## 3 1998-02-02       10  20
## 4 1997-12-20        2  20
## 5 1998-10-04        4  19
## 6 1999-06-16       10  18
  • Call summary() on the full students3 dataset to expose the concerning values of age and absences.
# Look at a summary() of students3
summary(students3)
##       dob                absences           age       
##  Min.   :1996-11-02   Min.   : 0.000   Min.   :17.00  
##  1st Qu.:1997-11-04   1st Qu.: 0.000   1st Qu.:18.00  
##  Median :1998-12-16   Median : 4.000   Median :19.00  
##  Mean   :1998-10-30   Mean   : 5.709   Mean   :19.06  
##  3rd Qu.:1999-10-29   3rd Qu.: 8.000   3rd Qu.:20.00  
##  Max.   :2000-10-25   Max.   :75.000   Max.   :21.00
  • View a histogram (using hist()) of the age variable.
# View a histogram of the age variable
hist(students3$age)

  • View a histogram of the absences variable.
# View a histogram of the absences variable
hist(students3$absences)

  • View another histogram of absences, but force values of zero to be bucketed to the right of zero on the x-axis with right = FALSE (see ?hist for more info).
# View a histogram of absences, but force zeros to be bucketed to the right of zero
hist(students3$absences, right=FALSE)

Putting it all together

con <- "https://assets.datacamp.com/production/course_723/datasets/weather.rds"
weather <- readRDS(gzcon(url(con)))
head(weather)
##   X year month           measure X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## 1 1 2014    12  Max.TemperatureF 64 42 51 43 42 45 38 29 49  48  39  39
## 2 2 2014    12 Mean.TemperatureF 52 38 44 37 34 42 30 24 39  43  36  35
## 3 3 2014    12  Min.TemperatureF 39 33 37 30 26 38 21 18 29  38  32  31
## 4 4 2014    12    Max.Dew.PointF 46 40 49 24 37 45 36 28 49  45  37  28
## 5 5 2014    12    MeanDew.PointF 40 27 42 21 25 40 20 16 41  39  31  27
## 6 6 2014    12     Min.DewpointF 26 17 24 13 12 36 -3  3 28  37  27  25
##   X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
## 1  42  45  42  44  49  44  37  36  36  44  47  46  59  50  52  52  41  30
## 2  37  39  37  40  45  40  33  32  33  39  45  44  52  44  45  46  36  26
## 3  32  33  32  35  41  36  29  27  30  33  42  41  44  37  38  40  30  22
## 4  28  29  33  42  46  34  25  30  30  39  45  46  58  31  34  42  26  10
## 5  26  27  29  36  41  30  22  24  27  34  42  44  43  29  31  35  20   4
## 6  24  25  27  30  32  26  20  20  25  25  37  41  29  28  29  27  10  -6
##   X31
## 1  30
## 2  25
## 3  20
## 4   8
## 5   5
## 6   1
  • Check that it’s a data.frame using the function class()
  • Look at the dimensions
  • View the column names
class(weather)
## [1] "data.frame"
# View the column names
names(weather)
##  [1] "X"       "year"    "month"   "measure" "X1"      "X2"      "X3"     
##  [8] "X4"      "X5"      "X6"      "X7"      "X8"      "X9"      "X10"    
## [15] "X11"     "X12"     "X13"     "X14"     "X15"     "X16"     "X17"    
## [22] "X18"     "X19"     "X20"     "X21"     "X22"     "X23"     "X24"    
## [29] "X25"     "X26"     "X27"     "X28"     "X29"     "X30"     "X31"
# Check the dimensions
dim(weather)
## [1] 286  35

Summarize the data

# Look at the structure using dplyr's glimpse()
glimpse(weather)
## Observations: 286
## Variables: 35
## $ X       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ year    <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ...
## $ month   <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat...
## $ X1      <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ...
## $ X2      <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ...
## $ X3      <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",...
## $ X4      <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ...
## $ X5      <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ...
## $ X6      <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",...
## $ X7      <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ...
## $ X8      <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "...
## $ X9      <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",...
## $ X10     <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",...
## $ X11     <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ...
## $ X12     <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ...
## $ X13     <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ...
## $ X14     <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ...
## $ X15     <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ...
## $ X16     <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ...
## $ X17     <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",...
## $ X18     <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ...
## $ X19     <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ...
## $ X20     <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ...
## $ X21     <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ...
## $ X22     <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ...
## $ X23     <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",...
## $ X24     <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",...
## $ X25     <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",...
## $ X26     <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ...
## $ X27     <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ...
## $ X28     <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ...
## $ X29     <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ...
## $ X30     <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "...
## $ X31     <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
# View a summary of the data
summary(weather)
##        X               year          month          measure         
##  Min.   :  1.00   Min.   :2014   Min.   : 1.000   Length:286        
##  1st Qu.: 72.25   1st Qu.:2015   1st Qu.: 4.000   Class :character  
##  Median :143.50   Median :2015   Median : 7.000   Mode  :character  
##  Mean   :143.50   Mean   :2015   Mean   : 6.923                     
##  3rd Qu.:214.75   3rd Qu.:2015   3rd Qu.:10.000                     
##  Max.   :286.00   Max.   :2015   Max.   :12.000                     
##       X1                 X2                 X3           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       X4                 X5                 X6           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       X7                 X8                 X9           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X10                X11                X12           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X13                X14                X15           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X16                X17                X18           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X19                X20                X21           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X22                X23                X24           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X25                X26                X27           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X28                X29                X30           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X31           
##  Length:286        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Take a closer look

head(weather)
##   X year month           measure X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## 1 1 2014    12  Max.TemperatureF 64 42 51 43 42 45 38 29 49  48  39  39
## 2 2 2014    12 Mean.TemperatureF 52 38 44 37 34 42 30 24 39  43  36  35
## 3 3 2014    12  Min.TemperatureF 39 33 37 30 26 38 21 18 29  38  32  31
## 4 4 2014    12    Max.Dew.PointF 46 40 49 24 37 45 36 28 49  45  37  28
## 5 5 2014    12    MeanDew.PointF 40 27 42 21 25 40 20 16 41  39  31  27
## 6 6 2014    12     Min.DewpointF 26 17 24 13 12 36 -3  3 28  37  27  25
##   X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
## 1  42  45  42  44  49  44  37  36  36  44  47  46  59  50  52  52  41  30
## 2  37  39  37  40  45  40  33  32  33  39  45  44  52  44  45  46  36  26
## 3  32  33  32  35  41  36  29  27  30  33  42  41  44  37  38  40  30  22
## 4  28  29  33  42  46  34  25  30  30  39  45  46  58  31  34  42  26  10
## 5  26  27  29  36  41  30  22  24  27  34  42  44  43  29  31  35  20   4
## 6  24  25  27  30  32  26  20  20  25  25  37  41  29  28  29  27  10  -6
##   X31
## 1  30
## 2  25
## 3  20
## 4   8
## 5   5
## 6   1
tail(weather)
##       X year month            measure   X1   X2   X3   X4   X5   X6   X7
## 281 281 2015    12 Mean.Wind.SpeedMPH    6 <NA> <NA> <NA> <NA> <NA> <NA>
## 282 282 2015    12  Max.Gust.SpeedMPH   17 <NA> <NA> <NA> <NA> <NA> <NA>
## 283 283 2015    12    PrecipitationIn 0.14 <NA> <NA> <NA> <NA> <NA> <NA>
## 284 284 2015    12         CloudCover    7 <NA> <NA> <NA> <NA> <NA> <NA>
## 285 285 2015    12             Events Rain <NA> <NA> <NA> <NA> <NA> <NA>
## 286 286 2015    12     WindDirDegrees  109 <NA> <NA> <NA> <NA> <NA> <NA>
##       X8   X9  X10  X11  X12  X13  X14  X15  X16  X17  X18  X19  X20  X21
## 281 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 282 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 283 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 284 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 285 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 286 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
##      X22  X23  X24  X25  X26  X27  X28  X29  X30  X31
## 281 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 282 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 283 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 284 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 285 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 286 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>

Column names are values

  • Call gather() on the weather data to gather columns X1-X31. The two columns created as a result should be called day and value. Save the result as weather2
weather2 <- weather %>% 
  gather(day, value, X1:X31)
head(weather2)
##   X year month           measure day value
## 1 1 2014    12  Max.TemperatureF  X1    64
## 2 2 2014    12 Mean.TemperatureF  X1    52
## 3 3 2014    12  Min.TemperatureF  X1    39
## 4 4 2014    12    Max.Dew.PointF  X1    46
## 5 5 2014    12    MeanDew.PointF  X1    40
## 6 6 2014    12     Min.DewpointF  X1    26

Values are variable names

  • Drop the first column.
# First remove column of row names
weather2 <- select(weather2, -1)
  • Spread the measure column of weather2 and save the result to weather3
# Spread the data
weather3 <- spread(weather2, measure, value)
# View the head
head(weather3)
##   year month day CloudCover    Events Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014    12  X1          6      Rain             46                29
## 2 2014    12 X10          8      Rain             45                29
## 3 2014    12 X11          8 Rain-Snow             37                28
## 4 2014    12 X12          7      Snow             28                21
## 5 2014    12 X13          5                       28                23
## 6 2014    12 X14          4                       29                20
##   Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1           74                    30.45               64
## 2          100                    29.58               48
## 3           92                    29.81               39
## 4           85                    29.88               39
## 5           75                    29.86               42
## 6           82                    29.91               45
##   Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                  10                22            63
## 2                  10                23            95
## 3                  10                21            87
## 4                  10                16            75
## 5                  10                17            65
## 6                  10                15            68
##   Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                     30.13                52                   10
## 2                      29.5                43                    3
## 3                     29.61                36                    7
## 4                     29.85                35                   10
## 5                     29.82                37                   10
## 6                     29.83                39                   10
##   Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                 13             40            26           52
## 2                 13             39            37           89
## 3                 13             31            27           82
## 4                 11             27            25           64
## 5                 12             26            24           55
## 6                 10             27            25           53
##   Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                    30.01               39                  10
## 2                    29.43               38                   1
## 3                    29.44               32                   1
## 4                    29.81               31                   7
## 5                    29.78               32                  10
## 6                    29.78               33                  10
##   PrecipitationIn WindDirDegrees
## 1            0.01            268
## 2            0.28            357
## 3            0.02            230
## 4               T            286
## 5               T            298
## 6            0.00            306

Clean up dates

  • Load the stringr and lubridate packages
  • Use stringr’s str_replace() to remove the Xs from the day column of weather3
# Remove X's from day column
weather3$day <- str_replace(weather3$day, "X","")
  • Create a new column called date. Use the unite() function from tidyr to paste together the year, month, and day columns in order, using - as a separator (see ?unite if you need help)
# Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")
  • Coerce the date column using the appropriate function from lubridate
# Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)
## Warning: 7 failed to parse.
  • Use the code provided (select()) to reorder columns, saving the result to weather5
# Rearrange columns using dplyr's select()
weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
  • View the head of weather5
head(weather5)
##         date    Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01      Rain          6             46                29
## 2 2014-12-10      Rain          8             45                29
## 3 2014-12-11 Rain-Snow          8             37                28
## 4 2014-12-12      Snow          7             28                21
## 5 2014-12-13                    5             28                23
## 6 2014-12-14                    4             29                20
##   Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1           74                    30.45               64
## 2          100                    29.58               48
## 3           92                    29.81               39
## 4           85                    29.88               39
## 5           75                    29.86               42
## 6           82                    29.91               45
##   Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                  10                22            63
## 2                  10                23            95
## 3                  10                21            87
## 4                  10                16            75
## 5                  10                17            65
## 6                  10                15            68
##   Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                     30.13                52                   10
## 2                      29.5                43                    3
## 3                     29.61                36                    7
## 4                     29.85                35                   10
## 5                     29.82                37                   10
## 6                     29.83                39                   10
##   Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                 13             40            26           52
## 2                 13             39            37           89
## 3                 13             31            27           82
## 4                 11             27            25           64
## 5                 12             26            24           55
## 6                 10             27            25           53
##   Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                    30.01               39                  10
## 2                    29.43               38                   1
## 3                    29.44               32                   1
## 4                    29.81               31                   7
## 5                    29.78               32                  10
## 6                    29.78               33                  10
##   PrecipitationIn WindDirDegrees
## 1            0.01            268
## 2            0.28            357
## 3            0.02            230
## 4               T            286
## 5               T            298
## 6            0.00            306

A closer look at column types

  • Use str() to see how variables are stored in weather5
str(weather5)
## 'data.frame':    403 obs. of  23 variables:
##  $ date                     : Date, format: "2014-12-01" "2014-12-10" ...
##  $ Events                   : chr  "Rain" "Rain" "Rain-Snow" "Snow" ...
##  $ CloudCover               : chr  "6" "8" "8" "7" ...
##  $ Max.Dew.PointF           : chr  "46" "45" "37" "28" ...
##  $ Max.Gust.SpeedMPH        : chr  "29" "29" "28" "21" ...
##  $ Max.Humidity             : chr  "74" "100" "92" "85" ...
##  $ Max.Sea.Level.PressureIn : chr  "30.45" "29.58" "29.81" "29.88" ...
##  $ Max.TemperatureF         : chr  "64" "48" "39" "39" ...
##  $ Max.VisibilityMiles      : chr  "10" "10" "10" "10" ...
##  $ Max.Wind.SpeedMPH        : chr  "22" "23" "21" "16" ...
##  $ Mean.Humidity            : chr  "63" "95" "87" "75" ...
##  $ Mean.Sea.Level.PressureIn: chr  "30.13" "29.5" "29.61" "29.85" ...
##  $ Mean.TemperatureF        : chr  "52" "43" "36" "35" ...
##  $ Mean.VisibilityMiles     : chr  "10" "3" "7" "10" ...
##  $ Mean.Wind.SpeedMPH       : chr  "13" "13" "13" "11" ...
##  $ MeanDew.PointF           : chr  "40" "39" "31" "27" ...
##  $ Min.DewpointF            : chr  "26" "37" "27" "25" ...
##  $ Min.Humidity             : chr  "52" "89" "82" "64" ...
##  $ Min.Sea.Level.PressureIn : chr  "30.01" "29.43" "29.44" "29.81" ...
##  $ Min.TemperatureF         : chr  "39" "38" "32" "31" ...
##  $ Min.VisibilityMiles      : chr  "10" "1" "1" "7" ...
##  $ PrecipitationIn          : chr  "0.01" "0.28" "0.02" "T" ...
##  $ WindDirDegrees           : chr  "268" "357" "230" "286" ...
  • View the first 20 rows of weather5. Keep an eye out for strange values!
head(weather5, 20)
##          date    Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1  2014-12-01      Rain          6             46                29
## 2  2014-12-10      Rain          8             45                29
## 3  2014-12-11 Rain-Snow          8             37                28
## 4  2014-12-12      Snow          7             28                21
## 5  2014-12-13                    5             28                23
## 6  2014-12-14                    4             29                20
## 7  2014-12-15                    2             33                21
## 8  2014-12-16      Rain          8             42                10
## 9  2014-12-17      Rain          8             46                26
## 10 2014-12-18      Rain          7             34                30
## 11 2014-12-19                    4             25                23
## 12 2014-12-02 Rain-Snow          7             40                29
## 13 2014-12-20      Snow          6             30                26
## 14 2014-12-21      Snow          8             30                20
## 15 2014-12-22      Rain          7             39                22
## 16 2014-12-23      Rain          8             45                25
## 17 2014-12-24  Fog-Rain          8             46                15
## 18 2014-12-25      Rain          6             58                40
## 19 2014-12-26                    1             31                25
## 20 2014-12-27                    3             34                21
##    Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1            74                    30.45               64
## 2           100                    29.58               48
## 3            92                    29.81               39
## 4            85                    29.88               39
## 5            75                    29.86               42
## 6            82                    29.91               45
## 7            89                    30.15               42
## 8            96                    30.17               44
## 9           100                    29.91               49
## 10           89                    29.87               44
## 11           69                    30.15               37
## 12           92                    30.71               42
## 13           89                    30.31               36
## 14           85                    30.37               36
## 15           89                     30.4               44
## 16          100                    30.31               47
## 17          100                    30.13               46
## 18          100                    29.96               59
## 19           70                    30.16               50
## 20           70                    30.22               52
##    Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                   10                22            63
## 2                   10                23            95
## 3                   10                21            87
## 4                   10                16            75
## 5                   10                17            65
## 6                   10                15            68
## 7                   10                15            75
## 8                   10                 8            85
## 9                   10                20            85
## 10                  10                23            73
## 11                  10                17            63
## 12                  10                24            72
## 13                  10                21            79
## 14                  10                16            77
## 15                  10                18            79
## 16                  10                20            91
## 17                   2                13            98
## 18                  10                28            75
## 19                  10                18            60
## 20                  10                17            60
##    Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                      30.13                52                   10
## 2                       29.5                43                    3
## 3                      29.61                36                    7
## 4                      29.85                35                   10
## 5                      29.82                37                   10
## 6                      29.83                39                   10
## 7                      30.05                37                   10
## 8                      30.09                40                    9
## 9                      29.75                45                    6
## 10                     29.78                40                   10
## 11                     29.98                33                   10
## 12                     30.59                38                    8
## 13                     30.26                32                   10
## 14                     30.32                33                    9
## 15                     30.35                39                   10
## 16                     30.23                45                    5
## 17                      29.9                44                    1
## 18                     29.63                52                    8
## 19                     30.11                44                   10
## 20                     30.14                45                   10
##    Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                  13             40            26           52
## 2                  13             39            37           89
## 3                  13             31            27           82
## 4                  11             27            25           64
## 5                  12             26            24           55
## 6                  10             27            25           53
## 7                   6             29            27           60
## 8                   4             36            30           73
## 9                  11             41            32           70
## 10                 14             30            26           57
## 11                 11             22            20           56
## 12                 15             27            17           51
## 13                 10             24            20           69
## 14                  9             27            25           69
## 15                  8             34            25           69
## 16                 13             42            37           82
## 17                  6             44            41           96
## 18                 14             43            29           49
## 19                 11             29            28           49
## 20                  9             31            29           50
##    Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                     30.01               39                  10
## 2                     29.43               38                   1
## 3                     29.44               32                   1
## 4                     29.81               31                   7
## 5                     29.78               32                  10
## 6                     29.78               33                  10
## 7                     29.91               32                  10
## 8                     29.92               35                   5
## 9                     29.69               41                   1
## 10                    29.71               36                  10
## 11                    29.86               29                  10
## 12                     30.4               33                   2
## 13                    30.17               27                   7
## 14                    30.28               30                   6
## 15                     30.3               33                   4
## 16                    30.16               42                   1
## 17                    29.55               41                   0
## 18                    29.47               44                   1
## 19                    29.99               37                  10
## 20                    30.03               38                  10
##    PrecipitationIn WindDirDegrees
## 1             0.01            268
## 2             0.28            357
## 3             0.02            230
## 4                T            286
## 5                T            298
## 6             0.00            306
## 7             0.00            324
## 8                T             79
## 9             0.43            311
## 10            0.01            281
## 11            0.00            305
## 12            0.10             62
## 13               T            350
## 14               T              2
## 15            0.05             24
## 16            0.25             63
## 17            0.56             12
## 18            0.14            250
## 19            0.00            255
## 20            0.00            251
  • Try coercing the PrecipitationIn column of weather5 to numeric without saving the result
as.numeric(weather5$PrecipitationIn)
## Warning: NAs introduced by coercion
##   [1] 0.01 0.28 0.02   NA   NA 0.00 0.00   NA 0.43 0.01 0.00 0.10   NA   NA
##  [15] 0.05 0.25 0.56 0.14 0.00 0.00 0.01 0.00 0.44 0.00 0.00 0.00 0.11 1.09
##  [29] 0.13 0.03 2.90 0.00 0.00 0.00 0.20 0.00   NA 0.12 0.00 0.00 0.15 0.00
##  [43] 0.00 0.00 0.00   NA 0.00 0.71 0.00 0.10 0.95 0.01   NA 0.62 0.06 0.05
##  [57] 0.57 0.00 0.02   NA 0.00 0.01 0.00 0.05 0.01 0.03 0.00 0.23 0.39 0.00
##  [71] 0.02 0.01 0.06 0.78 0.00 0.17 0.11 0.00   NA 0.07 0.02 0.00 0.00   NA
##  [85] 0.00   NA   NA 0.00 0.09   NA 0.07 0.37 0.88 0.17 0.06 0.01 0.00 0.00
##  [99] 0.80 0.27 0.00 0.14 0.00 0.00 0.01 0.05 0.09 0.00 0.00 0.00 0.04 0.80
## [113] 0.21 0.12 0.00 0.26   NA 0.00 0.02   NA 0.00 0.00   NA 0.00 0.00 0.09
## [127] 0.00 0.00 0.00 0.01 0.00 0.00 0.06 0.00 0.00 0.00 0.61 0.54   NA 0.00
## [141]   NA 0.00 0.00 0.10 0.07 0.00 0.03 0.00   NA 0.39 0.00 0.00 0.03 0.26
## [155] 0.09 0.00 0.00 0.00 0.02 0.00 0.00 0.00   NA 0.00 0.00 0.27 0.00 0.00
## [169] 0.00   NA 0.00 0.00   NA 0.00 0.00   NA 0.00 0.00 0.00 0.91 0.00 0.02
## [183] 0.00 0.00 0.00 0.00 0.38 0.00 0.00 0.00   NA 0.00 0.40   NA 0.00 0.00
## [197] 0.00 0.74 0.04 1.72 0.00 0.01 0.00 0.00   NA 0.20 1.43   NA 0.00 0.00
## [211]   NA 0.00   NA 0.09 0.00   NA   NA 0.50 1.12 0.00 0.00 0.00 0.03   NA
## [225] 0.00   NA 0.14   NA 0.00   NA   NA 0.00 0.00 0.01 0.00   NA 0.06 0.00
## [239] 0.00 0.00 0.02 0.00   NA 0.00 0.00 0.02   NA 0.15   NA 0.00 0.83 0.00
## [253] 0.00 0.00 0.08 0.00 0.00 0.14 0.00 0.00 0.00 0.63   NA 0.02   NA 0.00
## [267]   NA 0.00 0.00 0.00 0.00 0.00 0.00 0.49 0.00 0.00 0.00 0.00 0.00 0.00
## [281] 0.17 0.66 0.01 0.38 0.00 0.00 0.00 0.00 0.00 0.00 0.00   NA 0.00 0.00
## [295] 0.00 0.00 0.00 0.00 0.00 0.00 0.04 0.01 2.46   NA   NA 0.00 0.00 0.00
## [309] 0.20 0.00   NA 0.00 0.00 0.00 0.12 0.00 0.00   NA   NA   NA 0.00 0.08
## [323]   NA 0.07   NA 0.00 0.00 0.03 0.00 0.00 0.36 0.73 0.01 0.00 0.00 0.00
## [337] 0.00 0.00 0.00 0.00 0.34   NA 0.07 0.54 0.04 0.01 0.00 0.00 0.00 0.00
## [351] 0.00   NA 0.00 0.86 0.00 0.30 0.04 0.00 0.00 0.00 0.00 0.21 0.00 0.00
## [365] 0.00   NA 0.00 0.00 0.00 0.00 0.00 0.00 0.14   NA   NA   NA   NA   NA
## [379]   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## [393]   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA

Column type conversions

Use str_replace() from stringr to make the proper replacements in the PrecipitationIn column of weather5

# Replace T with 0 (T = trace)
weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn, "T", "0")

Run the call to mutate_each as-is to conveniently apply as.numeric() to all columns from CloudCover through WindDirDegrees (reading left to right in the data), saving the result to weather6

# Convert characters to numerics
weather6 <- weather5 %>% mutate_at( .vars=vars(CloudCover:WindDirDegrees), funs(as.numeric) )

View the structure of weather6 to confirm the coercions were successful

# Look at result
str(weather6)
## 'data.frame':    403 obs. of  23 variables:
##  $ date                     : Date, format: "2014-12-01" "2014-12-10" ...
##  $ Events                   : chr  "Rain" "Rain" "Rain-Snow" "Snow" ...
##  $ CloudCover               : num  6 8 8 7 5 4 2 8 8 7 ...
##  $ Max.Dew.PointF           : num  46 45 37 28 28 29 33 42 46 34 ...
##  $ Max.Gust.SpeedMPH        : num  29 29 28 21 23 20 21 10 26 30 ...
##  $ Max.Humidity             : num  74 100 92 85 75 82 89 96 100 89 ...
##  $ Max.Sea.Level.PressureIn : num  30.4 29.6 29.8 29.9 29.9 ...
##  $ Max.TemperatureF         : num  64 48 39 39 42 45 42 44 49 44 ...
##  $ Max.VisibilityMiles      : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Max.Wind.SpeedMPH        : num  22 23 21 16 17 15 15 8 20 23 ...
##  $ Mean.Humidity            : num  63 95 87 75 65 68 75 85 85 73 ...
##  $ Mean.Sea.Level.PressureIn: num  30.1 29.5 29.6 29.9 29.8 ...
##  $ Mean.TemperatureF        : num  52 43 36 35 37 39 37 40 45 40 ...
##  $ Mean.VisibilityMiles     : num  10 3 7 10 10 10 10 9 6 10 ...
##  $ Mean.Wind.SpeedMPH       : num  13 13 13 11 12 10 6 4 11 14 ...
##  $ MeanDew.PointF           : num  40 39 31 27 26 27 29 36 41 30 ...
##  $ Min.DewpointF            : num  26 37 27 25 24 25 27 30 32 26 ...
##  $ Min.Humidity             : num  52 89 82 64 55 53 60 73 70 57 ...
##  $ Min.Sea.Level.PressureIn : num  30 29.4 29.4 29.8 29.8 ...
##  $ Min.TemperatureF         : num  39 38 32 31 32 33 32 35 41 36 ...
##  $ Min.VisibilityMiles      : num  10 1 1 7 10 10 10 5 1 10 ...
##  $ PrecipitationIn          : num  0.01 0.28 0.02 0 0 0 0 0 0.43 0.01 ...
##  $ WindDirDegrees           : num  268 357 230 286 298 306 324 79 311 281 ...

Find missing values

Use sum() and s.na() to count the number of NA values in weather6

# Count missing values
sum(is.na(weather6))
## [1] 827

Look at a summary() of weather6 to figure out how the missings are distributed among the different variables

# Find missing values
summary(weather6)
##       date               Events            CloudCover    Max.Dew.PointF 
##  Min.   :2014-12-01   Length:403         Min.   :0.000   Min.   :-6.00  
##  1st Qu.:2015-03-09   Class :character   1st Qu.:3.000   1st Qu.:32.00  
##  Median :2015-06-16   Mode  :character   Median :5.000   Median :47.50  
##  Mean   :2015-06-16                      Mean   :4.708   Mean   :45.48  
##  3rd Qu.:2015-09-23                      3rd Qu.:7.000   3rd Qu.:61.00  
##  Max.   :2015-12-31                      Max.   :8.000   Max.   :75.00  
##  NA's   :7                               NA's   :37      NA's   :37     
##  Max.Gust.SpeedMPH  Max.Humidity     Max.Sea.Level.PressureIn
##  Min.   : 0.00     Min.   :  39.00   Min.   :29.58           
##  1st Qu.:21.00     1st Qu.:  73.25   1st Qu.:30.00           
##  Median :25.50     Median :  86.00   Median :30.14           
##  Mean   :26.99     Mean   :  85.69   Mean   :30.16           
##  3rd Qu.:31.25     3rd Qu.:  93.00   3rd Qu.:30.31           
##  Max.   :94.00     Max.   :1000.00   Max.   :30.88           
##  NA's   :43        NA's   :37        NA's   :37              
##  Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity  
##  Min.   :18.00    Min.   : 2.000      Min.   : 8.00     Min.   :28.00  
##  1st Qu.:42.00    1st Qu.:10.000      1st Qu.:16.00     1st Qu.:56.00  
##  Median :60.00    Median :10.000      Median :20.00     Median :66.00  
##  Mean   :58.93    Mean   : 9.907      Mean   :20.62     Mean   :66.02  
##  3rd Qu.:76.00    3rd Qu.:10.000      3rd Qu.:24.00     3rd Qu.:76.75  
##  Max.   :96.00    Max.   :10.000      Max.   :38.00     Max.   :98.00  
##  NA's   :37       NA's   :37          NA's   :37        NA's   :37     
##  Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
##  Min.   :29.49             Min.   : 8.00     Min.   :-1.000      
##  1st Qu.:29.87             1st Qu.:36.25     1st Qu.: 8.000      
##  Median :30.03             Median :53.50     Median :10.000      
##  Mean   :30.04             Mean   :51.40     Mean   : 8.861      
##  3rd Qu.:30.19             3rd Qu.:68.00     3rd Qu.:10.000      
##  Max.   :30.77             Max.   :84.00     Max.   :10.000      
##  NA's   :37                NA's   :37        NA's   :37          
##  Mean.Wind.SpeedMPH MeanDew.PointF   Min.DewpointF     Min.Humidity  
##  Min.   : 4.00      Min.   :-11.00   Min.   :-18.00   Min.   :16.00  
##  1st Qu.: 8.00      1st Qu.: 24.00   1st Qu.: 16.25   1st Qu.:35.00  
##  Median :10.00      Median : 41.00   Median : 35.00   Median :46.00  
##  Mean   :10.68      Mean   : 38.96   Mean   : 32.25   Mean   :48.31  
##  3rd Qu.:13.00      3rd Qu.: 56.00   3rd Qu.: 51.00   3rd Qu.:60.00  
##  Max.   :22.00      Max.   : 71.00   Max.   : 68.00   Max.   :96.00  
##  NA's   :37         NA's   :37       NA's   :37       NA's   :37     
##  Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
##  Min.   :29.16            Min.   :-3.00    Min.   : 0.000     
##  1st Qu.:29.76            1st Qu.:30.00    1st Qu.: 2.000     
##  Median :29.94            Median :46.00    Median :10.000     
##  Mean   :29.93            Mean   :43.33    Mean   : 6.716     
##  3rd Qu.:30.09            3rd Qu.:60.00    3rd Qu.:10.000     
##  Max.   :30.64            Max.   :74.00    Max.   :10.000     
##  NA's   :37               NA's   :37       NA's   :37         
##  PrecipitationIn  WindDirDegrees 
##  Min.   :0.0000   Min.   :  1.0  
##  1st Qu.:0.0000   1st Qu.:113.0  
##  Median :0.0000   Median :222.0  
##  Mean   :0.1016   Mean   :200.1  
##  3rd Qu.:0.0400   3rd Qu.:275.0  
##  Max.   :2.9000   Max.   :360.0  
##  NA's   :37       NA's   :37

Use which() to identify the indices (i.e. row numbers) where Max.Gust.SpeedMPH is NA and save the result to ind (for indices)

# Find indices of NAs in Max.Gust.SpeedMPH
ind <- which(is.na(weather6$Max.Gust.SpeedMPH))

Use ind to look at the full rows of weather6 for which Max.Gust.SpeedMPH is missing

# Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[ind, ]
##           date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 84        <NA>   <NA>         NA             NA                NA
## 86        <NA>   <NA>         NA             NA                NA
## 87        <NA>   <NA>         NA             NA                NA
## 149       <NA>   <NA>         NA             NA                NA
## 165 2015-05-18    Fog          6             52                NA
## 209 2015-06-03                 7             48                NA
## 211       <NA>   <NA>         NA             NA                NA
## 278 2015-08-08                 4             61                NA
## 280 2015-09-01                 1             63                NA
## 304       <NA>   <NA>         NA             NA                NA
## 314 2015-10-12                 0             56                NA
## 364 2015-11-03                 1             44                NA
## 366       <NA>   <NA>         NA             NA                NA
## 374 2015-12-10   <NA>         NA             NA                NA
## 375 2015-12-11   <NA>         NA             NA                NA
## 376 2015-12-12   <NA>         NA             NA                NA
## 377 2015-12-13   <NA>         NA             NA                NA
## 378 2015-12-14   <NA>         NA             NA                NA
## 379 2015-12-15   <NA>         NA             NA                NA
## 380 2015-12-16   <NA>         NA             NA                NA
## 381 2015-12-17   <NA>         NA             NA                NA
## 382 2015-12-18   <NA>         NA             NA                NA
## 383 2015-12-19   <NA>         NA             NA                NA
## 384 2015-12-02   <NA>         NA             NA                NA
## 385 2015-12-20   <NA>         NA             NA                NA
## 386 2015-12-21   <NA>         NA             NA                NA
## 387 2015-12-22   <NA>         NA             NA                NA
## 388 2015-12-23   <NA>         NA             NA                NA
## 389 2015-12-24   <NA>         NA             NA                NA
## 390 2015-12-25   <NA>         NA             NA                NA
## 391 2015-12-26   <NA>         NA             NA                NA
## 392 2015-12-27   <NA>         NA             NA                NA
## 393 2015-12-28   <NA>         NA             NA                NA
## 394 2015-12-29   <NA>         NA             NA                NA
## 395 2015-12-03   <NA>         NA             NA                NA
## 396 2015-12-30   <NA>         NA             NA                NA
## 397 2015-12-31   <NA>         NA             NA                NA
## 398 2015-12-04   <NA>         NA             NA                NA
## 399 2015-12-05   <NA>         NA             NA                NA
## 400 2015-12-06   <NA>         NA             NA                NA
## 401 2015-12-07   <NA>         NA             NA                NA
## 402 2015-12-08   <NA>         NA             NA                NA
## 403 2015-12-09   <NA>         NA             NA                NA
##     Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 84            NA                       NA               NA
## 86            NA                       NA               NA
## 87            NA                       NA               NA
## 149           NA                       NA               NA
## 165          100                    30.30               58
## 209           93                    30.31               56
## 211           NA                       NA               NA
## 278           87                    30.02               76
## 280           78                    30.06               79
## 304           NA                       NA               NA
## 314           89                    29.86               76
## 364           82                    30.25               73
## 366           NA                       NA               NA
## 374           NA                       NA               NA
## 375           NA                       NA               NA
## 376           NA                       NA               NA
## 377           NA                       NA               NA
## 378           NA                       NA               NA
## 379           NA                       NA               NA
## 380           NA                       NA               NA
## 381           NA                       NA               NA
## 382           NA                       NA               NA
## 383           NA                       NA               NA
## 384           NA                       NA               NA
## 385           NA                       NA               NA
## 386           NA                       NA               NA
## 387           NA                       NA               NA
## 388           NA                       NA               NA
## 389           NA                       NA               NA
## 390           NA                       NA               NA
## 391           NA                       NA               NA
## 392           NA                       NA               NA
## 393           NA                       NA               NA
## 394           NA                       NA               NA
## 395           NA                       NA               NA
## 396           NA                       NA               NA
## 397           NA                       NA               NA
## 398           NA                       NA               NA
## 399           NA                       NA               NA
## 400           NA                       NA               NA
## 401           NA                       NA               NA
## 402           NA                       NA               NA
## 403           NA                       NA               NA
##     Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 84                   NA                NA            NA
## 86                   NA                NA            NA
## 87                   NA                NA            NA
## 149                  NA                NA            NA
## 165                  10                16            79
## 209                  10                14            82
## 211                  NA                NA            NA
## 278                  10                14            68
## 280                  10                15            65
## 304                  NA                NA            NA
## 314                  10                15            65
## 364                  10                16            57
## 366                  NA                NA            NA
## 374                  NA                NA            NA
## 375                  NA                NA            NA
## 376                  NA                NA            NA
## 377                  NA                NA            NA
## 378                  NA                NA            NA
## 379                  NA                NA            NA
## 380                  NA                NA            NA
## 381                  NA                NA            NA
## 382                  NA                NA            NA
## 383                  NA                NA            NA
## 384                  NA                NA            NA
## 385                  NA                NA            NA
## 386                  NA                NA            NA
## 387                  NA                NA            NA
## 388                  NA                NA            NA
## 389                  NA                NA            NA
## 390                  NA                NA            NA
## 391                  NA                NA            NA
## 392                  NA                NA            NA
## 393                  NA                NA            NA
## 394                  NA                NA            NA
## 395                  NA                NA            NA
## 396                  NA                NA            NA
## 397                  NA                NA            NA
## 398                  NA                NA            NA
## 399                  NA                NA            NA
## 400                  NA                NA            NA
## 401                  NA                NA            NA
## 402                  NA                NA            NA
## 403                  NA                NA            NA
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 84                         NA                NA                   NA
## 86                         NA                NA                   NA
## 87                         NA                NA                   NA
## 149                        NA                NA                   NA
## 165                     30.23                54                    8
## 209                     30.24                52                   10
## 211                        NA                NA                   NA
## 278                     29.99                69                   10
## 280                     30.02                74                   10
## 304                        NA                NA                   NA
## 314                     29.80                64                   10
## 364                     30.13                60                   10
## 366                        NA                NA                   NA
## 374                        NA                NA                   NA
## 375                        NA                NA                   NA
## 376                        NA                NA                   NA
## 377                        NA                NA                   NA
## 378                        NA                NA                   NA
## 379                        NA                NA                   NA
## 380                        NA                NA                   NA
## 381                        NA                NA                   NA
## 382                        NA                NA                   NA
## 383                        NA                NA                   NA
## 384                        NA                NA                   NA
## 385                        NA                NA                   NA
## 386                        NA                NA                   NA
## 387                        NA                NA                   NA
## 388                        NA                NA                   NA
## 389                        NA                NA                   NA
## 390                        NA                NA                   NA
## 391                        NA                NA                   NA
## 392                        NA                NA                   NA
## 393                        NA                NA                   NA
## 394                        NA                NA                   NA
## 395                        NA                NA                   NA
## 396                        NA                NA                   NA
## 397                        NA                NA                   NA
## 398                        NA                NA                   NA
## 399                        NA                NA                   NA
## 400                        NA                NA                   NA
## 401                        NA                NA                   NA
## 402                        NA                NA                   NA
## 403                        NA                NA                   NA
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 84                  NA             NA            NA           NA
## 86                  NA             NA            NA           NA
## 87                  NA             NA            NA           NA
## 149                 NA             NA            NA           NA
## 165                 10             48            43           57
## 209                  7             45            43           71
## 211                 NA             NA            NA           NA
## 278                  6             57            54           49
## 280                  9             62            59           52
## 304                 NA             NA            NA           NA
## 314                  8             51            48           41
## 364                  8             42            40           31
## 366                 NA             NA            NA           NA
## 374                 NA             NA            NA           NA
## 375                 NA             NA            NA           NA
## 376                 NA             NA            NA           NA
## 377                 NA             NA            NA           NA
## 378                 NA             NA            NA           NA
## 379                 NA             NA            NA           NA
## 380                 NA             NA            NA           NA
## 381                 NA             NA            NA           NA
## 382                 NA             NA            NA           NA
## 383                 NA             NA            NA           NA
## 384                 NA             NA            NA           NA
## 385                 NA             NA            NA           NA
## 386                 NA             NA            NA           NA
## 387                 NA             NA            NA           NA
## 388                 NA             NA            NA           NA
## 389                 NA             NA            NA           NA
## 390                 NA             NA            NA           NA
## 391                 NA             NA            NA           NA
## 392                 NA             NA            NA           NA
## 393                 NA             NA            NA           NA
## 394                 NA             NA            NA           NA
## 395                 NA             NA            NA           NA
## 396                 NA             NA            NA           NA
## 397                 NA             NA            NA           NA
## 398                 NA             NA            NA           NA
## 399                 NA             NA            NA           NA
## 400                 NA             NA            NA           NA
## 401                 NA             NA            NA           NA
## 402                 NA             NA            NA           NA
## 403                 NA             NA            NA           NA
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 84                        NA               NA                  NA
## 86                        NA               NA                  NA
## 87                        NA               NA                  NA
## 149                       NA               NA                  NA
## 165                    30.12               49                   0
## 209                    30.19               47                  10
## 211                       NA               NA                  NA
## 278                    29.95               61                  10
## 280                    29.96               69                  10
## 304                       NA               NA                  NA
## 314                    29.74               51                  10
## 364                    30.06               47                  10
## 366                       NA               NA                  NA
## 374                       NA               NA                  NA
## 375                       NA               NA                  NA
## 376                       NA               NA                  NA
## 377                       NA               NA                  NA
## 378                       NA               NA                  NA
## 379                       NA               NA                  NA
## 380                       NA               NA                  NA
## 381                       NA               NA                  NA
## 382                       NA               NA                  NA
## 383                       NA               NA                  NA
## 384                       NA               NA                  NA
## 385                       NA               NA                  NA
## 386                       NA               NA                  NA
## 387                       NA               NA                  NA
## 388                       NA               NA                  NA
## 389                       NA               NA                  NA
## 390                       NA               NA                  NA
## 391                       NA               NA                  NA
## 392                       NA               NA                  NA
## 393                       NA               NA                  NA
## 394                       NA               NA                  NA
## 395                       NA               NA                  NA
## 396                       NA               NA                  NA
## 397                       NA               NA                  NA
## 398                       NA               NA                  NA
## 399                       NA               NA                  NA
## 400                       NA               NA                  NA
## 401                       NA               NA                  NA
## 402                       NA               NA                  NA
## 403                       NA               NA                  NA
##     PrecipitationIn WindDirDegrees
## 84               NA             NA
## 86               NA             NA
## 87               NA             NA
## 149              NA             NA
## 165               0             72
## 209               0             90
## 211              NA             NA
## 278               0             45
## 280               0             54
## 304              NA             NA
## 314               0            199
## 364               0            281
## 366              NA             NA
## 374              NA             NA
## 375              NA             NA
## 376              NA             NA
## 377              NA             NA
## 378              NA             NA
## 379              NA             NA
## 380              NA             NA
## 381              NA             NA
## 382              NA             NA
## 383              NA             NA
## 384              NA             NA
## 385              NA             NA
## 386              NA             NA
## 387              NA             NA
## 388              NA             NA
## 389              NA             NA
## 390              NA             NA
## 391              NA             NA
## 392              NA             NA
## 393              NA             NA
## 394              NA             NA
## 395              NA             NA
## 396              NA             NA
## 397              NA             NA
## 398              NA             NA
## 399              NA             NA
## 400              NA             NA
## 401              NA             NA
## 402              NA             NA
## 403              NA             NA

An obvious error

  • View a summary() of weather6
# Review distributions for all variables
summary(weather6)
##       date               Events            CloudCover    Max.Dew.PointF 
##  Min.   :2014-12-01   Length:403         Min.   :0.000   Min.   :-6.00  
##  1st Qu.:2015-03-09   Class :character   1st Qu.:3.000   1st Qu.:32.00  
##  Median :2015-06-16   Mode  :character   Median :5.000   Median :47.50  
##  Mean   :2015-06-16                      Mean   :4.708   Mean   :45.48  
##  3rd Qu.:2015-09-23                      3rd Qu.:7.000   3rd Qu.:61.00  
##  Max.   :2015-12-31                      Max.   :8.000   Max.   :75.00  
##  NA's   :7                               NA's   :37      NA's   :37     
##  Max.Gust.SpeedMPH  Max.Humidity     Max.Sea.Level.PressureIn
##  Min.   : 0.00     Min.   :  39.00   Min.   :29.58           
##  1st Qu.:21.00     1st Qu.:  73.25   1st Qu.:30.00           
##  Median :25.50     Median :  86.00   Median :30.14           
##  Mean   :26.99     Mean   :  85.69   Mean   :30.16           
##  3rd Qu.:31.25     3rd Qu.:  93.00   3rd Qu.:30.31           
##  Max.   :94.00     Max.   :1000.00   Max.   :30.88           
##  NA's   :43        NA's   :37        NA's   :37              
##  Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity  
##  Min.   :18.00    Min.   : 2.000      Min.   : 8.00     Min.   :28.00  
##  1st Qu.:42.00    1st Qu.:10.000      1st Qu.:16.00     1st Qu.:56.00  
##  Median :60.00    Median :10.000      Median :20.00     Median :66.00  
##  Mean   :58.93    Mean   : 9.907      Mean   :20.62     Mean   :66.02  
##  3rd Qu.:76.00    3rd Qu.:10.000      3rd Qu.:24.00     3rd Qu.:76.75  
##  Max.   :96.00    Max.   :10.000      Max.   :38.00     Max.   :98.00  
##  NA's   :37       NA's   :37          NA's   :37        NA's   :37     
##  Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
##  Min.   :29.49             Min.   : 8.00     Min.   :-1.000      
##  1st Qu.:29.87             1st Qu.:36.25     1st Qu.: 8.000      
##  Median :30.03             Median :53.50     Median :10.000      
##  Mean   :30.04             Mean   :51.40     Mean   : 8.861      
##  3rd Qu.:30.19             3rd Qu.:68.00     3rd Qu.:10.000      
##  Max.   :30.77             Max.   :84.00     Max.   :10.000      
##  NA's   :37                NA's   :37        NA's   :37          
##  Mean.Wind.SpeedMPH MeanDew.PointF   Min.DewpointF     Min.Humidity  
##  Min.   : 4.00      Min.   :-11.00   Min.   :-18.00   Min.   :16.00  
##  1st Qu.: 8.00      1st Qu.: 24.00   1st Qu.: 16.25   1st Qu.:35.00  
##  Median :10.00      Median : 41.00   Median : 35.00   Median :46.00  
##  Mean   :10.68      Mean   : 38.96   Mean   : 32.25   Mean   :48.31  
##  3rd Qu.:13.00      3rd Qu.: 56.00   3rd Qu.: 51.00   3rd Qu.:60.00  
##  Max.   :22.00      Max.   : 71.00   Max.   : 68.00   Max.   :96.00  
##  NA's   :37         NA's   :37       NA's   :37       NA's   :37     
##  Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
##  Min.   :29.16            Min.   :-3.00    Min.   : 0.000     
##  1st Qu.:29.76            1st Qu.:30.00    1st Qu.: 2.000     
##  Median :29.94            Median :46.00    Median :10.000     
##  Mean   :29.93            Mean   :43.33    Mean   : 6.716     
##  3rd Qu.:30.09            3rd Qu.:60.00    3rd Qu.:10.000     
##  Max.   :30.64            Max.   :74.00    Max.   :10.000     
##  NA's   :37               NA's   :37       NA's   :37         
##  PrecipitationIn  WindDirDegrees 
##  Min.   :0.0000   Min.   :  1.0  
##  1st Qu.:0.0000   1st Qu.:113.0  
##  Median :0.0000   Median :222.0  
##  Mean   :0.1016   Mean   :200.1  
##  3rd Qu.:0.0400   3rd Qu.:275.0  
##  Max.   :2.9000   Max.   :360.0  
##  NA's   :37       NA's   :37
  • Use which() to find the index of the erroneous element of weather6$Max.Humidity, saving the result to ind
# Find row with Max.Humidity of 1000
ind <- which(weather6$Max.Humidity == 1000)
  • Use ind to look at the full row of weather6 for that day You discover an extra zero was accidentally added to this value. Correct it in the data
# Look at the data for that day
weather6[ind, ]
##           date                Events CloudCover Max.Dew.PointF
## 138 2015-04-21 Fog-Rain-Thunderstorm          6             57
##     Max.Gust.SpeedMPH Max.Humidity Max.Sea.Level.PressureIn
## 138                94         1000                    29.75
##     Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 138               65                  10                20            71
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 138                      29.6                56                    5
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 138                 10             49            36           42
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 138                    29.53               46                   0
##     PrecipitationIn WindDirDegrees
## 138            0.54            184
# Change 1000 to 100
weather6$Max.Humidity[ind] <- 100

Another obvious error

  • Use summary() to look at the value of only the Mean.VisibilityMiles variable of weather6
# Look at summary of Mean.VisibilityMiles
summary(weather6$Mean.VisibilityMiles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -1.000   8.000  10.000   8.861  10.000  10.000      37

Determine the element of the value that is clearly erroneous in this column, saving the result to ind

# Get index of row with -1 value
ind <- which(weather6$Mean.VisibilityMiles == -1)

Use ind to look at the full row of weather6 for this day

# Look at full row
weather6[ind, ]
##           date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 196 2015-06-18                 5             54                23
##     Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 196           72                    30.14               76
##     Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 196                  10                17            59
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 196                     30.04                67                   -1
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 196                 10             49            45           46
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 196                    29.93               57                  10
##     PrecipitationIn WindDirDegrees
## 196               0            189

Inspect the values of other variables for this day to determine the correct value of Mean.VisibilityMiles, then make the appropriate fix

# Set Mean.VisibilityMiles to the appropriate value
weather6$Mean.VisibilityMiles[ind] <- 10

Check other extreme values

  • Check a summary() of weather6 one more time for extreme or unexpected values
# Review summary of full data once more
summary(weather6)
##       date               Events            CloudCover    Max.Dew.PointF 
##  Min.   :2014-12-01   Length:403         Min.   :0.000   Min.   :-6.00  
##  1st Qu.:2015-03-09   Class :character   1st Qu.:3.000   1st Qu.:32.00  
##  Median :2015-06-16   Mode  :character   Median :5.000   Median :47.50  
##  Mean   :2015-06-16                      Mean   :4.708   Mean   :45.48  
##  3rd Qu.:2015-09-23                      3rd Qu.:7.000   3rd Qu.:61.00  
##  Max.   :2015-12-31                      Max.   :8.000   Max.   :75.00  
##  NA's   :7                               NA's   :37      NA's   :37     
##  Max.Gust.SpeedMPH  Max.Humidity    Max.Sea.Level.PressureIn
##  Min.   : 0.00     Min.   : 39.00   Min.   :29.58           
##  1st Qu.:21.00     1st Qu.: 73.25   1st Qu.:30.00           
##  Median :25.50     Median : 86.00   Median :30.14           
##  Mean   :26.99     Mean   : 83.23   Mean   :30.16           
##  3rd Qu.:31.25     3rd Qu.: 93.00   3rd Qu.:30.31           
##  Max.   :94.00     Max.   :100.00   Max.   :30.88           
##  NA's   :43        NA's   :37       NA's   :37              
##  Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity  
##  Min.   :18.00    Min.   : 2.000      Min.   : 8.00     Min.   :28.00  
##  1st Qu.:42.00    1st Qu.:10.000      1st Qu.:16.00     1st Qu.:56.00  
##  Median :60.00    Median :10.000      Median :20.00     Median :66.00  
##  Mean   :58.93    Mean   : 9.907      Mean   :20.62     Mean   :66.02  
##  3rd Qu.:76.00    3rd Qu.:10.000      3rd Qu.:24.00     3rd Qu.:76.75  
##  Max.   :96.00    Max.   :10.000      Max.   :38.00     Max.   :98.00  
##  NA's   :37       NA's   :37          NA's   :37        NA's   :37     
##  Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
##  Min.   :29.49             Min.   : 8.00     Min.   : 1.000      
##  1st Qu.:29.87             1st Qu.:36.25     1st Qu.: 8.000      
##  Median :30.03             Median :53.50     Median :10.000      
##  Mean   :30.04             Mean   :51.40     Mean   : 8.891      
##  3rd Qu.:30.19             3rd Qu.:68.00     3rd Qu.:10.000      
##  Max.   :30.77             Max.   :84.00     Max.   :10.000      
##  NA's   :37                NA's   :37        NA's   :37          
##  Mean.Wind.SpeedMPH MeanDew.PointF   Min.DewpointF     Min.Humidity  
##  Min.   : 4.00      Min.   :-11.00   Min.   :-18.00   Min.   :16.00  
##  1st Qu.: 8.00      1st Qu.: 24.00   1st Qu.: 16.25   1st Qu.:35.00  
##  Median :10.00      Median : 41.00   Median : 35.00   Median :46.00  
##  Mean   :10.68      Mean   : 38.96   Mean   : 32.25   Mean   :48.31  
##  3rd Qu.:13.00      3rd Qu.: 56.00   3rd Qu.: 51.00   3rd Qu.:60.00  
##  Max.   :22.00      Max.   : 71.00   Max.   : 68.00   Max.   :96.00  
##  NA's   :37         NA's   :37       NA's   :37       NA's   :37     
##  Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
##  Min.   :29.16            Min.   :-3.00    Min.   : 0.000     
##  1st Qu.:29.76            1st Qu.:30.00    1st Qu.: 2.000     
##  Median :29.94            Median :46.00    Median :10.000     
##  Mean   :29.93            Mean   :43.33    Mean   : 6.716     
##  3rd Qu.:30.09            3rd Qu.:60.00    3rd Qu.:10.000     
##  Max.   :30.64            Max.   :74.00    Max.   :10.000     
##  NA's   :37               NA's   :37       NA's   :37         
##  PrecipitationIn  WindDirDegrees 
##  Min.   :0.0000   Min.   :  1.0  
##  1st Qu.:0.0000   1st Qu.:113.0  
##  Median :0.0000   Median :222.0  
##  Mean   :0.1016   Mean   :200.1  
##  3rd Qu.:0.0400   3rd Qu.:275.0  
##  Max.   :2.9000   Max.   :360.0  
##  NA's   :37       NA's   :37
  • View a histogram for MeanDew.PointF
# Look at histogram for MeanDew.PointF
hist(weather6$MeanDew.PointF)

  • Do the same for Min.TemperatureF
# Look at histogram for Min.TemperatureF
hist(weather6$Min.TemperatureF)

  • And once more for Mean.TemperatureF to compare distributions
# Compare to histogram for Mean.TemperatureF
hist(weather6$Mean.TemperatureF)

Finishing touches

We’ve created a vector of column names in your workspace called new_colnames, all of which obey the conventions described above. Clean up the column names of weather6 by assigning new_colnames to names(weather6) Replace all empty strings in the events column of weather6 with “None” One last time, print out the first 6 rows of the weather6 data frame to see the changes

new_colnames<- c( 
 "date",                       "events",                    
 "cloud_cover",                "max_dew_point_f",           
 "max_gust_speed_mph",         "max_humidity",              
 "max_sea_level_pressure_in",  "max_temperature_f",         
 "max_visibility_miles",       "max_wind_speed_mph",        
 "mean_humidity",              "mean_sea_level_pressure_in",
 "mean_temperature_f",         "mean_visibility_miles",     
 "mean_wind_speed_mph",        "mean_dew_point_f",          
 "min_dew_point_f",            "min_humidity",              
 "min_sea_level_pressure_in",  "min_temperature_f",         
 "min_visibility_miles",       "precipitation_in",          
 "wind_dir_degrees")
  • Clean up the column names of weather6 by assigning new_colnames to names(weather6)
# Clean up column names
names(weather6) <- new_colnames
  • Replace all empty strings in the events column of weather6 with "None"
# Replace empty cells in events column
weather6$events[weather6$events == ""] <- "None"
  • One last time, print out the first 6 rows of the `weather6 data frame to see the changes
# Print the first 6 rows of weather6
head(weather6, 6)
##         date    events cloud_cover max_dew_point_f max_gust_speed_mph
## 1 2014-12-01      Rain           6              46                 29
## 2 2014-12-10      Rain           8              45                 29
## 3 2014-12-11 Rain-Snow           8              37                 28
## 4 2014-12-12      Snow           7              28                 21
## 5 2014-12-13      None           5              28                 23
## 6 2014-12-14      None           4              29                 20
##   max_humidity max_sea_level_pressure_in max_temperature_f
## 1           74                     30.45                64
## 2          100                     29.58                48
## 3           92                     29.81                39
## 4           85                     29.88                39
## 5           75                     29.86                42
## 6           82                     29.91                45
##   max_visibility_miles max_wind_speed_mph mean_humidity
## 1                   10                 22            63
## 2                   10                 23            95
## 3                   10                 21            87
## 4                   10                 16            75
## 5                   10                 17            65
## 6                   10                 15            68
##   mean_sea_level_pressure_in mean_temperature_f mean_visibility_miles
## 1                      30.13                 52                    10
## 2                      29.50                 43                     3
## 3                      29.61                 36                     7
## 4                      29.85                 35                    10
## 5                      29.82                 37                    10
## 6                      29.83                 39                    10
##   mean_wind_speed_mph mean_dew_point_f min_dew_point_f min_humidity
## 1                  13               40              26           52
## 2                  13               39              37           89
## 3                  13               31              27           82
## 4                  11               27              25           64
## 5                  12               26              24           55
## 6                  10               27              25           53
##   min_sea_level_pressure_in min_temperature_f min_visibility_miles
## 1                     30.01                39                   10
## 2                     29.43                38                    1
## 3                     29.44                32                    1
## 4                     29.81                31                    7
## 5                     29.78                32                   10
## 6                     29.78                33                   10
##   precipitation_in wind_dir_degrees
## 1             0.01              268
## 2             0.28              357
## 3             0.02              230
## 4             0.00              286
## 5             0.00              298
## 6             0.00              306
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.1252    
## 
## attached base packages:
## [1] methods   stats     graphics  grDevices utils     datasets  base     
## 
## other attached packages:
## [1] stringr_1.3.0   lubridate_1.7.4 bindrcpp_0.2.2  tidyr_0.8.0    
## [5] dplyr_0.7.4     downloader_0.4 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.16     knitr_1.20       bindr_0.1.1      magrittr_1.5    
##  [5] R6_2.2.2         rlang_0.2.0      tools_3.4.4      xfun_0.1        
##  [9] htmltools_0.3.6  yaml_2.1.19      rprojroot_1.3-2  digest_0.6.15   
## [13] assertthat_0.2.0 tibble_1.4.2     bookdown_0.7     purrr_0.2.4     
## [17] glue_1.2.0       evaluate_0.10.1  rmarkdown_1.9    blogdown_0.6    
## [21] stringi_1.1.7    compiler_3.4.4   pillar_1.2.2     backports_1.1.2 
## [25] pkgconfig_2.0.1
knitr::write_bib(.packages(), "packages.bib") 

References

R Core Team. 2018. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

Spinu, Vitalie, Garrett Grolemund, and Hadley Wickham. 2018. Lubridate: Make Dealing with Dates a Little Easier. https://CRAN.R-project.org/package=lubridate.

Wickham, Hadley. 2018. Stringr: Simple, Consistent Wrappers for Common String Operations. https://CRAN.R-project.org/package=stringr.

Wickham, Hadley, Romain Francois, Lionel Henry, and Kirill Müller. 2017. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.

Wickham, Hadley, and Lionel Henry. 2018. Tidyr: Easily Tidy Data with ’Spread()’ and ’Gather()’ Functions. https://CRAN.R-project.org/package=tidyr.