layout: true <!-- <div class="my-footer"><span>arm.rbind.io/slides/xaringan</span></div> --> <!-- this adds the link footer to all slides, depends on my-footer class in css--> --- name: xaringan-title class: left, middle background-image: url(img/background.jpg) background-size: cover # Meet Tidyverse <img src="https://tidyverse.tidyverse.org/articles/tidyverse-logo.png" alt="tidyverse" width="180" /> ### .fancy[Introduction to data wrangling with tidyverse] .large[Winson Yang | Texas Tech University | 2020-06-17] <!-- this ends up being the title slide since seal = FALSE--> --- # Acknowledgements .pull-left[ .pull-left[ <img style="border-radius: 50%;" src="img/ttu_dept.jpg" width="150px"/> [TTU Department of Psychological Sciences](https://www.depts.ttu.edu/psy/) ] .pull-right[ <img style="border-radius: 50%;" src="img/rmorgan.jpg" width="150px"/> [Dr Robert Morgan, Department Chair](https://www.depts.ttu.edu/psy/people/rmorgan/index.php) ] ] .pull-right[ .pull-left[ <img style="border-radius: 50%;" src="img/Sean_Obryan2.jpg" width="150px"/> [Sean O'Bryan, Psychological Sciences Graduate Student Advisory Council](https://www.depts.ttu.edu/psy/caprockscience/people.php) ] .pull-right[ <img style="border-radius: 50%;" src="img/tran.jpg" width="150px"/> [Tran Le, Psychological Sciences Graduate Student Advisory Council](https://www.depts.ttu.edu/psy/sharc/People.php) ] ] <img style="border-radius: 50%;" src="img/allisonhorst.jpg" width="150px"/> [Artwork by @Allisonhorst](https://twitter.com/allison_horst) --- class: inverse, center, middle # Let's Get Started --- class: right, middle <img class="rounded-circle" src="https://github.com/winsonfzyang.png" width="150px"/> # Find me at... [<svg style="height:0.8em;top:.04em;position:relative;" viewBox="0 0 512 512"><path d="M459.37 151.716c.325 4.548.325 9.097.325 13.645 0 138.72-105.583 298.558-298.558 298.558-59.452 0-114.68-17.219-161.137-47.106 8.447.974 16.568 1.299 25.34 1.299 49.055 0 94.213-16.568 130.274-44.832-46.132-.975-84.792-31.188-98.112-72.772 6.498.974 12.995 1.624 19.818 1.624 9.421 0 18.843-1.3 27.614-3.573-48.081-9.747-84.143-51.98-84.143-102.985v-1.299c13.969 7.797 30.214 12.67 47.431 13.319-28.264-18.843-46.781-51.005-46.781-87.391 0-19.492 5.197-37.36 14.294-52.954 51.655 63.675 129.3 105.258 216.365 109.807-1.624-7.797-2.599-15.918-2.599-24.04 0-57.828 46.782-104.934 104.934-104.934 30.213 0 57.502 12.67 76.67 33.137 23.715-4.548 46.456-13.32 66.599-25.34-7.798 24.366-24.366 44.833-46.132 57.827 21.117-2.273 41.584-8.122 60.426-16.243-14.292 20.791-32.161 39.308-52.628 54.253z"/></svg> @winsonfzyang](http://twitter.com/winsonfzyang) [<svg style="height:0.8em;top:.04em;position:relative;" viewBox="0 0 496 512"><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg> @winsonfzyang](http://github.com/winsonfzyang) [<svg style="height:0.8em;top:.04em;position:relative;" viewBox="0 0 512 512"><path d="M326.612 185.391c59.747 59.809 58.927 155.698.36 214.59-.11.12-.24.25-.36.37l-67.2 67.2c-59.27 59.27-155.699 59.262-214.96 0-59.27-59.26-59.27-155.7 0-214.96l37.106-37.106c9.84-9.84 26.786-3.3 27.294 10.606.648 17.722 3.826 35.527 9.69 52.721 1.986 5.822.567 12.262-3.783 16.612l-13.087 13.087c-28.026 28.026-28.905 73.66-1.155 101.96 28.024 28.579 74.086 28.749 102.325.51l67.2-67.19c28.191-28.191 28.073-73.757 0-101.83-3.701-3.694-7.429-6.564-10.341-8.569a16.037 16.037 0 0 1-6.947-12.606c-.396-10.567 3.348-21.456 11.698-29.806l21.054-21.055c5.521-5.521 14.182-6.199 20.584-1.731a152.482 152.482 0 0 1 20.522 17.197zM467.547 44.449c-59.261-59.262-155.69-59.27-214.96 0l-67.2 67.2c-.12.12-.25.25-.36.37-58.566 58.892-59.387 154.781.36 214.59a152.454 152.454 0 0 0 20.521 17.196c6.402 4.468 15.064 3.789 20.584-1.731l21.054-21.055c8.35-8.35 12.094-19.239 11.698-29.806a16.037 16.037 0 0 0-6.947-12.606c-2.912-2.005-6.64-4.875-10.341-8.569-28.073-28.073-28.191-73.639 0-101.83l67.2-67.19c28.239-28.239 74.3-28.069 102.325.51 27.75 28.3 26.872 73.934-1.155 101.96l-13.087 13.087c-4.35 4.35-5.769 10.79-3.783 16.612 5.864 17.194 9.042 34.999 9.69 52.721.509 13.906 17.454 20.446 27.294 10.606l37.106-37.106c59.271-59.259 59.271-155.699.001-214.959z"/></svg> winsonfzyang.github.io](https://winsonfzyang.rbind.io) [<svg style="height:0.8em;top:.04em;position:relative;" viewBox="0 0 512 512"><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> winson.yang@ttu.edu](mailto:winson.yang@ttu.edu) --- # Who am I -- Second year Experimental (Cognitive) Psychology student. - Projects include Neurofeedback, Neuroscience of Meditaition, cognitive flexibility, Neurodegeneration -- Other projects - data visualization, UX/UI, software development for neuroscientists and psychologists - Programming education --- <center> <img src="img/r_first_then.png" width="500"/> </center> --- # Workshop aims Introduce the main components of the Tidyverse - readr (read files) - dplyr, tidyr (manipulate data) - ggplot2 (make awesome graphs) I have to assume you have a basic knowledge of R We don't really have time to cover all of the tidyverse (it is a huge universe!) --- class: inverse, middle, center # Day I: Data wrangling with tidyverse --- background-image: url(img/messymeme.jpeg) background-position: 50% 50% background-size: cover class: center, bottom, inverse --- background-image: url(img/tidyversepackages.jpg) background-position: 50% 50% background-size: cover class: center, bottom, inverse --- # The tidyverse workflow!! ![](img/workflow.jpg) --- # Functions we will cover today -- .pull-left[ - read_csv() - skim() - filter() - select() ] -- .pull-right[ - arrange() - mutate() - group_by() %>% summarize() ] -- There are more functions in the tidyverse package, but this should be enough to get you going with data analysis! If you have not done so already, please install all the packages in the tidyverse by running `install.packages("tidyverse")` in R Studio. We will then load the package using `library(tidyverse)`. --- # Importing data -- With the `readr`, `haven`, `readxl` packages, we can load various type of data -- typical usage: `read_*()` where * can be csv, excel, spss -- ```r library(tidyverse) # Load data df_csv <- read_csv("./../data/sample_data1.csv") *df_spss <- haven::read_spss("./../data/sample_data3.sav") df_excel <- readxl::read_excel("./../data/sample_data3_datadictionary.xlsx") ``` -- *package*`::`*function* calls out a function from a package. -- ```r df_spss <- haven::read_spss("./../data/sample_data3.sav") ``` is the same as: -- ```r library(haven) # Load data df_spss <- read_spss("./../data/sample_data3.sav") ``` --- # Understanding your data
--- # Selecting variables: **`select()`** -- We use `select()` to select certain variables/columns to work with (your data may be huge). -- .pull-left[ ```r df_csv %>% select(ID, Dx, Sex) %>% head() ``` ``` # # A tibble: 6 x 3 # ID Dx Sex # <dbl> <chr> <chr> # 1 1 nfvPPA Male # 2 2 bvFTD Male # 3 3 PSP Male # 4 4 PSP Male # 5 5 bvFTD Male # 6 6 svPPA Male ``` ] -- .pull-right[ ![](img/fx_select.JPG) ] --- # Filtering variables: **`filter()`** -- We use `filter()` to remove or select rows depending on their values. -- .pull-left[ ```r df_csv %>% select(ID, Dx, Sex) %>% filter(Dx == "CONTROL" & Sex == "Male") ``` ``` # # A tibble: 13 x 3 # ID Dx Sex # <dbl> <chr> <chr> # 1 20 CONTROL Male # 2 22 CONTROL Male # 3 24 CONTROL Male # 4 37 CONTROL Male # 5 38 CONTROL Male # 6 44 CONTROL Male # 7 50 CONTROL Male # 8 56 CONTROL Male # 9 62 CONTROL Male # 10 91 CONTROL Male # 11 144 CONTROL Male # 12 179 CONTROL Male # 13 197 CONTROL Male ``` ] -- .pull-right[ ![](img/fx_filter.JPG) ] --- # Arranging variables: **`arrange()`** -- We use `arrange()` to changes the order of our data. -- .pull-left[ ```r df_csv %>% select(ID, Dx, Sex, MMSE) %>% filter(Dx == "CONTROL") %>% arrange(desc(MMSE)) ``` ``` # # A tibble: 42 x 4 # ID Dx Sex MMSE # <dbl> <chr> <chr> <dbl> # 1 20 CONTROL Male 30 # 2 51 CONTROL Female 30 # 3 60 CONTROL Female 30 # 4 98 CONTROL Female 30 # 5 101 CONTROL Female 30 # 6 197 CONTROL Male 30 # 7 17 CONTROL Female 29 # 8 19 CONTROL Female 29 # 9 38 CONTROL Male 29 # 10 48 CONTROL Female 29 # # ... with 32 more rows ``` ] -- .pull-right[ ![](img/fx_arrange.JPG) ] --- # Creating variables: **`mutate()`** -- The job of `mutate()` is to add new columns that are functions of existing columns. -- .pull-left[ ```r df_csv %>% filter(Dx == "CONTROL") %>% select(ID, q1:q6) %>% mutate(q_total = q1 + q2 + q3 + q4 + q5 + q6) ``` ``` # # A tibble: 42 x 8 # ID q1 q2 q3 q4 q5 q6 q_total # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 16 3 2 1 4 1 3 14 # 2 17 2 2 2 3 5 2 16 # 3 19 4 5 5 2 1 2 19 # 4 20 4 1 5 2 3 2 17 # 5 21 3 1 3 5 1 5 18 # 6 22 3 2 2 3 1 3 14 # 7 23 3 5 2 2 4 3 19 # 8 24 4 2 4 1 4 5 20 # 9 25 4 4 5 1 3 5 22 # 10 37 1 2 2 3 4 2 14 # # ... with 32 more rows ``` ] -- .pull-right[ ![](img/fx_mutate.JPG) ] --- # Renaming variables: **`rename()`** -- The job of `rename()` is to add rename existing columns. -- .pull-left[ ```r df_csv %>% filter(Dx == "CONTROL") %>% select(ID, q1:q6) %>% rename(WB_1 = q1, WN_2 = q2) ``` ``` # # A tibble: 42 x 7 # ID WB_1 WN_2 q3 q4 q5 q6 # <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> # 1 16 3 2 1 4 1 3 # 2 17 2 2 2 3 5 2 # 3 19 4 5 5 2 1 2 # 4 20 4 1 5 2 3 2 # 5 21 3 1 3 5 1 5 # 6 22 3 2 2 3 1 3 # 7 23 3 5 2 2 4 3 # 8 24 4 2 4 1 4 5 # 9 25 4 4 5 1 3 5 # 10 37 1 2 2 3 4 2 # # ... with 32 more rows ``` ] -- .pull-right[ ![](img/fx_rename.JPG) ] --- # Grouping variables and summarizing data: **`group_by()`** and **`summarize()`** -- These two are usually a couple since we want to use them to create group summaries. -- .pull-left[ ```r df_csv %>% group_by(Dx, Sex) %>% summarize(Edu_mean = mean(Education), MMSE_mean = mean(MMSE)) %>% rename(Education = Edu_mean, MMSE = MMSE_mean) ``` ``` # # A tibble: 12 x 4 # # Groups: Dx [6] # Dx Sex Education MMSE # <chr> <chr> <dbl> <dbl> # 1 AD Female 17.0 19.0 # 2 AD Male 17.7 19.6 # 3 bvFTD Female 17.5 21.5 # 4 bvFTD Male 17.5 21.3 # 5 CONTROL Female 16.3 27.1 # 6 CONTROL Male 16.4 27.9 # 7 nfvPPA Female 17.4 19.8 # 8 nfvPPA Male 16.9 19.1 # 9 PSP Female 16.7 19.6 # 10 PSP Male 17 18 # 11 svPPA Female 17.8 20.2 # 12 svPPA Male 16.9 15.8 ``` ] -- .pull-right[ ![](img/fx_group_summarize.JPG) ] --- # Transforming between wide and long data: **`gather()`** -- `Gather()` turns wide data into long format -- .pull-left[ ```r df_csv %>% filter(Dx %in% c("AD", "bvFTD", "svPPA")) %>% group_by(Dx) %>% summarize(Edu_mean = mean(Education), MMSE_mean = mean(MMSE)) ``` ``` # # A tibble: 3 x 3 # Dx Edu_mean MMSE_mean # <chr> <dbl> <dbl> # 1 AD 17.3 19.2 # 2 bvFTD 17.5 21.4 # 3 svPPA 17.4 18.2 ``` <img src="img/fx_gather.JPG" width="300"> ] -- .pull-right[ ```r df_csv %>% filter(Dx %in% c("AD", "bvFTD", "svPPA")) %>% group_by(Dx) %>% summarize(Edu_mean = mean(Education), MMSE_mean = mean(MMSE)) %>% gather(key = "Cog", value = "Score", Edu_mean, MMSE_mean) ``` ``` # # A tibble: 6 x 3 # Dx Cog Score # <chr> <chr> <dbl> # 1 AD Edu_mean 17.3 # 2 bvFTD Edu_mean 17.5 # 3 svPPA Edu_mean 17.4 # 4 AD MMSE_mean 19.2 # 5 bvFTD MMSE_mean 21.4 # 6 svPPA MMSE_mean 18.2 ``` ] --- # Transforming between wide and long data: **`spread()`** -- `Spread()` turns long format into wide format -- .pull-left[ ```r df_csv %>% filter(Dx %in% c("AD", "bvFTD", "svPPA")) %>% group_by(Dx) %>% summarize(Edu_mean = mean(Education), MMSE_mean = mean(MMSE)) %>% gather(key = "Cog", value = "Score", Edu_mean, MMSE_mean) ``` ``` # # A tibble: 6 x 3 # Dx Cog Score # <chr> <chr> <dbl> # 1 AD Edu_mean 17.3 # 2 bvFTD Edu_mean 17.5 # 3 svPPA Edu_mean 17.4 # 4 AD MMSE_mean 19.2 # 5 bvFTD MMSE_mean 21.4 # 6 svPPA MMSE_mean 18.2 ``` ] -- .pull-right[ ```r df_csv %>% filter(Dx %in% c("AD", "bvFTD", "svPPA")) %>% group_by(Dx) %>% summarize(Edu_mean = mean(Education), MMSE_mean = mean(MMSE)) %>% gather(key = "Cog", value = "Score", Edu_mean, MMSE_mean) %>% spread(key = "Cog", value = "Score") ``` ``` # # A tibble: 3 x 3 # Dx Edu_mean MMSE_mean # <chr> <dbl> <dbl> # 1 AD 17.3 19.2 # 2 bvFTD 17.5 21.4 # 3 svPPA 17.4 18.2 ``` <img src="img/fx_spread.JPG" width="300"> ] --- class: center, middle # Thanks! # Now let's get hands-on! Slides created via the R package [**xaringan**](https://github.com/yihui/xaringan).