############################################################################################## # Tidyverse workshop | 19 August 2022 | Author: Nick Roxburgh # ############################################################################################## ############################################################################################## # Installing and loading tidyverse # ############################################################################################## ################################# # Install the required packages # ################################# # The following packages will be used during the workshop: tidyverse, readxl, magrittr, # lubridate, sf, rmapshaper, starwarsdb, ggfx, and units. # Individual packages can be installed as follows: install.packages(""). # Alternatively, a vector of package names can be passed to the function: # install.packages(c("", "")). ############################## # Load the required packages # ############################## # Packages can be loaded once installed. library(tidyverse) library(readxl) library(sf) library(magrittr) library(lubridate) library(starwarsdb) ############################################################################################## ############################################################################################## # Importing data # ############################################################################################## # Save this script in a folder of its own, then set your working directory to this location # using the following command: setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # We can check it has worked as expected using getwd(): getwd() # Next, create subfolders titled `RawData` and `ProcessedData`. Save the following datasets to # the `RawData` subfolder: # 1. .csv file: https://ourairports.com/countries/MW/airports.csv [https://data.humdata.org/dataset/ourairports-mwi "mw-airports.csv"] # 2. .xlsx file: https://map.ipcinfo.org/api/public/population-tracking-tool/data/2017,2022/?export=true&condition=A&country=MW [https://data.humdata.org/dataset/malawi-acute-food-insecurity-country-data, "Malawi - IPC Analysis 2017-2022.xlsx"] # 3. .gpkg file: https://geodata-eu-central-1-kontur-public.s3.eu-central-1.amazonaws.com/kontur_datasets/kontur_boundaries_MW_20220407.gpkg.gz [https://data.humdata.org/dataset/kontur-boundaries-malawi "kontur_boundaries_MW_20220407.gpkg"] ############################ # Read and write .csv data # ############################ # To read in the .csv data, we use the read_csv() function from the readr package, and specify # the path to our .csv file: MW_Airports <- read_csv("RawData/mw-airports.csv") # We can save it to our `ProcessedData` folder using write_csv(): write_csv(MW_Airports, "ProcessedData/MW_Airports.csv") ########################## # Read in the .xlsx data # ########################## # To read in the .xlsx data, we use the read_xlsx() function from the readxl package, and # specify the path to our .csv file, along with the sheet name or number and cell range: MW_FoodInsecurity <- read_xlsx("RawData/Malawi - IPC Analysis 2017-2022.xlsx", sheet = 1, range = "A10:AK45") # There are no functions in readxl for saving Excel files formats. rm(MW_FoodInsecurity) ################################## # Read in the .gpkg spatial data # ################################## # To read in the .gpkg data, we use the st_read() function from the sf package: MW_AdminBoundaries <- st_read("RawData/kontur_boundaries_MW_20220407.gpkg") # We can save it to our `ProcessedData` folder using st_write. If we wished, we could # save it as another format (e.g., .shp) simply by changing the file extension. st_write(MW_AdminBoundaries, "ProcessedData/MW_AdminBoundaries.gpkg") ################################################# # Convert the MW_Aiports tibble to an sf object # ################################################# # The latitude and longitude columns can be converted to an sf objects. MW_Airports <- st_as_sf(MW_Airports, coords=c("longitude_deg", "latitude_deg"), crs = st_crs(MW_AdminBoundaries)) # For further functionality, see the readr cheatsheet which is available here: # https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-import.pdf ############################################################################################## ############################################################################################## # Tidying data # ############################################################################################## # We'll create a wide format table to work with here. dat <- tibble(country = c("AT", "BE", "CR"), `2020` = c(79, 102, 32), `2021` = c(92, 99, 37)) # Convert the data from wide to long format. Column names move to a name_to column and values # to a values_to column. dat <- pivot_longer(dat, cols = 2:3, names_to = "year", values_to = "cases") # Now we will widens the data once more. Typically, one column provides the new column names, # the other the values. We use this if we have multiple variables stacked in a column. dat <- pivot_wider(dat, names_from = "year", values_from = "cases") rm(dat) # Let's use drop_na() to drop all rows that lack an location identifier code (i.e. iata_code) # from our MW_Airports tibble. MW_Airports <- drop_na(MW_Airports, iata_code) # Let's unite the continent, country_name, and iso_country columns MW_Airports <- unite(MW_Airports, col = "UnitedCol", c(continent, country_name, iso_country), sep = "_") # Let's separate these columns oncemore MW_Airports <- separate(MW_Airports, col = "UnitedCol", into = c("continent", "country_name", "iso_country"), sep="_") # The object was converted to a simple tibble during these steps - we'll convert it back to being an sf object. MW_Airports <- st_as_sf(MW_Airports) # The tidyr cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr.pdf ############################################################################################## ############################################################################################## # Transforming data with dplyr, purrr, forcats, stringr, and lubridate # ############################################################################################## # We'll use the Star Wars dataset here. starwars ################################ # Use dplyr to manipulate data # ################################ # Chooses rows based on column values filter(starwars, homeworld == "Alderaan") # Select rows based on location slice(starwars, 5:n()) # Orders rows by the values of selected columns arrange(starwars, -height) # Select columns to retain select(starwars, name, homeworld, species) # Change the name of columns rename(starwars, homeland = homeworld) # Changes the location of a column relocate(starwars, species, .after = name) # Create new variables or apply changes to existing ones mutate(starwars, bmi = mass / (height * 0.01)^2) # Collapse groups of rows into a single-row summary summarise(starwars, minHeight = min(height, na.rm = T), maxHeight = max(height, na.rm = T)) # The dplyr cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf ################################################ # Use purrr to work with functions and vectors # ################################################ # Use map() to apply a function to each element of a list or vector. Here, we return the number # of distinct values in each column of the Star Wars data map(starwars, n_distinct) # The purrr cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/purrr.pdf #################################### # Use forcats to work with factors # #################################### # Reorder factor levels by sorting along another variable - sort characters factor levesl by height. StarwarsReordered <- fct_reorder(starwars$name, starwars$height, min) levels(StarwarsReordered) # Collapse the least/most frequent values of a factor into an “other” category count(mutate(starwars, species = fct_lump(species, n=3)), species) # The forcats cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/factors.pdf #################################### # Use stringr to work with strings # #################################### # Sort character names alphabetically: str_sort(starwars$name, decreasing = FALSE) # Seek the presence of The Force... str_detect(starwars$name, "Yod") # Correct the record: str_replace_all(starwars$name, "Anakin Skywalker", "Darth Vader") # The stringr cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/strings.pdf #################################### # Use lubridate to work with dates # #################################### # Create a POSIXct object: ymd_hms("2022-08-19 10:00:00", tz = "UTC") # Round date-times: floor_date(ymd("2022-08-19"), unit = "year") # Perform math on date-times ymd("2022-08-19") + days(5) # Define a time interval: interval(start = ymd("2022-01-05"), end = ymd("2022-01-31")) # The lubridate cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/lubridate.pdf ############################################################################################## # Pipes # ############################################################################################## # Calculate the mean height by planet of blue eyed, female characters in Star Wars by using # a series of piped commands: starwars %>% filter(eye_color == "blue" & sex == "female") %>% group_by(homeworld) %>% summarise(mean_height = mean(height)) ############################################################################################## # Visualising data with ggplot2 # ############################################################################################## # Create a barplot: dat <- starwars %>% mutate(species = fct_lump(species, n = 3, other_level = "Other"), species = fct_infreq(species)) %>% count(species) %>% drop_na() ggplot(dat, aes(x = species, y = n)) + geom_bar(stat = "identity") # And a column plot: dat <- starwars %>% mutate(species = fct_lump(species, n = 3, other_level = "Other"), species = fct_infreq(species)) %>% count(species) %>% drop_na() ggplot(dat, aes(x = species, y = n)) + geom_col() # Create a step plot: dat <- tibble(ReleaseDate = c(1977, 1980, 1983, 1999, 2002, 2005, 2015, 2017, 2019), Episode = c("IV", "V", "VI", "I", "II", "III", "VII", "VIII", "IX"), TotalEpisodes = 1:9) ggplot(dat, aes(x = ReleaseDate, y = TotalEpisodes)) + geom_step() # Create a scatter plot: dat <- starwars %>% drop_na(mass, height) %>% mutate(bmi = mass / (height * 0.01)^2, zBmi = scale(bmi)) %>% filter(between(zBmi, -2.5, +2.5)) ggplot(dat, aes(x = mass, y = height)) + geom_point() # Now colour the points by species, tweak their aesthetics, and specify scales: dat <- starwars %>% drop_na(mass, height, species) %>% mutate(species = fct_lump(species, n = 3, other_level = "Other"), species = fct_infreq(species)) %>% mutate(bmi = mass / (height * 0.01)^2, .after = mass) %>% mutate(zBmi = scale(bmi)) %>% filter(between(zBmi, -2.5, +2.5)) ggplot(dat, aes(x = mass, y = height, colour = species)) + geom_point(alpha = 0.5, shape = "triangle", size = 3) + scale_x_continuous(name = "Mass (kg)", limits = c(0, 165), breaks = seq(0, 160, 40), expand = c(0,0)) + scale_y_continuous(name = "Height (cm)", limits = c(0, 255), breaks = seq(0, 250, 50)) # Define a theme: theme_cleanGrey <- function() { theme_bw() + theme(plot.background = element_rect(fill = "#F5F5F5"), plot.margin = unit(c(.5, .5, .5, .5), "cm"), panel.grid.minor = element_blank(), panel.grid.major = element_line(linetype = "dashed", colour = "black", size = 0.2), panel.border = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), axis.text = element_text(face = "italic", colour = "#606060"), axis.title = element_text(colour = "black"), legend.title = element_text(face = "italic"), legend.background = element_blank(), legend.key = element_blank(), legend.position = "right") } # Apply the theme to our plot and add titles: dat <- starwars %>% drop_na(mass, height, species) %>% mutate(species = fct_lump(species, n = 3, other_level = "Other"), species = fct_infreq(species)) %>% mutate(bmi = mass / (height * 0.01)^2, .after = mass) %>% mutate(zBmi = scale(bmi)) %>% filter(between(zBmi, -2.5, +2.5)) ggplot(dat, aes(x = mass, y = height, colour = species)) + geom_point(alpha = 0.5, shape = "triangle", size = 3) + scale_x_continuous(name = "Mass (kg)", limits = c(0, 165), breaks = seq(0, 160, 40), expand = c(0,0)) + scale_y_continuous(name = "Height (cm)", limits = c(0, 255), breaks = seq(0, 250, 50), expand = c(0,0)) + theme_cleanGrey() + scale_colour_discrete("Species") + ggtitle(label = "Starwars character height and weight", subtitle = "Data: Aden-Buie (2020)") # The ggplot2 cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-visualization.pdf ############################################################################################## # Working with & visualising spatial data using sf & ggplot2 # ############################################################################################## # Load the additional packages required for this exercise: library(rmapshaper) library(ggfx) library(units) # Let's start by plotting the Malawi data we worked with earlier. # We'll first disable scientific notation so it is not used in the plot. options(scipen=999) # Next, we will select the administrative level we wish to plot using filter from the dplyr # package. MW_AdminBoundaries <- MW_AdminBoundaries %>% filter(admin_level == 5) %>% # We'll now use st_area from the sf package to calculate the area of each polygon. The value # returned will be in m^2. mutate(area = st_area(.), # Convert from m^2 to km^2 using set_units from the units package. area = set_units(area, km^2), # Units does not play well with certain function, e.g. ms_simplify from the # rmapshaper package. Where this is an issue, we can convert from units to numeric. area = as.numeric(area), # Now we can calculate population per km^2. population_density = population / area) %>% #The MW_AdminBoundaries data is large, so will be slow to plot. We can simplify it first # using ms_simplify() from the rmapshaper package, specifying the proportion of vertices # to retain as 5%. ms_simplify(keep = 0.05) # Our data is now ready to plot. ggplot() + # We'll begin by creating a choropleth map showing the population density we just calculated. # To make the map visually "pop", we will add a drop shadow which we can do by wrapping the # with_shadow() function from the ggfx package around the geom_sf(). We can adjust the blur, # colour, and offset of the shadow with the arguments after the geom_sf(). with_shadow( geom_sf(data = MW_AdminBoundaries, aes(fill = population_density), color = "white", lwd = 0.25), sigma = 1, colour = "darkgrey", x_offset = 0, y_offset = 0) + # Here we specify the colour scale we want to use for our choropleth. scale_fill_continuous(low = "#CFFFE6", high = "#008252", na.value = "#E9E9E9", name = "Population km^2", labels = scales::comma) + # Now we add an additional layer, showing our filtered down airport locations. geom_sf(data = MW_Airports, colour = "black", size = 1, shape = 17) + # Now we specify one of ggplots default themes, and then some manual tweaks to the theme. theme_void() + theme( legend.key.width = unit(0.2, 'cm'), legend.spacing.x = unit(0.08, 'cm'), legend.title = element_text(size = 10, face = "italic")) # The sf cheatsheet can be found here: https://raw.githubusercontent.com/rstudio/cheatsheets/main/sf.pdf