# Dean Attali
# July 11, 2017
#
# This script scrapes data from the useR 2017 scheduling website to see the
# attendance preferences of conference attendees.
# There were 1,161 attendees in total, and 966 of them used the Sched app.
# This code was written in a hurry while on a train, and it is far
# from being "great and robust" scraping code, so beware trying to copy any of
# this code :)
library(rvest)
base_url <- "https://user2017.sched.com"
# Get all info for a talk from its event URL; return a one-row tibble
get_event_data <- function(event_id) {
# Read the HTML page
full_url <- paste0(base_url, "/event/", event_id)
content <- read_html(full_url)
# Extract information from the page
title <-
content %>% html_node(".event a.name") %>%
html_text(trim = TRUE)
attendance <-
content %>% html_node("#sched-page-event-attendees h2") %>%
html_text(trim = TRUE) %>%
sub(pattern = ".*\\((.*)\\).*", replacement = "\\1", x = .) %>%
as.numeric()
speaker <-
content %>% html_node(".sched-person h2 a") %>%
html_text(trim = TRUE)
time <-
content %>% html_node(".sched-event-details-timeandplace") %>%
html_text(trim = TRUE) %>%
sub(pattern = ".*(July.*) -.*", replacement = "\\1", x = .) %>%
strptime("%B %d, %Y %I:%M%p", tz = "CET") %>%
as.POSIXct()
room <-
content %>% html_node(".sched-event-details-timeandplace a") %>%
html_text(trim = TRUE)
type <-
content %>% html_node(".sched-event-type a") %>%
html_text(trim = TRUE)
# Return all event information as a one-row tibble
tibble::tibble(type = type, title = title, attendance = attendance,
speaker = speaker, time = time, room = room, url = full_url)
}
# Find all event links from the main page, extract the event ID,
# and scrape each event page
content <- read_html(base_url)
event_ids <- content %>%
html_nodes(".event a.name") %>%
html_attr("href") %>%
sub("/event/(.*)/.*", "\\1", .)
all_talks <- purrr::map_df(event_ids, get_event_data)
# Remove RIOT SESSION because it's the only session that is not the same type
# of session as everything else in its time slot. It's also the lowest
# attendance with 21, so not too interesting as far as this dataset is concerned
all_talks <- all_talks[-which(all_talks$title == "RIOT SESSION"), ]
# Make sure all concurrent sessions are of the same type
stopifnot(
all(dplyr::group_by(all_talks, time) %>%
dplyr::summarize(num_types = length(unique(type))) %>%
.$num_types == 1)
)
# add some aggregate attendance data
total_attendance <- 966
aggregate_data <- all_talks %>%
dplyr::group_by(time) %>%
dplyr::summarize(
concurrent_sessions = n(),
concurrent_attendance = sum(attendance),
type = type[1]
) %>%
dplyr::mutate(expected = round(total_attendance / concurrent_sessions))
all_talks <- all_talks %>%
dplyr::left_join(aggregate_data, by = c("time", "type")) %>%
dplyr::mutate(attendance_ratio = round(attendance/expected, 2))
# add unique id and save
all_talks$id <- rownames(all_talks)
write.csv(all_talks, "all_talks.csv", row.names = FALSE)