Note: when aggregating data, 'attendance' is the total attendance, while all other numeric columns are the mean of the respective variable
# Dean Attali
# July 11, 2017
#
# This script scrapes data from the useR 2017 scheduling website to see the
# attendance preferences of conference attendees.
# This code was written in a hurry while on a train, and it is far
# from being "great and robust" scraping code, so beware trying to copy any of
# thid code :)

library(rvest)

base_url <- "https://user2017.sched.com"

# Get all info for a talk from its event URL; return a one-row tibble
get_event_data <- function(event_id) {
  # Read the HTML page
  full_url <- paste0(base_url, "/event/", event_id)
  content <- read_html(full_url)

  # Extract information from the page
  title <-
    content %>% html_node(".event a.name") %>%
    html_text(trim = TRUE)
  attendance <-
    content %>% html_node("#sched-page-event-attendees h2") %>%
    html_text(trim = TRUE) %>%
    sub(pattern = ".*\\((.*)\\).*", replacement = "\\1", x = .) %>%
    as.numeric()
  speaker <-
    content %>% html_node(".sched-person h2 a") %>%
    html_text(trim = TRUE)
  time <-
    content %>% html_node(".sched-event-details-timeandplace") %>%
    html_text(trim = TRUE) %>%
    sub(pattern = ".*(July.*) -.*", replacement = "\\1", x = .) %>%
    strptime("%B %d, %Y %I:%M%p", tz = "CET") %>%
    as.POSIXct()
  room <-
    content %>% html_node(".sched-event-details-timeandplace a") %>%
    html_text(trim = TRUE)
  type <-
    content %>% html_node(".sched-event-type a") %>%
    html_text(trim = TRUE)

  # Return all event information as a one-row tibble
  tibble::tibble(type = type, title = title, attendance = attendance,
                 speaker = speaker, time = time, room = room, url = full_url)
}

# Find all event links from the main page, extract the event ID,
# and scrape each event page
content <- read_html(base_url)
event_ids <- content %>%
  html_nodes(".event a.name") %>%
  html_attr("href") %>%
  sub("/event/(.*)/.*", "\\1", .)
all_talks <- purrr::map_df(event_ids, get_event_data)


# Remove RIOT SESSION because it's the only session that is not the same type
# of session as everything else in its time slot. It's also the lowest
# attendance with 21, so not too interesting as far as this dataset is concerned
all_talks <- all_talks[-which(all_talks$title == "RIOT SESSION"), ]

# Make sure all concurrent sessions are of the same type
stopifnot(
  all(dplyr::group_by(all_talks, time) %>%
        dplyr::summarize(num_types = length(unique(type))) %>%
        .$num_types == 1)
)

# add some aggregate attendance data
total_attendance <- 966
aggregate_data <- all_talks %>%
  dplyr::group_by(time) %>%
  dplyr::summarize(
    concurrent_sessions = n(),
    concurrent_attendance = sum(attendance),
    type = type[1]
  ) %>%
  dplyr::mutate(expected = round(total_attendance / concurrent_sessions))

all_talks <- all_talks %>%
  dplyr::left_join(aggregate_data, by = c("time", "type")) %>%
  dplyr::mutate(attendance_ratio = round(attendance/expected, 2))

# add unique id and save
all_talks$id <- rownames(all_talks)
write.csv(all_talks, "all_talks.csv", row.names = FALSE)
More apps by Dean