RSelenium Data Collection
The captions were collected from Downsub by using the RSelenium package. For those looking for a tutorial on webscraping with this package, you can find one here. There are several tools needed to get this specific script to work:
- Chromium Browser for Automation.
- Chrome WebDriver for RSelenium.
- Adblock needed for blocking popups.
- Java must be installed and the PATH variable must be updated with its location.
This demo script only goes through five examples, this can be adjusted by: ‘stop_row’. Each scrape takes three seconds, with 48,000 videos this should take nearly two complete days.
library(RSelenium) # R Scraping
library(tidyverse) # Data Manipulation
library(netstat) # Network Statistics
library(base64enc) # Base64 Encoding
library(wdman) # Webdriver Manager
download_directory <- "D:\\files" # This is where the txt files are downloaded
chrome_path <- "/Google/Chrome/Chromium/chrome.exe" # This is the path to chromium
filepath <- "Extensions/ohahllgiabjaoigichmmfljhkcfikeof/3_8_7_0.crx" # Path to Extension
extension_base64 <- base64encode(filepath) # Encoded file path
Creating DF
new_vids <- read.csv("videos_complete.csv") # Loading in Videos
new_vids2 <- new_vids %>%
mutate(URL = paste0("https://www.subtitle.to/youtube.com/watch?v=", videoId))
threshold_date <- as.Date("2017-01-03")
videos_after_threshold <- new_vids2[new_vids2$publishedAt >= threshold_date, ]
download_status <- data.frame(
URL = videos_after_threshold$URL,
videoId = videos_after_threshold$videoId,
Status = NA,
title = videos_after_threshold$videoTitle,
file_name = NA
)
download_status$file_name <- paste0("[English (auto-generated)] ", download_status$title, " [DownSub.com].txt")
download_status$file_name <- gsub(":", "_", download_status$file_name)
start_row <- 1
stop_row <- 5
save_interval <- 2000
downloaded_count <- 0
no_captions_count <- 0
Setting up RSelenium
cprof <- list(
chromeOptions = list(
prefs = list(
"profile.default_content_settings.popups" = 0, # Block pop-ups
"download.default_directory" = download_directory),
extensions = list(extension_base64),
binary = chrome_path # Specify the path to the Chrome binary here
)
)
driver <- rsDriver(browser = "chrome", chromever = "114.0.5735.90",
extraCapabilities = cprof, verbose = FALSE,
port = free_port())
remDr <- driver[["client"]]
Loop that starts the process
for (i in start_row:min(stop_row, nrow(download_status))) {
if (!is.na(download_status$Status[i])) {
next
}
url <- download_status$URL[i]
remDr$navigate(url)
Sys.sleep(3)
selector <- "#app > div > main > div > div.container.ds-info.outlined > div > div.row.no-gutters > div.pr-1.col-sm-7.col-md-6.col-12 > div.flex.mt-5.text-center > div.layout.justify-start.align-center > button:nth-child(2)"
button <- tryCatch({
remDr$findElement(using = "css selector", value = selector)
}, error = function(e) {
NULL
})
if (is.null(button)) {
download_status$Status[i] <- "No Captions"
no_captions_count <- no_captions_count + 1
} else {
button$clickElement()
Sys.sleep(3)
download_status$Status[i] <- "Downloaded"
downloaded_count <- downloaded_count + 1
}
# Print progress
cat(sprintf("Processed %d rows. Downloaded: %d, No Captions: %d\n", i, downloaded_count, no_captions_count))
if (i %% save_interval == 0) {
saveRDS(download_status, paste0("download_status_", i, ".rds"))
}
}
## Processed 1 rows. Downloaded: 1, No Captions: 0
## Processed 2 rows. Downloaded: 2, No Captions: 0
## Processed 3 rows. Downloaded: 3, No Captions: 0
## Processed 4 rows. Downloaded: 4, No Captions: 0
## Processed 5 rows. Downloaded: 5, No Captions: 0
Closing the RSelenium process
remDr$close()
driver$server$stop()
## [1] TRUE