Replication of Figures (Thesis)

Replication of Figures (Thesis)

Preparation

Packages to Load.

library(dplyr) # For Data Manipulation
library(lme4)  # For Linear Mixed-effects Modeling
library(lmerTest) # For Type III Anova
library(ggplot2) # For Creating Customized Figures
library(gridExtra) # For Two ggplots Next to Each Other
library(corrplot) # Correlation plot

Encoding and Data.

encoding <- "UTF-8" 
csv_file <- "sen_youtube_data.csv" 
df <- read.csv(csv_file, fileEncoding = encoding) # Load in the necessary file. Make sure its in the working directory.

Figure 5.1 Histogram of View Count

figure_5_1 <- ggplot(df, aes(x = log_viewCount)) +
  geom_histogram(binwidth = 1, fill = "gray70", color = "black", alpha = 0.7, boundary = -1) +
  labs(x = "View Count (Log)", y = "Frequency") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5)) + 
  theme(
    text = element_text(size = 12),
    axis.title.x = element_text(size = 12, margin = margin(t = 10)),
    axis.title.y = element_text(size = 12, margin = margin(r = 10)),
    axis.text = element_text(size = 12, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) 
figure_5_1

Function that customized each of the plots – instead of doing it twice

plot_sentiment_histogram <- function(data, sentiment_column, binwidth = 1/10, title = "", x_label = "Sentiment Score") {
  ggplot(data, aes(x = !!sym(sentiment_column))) +
    geom_histogram(binwidth = binwidth, fill = "gray90", color = "black", alpha = 0.7, boundary = 0) +
    labs(title = title, x = x_label, y = "Frequency") +
    theme_minimal() + 
    theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
          axis.title.x = element_text(size = 16, margin = margin(t = 10)),
          axis.title.y = element_text(size = 16, margin = margin(r = 10)),
          axis.text = element_text(size = 16, margin = margin(t = 10), face = "bold"),
          axis.text.y.left = element_text(size = 16, margin = margin(r = 10)),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()
    ) 
}

Figure 5.2 Histogram of Individual Sentiments

pos_sent <- plot_sentiment_histogram(df, "pos_prob", title= "Positive")
neg_sent <- plot_sentiment_histogram(df, "neg_prob", title= "Negative")
grid.arrange(pos_sent, neg_sent, ncol = 2) # gridExtra package

Figure 5.3 Histogram of Composite Score

figure_5_3 <- ggplot(df, aes(x = compound_score)) +
  geom_histogram(binwidth = 0.2, fill = "gray70", color = "black", alpha = 0.7, boundary = -1) +
  labs(x = "Sentiment Score", y = "Frequency") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5)) + 
  theme(
    text = element_text(size = 12), 
    axis.title.x = element_text(size = 12, margin = margin(t = 10)),
    axis.title.y = element_text(size = 12, margin = margin(r = 10)),
    axis.text = element_text(size = 12, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) 

figure_5_3

Figure 5.4 Correlation Matrix (Video Level)

video_df <- df[c("neg_prob", "pos_prob","compound_score", "log_viewCount",
                 "durationMin")]

corr_matrix <- cor(video_df)# Creating a matrix
custom_labels1 <- c("Negative Prob.", "Positive Prob.", "Composite Score", "View Count (Log)",
                    "Duration (Min)") # Custom Labels
rownames(corr_matrix) <- colnames(corr_matrix) <- custom_labels1


corrplot(corr_matrix, method = "number", col = gray.colors(50),
         tl.col = "black", tl.srt = 45, tl.cex = 0.8)

Figure 6.1 Coefficient Plot (Model 8)

negative_model3 <- lmer(log_viewCount ~ neg_prob + abs_nom + durationMin +
                          seniority  + gender  + log_subscriber +
                          (1 + neg_prob| channelId ), data = df)

label_vector <- c("(Intercept)", "Ideology", "Duration", "Gender (M)", "Subscribers (Log)", "Negativity", "Seniority")
coefficients <- as.data.frame(summary(negative_model3)$coefficients)

figure_6_1 <- ggplot(data = coefficients, aes(x = rownames(coefficients), y = Estimate)) +
  geom_point() + 
  geom_errorbar(aes(ymin = Estimate - `Std. Error`, ymax = Estimate + `Std. Error`)) +
  labs(x = "Coefficients", y = "Estimate") + theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size = 12),
        axis.title.x = element_text(size = 12, margin = margin(t = 10)),
        axis.title.y = element_text(size = 12, margin = margin(r = 10)),
        axis.text = element_text(size = 12, face = "bold"), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.minor = element_blank()) + 
  scale_x_discrete(labels = label_vector)
print(figure_6_1) # Display plot

Figure 6.2 Histograms of Random Components

par(mar = c(5, 4, 1, 2) + 0.1)  # Adjust the margins
par(mfrow=c(1,2)) # 1 by 2 
# Model 4 load in
negative_model3 <- lmer(log_viewCount ~ neg_prob + abs_nom + durationMin +
                          seniority  + gender  + log_subscriber +
                          (1 + neg_prob| channelId ), data = df)
# Random components of Model 4
random_components <- ranef(negative_model3)
# Intercept Histogram
hist(random_components$channelId[, "(Intercept)"],
     xlab = "Intercept Random Effect",
     main = "Distribution of Random Intercepts",
     breaks = seq(-2, 3, by = 0.5))
# Slope Histogram
hist(random_components$channelId[,"neg_prob"], xlab= "Negative Sentiment Random Effect",
     main = "Distribution of Random Slopes",
     breaks = seq(-2, 3, by = 0.5))

Figure 6.3 Random Components for Each Senator

negative_model3 <- lmer(log_viewCount ~ neg_prob + abs_nom + durationMin +
                          seniority  + gender  + log_subscriber +
                          (1 + neg_prob| channelId ), data = df)
random_components <- ranef(negative_model3)

par(mar = c(5, 4, 1, 2) + 0.1)  # Adjust the margins

par(mfrow=c(1,1)) # 1 histogram

plot(random_components$channelId[, "(Intercept)"],
     random_components$channelId[, "neg_prob"],
     xlab = "Intercept",
     ylab = "Negative Slope",
     xlim = range(random_components$channelId[, "(Intercept)"]),
     ylim = range(random_components$channelId[, "neg_prob"]),
     pch = 16, 
     col = "black", 
     cex = 1.2,
     grid() 
)

Figure A.1 Histogram of Video Duration (Minutes)

par(mfrow=c(1,1))
par(mar = c(5, 4, 1, 2) + 0.1) # Adjusts the margins
# Video Duration
hist(df$durationMin,
     xlab = "Duration (in Minutes)",
     main = "",
     breaks = seq(0, 60, length.out = 31))