Regression Analysis
Do Won Kim
2025-02-12
# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
conflicted, # Avoid conflict of functions with same names
tidyverse, # Tidyverse umbrella package
readr, # read_csv()
arrow, # Load parquet files
car, # vif()
betareg, # betareg()
sjPlot, # tab_xtab()
kableExtra, # kbl()
broom, # tidy()
DT # datatable()
)
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
1 Data Prep
# Set the base directory (When you replicate this, change this to your working directory)
base_dir <- "/Users/dowonkim/Desktop/domain_audience_diversity/data"
# Define subdirectories for different data categories
sub_dirs <- c("age", "gender", "meta_stats", "party", "race")
# Generate full paths for each subdirectory
dirs <- file.path(base_dir, sub_dirs)
# Function to read all Parquet files in a directory and store them in a named list
read_all_parquet <- function(dir) {
files <- list.files(dir, pattern = "\\.parquet$", full.names = TRUE)
dfs <- lapply(files, read_parquet)
names(dfs) <- tools::file_path_sans_ext(basename(files)) # Assign file names as list names
return(dfs)
}
# Read all Parquet files from the defined directories
df_list <- lapply(dirs, read_all_parquet)
df_list <- do.call(c, df_list) # Flatten the nested list
# Remove raw_counts
df_list <- df_list[!names(df_list) %in% c("gender_raw_count_shares", "gender_raw_count_users",
"race_raw_count_shares", "race_raw_count_users")]
# Merge all data frames based on the 'domain' key
merged_df <- Reduce(function(x, y) merge(x, y, by = "domain", all = TRUE), df_list)
# Now bring the NewsGuard scores
newsguard_path <- file.path(base_dir, "reliability", "newsguard_2022_01.csv")
newsguard_scores <- read_csv(newsguard_path)
# Merge NewsGuard data with the main dataset
final_df <- merge(merged_df, newsguard_scores, by = "domain", all.x = TRUE)
# Check the final dataset
final_df |> datatable(filter = 'top')
2 Regression Analysis
First, check the distribution of our outcome (newsguard_score).
Why use Beta Regression?
- Y (newsguard_score_perc) is a proportion bounded between 0 and 1 (or 0 and 100 for original scle).
- Linear regression (OLS) assumes normally distributed residuals, which is violated in this case (skewed distribution).
- Beta regression models data that follows a Beta distribution, allowing for flexible variance (heteroskedasticity).
2.2 Model 2. Users
model2 <- betareg(newsguard_score_perc ~
age_deviation_score_users + average_yob_users +
average_gender_score_users + variance_gender_score_users +
average_party_score_users + variance_party_score_users +
race_deviation_score_users +
shares + users,
data = final_df)
vif(model2)
## age_deviation_score_users average_yob_users
## 1.4432276 -0.9046236
## average_gender_score_users variance_gender_score_users
## 1.5235995 0.8788422
## average_party_score_users variance_party_score_users
## 0.9174253 1.1935640
## race_deviation_score_users shares
## 1.2459435 1.1459011
## users
## 1.0545576
- VIFs looking good
##
## Call:
## betareg(formula = newsguard_score_perc ~ age_deviation_score_users +
## average_yob_users + average_gender_score_users + variance_gender_score_users +
## average_party_score_users + variance_party_score_users + race_deviation_score_users +
## shares + users, data = final_df)
##
## Randomized quantile residuals:
## Min 1Q Median 3Q Max
## -4.2946 -2.2297 -1.6704 -0.1789 4.3902
##
## Coefficients (mu model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.077e+02 4.760e-01 226.245 < 2e-16 ***
## age_deviation_score_users -3.397e+00 1.320e-01 -25.744 < 2e-16 ***
## average_yob_users -5.553e-02 5.622e-05 -987.768 < 2e-16 ***
## average_gender_score_users 2.753e+00 2.991e-01 9.203 < 2e-16 ***
## variance_gender_score_users 2.661e+01 9.422e-01 28.240 < 2e-16 ***
## average_party_score_users -8.224e+00 2.771e-01 -29.675 < 2e-16 ***
## variance_party_score_users 1.814e+01 2.374e+00 7.640 2.17e-14 ***
## race_deviation_score_users -3.259e+00 3.865e-01 -8.433 < 2e-16 ***
## shares 9.433e-09 8.364e-06 0.001 0.999
## users 2.217e-06 1.128e-05 0.197 0.844
##
## Phi coefficients (phi model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 5.1143 0.1231 41.54 <2e-16 ***
##
## Exceedence parameter (extended-support xbetax model):
## Estimate Std. Error z value Pr(>|z|)
## Log(nu) -1.2859 0.0487 -26.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Exceedence parameter nu: 0.2764
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: -5732 on 12 Df
## Number of iterations in BFGS optimization: 1
✔ Higher age diversity (age_deviation_score_shares) is associated with lower NewsGuard scores
✔ Younger users (higher average_yob_users) are associated with lower NewsGuard scores, opposite to the shares-based model!
✔ More male-dominated sharing (average_gender_score_shares) is associated with higher NewsGuard scores
✔ Greater gender diversity (variance_gender_score_shares) is associated with higher NewsGuard scores
✔ Politically skewed audiences (average_party_score_shares) are associated with lower NewsGuard scores
✔ More political diversity (variance_party_score_shares) is associated with higher NewsGuard scores
✔ More racially diverse user bases (race_deviation_score_users) correlate with lower NewsGuard scores. (This was not significant in shares-based models!)
Just to check, OLS:
# Just to check: lm()
model2_lm <- lm(newsguard_score_perc ~
age_deviation_score_users +
average_yob_users +
average_gender_score_users + variance_gender_score_users +
average_party_score_users + variance_party_score_users +
race_deviation_score_users +
shares + users,
data = final_df)
vif(model2_lm)
## age_deviation_score_users average_yob_users
## 6.118724 5.606263
## average_gender_score_users variance_gender_score_users
## 1.240989 1.166220
## average_party_score_users variance_party_score_users
## 1.941844 1.559644
## race_deviation_score_users shares
## 1.081159 3.184314
## users
## 3.466397
- High VIF between
age_deviation_score_users
andaverage_yob_users
- Let’s run the OLS regression in two versions: without
age_deviation_score_users
, and withoutaverage_yob_users
lm(newsguard_score_perc ~
# age_deviation_score_users +
average_yob_users +
average_gender_score_users + variance_gender_score_users +
average_party_score_users + variance_party_score_users +
race_deviation_score_users +
shares + users,
data = final_df) |> summary()
##
## Call:
## lm(formula = newsguard_score_perc ~ average_yob_users + average_gender_score_users +
## variance_gender_score_users + average_party_score_users +
## variance_party_score_users + race_deviation_score_users +
## shares + users, data = final_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.12865 -0.07086 0.03692 0.11132 0.69719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.158e+01 1.498e+00 -21.085 < 2e-16 ***
## average_yob_users 1.614e-02 7.585e-04 21.278 < 2e-16 ***
## average_gender_score_users 3.160e-01 3.090e-02 10.226 < 2e-16 ***
## variance_gender_score_users 1.849e+00 1.296e-01 14.267 < 2e-16 ***
## average_party_score_users -9.123e-01 2.648e-02 -34.450 < 2e-16 ***
## variance_party_score_users 2.907e+00 1.413e-01 20.570 < 2e-16 ***
## race_deviation_score_users -1.071e-01 3.556e-02 -3.010 0.00263 **
## shares 1.813e-08 1.624e-08 1.116 0.26446
## users -5.552e-07 2.055e-07 -2.702 0.00691 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1845 on 4014 degrees of freedom
## Multiple R-squared: 0.4708, Adjusted R-squared: 0.4697
## F-statistic: 446.3 on 8 and 4014 DF, p-value: < 2.2e-16
lm(newsguard_score_perc ~
age_deviation_score_users +
# average_yob_users +
average_gender_score_users + variance_gender_score_users +
average_party_score_users + variance_party_score_users +
race_deviation_score_users +
shares + users,
data = final_df) |> summary()
##
## Call:
## lm(formula = newsguard_score_perc ~ age_deviation_score_users +
## average_gender_score_users + variance_gender_score_users +
## average_party_score_users + variance_party_score_users +
## race_deviation_score_users + shares + users, data = final_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93816 -0.06539 0.03577 0.10275 0.80962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.117e-01 3.693e-02 13.857 < 2e-16 ***
## age_deviation_score_users -3.080e-01 1.158e-02 -26.604 < 2e-16 ***
## average_gender_score_users 3.059e-01 2.939e-02 10.410 < 2e-16 ***
## variance_gender_score_users 1.375e+00 1.222e-01 11.248 < 2e-16 ***
## average_party_score_users -7.196e-01 2.853e-02 -25.222 < 2e-16 ***
## variance_party_score_users 1.956e+00 1.472e-01 13.284 < 2e-16 ***
## race_deviation_score_users -9.524e-02 3.455e-02 -2.757 0.00587 **
## shares 1.075e-08 1.571e-08 0.684 0.49387
## users -4.120e-07 1.963e-07 -2.098 0.03593 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1794 on 4014 degrees of freedom
## Multiple R-squared: 0.4994, Adjusted R-squared: 0.4984
## F-statistic: 500.5 on 8 and 4014 DF, p-value: < 2.2e-16
- Basically the same results!