Regression Analysis
# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  conflicted,    # Avoid conflict of functions with same names
  tidyverse,     # Tidyverse umbrella package
  readr,         # read_csv()
  arrow,         # Load parquet files
  car,           # vif()
  betareg,       # betareg()
  sjPlot,        # tab_xtab()
  kableExtra,    # kbl()
  broom,         # tidy()
  DT             # datatable()
)

conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")

1 Data Prep

# Set the base directory (When you replicate this, change this to your working directory)
base_dir <- "/Users/dowonkim/Desktop/domain_audience_diversity/data"

# Define subdirectories for different data categories
sub_dirs <- c("age", "gender", "meta_stats", "party", "race")

# Generate full paths for each subdirectory
dirs <- file.path(base_dir, sub_dirs)

# Function to read all Parquet files in a directory and store them in a named list
read_all_parquet <- function(dir) {
  files <- list.files(dir, pattern = "\\.parquet$", full.names = TRUE)
  dfs <- lapply(files, read_parquet)
  names(dfs) <- tools::file_path_sans_ext(basename(files))  # Assign file names as list names
  return(dfs)
}

# Read all Parquet files from the defined directories
df_list <- lapply(dirs, read_all_parquet)
df_list <- do.call(c, df_list)  # Flatten the nested list

# Remove raw_counts
df_list <- df_list[!names(df_list) %in% c("gender_raw_count_shares", "gender_raw_count_users",
                                          "race_raw_count_shares", "race_raw_count_users")]

# Merge all data frames based on the 'domain' key
merged_df <- Reduce(function(x, y) merge(x, y, by = "domain", all = TRUE), df_list)

# Now bring the NewsGuard scores
newsguard_path <- file.path(base_dir, "reliability", "newsguard_2022_01.csv")
newsguard_scores <- read_csv(newsguard_path)

# Merge NewsGuard data with the main dataset
final_df <- merge(merged_df, newsguard_scores, by = "domain", all.x = TRUE)

# Check the final dataset
final_df |> datatable(filter = 'top')

2 Regression Analysis

First, check the distribution of our outcome (newsguard_score).

# Check the distribution of our outcome (newsguard_score)
hist(final_df$newsguard_score)

final_df |> mutate(
  newsguard_score_perc = newsguard_score/100
) -> final_df

Why use Beta Regression?

  • Y (newsguard_score_perc) is a proportion bounded between 0 and 1 (or 0 and 100 for original scle).
  • Linear regression (OLS) assumes normally distributed residuals, which is violated in this case (skewed distribution).
  • Beta regression models data that follows a Beta distribution, allowing for flexible variance (heteroskedasticity).

2.1 Model 1. Shares

model1 <- betareg(newsguard_score_perc ~
               age_deviation_score_shares +  average_yob_shares +
                 # gender_leaning_score_shares +
                 average_gender_score_shares + variance_gender_score_shares +
                 # political_leaning_score_shares +
                 average_party_score_shares + variance_party_score_shares +
                 race_deviation_score_shares +
                 shares + users,
               data = final_df)
vif(model1)
##   age_deviation_score_shares           average_yob_shares 
##                    0.6664758                   -0.1433808 
##  average_gender_score_shares variance_gender_score_shares 
##                    1.1292099                    1.3267294 
##   average_party_score_shares  variance_party_score_shares 
##                    0.6806386                    1.3892623 
##  race_deviation_score_shares                       shares 
##                    1.1397561                    1.1442298 
##                        users 
##                    1.0826498
  • VIFs are looking fine here (VIF < 5)
  • If we include gender_leaning_score_shares or political_leaning_score_shares in the model, VIF gets too high so I removed them.
summary(model1)
## 
## Call:
## betareg(formula = newsguard_score_perc ~ age_deviation_score_shares + 
##     average_yob_shares + average_gender_score_shares + variance_gender_score_shares + 
##     average_party_score_shares + variance_party_score_shares + race_deviation_score_shares + 
##     shares + users, data = final_df)
## 
## Randomized quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2265 -2.1817 -1.6565 -0.3150  5.0063 
## 
## Coefficients (mu model with logit link):
##                                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                  -1.755e+02  2.836e-01 -618.832  < 2e-16 ***
## age_deviation_score_shares   -2.390e+00  1.117e-01  -21.392  < 2e-16 ***
## average_yob_shares            9.054e-02  4.321e-05 2095.144  < 2e-16 ***
## average_gender_score_shares   2.102e+00  2.325e-01    9.039  < 2e-16 ***
## variance_gender_score_shares  6.507e+00  8.724e-01    7.459 8.69e-14 ***
## average_party_score_shares   -4.602e+00  1.929e-01  -23.855  < 2e-16 ***
## variance_party_score_shares   3.578e+00  1.499e+00    2.387    0.017 *  
## race_deviation_score_shares  -1.028e-01  2.005e-01   -0.513    0.608    
## shares                        1.096e-07  8.164e-06    0.013    0.989    
## users                         5.138e-06  1.112e-05    0.462    0.644    
## 
## Phi coefficients (phi model with identity link):
##       Estimate Std. Error z value Pr(>|z|)    
## (phi)   4.5764     0.1073   42.66   <2e-16 ***
## 
## Exceedence parameter (extended-support xbetax model):
##         Estimate Std. Error z value Pr(>|z|)    
## Log(nu) -1.28587    0.04756  -27.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Exceedence parameter nu: 0.2764
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: -5631 on 12 Df
## Number of iterations in BFGS optimization: 1

✔ Higher age diversity (age_deviation_score_shares) is associated with lower NewsGuard scores

✔ Older audiences (average_yob_shares) are associated with higher NewsGuard scores

✔ More male-dominated sharing (average_gender_score_shares) is associated with higher NewsGuard scores

✔ Greater gender diversity (variance_gender_score_shares) is associated with higher NewsGuard scores

✔ Politically skewed audiences (average_party_score_shares) are associated with lower NewsGuard scores

✔ More political diversity (variance_party_score_shares) is associated with higher NewsGuard scores

✔ Racial diversity (race_deviation_score_shares), the total number of shares (shares) and unique users (users) do not significantly impact NewsGuard scores (p > 0.05).

Just to check, I ran OLS as well:

# Just to check: lm()
model1_lm <- lm(newsguard_score_perc ~
               age_deviation_score_shares +  average_yob_shares +
                 # gender_leaning_score_shares +
                 average_gender_score_shares + variance_gender_score_shares +
                 # political_leaning_score_shares +
                 average_party_score_shares + variance_party_score_shares +
                 race_deviation_score_shares +
                 shares + users,
               data = final_df)
vif(model1_lm)
##   age_deviation_score_shares           average_yob_shares 
##                     2.158451                     1.731195 
##  average_gender_score_shares variance_gender_score_shares 
##                     1.091407                     1.355816 
##   average_party_score_shares  variance_party_score_shares 
##                     1.508645                     1.359835 
##  race_deviation_score_shares                       shares 
##                     1.115171                     3.124619 
##                        users 
##                     3.217145
summary(model1_lm)
## 
## Call:
## lm(formula = newsguard_score_perc ~ age_deviation_score_shares + 
##     average_yob_shares + average_gender_score_shares + variance_gender_score_shares + 
##     average_party_score_shares + variance_party_score_shares + 
##     race_deviation_score_shares + shares + users, data = final_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95327 -0.08057  0.04030  0.11897  1.19685 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -2.117e+01  1.275e+00 -16.610  < 2e-16 ***
## age_deviation_score_shares   -2.125e-01  1.563e-02 -13.592  < 2e-16 ***
## average_yob_shares            1.118e-02  6.423e-04  17.403  < 2e-16 ***
## average_gender_score_shares   2.364e-01  2.063e-02  11.461  < 2e-16 ***
## variance_gender_score_shares -2.904e-03  8.984e-02  -0.032    0.974    
## average_party_score_shares   -3.985e-01  2.223e-02 -17.923  < 2e-16 ***
## variance_party_score_shares   4.898e-01  1.130e-01   4.336 1.49e-05 ***
## race_deviation_score_shares   1.466e-02  1.475e-02   0.994    0.320    
## shares                        1.396e-08  1.720e-08   0.811    0.417    
## users                         7.225e-08  2.118e-07   0.341    0.733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1971 on 4013 degrees of freedom
## Multiple R-squared:  0.3959, Adjusted R-squared:  0.3945 
## F-statistic: 292.2 on 9 and 4013 DF,  p-value: < 2.2e-16
  • Same result except for the gender diversity (variance_gender_score_shares): OLS X vs. Beta reg O
  • OLS shrinks extreme effects because it assumes a constant variance. If gender diversity (variance_gender_score_shares) has a non-linear effect or interacts with other variables, OLS may not capture its full impact. vs. Beta Regression accounts for heteroskedasticity and handles non-linear relationships better.

2.2 Model 2. Users

model2 <- betareg(newsguard_score_perc ~
               age_deviation_score_users +  average_yob_users +
                 average_gender_score_users + variance_gender_score_users +
                 average_party_score_users + variance_party_score_users +
                 race_deviation_score_users +
                 shares + users,
               data = final_df)
vif(model2)
##   age_deviation_score_users           average_yob_users 
##                   1.4432276                  -0.9046236 
##  average_gender_score_users variance_gender_score_users 
##                   1.5235995                   0.8788422 
##   average_party_score_users  variance_party_score_users 
##                   0.9174253                   1.1935640 
##  race_deviation_score_users                      shares 
##                   1.2459435                   1.1459011 
##                       users 
##                   1.0545576
  • VIFs looking good
summary(model2)
## 
## Call:
## betareg(formula = newsguard_score_perc ~ age_deviation_score_users + 
##     average_yob_users + average_gender_score_users + variance_gender_score_users + 
##     average_party_score_users + variance_party_score_users + race_deviation_score_users + 
##     shares + users, data = final_df)
## 
## Randomized quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2946 -2.2297 -1.6704 -0.1789  4.3902 
## 
## Coefficients (mu model with logit link):
##                               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                  1.077e+02  4.760e-01  226.245  < 2e-16 ***
## age_deviation_score_users   -3.397e+00  1.320e-01  -25.744  < 2e-16 ***
## average_yob_users           -5.553e-02  5.622e-05 -987.768  < 2e-16 ***
## average_gender_score_users   2.753e+00  2.991e-01    9.203  < 2e-16 ***
## variance_gender_score_users  2.661e+01  9.422e-01   28.240  < 2e-16 ***
## average_party_score_users   -8.224e+00  2.771e-01  -29.675  < 2e-16 ***
## variance_party_score_users   1.814e+01  2.374e+00    7.640 2.17e-14 ***
## race_deviation_score_users  -3.259e+00  3.865e-01   -8.433  < 2e-16 ***
## shares                       9.433e-09  8.364e-06    0.001    0.999    
## users                        2.217e-06  1.128e-05    0.197    0.844    
## 
## Phi coefficients (phi model with identity link):
##       Estimate Std. Error z value Pr(>|z|)    
## (phi)   5.1143     0.1231   41.54   <2e-16 ***
## 
## Exceedence parameter (extended-support xbetax model):
##         Estimate Std. Error z value Pr(>|z|)    
## Log(nu)  -1.2859     0.0487   -26.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Exceedence parameter nu: 0.2764
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: -5732 on 12 Df
## Number of iterations in BFGS optimization: 1

✔ Higher age diversity (age_deviation_score_shares) is associated with lower NewsGuard scores

✔ Younger users (higher average_yob_users) are associated with lower NewsGuard scores, opposite to the shares-based model!

✔ More male-dominated sharing (average_gender_score_shares) is associated with higher NewsGuard scores

✔ Greater gender diversity (variance_gender_score_shares) is associated with higher NewsGuard scores

✔ Politically skewed audiences (average_party_score_shares) are associated with lower NewsGuard scores

✔ More political diversity (variance_party_score_shares) is associated with higher NewsGuard scores

✔ More racially diverse user bases (race_deviation_score_users) correlate with lower NewsGuard scores. (This was not significant in shares-based models!)

Just to check, OLS:

# Just to check: lm()
model2_lm <- lm(newsguard_score_perc ~
                  age_deviation_score_users +
                    average_yob_users +
                    average_gender_score_users + variance_gender_score_users +
                    average_party_score_users + variance_party_score_users +
                    race_deviation_score_users +
                    shares + users,
               data = final_df)
vif(model2_lm)
##   age_deviation_score_users           average_yob_users 
##                    6.118724                    5.606263 
##  average_gender_score_users variance_gender_score_users 
##                    1.240989                    1.166220 
##   average_party_score_users  variance_party_score_users 
##                    1.941844                    1.559644 
##  race_deviation_score_users                      shares 
##                    1.081159                    3.184314 
##                       users 
##                    3.466397
  • High VIF between age_deviation_score_users and average_yob_users
  • Let’s run the OLS regression in two versions: without age_deviation_score_users, and without average_yob_users
lm(newsguard_score_perc ~
     # age_deviation_score_users +
       average_yob_users +
       average_gender_score_users + variance_gender_score_users +
       average_party_score_users + variance_party_score_users +
       race_deviation_score_users +
       shares + users,
   data = final_df) |> summary()
## 
## Call:
## lm(formula = newsguard_score_perc ~ average_yob_users + average_gender_score_users + 
##     variance_gender_score_users + average_party_score_users + 
##     variance_party_score_users + race_deviation_score_users + 
##     shares + users, data = final_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.12865 -0.07086  0.03692  0.11132  0.69719 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -3.158e+01  1.498e+00 -21.085  < 2e-16 ***
## average_yob_users            1.614e-02  7.585e-04  21.278  < 2e-16 ***
## average_gender_score_users   3.160e-01  3.090e-02  10.226  < 2e-16 ***
## variance_gender_score_users  1.849e+00  1.296e-01  14.267  < 2e-16 ***
## average_party_score_users   -9.123e-01  2.648e-02 -34.450  < 2e-16 ***
## variance_party_score_users   2.907e+00  1.413e-01  20.570  < 2e-16 ***
## race_deviation_score_users  -1.071e-01  3.556e-02  -3.010  0.00263 ** 
## shares                       1.813e-08  1.624e-08   1.116  0.26446    
## users                       -5.552e-07  2.055e-07  -2.702  0.00691 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1845 on 4014 degrees of freedom
## Multiple R-squared:  0.4708, Adjusted R-squared:  0.4697 
## F-statistic: 446.3 on 8 and 4014 DF,  p-value: < 2.2e-16
lm(newsguard_score_perc ~
     age_deviation_score_users +
       # average_yob_users +
       average_gender_score_users + variance_gender_score_users +
       average_party_score_users + variance_party_score_users +
       race_deviation_score_users +
       shares + users,
   data = final_df) |> summary()
## 
## Call:
## lm(formula = newsguard_score_perc ~ age_deviation_score_users + 
##     average_gender_score_users + variance_gender_score_users + 
##     average_party_score_users + variance_party_score_users + 
##     race_deviation_score_users + shares + users, data = final_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93816 -0.06539  0.03577  0.10275  0.80962 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.117e-01  3.693e-02  13.857  < 2e-16 ***
## age_deviation_score_users   -3.080e-01  1.158e-02 -26.604  < 2e-16 ***
## average_gender_score_users   3.059e-01  2.939e-02  10.410  < 2e-16 ***
## variance_gender_score_users  1.375e+00  1.222e-01  11.248  < 2e-16 ***
## average_party_score_users   -7.196e-01  2.853e-02 -25.222  < 2e-16 ***
## variance_party_score_users   1.956e+00  1.472e-01  13.284  < 2e-16 ***
## race_deviation_score_users  -9.524e-02  3.455e-02  -2.757  0.00587 ** 
## shares                       1.075e-08  1.571e-08   0.684  0.49387    
## users                       -4.120e-07  1.963e-07  -2.098  0.03593 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1794 on 4014 degrees of freedom
## Multiple R-squared:  0.4994, Adjusted R-squared:  0.4984 
## F-statistic: 500.5 on 8 and 4014 DF,  p-value: < 2.2e-16
  • Basically the same results!