Attrition Analysis
# Load packages ================================================================
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  conflicted,    # Avoid conflict of functions with same names
  tidyverse,     # Tidyverse umbrella package
  haven,         # read_dta()
  infer,         # chisq_test() 
  sjPlot,        # tab_xtab()
  kableExtra,    # kbl()
  DT             # datatable() 
)  

conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")


# Import data ==================================================================
# Load survey data
df_w3 <- readRDS("survey_data/2023-095c_client_wave-3.rds")    # wave 3 
df_w2 <- read_dta("survey_data/2023-095b_client_wave-2.dta")   # wave 2
df_w1_1 <- readRDS("survey_data/2023-095a_client_wave-1.rds")  
df_w1_2 <- readRDS("survey_data/2023-095a_client_wave-1_supp.rds") 
df_w1 <- rowid_to_column(rbind(df_w1_1, df_w1_2), "ID") # merge and add ID

# Users who were eligible for eligibility test: ================================
users_for_elig_test_final <- read_csv("backend_data/users_for_elig_test_final.csv", 
                                      col_types = cols(user_id = col_character()))

# Wave 1 data: filter & select =================================================
wave1 <- df_w1 |>  
  filter(Progress==100) |> # who finished wave 1 till the end
  select(ID, userid, vs_age:vs_recruitment) |> 
  rename(user_id = userid) |> # rename userid for consistency
  merge(users_for_elig_test_final, by=c("user_id"), all.y=TRUE) |>
  select(ID, user_id, screen_name, vsid, vs_recruitment,
         vs_pid_base, vs_pid_lean, vs_raceeth, vs_age, vs_education, vs_gender)
  
# Eligibility criteria: follow OR exposure =====================================
eligibility_results <- read_csv("backend_data/eligibility_results.csv", 
                                col_types = cols(user_id = col_character(), 
                                                 account_created = col_skip(), 
                                                 following_us = col_skip()))

wave1 |> 
  merge(eligibility_results, by="user_id") |>
  relocate(user_id, following_NG, hometimeline) |>
  rename(follow = following_NG,
         exposure = hometimeline) -> wave1_merge

# Pre-treatment engagement =====================================================
pre_engagement_final <- read_csv("backend_data/pre_engagement_final.csv", 
                                 col_types = cols(user_id = col_character(), 
                                                  target_user_id = col_skip(), 
                                                  type = col_skip()))
pre_likes_final <- read_csv("backend_data/pre_likes_final.csv", 
                            col_types = cols(user_id = col_character(), 
                                             target_user_id = col_skip(), 
                                             match_type = col_skip()))
pre_engagement <- rbind(pre_engagement_final, pre_likes_final) |> 
  unique() |> 
  mutate(pre_total_engage = 1) 

wave1_merge |> 
  merge(pre_engagement, by="user_id", all = TRUE) |> 
  mutate_at("pre_total_engage", ~replace(., is.na(.), 0)) -> wave1_merge


# Wave 2 =======================================================================
df_w2 |> 
  filter(userid!="") |> 
  select(userid, randomized_group) |> 
  mutate(Y2 = 0) |> # a new variable indicating took wave 2 (attrition = 1)
  rename(user_id = userid) -> user_w2

# Merge ========================================================================
wave2_merge <- wave1_merge |>
  merge(user_w2, by="user_id", all = TRUE) |>
  mutate_at("Y2", ~replace(., is.na(.), 1)) # attrition to wave 2 = 1 

# Wave 3 =======================================================================
df_w3 |> 
  select(userid) |> 
  mutate(Y3 = 0) |> 
  rename(user_id = userid) -> wave3

wave3_merge <- wave2_merge |>
  merge(wave3, by="user_id", all = TRUE) |>
  mutate_at("Y3", ~replace(., is.na(.), 1)) # attrition to wave 3 = 1

# Data preprocess ==============================================================
wave3_merge |>
  relocate(user_id, follow, exposure, vs_recruitment, pre_total_engage, 
           randomized_group, vs_pid_base:vs_pid_lean, vs_raceeth, vs_age,
           vs_education, vs_gender, Y2, Y3) |>
  mutate(
    party_id = as.factor(case_when(
      vs_pid_base == "Democrat" | vs_pid_lean == "The Democratic Party" ~ "Democrat",
      vs_pid_base == "Republican" | vs_pid_lean == "The Republican Party" ~ "Republican",
      vs_pid_lean == "Neither" ~ "Neither")),
    
    age_group = as.factor(case_when(
      vs_age < 35 ~ "18-34",
      vs_age >= 35 & vs_age < 45 ~ "35-44",
      vs_age >= 45 & vs_age < 55 ~ "45-54", 
      vs_age >= 55 & vs_age < 65 ~ "55-64",
      vs_age >= 65 & vs_age < 75 ~ "65-74", 
      vs_age >= 75 & vs_age < 85 ~ "75-84", 
      vs_age >= 85 ~ "85+")),
    
    college_grad = as.factor(case_when(
      vs_education == "2-year or associate degree" ~ 1, 
      vs_education == "4-year or bachelor degree" ~ 1, 
      vs_education == "Post-graduate degree" ~ 1, 
      .default = 0)),
    
    white = as.factor(case_when(vs_raceeth == "White" ~ 1, .default = 0)),
    
    male = as.factor(case_when(vs_gender == "Male" ~ 1, .default = 0)),
    
    Y2 = as.factor(Y2),
    Y3 = as.factor(Y3),
    
    pre_total_engage = as.factor(pre_total_engage),
    
    randomized = as.factor(case_when(
      randomized_group == "control" ~ "Control",
      randomized_group == "drop" | randomized_group == "muting_treatment2_drop" ~ "Drop",
      randomized_group == "media_literacy" ~ "Media Literacy",
      randomized_group == "muting_treatment1" | randomized_group == "muting_treatment2" ~ "Muting",
      is.na(randomized_group) ~ NA)),
    
    eligibility_criteria = as.factor(case_when(
      follow == TRUE & exposure == TRUE ~ "Both",
      follow == TRUE & exposure == FALSE ~ "Follow",
      follow == FALSE & exposure == TRUE ~ "Exposure"))
    ) |>
  rename(recruit_mode = vs_recruitment) |> 
  select(user_id, follow, exposure, eligibility_criteria, recruit_mode, 
         pre_total_engage, randomized, party_id, 
         white, age_group, college_grad, male,
         Y2, Y3) -> attrit_dta

1 Data description

  • Y2 (1 if attrit in Wave 2 vs. 1)

  • Y3 (1 if attrit in Wave 3 vs. 1)

  • Y23 (1 if attrit in Wave 3 vs. 2)

DT::datatable(attrit_dta)

2 Attrition Analysis

2.1 Attrition by recruitment mode

2.1.1 Twitter vs. New Panelist vs. Existing Panelist

  • Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
attrit_dta |>
  select(recruit_mode, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
  infer::chisq_test(Y2~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1      5.95        2  0.0510
tab_xtab(attrit_dta$recruit_mode, attrit_dta$Y2, show.row.prc = TRUE) # viewer
recruit_modeY2Total
01
Twitter recruitment19
48.7 %
20
51.3 %
39
100 %
New panelist
recruited after
January 1, 2024
28
27.5 %
74
72.5 %
102
100 %
Existing panelist204
31.9 %
435
68.1 %
639
100 %
Total251
32.2 %
529
67.8 %
780
100 %
χ2=5.952 · df=2 · Cramer’s V=0.087 · p=0.051

  • Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
attrit_dta |>
  select(recruit_mode, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
  infer::chisq_test(Y3~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.744        2   0.689
#* statistic  chisq_df  p_value
#* 0.744      2         0.689
tab_xtab(attrit_dta$recruit_mode, attrit_dta$Y3, show.row.prc = TRUE) # viewer
recruit_modeY3Total
01
Twitter recruitment7
17.9 %
32
82.1 %
39
100 %
New panelist
recruited after
January 1, 2024
13
12.7 %
89
87.3 %
102
100 %
Existing panelist99
15.5 %
540
84.5 %
639
100 %
Total119
15.3 %
661
84.7 %
780
100 %
χ2=0.744 · df=2 · Cramer’s V=0.031 · p=0.689

  • Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
attrit_dta |>
  filter(Y2 != 1) |> # among those took wave 2
  mutate(Y23 = as.factor(case_when( 
    Y2 == 0 & Y3 == 1 ~ 1, .default = 0))) |>
  select(recruit_mode, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
  infer::chisq_test(Y23~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.964        2   0.617
attrit_dta |>
  mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
                                   .default = "Panelist")) -> attrit_dta2
attrit_dta2 |>
  filter(Y2 !=1) |>
  mutate(Y23 = as.factor(case_when(Y2 == 0 & Y3 == 1 ~ 1, 
                                   .default = 0))) -> attrit_dta3
tab_xtab(attrit_dta3$recruit_mode, attrit_dta3$Y23, show.row.prc = TRUE) 
recruit_modeY23Total
01
Twitter recruitment7
36.8 %
12
63.2 %
19
100 %
New panelist
recruited after
January 1, 2024
13
46.4 %
15
53.6 %
28
100 %
Existing panelist99
48.5 %
105
51.5 %
204
100 %
Total119
47.4 %
132
52.6 %
251
100 %
χ2=0.964 · df=2 · Cramer’s V=0.062 · p=0.617

2.1.2 Twitter vs. Panelist (New + Existing)

  • Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
attrit_dta |>
  # what if recruitment mode = twitter vs. panelist (existing + new)? 
  mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
                                   .default = "Panelist")) |>
  chisq_test(Y2~recruit_mode2) 
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1      4.38        1  0.0364
tab_xtab(attrit_dta2$recruit_mode2, attrit_dta2$Y2, show.row.prc = TRUE) 
recruit_mode2Y2Total
01
Panelist232
31.3 %
509
68.7 %
741
100 %
Twitter19
48.7 %
20
51.3 %
39
100 %
Total251
32.2 %
529
67.8 %
780
100 %
χ2=4.378 · df=1 · φ=0.081 · p=0.036
# Wave 2 attrit (vs. wave 1): Twitter < Panelist 

  • Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
attrit_dta2 |> 
  chisq_test(Y3~recruit_mode2)
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1    0.0632        1   0.802
# No diff 

  • Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
# recruitment mode: two (twitter vs. panelist) 
attrit_dta3 |> 
  chisq_test(Y23~recruit_mode2)
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.519        1   0.471

✅ No differences in attrition between recruitment mode, except for Twitter vs. Panelist (binary) (wave 2 vs. 1).


2.2 Attrition by eligibility criteria

# (wave 1 vs. wave 2)
tab_xtab(attrit_dta$eligibility_criteria, attrit_dta$Y2, show.row.prc = TRUE) 
eligibility_criteriaY2Total
01
Both154
69.1 %
69
30.9 %
223
100 %
Exposure93
68.4 %
43
31.6 %
136
100 %
Follow4
57.1 %
3
42.9 %
7
100 %
Total251
68.6 %
115
31.4 %
366
100 %
χ2=0.451 · df=2 · Cramer’s V=0.035 · Fisher’s p=0.823
# (wave 1 vs. wave 3)
tab_xtab(attrit_dta$eligibility_criteria, attrit_dta$Y3, show.row.prc = TRUE) 
eligibility_criteriaY3Total
01
Both65
29.1 %
158
70.9 %
223
100 %
Exposure52
38.2 %
84
61.8 %
136
100 %
Follow2
28.6 %
5
71.4 %
7
100 %
Total119
32.5 %
247
67.5 %
366
100 %
χ2=3.230 · df=2 · Cramer’s V=0.094 · Fisher’s p=0.204
# (wave 2 vs. wave 3)
tab_xtab(attrit_dta3$eligibility_criteria, attrit_dta3$Y23, show.row.prc = TRUE) 
eligibility_criteriaY23Total
01
Both65
42.2 %
89
57.8 %
154
100 %
Exposure52
55.9 %
41
44.1 %
93
100 %
Follow2
50 %
2
50 %
4
100 %
Total119
47.4 %
132
52.6 %
251
100 %
χ2=4.380 · df=2 · Cramer’s V=0.132 · Fisher’s p=0.112

✅ No differences in attrition between eligibility criteria

2.3 Attrition by pre-treatment engagement

  • Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
# (wave 1 vs. wave 2)
tab_xtab(attrit_dta$pre_total_engage, attrit_dta$Y2, show.row.prc = TRUE) 
pre_total_engageY2Total
01
0119
19.5 %
491
80.5 %
610
100 %
1132
77.6 %
38
22.4 %
170
100 %
Total251
32.2 %
529
67.8 %
780
100 %
χ2=203.254 · df=1 · φ=0.514 · p=0.000
attrit_dta |>
  select(pre_total_engage, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
  infer::chisq_test(Y2~pre_total_engage)  
## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      203.        1 4.07e-46

  • Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
# (wave 1 vs. wave 3)
tab_xtab(attrit_dta$pre_total_engage, attrit_dta$Y3, show.row.prc = TRUE) 
pre_total_engageY3Total
01
057
9.3 %
553
90.7 %
610
100 %
162
36.5 %
108
63.5 %
170
100 %
Total119
15.3 %
661
84.7 %
780
100 %
χ2=73.583 · df=1 · φ=0.311 · p=0.000
attrit_dta |>
  select(pre_total_engage, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
  infer::chisq_test(Y3~pre_total_engage) 
## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      73.6        1 9.65e-18

  • Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
# (wave 2 vs. wave 3)
tab_xtab(attrit_dta3$pre_total_engage, attrit_dta3$Y23, show.row.prc = TRUE) 
pre_total_engageY23Total
01
057
47.9 %
62
52.1 %
119
100 %
162
47 %
70
53 %
132
100 %
Total119
47.4 %
132
52.6 %
251
100 %
χ2=0.000 · df=1 · φ=0.009 · p=0.984
attrit_dta3 |>
  select(pre_total_engage, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
  infer::chisq_test(Y23~pre_total_engage) 
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1  0.000428        1   0.984
#* statistic  chisq_df  p_value
#* 0.000428   1         0.984
# ========= No differences in attrition between wave 2 vs. wave 3 ==============

✅ Different attrition by pre-treatment total engagement: Ppl with non-zero pre-treatment engagement attrit less in Wave 3 or 2 vs. 1. (No differences in attrition between wave 2 vs. wave 3)


2.4 [Differential Attrition] By randomized group

  • Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2) only since the treatment was in Wave 2
# ==============================================================================
# 4. Randomized group (Differential attrition) ================================= 
# ==============================================================================

# (wave 2 vs. wave 3) (Y23: 1 = attrit in Wave 3 vs. 2)
attrit_dta4 <- attrit_dta3 |>
  filter(randomized != "Drop") |> # Among randomized participants 
  mutate(randomized = factor(randomized,
                             levels = c("Control", "Media Literacy", "Muting"))) 
attrit_dta4 |> 
  infer::chisq_test(Y23~randomized) 
## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect
## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.636        2   0.728
tab_xtab(attrit_dta4$randomized, attrit_dta4$Y23, show.row.prc = TRUE) 
randomizedY23Total
01
Control42
87.5 %
6
12.5 %
48
100 %
Media Literacy36
85.7 %
6
14.3 %
42
100 %
Muting41
91.1 %
4
8.9 %
45
100 %
Total119
88.1 %
16
11.9 %
135
100 %
χ2=0.636 · df=2 · Cramer’s V=0.069 · p=0.728
# ========= No differential attrition between wave 2 vs. wave 3 ================

✅ No differential attrition between wave 2 vs. wave 3


2.5 [Selective attrition] By observable characteristics

Test null of no difference in party_id, white, age_group, college_grad, male between people who attrit and those who don’t (Y2; Y3)

  1. T-tests with unequal variances for binary (white, college_grad, male, democrat, republican) and continuous variables
  • White (non-white - white)
# ==============================================================================
# 5. Observable characteristics (Selective attrition) ========================== 
#* ========================================================================== *#
#* Selective attrition: Wave 1 vs. Wave 2; Wave 1 vs. Wave 3 
#* Test null of no difference in party_id, white, age_group, college_grad, male
#* between people who attrit and those who don’t (Y2; Y3)
#* ========================================================================== *#
#* T-tests with unequal variances for binary and continuous variables 
#* - white, college_grad, male (binary); 
#* - democrat, republican (as binary)
#* ========================================================================== *#
attrit_dta |> 
  mutate(white = as.numeric(white)-1) |> 
  infer::t_test(white ~ Y2, order = c("0", "1"), var.equal=FALSE) # mean(non-white) - mean(white)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      1.99  514.  0.0469 two.sided     0.0715 0.000996    0.142
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 1.99       514.  0.0469  two.sided     0.0715    0.000996  0.142
# white less attrit (wave 2 vs. 1) 

attrit_dta |> 
  mutate(white = as.numeric(white)-1) |> 
  infer::t_test(white ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.40  173.  0.0173 two.sided      0.107   0.0192    0.195
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.40       173.  0.0173  two.sided     0.107     0.0192    0.195
# white less attrit (wave 3 vs. 1)

# FYI: no diff in attrit between white vs. non-white (wave 3 vs. 2)

✅ white less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between white vs. non-white (wave 3 vs. 2)


  • College grad (non-college grad - college grad)
# ============================================================================ #
attrit_dta |> 
  mutate(college_grad = as.numeric(college_grad)-1) |> 
  infer::t_test(college_grad ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      3.10  493. 0.00206 two.sided      0.118   0.0431    0.193
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 3.10       493.  0.00206 two.sided     0.118     0.0431    0.193
# college graduate less attrit (wave 2 vs. 1)

attrit_dta |> 
  mutate(college_grad = as.numeric(college_grad)-1) |> 
  infer::t_test(college_grad ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.36  164.  0.0196 two.sided      0.116   0.0189    0.214
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.36       164.  0.0196  two.sided     0.116     0.0189    0.214
# college graduate less attrit (wave 3 vs. 1)

# FYI: no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)

✅ college graduate less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)


  • Male (non-male - male)
# ============================================================================ #
attrit_dta |> 
  mutate(male = as.numeric(male)-1) |> 
  infer::t_test(male ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1    -0.629  492.   0.529 two.sided    -0.0241  -0.0993   0.0511
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -0.629     492.  0.529   two.sided     -0.0241   -0.0993   0.0511
# no sig diff (wave 2 vs. 1)

attrit_dta |> 
  mutate(male = as.numeric(male)-1) |> 
  infer::t_test(male ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -2.45  166.  0.0153 two.sided     -0.119   -0.215  -0.0231
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -2.45      166.  0.0153  two.sided     -0.119    -0.215    -0.0231
# male attrit more (wave 3 vs. 1)
# FYI: no diff in attrit between male vs. non-male (wave 3 vs. 2)

✅ male attrit more (wave 3 vs. 1) ; no diff in attrit between male vs. non-male grad (wave 3 vs. 2; wave 2 vs. 1)


  • Democrat (non-democrat - democrat)
# ============================================================================ #
attrit_dta |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -3.08  491. 0.00216 two.sided     -0.118   -0.192  -0.0426
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -3.08      491.  0.00216 two.sided     -0.118    -0.192    -0.0426
# democrat more attrit (wave 2 vs. 1)

attrit_dta |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1    -0.325  163.   0.746 two.sided    -0.0162   -0.115   0.0824
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -0.325     163.  0.746   two.sided     -0.0162   -0.115    0.0824
# no sig diff (wave 3 vs. 1)

attrit_dta3 |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y23, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.00  245.  0.0461 two.sided      0.125  0.00217    0.249
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.00       245.  0.0461  two.sided     0.125     0.00217   0.249
# democrat less attrit (wave 3 vs. 2)

✅ democrat more attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; democrat less attrit (wave 3 vs. 2)


  • Republican (non-republican - republican)
# ============================================================================ #
attrit_dta |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df   p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>     <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      3.97  453. 0.0000840 two.sided      0.147   0.0742    0.220
#* statistic  t_df  p_value   alternative   estimate  lower_ci  upper_ci
#* 3.97       453.  0.000084  two.sided     0.147     0.0742    0.220
# republican less attrit (wave 2 vs. 1)

attrit_dta |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      1.07  160.   0.288 two.sided     0.0514  -0.0439    0.147
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 1.07       160.  0.288   two.sided     0.0514    -0.0439   0.147
# no sig diff (wave 3 vs. 1)

attrit_dta3 |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y23, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -1.71  248.  0.0888 two.sided     -0.107   -0.230   0.0163
#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -3.83      179.  0.00018 two.sided     -0.181    -0.274    -0.0876
# republican more attrit (wave 3 vs. 2)

✅ republican less attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; republican more attrit (wave 3 vs. 2) (=Opposite of democrat)

  1. Chi-squared tests using the Benjamini-Hochberg procedure for factors
  • factors: party_id, age_group
#* ========================================================================== *#
#* Chi-squared tests using the Benjamini-Hochberg procedure for factors
#* - party_id, age_group (factors) 
#* ========================================================================== *#
attrit_dta |>
  infer::chisq_test(party_id ~ Y2)
## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      16.5        2 0.000259
tab_xtab(attrit_dta$party_id, attrit_dta$Y2, show.row.prc = TRUE) 
party_idY2Total
01
Democrat110
27.2 %
294
72.8 %
404
100 %
Neither32
27.8 %
83
72.2 %
115
100 %
Republican109
41.8 %
152
58.2 %
261
100 %
Total251
32.2 %
529
67.8 %
780
100 %
χ2=16.520 · df=2 · Cramer’s V=0.146 · p=0.000
#*   statistic chisq_df  p_value
#*   16.5      2         0.000259

attrit_dta |>
  infer::chisq_test(age_group ~ Y2)
## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect
## # A tibble: 1 × 3
##   statistic chisq_df    p_value
##       <dbl>    <int>      <dbl>
## 1      38.1        6 0.00000107
#*   statistic chisq_df  p_value
#*   38.1        6       0.00000107 
#*   (with warning: chi-squared approximation may be incorrect)
tab_xtab(attrit_dta$age_group, attrit_dta$Y2, show.row.prc = TRUE) 
age_groupY2Total
01
18-3445
19.9 %
181
80.1 %
226
100 %
35-4465
28.6 %
162
71.4 %
227
100 %
45-5467
42.1 %
92
57.9 %
159
100 %
55-6448
42.9 %
64
57.1 %
112
100 %
65-7420
43.5 %
26
56.5 %
46
100 %
75-845
62.5 %
3
37.5 %
8
100 %
85+1
100 %
0
0 %
1
100 %
Total251
32.2 %
528
67.8 %
779
100 %
χ2=38.111 · df=6 · Cramer’s V=0.221 · Fisher’s p=0.000
  • To be updated (currently figuring our what I should show)

OR following this paper (I used their R script for attrition tables):

  • % of respondents by each category in each wave

    • e.g., In Wave 1, 51.8% were self-identified democrat, 33.5% republican, and 14.7% as neither. In Wave 2 (among those who took Wave 2; didn’t attrit), 43.8% were democrat, 43.4% republican, and 12.7% neither.

    • p-value: result of chi-squared tests

      • For factor variables (Party ID and Age Group), p-values are adjusted using BH => 👀 not sure I did this right though
# OR 
# SETUP ========================================================================
data_w1 <- attrit_dta # started from wave 1 
data_w12 <- attrit_dta |> filter(Y2 == 0) # drop the attrit in wave 2 (vs. 1)
data_w123 <- attrit_dta |> filter(Y2 == 0 & Y3 == 0) # drop the attrit in wave 3 (vs. 2)

# Attrition Table
wave1_bind <- rbind(
  data.frame(prop.table(table(data_w1$party_id))*100, variable = "Party ID"),
  data.frame(prop.table(table(data_w1$white))*100, variable = "White"), 
  data.frame(prop.table(table(data_w1$age_group))*100, variable = "Age Group"), 
  data.frame(prop.table(table(data_w1$college_grad))*100, variable = "College Grad"),
  data.frame(prop.table(table(data_w1$male))*100, variable = "Male")
) |> rename(value = Var1, wave1 = Freq) 

wave2_bind <- rbind(
  data.frame(prop.table(table(data_w12$party_id))*100),
  data.frame(prop.table(table(data_w12$white))*100), 
  data.frame(prop.table(table(data_w12$age_group))*100), 
  data.frame(prop.table(table(data_w12$college_grad))*100),
  data.frame(prop.table(table(data_w12$male))*100)
) |> rename(value = Var1, wave2 = Freq) |> select(!value)

wave3_bind <- rbind(
  data.frame(prop.table(table(data_w123$party_id))*100),
  data.frame(prop.table(table(data_w123$white))*100), 
  data.frame(prop.table(table(data_w123$age_group))*100), 
  data.frame(prop.table(table(data_w123$college_grad))*100),
  data.frame(prop.table(table(data_w123$male))*100)
) |> rename(value = Var1, wave3 = Freq) |> select(!value)

cbind(wave1_bind, wave2_bind, wave3_bind) |>
  relocate(variable, value, wave1, wave2, wave3) |>
  mutate(
    wave1 = round(wave1, 1),
    wave2 = round(wave2, 1),
    wave3 = round(wave3, 1)
  ) -> attrition_table

# Add significance tests
p_pid <- round(chisq.test(rbind(
  (table(data_w1$party_id)),
  (table(data_w12$party_id)),
  (table(data_w123$party_id))))$p.value, 3)
p_white <- round(chisq.test(rbind(
  (table(data_w1$white)),
  (table(data_w12$white)),
  (table(data_w123$white))))$p.value, 3)
p_age <- round(chisq.test(rbind(
  (table(data_w1$age_group)),
  (table(data_w12$age_group)),
  (table(data_w123$age_group))))$p.value, 3)
## Warning in chisq.test(rbind((table(data_w1$age_group)),
## (table(data_w12$age_group)), : Chi-squared approximation may be incorrect
p_collgrad <- round(chisq.test(rbind(
  (table(data_w1$college_grad)),
  (table(data_w12$college_grad)),
  (table(data_w123$college_grad))))$p.value, 3)
p_male <- round(chisq.test(rbind(
  (table(data_w1$male)),
  (table(data_w12$male)),
  (table(data_w123$male))))$p.value, 3)

# BH procedure for factor variables (***Is this right?)
# (https://www.r-bloggers.com/2023/07/the-benjamini-hochberg-procedure-fdr-and-p-value-adjusted-explained/)
p.adjust(c(p_pid, p_age), method="BH")
## [1] 0.067 0.062
attrition_table <- attrition_table |> 
  mutate(sig = case_when(
    variable == "Party ID" ~ p_pid,
    variable == "White" ~ p_white,
    variable == "Age Group" ~ 0.062,
    variable == "College Grad" ~ p_collgrad,
    variable == "Male" ~ p_male)) 

attrition_table |> 
  kbl(caption = "Attrition by demographics", 
      format = "html", booktabs = T, escape = F, linesep = "", row.names = F, 
      col.names = c("Variable", "Value", "Wave1", "Wave2", "Wave3", "p-value")) |>
  collapse_rows(columns = c(1, 6), valign = "middle", 
                row_group_label_position = "stack",
                latex_hline = "major") |>
  kable_classic_2("hover", full_width = F)
Attrition by demographics
VariableValueWave1Wave2Wave3p-value
Party IDDemocrat51.843.850.40.067
Neither14.712.711.8
Republican33.543.437.8
White035.130.326.10.081
164.969.773.9
Age Group18-3429.017.920.20.062
35-4429.125.928.6
45-5420.426.723.5
55-6414.419.119.3
65-745.98.07.6
75-841.02.00.8
85+0.10.40.0
College Grad051.043.041.20.023
149.057.058.8
Male052.954.663.00.120
147.145.437.0

✅ College graduates attrited less