Attrition Analysis

# Load packages ================================================================
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  conflicted,    # Avoid conflict of functions with same names
  tidyverse,     # Tidyverse umbrella package
  haven,         # read_dta()
  infer,         # chisq_test() 
  sjPlot,        # tab_xtab()
  kableExtra,    # kbl()
  DT             # datatable() 
)  

conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")


# Import data ==================================================================
# Load survey data
df_w3 <- readRDS("survey_data/2023-095c_client_wave-3.rds")    # wave 3 
df_w2 <- read_dta("survey_data/2023-095b_client_wave-2.dta")   # wave 2
df_w1_1 <- readRDS("survey_data/2023-095a_client_wave-1.rds")  
df_w1_2 <- readRDS("survey_data/2023-095a_client_wave-1_supp.rds") 
df_w1 <- rowid_to_column(rbind(df_w1_1, df_w1_2), "ID") # merge and add ID

# Users who were eligible for eligibility test: ================================
users_for_elig_test_final <- read_csv("backend_data/users_for_elig_test_final.csv", 
                                      col_types = cols(user_id = col_character()))

# Wave 1 data: filter & select =================================================
wave1 <- df_w1 |>  
  filter(Progress==100) |> # who finished wave 1 till the end
  select(ID, userid, vs_age:vs_recruitment) |> 
  rename(user_id = userid) |> # rename userid for consistency
  merge(users_for_elig_test_final, by=c("user_id"), all.y=TRUE) |>
  select(ID, user_id, screen_name, vsid, vs_recruitment,
         vs_pid_base, vs_pid_lean, vs_raceeth, vs_age, vs_education, vs_gender)
  
# Eligibility criteria: follow OR exposure =====================================
eligibility_results <- read_csv("backend_data/eligibility_results.csv", 
                                col_types = cols(user_id = col_character(), 
                                                 account_created = col_skip(), 
                                                 following_us = col_skip()))

wave1 |> 
  merge(eligibility_results, by="user_id") |>
  relocate(user_id, following_NG, hometimeline) |>
  rename(follow = following_NG,
         exposure = hometimeline) -> wave1_merge

# Pre-treatment engagement =====================================================
pre_engagement_final <- read_csv("backend_data/pre_engagement_final.csv", 
                                 col_types = cols(user_id = col_character(), 
                                                  target_user_id = col_skip(), 
                                                  type = col_skip()))
pre_likes_final <- read_csv("backend_data/pre_likes_final.csv", 
                            col_types = cols(user_id = col_character(), 
                                             target_user_id = col_skip(), 
                                             match_type = col_skip()))
pre_engagement <- rbind(pre_engagement_final, pre_likes_final) |> 
  unique() |> 
  mutate(pre_total_engage = 1) 

wave1_merge |> 
  merge(pre_engagement, by="user_id", all = TRUE) |> 
  mutate_at("pre_total_engage", ~replace(., is.na(.), 0)) -> wave1_merge


# Wave 2 =======================================================================
df_w2 |> 
  filter(userid!="") |> 
  select(userid, randomized_group) |> 
  mutate(Y2 = 0) |> # a new variable indicating took wave 2 (attrition = 1)
  rename(user_id = userid) -> user_w2

# Merge ========================================================================
wave2_merge <- wave1_merge |>
  merge(user_w2, by="user_id", all = TRUE) |>
  mutate_at("Y2", ~replace(., is.na(.), 1)) # attrition to wave 2 = 1 

# Wave 3 =======================================================================
df_w3 |> 
  select(userid) |> 
  mutate(Y3 = 0) |> 
  rename(user_id = userid) -> wave3

wave3_merge <- wave2_merge |>
  merge(wave3, by="user_id", all = TRUE) |>
  mutate_at("Y3", ~replace(., is.na(.), 1)) # attrition to wave 3 = 1

# Data preprocess ==============================================================
wave3_merge |>
  relocate(user_id, follow, exposure, vs_recruitment, pre_total_engage, 
           randomized_group, vs_pid_base:vs_pid_lean, vs_raceeth, vs_age,
           vs_education, vs_gender, Y2, Y3) |>
  mutate(
    party_id = as.factor(case_when(
      vs_pid_base == "Democrat" | vs_pid_lean == "The Democratic Party" ~ "Democrat",
      vs_pid_base == "Republican" | vs_pid_lean == "The Republican Party" ~ "Republican",
      vs_pid_lean == "Neither" ~ "Neither")),
    
    age_group = as.factor(case_when(
      vs_age < 35 ~ "18-34",
      vs_age >= 35 & vs_age < 45 ~ "35-44",
      vs_age >= 45 & vs_age < 55 ~ "45-54", 
      vs_age >= 55 & vs_age < 65 ~ "55-64",
      vs_age >= 65 & vs_age < 75 ~ "65-74", 
      vs_age >= 75 & vs_age < 85 ~ "75-84", 
      vs_age >= 85 ~ "85+")),
    
    college_grad = as.factor(case_when(
      vs_education == "2-year or associate degree" ~ 1, 
      vs_education == "4-year or bachelor degree" ~ 1, 
      vs_education == "Post-graduate degree" ~ 1, 
      .default = 0)),
    
    white = as.factor(case_when(vs_raceeth == "White" ~ 1, .default = 0)),
    
    male = as.factor(case_when(vs_gender == "Male" ~ 1, .default = 0)),
    
    Y2 = as.factor(Y2),
    Y3 = as.factor(Y3),
    
    pre_total_engage = as.factor(pre_total_engage),
    
    randomized = as.factor(case_when(
      randomized_group == "control" ~ "Control",
      randomized_group == "drop" | randomized_group == "muting_treatment2_drop" ~ "Drop",
      randomized_group == "media_literacy" ~ "Media Literacy",
      randomized_group == "muting_treatment1" | randomized_group == "muting_treatment2" ~ "Muting",
      is.na(randomized_group) ~ NA)),
    
    eligibility_criteria = as.factor(case_when(
      follow == TRUE & exposure == TRUE ~ "Both",
      follow == TRUE & exposure == FALSE ~ "Follow",
      follow == FALSE & exposure == TRUE ~ "Exposure"))
    ) |>
  rename(recruit_mode = vs_recruitment) |> 
  select(user_id, follow, exposure, eligibility_criteria, recruit_mode, 
         pre_total_engage, randomized, party_id, 
         white, age_group, college_grad, male,
         Y2, Y3) -> attrit_dta

1 Data description

Y2 (1 if attrit in Wave 2 vs. 1)
Y3 (1 if attrit in Wave 3 vs. 1)
Y23 (1 if attrit in Wave 3 vs. 2)

DT::datatable(attrit_dta)

2 Attrition Analysis

2.1 Attrition by recruitment mode

2.1.1 Twitter vs. New Panelist vs. Existing Panelist

Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)

attrit_dta |>
  select(recruit_mode, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
  infer::chisq_test(Y2~recruit_mode) # Pearson's Chi-squared test

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1      5.95        2  0.0510

tab_xtab(attrit_dta$recruit_mode, attrit_dta$Y2, show.row.prc = TRUE) # viewer

recruit_mode	Y2		Total
recruit_mode	0	1	Total
Twitter recruitment	19 48.7 %	20 51.3 %	39 100 %
New panelist recruited after January 1, 2024	28 27.5 %	74 72.5 %	102 100 %
Existing panelist	204 31.9 %	435 68.1 %	639 100 %
Total	251 32.2 %	529 67.8 %	780 100 %
χ²=5.952 · df=2 · Cramer’s V=0.087 · p=0.051

Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)

attrit_dta |>
  select(recruit_mode, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
  infer::chisq_test(Y3~recruit_mode) # Pearson's Chi-squared test

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.744        2   0.689

#* statistic  chisq_df  p_value
#* 0.744      2         0.689
tab_xtab(attrit_dta$recruit_mode, attrit_dta$Y3, show.row.prc = TRUE) # viewer

recruit_mode	Y3		Total
recruit_mode	0	1	Total
Twitter recruitment	7 17.9 %	32 82.1 %	39 100 %
New panelist recruited after January 1, 2024	13 12.7 %	89 87.3 %	102 100 %
Existing panelist	99 15.5 %	540 84.5 %	639 100 %
Total	119 15.3 %	661 84.7 %	780 100 %
χ²=0.744 · df=2 · Cramer’s V=0.031 · p=0.689

Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)

attrit_dta |>
  filter(Y2 != 1) |> # among those took wave 2
  mutate(Y23 = as.factor(case_when( 
    Y2 == 0 & Y3 == 1 ~ 1, .default = 0))) |>
  select(recruit_mode, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
  infer::chisq_test(Y23~recruit_mode) # Pearson's Chi-squared test

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.964        2   0.617

attrit_dta |>
  mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
                                   .default = "Panelist")) -> attrit_dta2
attrit_dta2 |>
  filter(Y2 !=1) |>
  mutate(Y23 = as.factor(case_when(Y2 == 0 & Y3 == 1 ~ 1, 
                                   .default = 0))) -> attrit_dta3
tab_xtab(attrit_dta3$recruit_mode, attrit_dta3$Y23, show.row.prc = TRUE)

recruit_mode	Y23		Total
recruit_mode	0	1	Total
Twitter recruitment	7 36.8 %	12 63.2 %	19 100 %
New panelist recruited after January 1, 2024	13 46.4 %	15 53.6 %	28 100 %
Existing panelist	99 48.5 %	105 51.5 %	204 100 %
Total	119 47.4 %	132 52.6 %	251 100 %
χ²=0.964 · df=2 · Cramer’s V=0.062 · p=0.617

2.1.2 Twitter vs. Panelist (New + Existing)

Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)

attrit_dta |>
  # what if recruitment mode = twitter vs. panelist (existing + new)? 
  mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
                                   .default = "Panelist")) |>
  chisq_test(Y2~recruit_mode2)

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1      4.38        1  0.0364

tab_xtab(attrit_dta2$recruit_mode2, attrit_dta2$Y2, show.row.prc = TRUE)

recruit_mode2	Y2		Total
recruit_mode2	0	1	Total
Panelist	232 31.3 %	509 68.7 %	741 100 %
Twitter	19 48.7 %	20 51.3 %	39 100 %
Total	251 32.2 %	529 67.8 %	780 100 %
χ²=4.378 · df=1 · φ=0.081 · p=0.036

# Wave 2 attrit (vs. wave 1): Twitter < Panelist

Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)

attrit_dta2 |> 
  chisq_test(Y3~recruit_mode2)

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1    0.0632        1   0.802

# No diff

Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)

# recruitment mode: two (twitter vs. panelist) 
attrit_dta3 |> 
  chisq_test(Y23~recruit_mode2)

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.519        1   0.471

✅ No differences in attrition between recruitment mode, except for Twitter vs. Panelist (binary) (wave 2 vs. 1).

2.2 Attrition by eligibility criteria

# (wave 1 vs. wave 2)
tab_xtab(attrit_dta$eligibility_criteria, attrit_dta$Y2, show.row.prc = TRUE)

eligibility_criteria	Y2		Total
eligibility_criteria	0	1	Total
Both	154 69.1 %	69 30.9 %	223 100 %
Exposure	93 68.4 %	43 31.6 %	136 100 %
Follow	4 57.1 %	3 42.9 %	7 100 %
Total	251 68.6 %	115 31.4 %	366 100 %
χ²=0.451 · df=2 · Cramer’s V=0.035 · Fisher’s p=0.823

# (wave 1 vs. wave 3)
tab_xtab(attrit_dta$eligibility_criteria, attrit_dta$Y3, show.row.prc = TRUE)

eligibility_criteria	Y3		Total
eligibility_criteria	0	1	Total
Both	65 29.1 %	158 70.9 %	223 100 %
Exposure	52 38.2 %	84 61.8 %	136 100 %
Follow	2 28.6 %	5 71.4 %	7 100 %
Total	119 32.5 %	247 67.5 %	366 100 %
χ²=3.230 · df=2 · Cramer’s V=0.094 · Fisher’s p=0.204

# (wave 2 vs. wave 3)
tab_xtab(attrit_dta3$eligibility_criteria, attrit_dta3$Y23, show.row.prc = TRUE)

eligibility_criteria	Y23		Total
eligibility_criteria	0	1	Total
Both	65 42.2 %	89 57.8 %	154 100 %
Exposure	52 55.9 %	41 44.1 %	93 100 %
Follow	2 50 %	2 50 %	4 100 %
Total	119 47.4 %	132 52.6 %	251 100 %
χ²=4.380 · df=2 · Cramer’s V=0.132 · Fisher’s p=0.112

✅ No differences in attrition between eligibility criteria

2.3 Attrition by pre-treatment engagement

Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)

# (wave 1 vs. wave 2)
tab_xtab(attrit_dta$pre_total_engage, attrit_dta$Y2, show.row.prc = TRUE)

pre_total_engage	Y2		Total
pre_total_engage	0	1	Total
0	119 19.5 %	491 80.5 %	610 100 %
1	132 77.6 %	38 22.4 %	170 100 %
Total	251 32.2 %	529 67.8 %	780 100 %
χ²=203.254 · df=1 · φ=0.514 · p=0.000

attrit_dta |>
  select(pre_total_engage, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
  infer::chisq_test(Y2~pre_total_engage)

## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      203.        1 4.07e-46

Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)

# (wave 1 vs. wave 3)
tab_xtab(attrit_dta$pre_total_engage, attrit_dta$Y3, show.row.prc = TRUE)

pre_total_engage	Y3		Total
pre_total_engage	0	1	Total
0	57 9.3 %	553 90.7 %	610 100 %
1	62 36.5 %	108 63.5 %	170 100 %
Total	119 15.3 %	661 84.7 %	780 100 %
χ²=73.583 · df=1 · φ=0.311 · p=0.000

attrit_dta |>
  select(pre_total_engage, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
  infer::chisq_test(Y3~pre_total_engage)

## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      73.6        1 9.65e-18

Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)

# (wave 2 vs. wave 3)
tab_xtab(attrit_dta3$pre_total_engage, attrit_dta3$Y23, show.row.prc = TRUE)

pre_total_engage	Y23		Total
pre_total_engage	0	1	Total
0	57 47.9 %	62 52.1 %	119 100 %
1	62 47 %	70 53 %	132 100 %
Total	119 47.4 %	132 52.6 %	251 100 %
χ²=0.000 · df=1 · φ=0.009 · p=0.984

attrit_dta3 |>
  select(pre_total_engage, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
  infer::chisq_test(Y23~pre_total_engage)

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1  0.000428        1   0.984

#* statistic  chisq_df  p_value
#* 0.000428   1         0.984
# ========= No differences in attrition between wave 2 vs. wave 3 ==============

✅ Different attrition by pre-treatment total engagement: Ppl with non-zero pre-treatment engagement attrit less in Wave 3 or 2 vs. 1. (No differences in attrition between wave 2 vs. wave 3)

2.4 [Differential Attrition] By randomized group

Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2) only since the treatment was in Wave 2

# ==============================================================================
# 4. Randomized group (Differential attrition) ================================= 
# ==============================================================================

# (wave 2 vs. wave 3) (Y23: 1 = attrit in Wave 3 vs. 2)
attrit_dta4 <- attrit_dta3 |>
  filter(randomized != "Drop") |> # Among randomized participants 
  mutate(randomized = factor(randomized,
                             levels = c("Control", "Media Literacy", "Muting"))) 
attrit_dta4 |> 
  infer::chisq_test(Y23~randomized)

## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect

## # A tibble: 1 × 3
##   statistic chisq_df p_value
##       <dbl>    <int>   <dbl>
## 1     0.636        2   0.728

tab_xtab(attrit_dta4$randomized, attrit_dta4$Y23, show.row.prc = TRUE)

randomized	Y23		Total
randomized	0	1	Total
Control	42 87.5 %	6 12.5 %	48 100 %
Media Literacy	36 85.7 %	6 14.3 %	42 100 %
Muting	41 91.1 %	4 8.9 %	45 100 %
Total	119 88.1 %	16 11.9 %	135 100 %
χ²=0.636 · df=2 · Cramer’s V=0.069 · p=0.728

# ========= No differential attrition between wave 2 vs. wave 3 ================

✅ No differential attrition between wave 2 vs. wave 3

2.5 [Selective attrition] By observable characteristics

Test null of no difference in party_id, white, age_group, college_grad, male between people who attrit and those who don’t (Y2; Y3)

T-tests with unequal variances for binary (white, college_grad, male, democrat, republican) and continuous variables

White (non-white - white)

# ==============================================================================
# 5. Observable characteristics (Selective attrition) ========================== 
#* ========================================================================== *#
#* Selective attrition: Wave 1 vs. Wave 2; Wave 1 vs. Wave 3 
#* Test null of no difference in party_id, white, age_group, college_grad, male
#* between people who attrit and those who don’t (Y2; Y3)
#* ========================================================================== *#
#* T-tests with unequal variances for binary and continuous variables 
#* - white, college_grad, male (binary); 
#* - democrat, republican (as binary)
#* ========================================================================== *#
attrit_dta |> 
  mutate(white = as.numeric(white)-1) |> 
  infer::t_test(white ~ Y2, order = c("0", "1"), var.equal=FALSE) # mean(non-white) - mean(white)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      1.99  514.  0.0469 two.sided     0.0715 0.000996    0.142

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 1.99       514.  0.0469  two.sided     0.0715    0.000996  0.142
# white less attrit (wave 2 vs. 1) 

attrit_dta |> 
  mutate(white = as.numeric(white)-1) |> 
  infer::t_test(white ~ Y3, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.40  173.  0.0173 two.sided      0.107   0.0192    0.195

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.40       173.  0.0173  two.sided     0.107     0.0192    0.195
# white less attrit (wave 3 vs. 1)

# FYI: no diff in attrit between white vs. non-white (wave 3 vs. 2)

✅ white less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between white vs. non-white (wave 3 vs. 2)

College grad (non-college grad - college grad)

# ============================================================================ #
attrit_dta |> 
  mutate(college_grad = as.numeric(college_grad)-1) |> 
  infer::t_test(college_grad ~ Y2, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      3.10  493. 0.00206 two.sided      0.118   0.0431    0.193

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 3.10       493.  0.00206 two.sided     0.118     0.0431    0.193
# college graduate less attrit (wave 2 vs. 1)

attrit_dta |> 
  mutate(college_grad = as.numeric(college_grad)-1) |> 
  infer::t_test(college_grad ~ Y3, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.36  164.  0.0196 two.sided      0.116   0.0189    0.214

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.36       164.  0.0196  two.sided     0.116     0.0189    0.214
# college graduate less attrit (wave 3 vs. 1)

# FYI: no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)

✅ college graduate less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)

Male (non-male - male)

# ============================================================================ #
attrit_dta |> 
  mutate(male = as.numeric(male)-1) |> 
  infer::t_test(male ~ Y2, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1    -0.629  492.   0.529 two.sided    -0.0241  -0.0993   0.0511

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -0.629     492.  0.529   two.sided     -0.0241   -0.0993   0.0511
# no sig diff (wave 2 vs. 1)

attrit_dta |> 
  mutate(male = as.numeric(male)-1) |> 
  infer::t_test(male ~ Y3, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -2.45  166.  0.0153 two.sided     -0.119   -0.215  -0.0231

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -2.45      166.  0.0153  two.sided     -0.119    -0.215    -0.0231
# male attrit more (wave 3 vs. 1)
# FYI: no diff in attrit between male vs. non-male (wave 3 vs. 2)

✅ male attrit more (wave 3 vs. 1) ; no diff in attrit between male vs. non-male grad (wave 3 vs. 2; wave 2 vs. 1)

Democrat (non-democrat - democrat)

# ============================================================================ #
attrit_dta |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y2, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -3.08  491. 0.00216 two.sided     -0.118   -0.192  -0.0426

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -3.08      491.  0.00216 two.sided     -0.118    -0.192    -0.0426
# democrat more attrit (wave 2 vs. 1)

attrit_dta |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y3, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1    -0.325  163.   0.746 two.sided    -0.0162   -0.115   0.0824

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -0.325     163.  0.746   two.sided     -0.0162   -0.115    0.0824
# no sig diff (wave 3 vs. 1)

attrit_dta3 |>
  mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
  infer::t_test(democrat ~ Y23, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      2.00  245.  0.0461 two.sided      0.125  0.00217    0.249

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 2.00       245.  0.0461  two.sided     0.125     0.00217   0.249
# democrat less attrit (wave 3 vs. 2)

✅ democrat more attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; democrat less attrit (wave 3 vs. 2)

Republican (non-republican - republican)

# ============================================================================ #
attrit_dta |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y2, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df   p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>     <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      3.97  453. 0.0000840 two.sided      0.147   0.0742    0.220

#* statistic  t_df  p_value   alternative   estimate  lower_ci  upper_ci
#* 3.97       453.  0.000084  two.sided     0.147     0.0742    0.220
# republican less attrit (wave 2 vs. 1)

attrit_dta |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y3, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1      1.07  160.   0.288 two.sided     0.0514  -0.0439    0.147

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* 1.07       160.  0.288   two.sided     0.0514    -0.0439   0.147
# no sig diff (wave 3 vs. 1)

attrit_dta3 |>
  mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
  infer::t_test(republican ~ Y23, order = c("0", "1"), var.equal=FALSE)

## # A tibble: 1 × 7
##   statistic  t_df p_value alternative estimate lower_ci upper_ci
##       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
## 1     -1.71  248.  0.0888 two.sided     -0.107   -0.230   0.0163

#* statistic  t_df  p_value alternative   estimate  lower_ci  upper_ci
#* -3.83      179.  0.00018 two.sided     -0.181    -0.274    -0.0876
# republican more attrit (wave 3 vs. 2)

✅ republican less attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; republican more attrit (wave 3 vs. 2) (=Opposite of democrat)

Chi-squared tests using the Benjamini-Hochberg procedure for factors

factors: party_id, age_group

#* ========================================================================== *#
#* Chi-squared tests using the Benjamini-Hochberg procedure for factors
#* - party_id, age_group (factors) 
#* ========================================================================== *#
attrit_dta |>
  infer::chisq_test(party_id ~ Y2)

## # A tibble: 1 × 3
##   statistic chisq_df  p_value
##       <dbl>    <int>    <dbl>
## 1      16.5        2 0.000259

tab_xtab(attrit_dta$party_id, attrit_dta$Y2, show.row.prc = TRUE)

party_id	Y2		Total
party_id	0	1	Total
Democrat	110 27.2 %	294 72.8 %	404 100 %
Neither	32 27.8 %	83 72.2 %	115 100 %
Republican	109 41.8 %	152 58.2 %	261 100 %
Total	251 32.2 %	529 67.8 %	780 100 %
χ²=16.520 · df=2 · Cramer’s V=0.146 · p=0.000

#*   statistic chisq_df  p_value
#*   16.5      2         0.000259

attrit_dta |>
  infer::chisq_test(age_group ~ Y2)

## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect

## # A tibble: 1 × 3
##   statistic chisq_df    p_value
##       <dbl>    <int>      <dbl>
## 1      38.1        6 0.00000107

#*   statistic chisq_df  p_value
#*   38.1        6       0.00000107 
#*   (with warning: chi-squared approximation may be incorrect)
tab_xtab(attrit_dta$age_group, attrit_dta$Y2, show.row.prc = TRUE)

age_group	Y2		Total
age_group	0	1	Total
18-34	45 19.9 %	181 80.1 %	226 100 %
35-44	65 28.6 %	162 71.4 %	227 100 %
45-54	67 42.1 %	92 57.9 %	159 100 %
55-64	48 42.9 %	64 57.1 %	112 100 %
65-74	20 43.5 %	26 56.5 %	46 100 %
75-84	5 62.5 %	3 37.5 %	8 100 %
85+	1 100 %	0 0 %	1 100 %
Total	251 32.2 %	528 67.8 %	779 100 %
χ²=38.111 · df=6 · Cramer’s V=0.221 · Fisher’s p=0.000

To be updated (currently figuring our what I should show)

OR following this paper (I used their R script for attrition tables):

% of respondents by each category in each wave
- e.g., In Wave 1, 51.8% were self-identified democrat, 33.5% republican, and 14.7% as neither. In Wave 2 (among those who took Wave 2; didn’t attrit), 43.8% were democrat, 43.4% republican, and 12.7% neither.
- p-value: result of chi-squared tests
  - For factor variables (Party ID and Age Group), p-values are adjusted using BH => 👀 not sure I did this right though

# OR 
# SETUP ========================================================================
data_w1 <- attrit_dta # started from wave 1 
data_w12 <- attrit_dta |> filter(Y2 == 0) # drop the attrit in wave 2 (vs. 1)
data_w123 <- attrit_dta |> filter(Y2 == 0 & Y3 == 0) # drop the attrit in wave 3 (vs. 2)

# Attrition Table
wave1_bind <- rbind(
  data.frame(prop.table(table(data_w1$party_id))*100, variable = "Party ID"),
  data.frame(prop.table(table(data_w1$white))*100, variable = "White"), 
  data.frame(prop.table(table(data_w1$age_group))*100, variable = "Age Group"), 
  data.frame(prop.table(table(data_w1$college_grad))*100, variable = "College Grad"),
  data.frame(prop.table(table(data_w1$male))*100, variable = "Male")
) |> rename(value = Var1, wave1 = Freq) 

wave2_bind <- rbind(
  data.frame(prop.table(table(data_w12$party_id))*100),
  data.frame(prop.table(table(data_w12$white))*100), 
  data.frame(prop.table(table(data_w12$age_group))*100), 
  data.frame(prop.table(table(data_w12$college_grad))*100),
  data.frame(prop.table(table(data_w12$male))*100)
) |> rename(value = Var1, wave2 = Freq) |> select(!value)

wave3_bind <- rbind(
  data.frame(prop.table(table(data_w123$party_id))*100),
  data.frame(prop.table(table(data_w123$white))*100), 
  data.frame(prop.table(table(data_w123$age_group))*100), 
  data.frame(prop.table(table(data_w123$college_grad))*100),
  data.frame(prop.table(table(data_w123$male))*100)
) |> rename(value = Var1, wave3 = Freq) |> select(!value)

cbind(wave1_bind, wave2_bind, wave3_bind) |>
  relocate(variable, value, wave1, wave2, wave3) |>
  mutate(
    wave1 = round(wave1, 1),
    wave2 = round(wave2, 1),
    wave3 = round(wave3, 1)
  ) -> attrition_table

# Add significance tests
p_pid <- round(chisq.test(rbind(
  (table(data_w1$party_id)),
  (table(data_w12$party_id)),
  (table(data_w123$party_id))))$p.value, 3)
p_white <- round(chisq.test(rbind(
  (table(data_w1$white)),
  (table(data_w12$white)),
  (table(data_w123$white))))$p.value, 3)
p_age <- round(chisq.test(rbind(
  (table(data_w1$age_group)),
  (table(data_w12$age_group)),
  (table(data_w123$age_group))))$p.value, 3)

## Warning in chisq.test(rbind((table(data_w1$age_group)),
## (table(data_w12$age_group)), : Chi-squared approximation may be incorrect

p_collgrad <- round(chisq.test(rbind(
  (table(data_w1$college_grad)),
  (table(data_w12$college_grad)),
  (table(data_w123$college_grad))))$p.value, 3)
p_male <- round(chisq.test(rbind(
  (table(data_w1$male)),
  (table(data_w12$male)),
  (table(data_w123$male))))$p.value, 3)

# BH procedure for factor variables (***Is this right?)
# (https://www.r-bloggers.com/2023/07/the-benjamini-hochberg-procedure-fdr-and-p-value-adjusted-explained/)
p.adjust(c(p_pid, p_age), method="BH")

## [1] 0.067 0.062

attrition_table <- attrition_table |> 
  mutate(sig = case_when(
    variable == "Party ID" ~ p_pid,
    variable == "White" ~ p_white,
    variable == "Age Group" ~ 0.062,
    variable == "College Grad" ~ p_collgrad,
    variable == "Male" ~ p_male)) 

attrition_table |> 
  kbl(caption = "Attrition by demographics", 
      format = "html", booktabs = T, escape = F, linesep = "", row.names = F, 
      col.names = c("Variable", "Value", "Wave1", "Wave2", "Wave3", "p-value")) |>
  collapse_rows(columns = c(1, 6), valign = "middle", 
                row_group_label_position = "stack",
                latex_hline = "major") |>
  kable_classic_2("hover", full_width = F)

Attrition by demographics
Variable	Value	Wave1	Wave2	Wave3	p-value
Party ID	Democrat	51.8	43.8	50.4	0.067
	Neither	14.7	12.7	11.8
	Republican	33.5	43.4	37.8
White	0	35.1	30.3	26.1	0.081
White	1	64.9	69.7	73.9	0.081
Age Group	18-34	29.0	17.9	20.2	0.062
	35-44	29.1	25.9	28.6
	45-54	20.4	26.7	23.5
	55-64	14.4	19.1	19.3
	65-74	5.9	8.0	7.6
	75-84	1.0	2.0	0.8
	85+	0.1	0.4	0.0
College Grad	0	51.0	43.0	41.2	0.023
College Grad	1	49.0	57.0	58.8	0.023
Male	0	52.9	54.6	63.0	0.120
Male	1	47.1	45.4	37.0	0.120

✅ College graduates attrited less