Attrition Analysis
Do Won Kim
2024-05-16
# Load packages ================================================================
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
conflicted, # Avoid conflict of functions with same names
tidyverse, # Tidyverse umbrella package
haven, # read_dta()
infer, # chisq_test()
sjPlot, # tab_xtab()
kableExtra, # kbl()
DT # datatable()
)
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
# Import data ==================================================================
# Load survey data
df_w3 <- readRDS("survey_data/2023-095c_client_wave-3.rds") # wave 3
df_w2 <- read_dta("survey_data/2023-095b_client_wave-2.dta") # wave 2
df_w1_1 <- readRDS("survey_data/2023-095a_client_wave-1.rds")
df_w1_2 <- readRDS("survey_data/2023-095a_client_wave-1_supp.rds")
df_w1 <- rowid_to_column(rbind(df_w1_1, df_w1_2), "ID") # merge and add ID
# Users who were eligible for eligibility test: ================================
users_for_elig_test_final <- read_csv("backend_data/users_for_elig_test_final.csv",
col_types = cols(user_id = col_character()))
# Wave 1 data: filter & select =================================================
wave1 <- df_w1 |>
filter(Progress==100) |> # who finished wave 1 till the end
select(ID, userid, vs_age:vs_recruitment) |>
rename(user_id = userid) |> # rename userid for consistency
merge(users_for_elig_test_final, by=c("user_id"), all.y=TRUE) |>
select(ID, user_id, screen_name, vsid, vs_recruitment,
vs_pid_base, vs_pid_lean, vs_raceeth, vs_age, vs_education, vs_gender)
# Eligibility criteria: follow OR exposure =====================================
eligibility_results <- read_csv("backend_data/eligibility_results.csv",
col_types = cols(user_id = col_character(),
account_created = col_skip(),
following_us = col_skip()))
wave1 |>
merge(eligibility_results, by="user_id") |>
relocate(user_id, following_NG, hometimeline) |>
rename(follow = following_NG,
exposure = hometimeline) -> wave1_merge
# Pre-treatment engagement =====================================================
pre_engagement_final <- read_csv("backend_data/pre_engagement_final.csv",
col_types = cols(user_id = col_character(),
target_user_id = col_skip(),
type = col_skip()))
pre_likes_final <- read_csv("backend_data/pre_likes_final.csv",
col_types = cols(user_id = col_character(),
target_user_id = col_skip(),
match_type = col_skip()))
pre_engagement <- rbind(pre_engagement_final, pre_likes_final) |>
unique() |>
mutate(pre_total_engage = 1)
wave1_merge |>
merge(pre_engagement, by="user_id", all = TRUE) |>
mutate_at("pre_total_engage", ~replace(., is.na(.), 0)) -> wave1_merge
# Wave 2 =======================================================================
df_w2 |>
filter(userid!="") |>
select(userid, randomized_group) |>
mutate(Y2 = 0) |> # a new variable indicating took wave 2 (attrition = 1)
rename(user_id = userid) -> user_w2
# Merge ========================================================================
wave2_merge <- wave1_merge |>
merge(user_w2, by="user_id", all = TRUE) |>
mutate_at("Y2", ~replace(., is.na(.), 1)) # attrition to wave 2 = 1
# Wave 3 =======================================================================
df_w3 |>
select(userid) |>
mutate(Y3 = 0) |>
rename(user_id = userid) -> wave3
wave3_merge <- wave2_merge |>
merge(wave3, by="user_id", all = TRUE) |>
mutate_at("Y3", ~replace(., is.na(.), 1)) # attrition to wave 3 = 1
# Data preprocess ==============================================================
wave3_merge |>
relocate(user_id, follow, exposure, vs_recruitment, pre_total_engage,
randomized_group, vs_pid_base:vs_pid_lean, vs_raceeth, vs_age,
vs_education, vs_gender, Y2, Y3) |>
mutate(
party_id = as.factor(case_when(
vs_pid_base == "Democrat" | vs_pid_lean == "The Democratic Party" ~ "Democrat",
vs_pid_base == "Republican" | vs_pid_lean == "The Republican Party" ~ "Republican",
vs_pid_lean == "Neither" ~ "Neither")),
age_group = as.factor(case_when(
vs_age < 35 ~ "18-34",
vs_age >= 35 & vs_age < 45 ~ "35-44",
vs_age >= 45 & vs_age < 55 ~ "45-54",
vs_age >= 55 & vs_age < 65 ~ "55-64",
vs_age >= 65 & vs_age < 75 ~ "65-74",
vs_age >= 75 & vs_age < 85 ~ "75-84",
vs_age >= 85 ~ "85+")),
college_grad = as.factor(case_when(
vs_education == "2-year or associate degree" ~ 1,
vs_education == "4-year or bachelor degree" ~ 1,
vs_education == "Post-graduate degree" ~ 1,
.default = 0)),
white = as.factor(case_when(vs_raceeth == "White" ~ 1, .default = 0)),
male = as.factor(case_when(vs_gender == "Male" ~ 1, .default = 0)),
Y2 = as.factor(Y2),
Y3 = as.factor(Y3),
pre_total_engage = as.factor(pre_total_engage),
randomized = as.factor(case_when(
randomized_group == "control" ~ "Control",
randomized_group == "drop" | randomized_group == "muting_treatment2_drop" ~ "Drop",
randomized_group == "media_literacy" ~ "Media Literacy",
randomized_group == "muting_treatment1" | randomized_group == "muting_treatment2" ~ "Muting",
is.na(randomized_group) ~ NA)),
eligibility_criteria = as.factor(case_when(
follow == TRUE & exposure == TRUE ~ "Both",
follow == TRUE & exposure == FALSE ~ "Follow",
follow == FALSE & exposure == TRUE ~ "Exposure"))
) |>
rename(recruit_mode = vs_recruitment) |>
select(user_id, follow, exposure, eligibility_criteria, recruit_mode,
pre_total_engage, randomized, party_id,
white, age_group, college_grad, male,
Y2, Y3) -> attrit_dta
1 Data description
Y2 (1 if attrit in Wave 2 vs. 1)
Y3 (1 if attrit in Wave 3 vs. 1)
Y23 (1 if attrit in Wave 3 vs. 2)
2 Attrition Analysis
2.1 Attrition by recruitment mode
2.1.1 Twitter vs. New Panelist vs. Existing Panelist
- Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
attrit_dta |>
select(recruit_mode, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
infer::chisq_test(Y2~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 5.95 2 0.0510
recruit_mode | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
Twitter recruitment | 19 48.7 % | 20 51.3 % | 39 100 % |
New panelist recruited after January 1, 2024 | 28 27.5 % | 74 72.5 % | 102 100 % |
Existing panelist | 204 31.9 % | 435 68.1 % | 639 100 % |
Total | 251 32.2 % | 529 67.8 % | 780 100 % | χ2=5.952 · df=2 · Cramer’s V=0.087 · p=0.051 |
- Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
attrit_dta |>
select(recruit_mode, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
infer::chisq_test(Y3~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.744 2 0.689
#* statistic chisq_df p_value
#* 0.744 2 0.689
tab_xtab(attrit_dta$recruit_mode, attrit_dta$Y3, show.row.prc = TRUE) # viewer
recruit_mode | Y3 | Total | |
---|---|---|---|
0 | 1 | ||
Twitter recruitment | 7 17.9 % | 32 82.1 % | 39 100 % |
New panelist recruited after January 1, 2024 | 13 12.7 % | 89 87.3 % | 102 100 % |
Existing panelist | 99 15.5 % | 540 84.5 % | 639 100 % |
Total | 119 15.3 % | 661 84.7 % | 780 100 % | χ2=0.744 · df=2 · Cramer’s V=0.031 · p=0.689 |
- Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
attrit_dta |>
filter(Y2 != 1) |> # among those took wave 2
mutate(Y23 = as.factor(case_when(
Y2 == 0 & Y3 == 1 ~ 1, .default = 0))) |>
select(recruit_mode, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
infer::chisq_test(Y23~recruit_mode) # Pearson's Chi-squared test
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.964 2 0.617
attrit_dta |>
mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
.default = "Panelist")) -> attrit_dta2
attrit_dta2 |>
filter(Y2 !=1) |>
mutate(Y23 = as.factor(case_when(Y2 == 0 & Y3 == 1 ~ 1,
.default = 0))) -> attrit_dta3
tab_xtab(attrit_dta3$recruit_mode, attrit_dta3$Y23, show.row.prc = TRUE)
recruit_mode | Y23 | Total | |
---|---|---|---|
0 | 1 | ||
Twitter recruitment | 7 36.8 % | 12 63.2 % | 19 100 % |
New panelist recruited after January 1, 2024 | 13 46.4 % | 15 53.6 % | 28 100 % |
Existing panelist | 99 48.5 % | 105 51.5 % | 204 100 % |
Total | 119 47.4 % | 132 52.6 % | 251 100 % | χ2=0.964 · df=2 · Cramer’s V=0.062 · p=0.617 |
2.1.2 Twitter vs. Panelist (New + Existing)
- Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
attrit_dta |>
# what if recruitment mode = twitter vs. panelist (existing + new)?
mutate(recruit_mode2 = case_when(recruit_mode=="Twitter recruitment"~"Twitter",
.default = "Panelist")) |>
chisq_test(Y2~recruit_mode2)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 4.38 1 0.0364
recruit_mode2 | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
Panelist | 232 31.3 % | 509 68.7 % | 741 100 % |
19 48.7 % | 20 51.3 % | 39 100 % | |
Total | 251 32.2 % | 529 67.8 % | 780 100 % | χ2=4.378 · df=1 · φ=0.081 · p=0.036 |
- Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.0632 1 0.802
- Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.519 1 0.471
✅ No differences in attrition between recruitment mode, except for Twitter vs. Panelist (binary) (wave 2 vs. 1).
2.2 Attrition by eligibility criteria
eligibility_criteria | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
Both | 154 69.1 % | 69 30.9 % | 223 100 % |
Exposure | 93 68.4 % | 43 31.6 % | 136 100 % |
Follow | 4 57.1 % | 3 42.9 % | 7 100 % |
Total | 251 68.6 % | 115 31.4 % | 366 100 % | χ2=0.451 · df=2 · Cramer’s V=0.035 · Fisher’s p=0.823 |
eligibility_criteria | Y3 | Total | |
---|---|---|---|
0 | 1 | ||
Both | 65 29.1 % | 158 70.9 % | 223 100 % |
Exposure | 52 38.2 % | 84 61.8 % | 136 100 % |
Follow | 2 28.6 % | 5 71.4 % | 7 100 % |
Total | 119 32.5 % | 247 67.5 % | 366 100 % | χ2=3.230 · df=2 · Cramer’s V=0.094 · Fisher’s p=0.204 |
# (wave 2 vs. wave 3)
tab_xtab(attrit_dta3$eligibility_criteria, attrit_dta3$Y23, show.row.prc = TRUE)
eligibility_criteria | Y23 | Total | |
---|---|---|---|
0 | 1 | ||
Both | 65 42.2 % | 89 57.8 % | 154 100 % |
Exposure | 52 55.9 % | 41 44.1 % | 93 100 % |
Follow | 2 50 % | 2 50 % | 4 100 % |
Total | 119 47.4 % | 132 52.6 % | 251 100 % | χ2=4.380 · df=2 · Cramer’s V=0.132 · Fisher’s p=0.112 |
✅ No differences in attrition between eligibility criteria
2.3 Attrition by pre-treatment engagement
- Wave 1 vs. 2 (1: Attrit in Wave 2 vs. 1)
pre_total_engage | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
0 | 119 19.5 % | 491 80.5 % | 610 100 % |
1 | 132 77.6 % | 38 22.4 % | 170 100 % |
Total | 251 32.2 % | 529 67.8 % | 780 100 % | χ2=203.254 · df=1 · φ=0.514 · p=0.000 |
attrit_dta |>
select(pre_total_engage, Y2) |> # wave 1 vs. wave 2 (1: attrit in Wave 2 vs. 1)
infer::chisq_test(Y2~pre_total_engage)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 203. 1 4.07e-46
- Wave 1 vs. 3 (1: Attrit in Wave 3 vs. 1)
pre_total_engage | Y3 | Total | |
---|---|---|---|
0 | 1 | ||
0 | 57 9.3 % | 553 90.7 % | 610 100 % |
1 | 62 36.5 % | 108 63.5 % | 170 100 % |
Total | 119 15.3 % | 661 84.7 % | 780 100 % | χ2=73.583 · df=1 · φ=0.311 · p=0.000 |
attrit_dta |>
select(pre_total_engage, Y3) |> # wave 1 vs. wave 3 (1: attrit in Wave 3 vs. 1)
infer::chisq_test(Y3~pre_total_engage)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 73.6 1 9.65e-18
- Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2)
pre_total_engage | Y23 | Total | |
---|---|---|---|
0 | 1 | ||
0 | 57 47.9 % | 62 52.1 % | 119 100 % |
1 | 62 47 % | 70 53 % | 132 100 % |
Total | 119 47.4 % | 132 52.6 % | 251 100 % | χ2=0.000 · df=1 · φ=0.009 · p=0.984 |
attrit_dta3 |>
select(pre_total_engage, Y23) |> # wave 2 vs. wave 3 (1: attrit in Wave 3 vs. 2)
infer::chisq_test(Y23~pre_total_engage)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.000428 1 0.984
#* statistic chisq_df p_value
#* 0.000428 1 0.984
# ========= No differences in attrition between wave 2 vs. wave 3 ==============
✅ Different attrition by pre-treatment total engagement: Ppl with non-zero pre-treatment engagement attrit less in Wave 3 or 2 vs. 1. (No differences in attrition between wave 2 vs. wave 3)
2.4 [Differential Attrition] By randomized group
- Wave 2 vs. 3 (1: Attrit in Wave 3 vs. 2) only since the treatment was in Wave 2
# ==============================================================================
# 4. Randomized group (Differential attrition) =================================
# ==============================================================================
# (wave 2 vs. wave 3) (Y23: 1 = attrit in Wave 3 vs. 2)
attrit_dta4 <- attrit_dta3 |>
filter(randomized != "Drop") |> # Among randomized participants
mutate(randomized = factor(randomized,
levels = c("Control", "Media Literacy", "Muting")))
attrit_dta4 |>
infer::chisq_test(Y23~randomized)
## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 0.636 2 0.728
randomized | Y23 | Total | |
---|---|---|---|
0 | 1 | ||
Control | 42 87.5 % | 6 12.5 % | 48 100 % |
Media Literacy | 36 85.7 % | 6 14.3 % | 42 100 % |
Muting | 41 91.1 % | 4 8.9 % | 45 100 % |
Total | 119 88.1 % | 16 11.9 % | 135 100 % | χ2=0.636 · df=2 · Cramer’s V=0.069 · p=0.728 |
✅ No differential attrition between wave 2 vs. wave 3
2.5 [Selective attrition] By observable characteristics
Test null of no difference in party_id, white, age_group, college_grad, male between people who attrit and those who don’t (Y2; Y3)
- T-tests with unequal variances for binary (white, college_grad, male, democrat, republican) and continuous variables
- White
(non-white - white)
# ==============================================================================
# 5. Observable characteristics (Selective attrition) ==========================
#* ========================================================================== *#
#* Selective attrition: Wave 1 vs. Wave 2; Wave 1 vs. Wave 3
#* Test null of no difference in party_id, white, age_group, college_grad, male
#* between people who attrit and those who don’t (Y2; Y3)
#* ========================================================================== *#
#* T-tests with unequal variances for binary and continuous variables
#* - white, college_grad, male (binary);
#* - democrat, republican (as binary)
#* ========================================================================== *#
attrit_dta |>
mutate(white = as.numeric(white)-1) |>
infer::t_test(white ~ Y2, order = c("0", "1"), var.equal=FALSE) # mean(non-white) - mean(white)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1.99 514. 0.0469 two.sided 0.0715 0.000996 0.142
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 1.99 514. 0.0469 two.sided 0.0715 0.000996 0.142
# white less attrit (wave 2 vs. 1)
attrit_dta |>
mutate(white = as.numeric(white)-1) |>
infer::t_test(white ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2.40 173. 0.0173 two.sided 0.107 0.0192 0.195
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 2.40 173. 0.0173 two.sided 0.107 0.0192 0.195
# white less attrit (wave 3 vs. 1)
# FYI: no diff in attrit between white vs. non-white (wave 3 vs. 2)
✅ white less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between white vs. non-white (wave 3 vs. 2)
- College grad
(non-college grad - college grad)
# ============================================================================ #
attrit_dta |>
mutate(college_grad = as.numeric(college_grad)-1) |>
infer::t_test(college_grad ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 3.10 493. 0.00206 two.sided 0.118 0.0431 0.193
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 3.10 493. 0.00206 two.sided 0.118 0.0431 0.193
# college graduate less attrit (wave 2 vs. 1)
attrit_dta |>
mutate(college_grad = as.numeric(college_grad)-1) |>
infer::t_test(college_grad ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2.36 164. 0.0196 two.sided 0.116 0.0189 0.214
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 2.36 164. 0.0196 two.sided 0.116 0.0189 0.214
# college graduate less attrit (wave 3 vs. 1)
# FYI: no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)
✅ college graduate less attrit (wave 2 or 3 vs. 1) ; no diff in attrit between college grad vs. non-college grad (wave 3 vs. 2)
- Male
(non-male - male)
# ============================================================================ #
attrit_dta |>
mutate(male = as.numeric(male)-1) |>
infer::t_test(male ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 -0.629 492. 0.529 two.sided -0.0241 -0.0993 0.0511
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* -0.629 492. 0.529 two.sided -0.0241 -0.0993 0.0511
# no sig diff (wave 2 vs. 1)
attrit_dta |>
mutate(male = as.numeric(male)-1) |>
infer::t_test(male ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 -2.45 166. 0.0153 two.sided -0.119 -0.215 -0.0231
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* -2.45 166. 0.0153 two.sided -0.119 -0.215 -0.0231
# male attrit more (wave 3 vs. 1)
# FYI: no diff in attrit between male vs. non-male (wave 3 vs. 2)
✅ male attrit more (wave 3 vs. 1) ; no diff in attrit between male vs. non-male grad (wave 3 vs. 2; wave 2 vs. 1)
- Democrat
(non-democrat - democrat)
# ============================================================================ #
attrit_dta |>
mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
infer::t_test(democrat ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 -3.08 491. 0.00216 two.sided -0.118 -0.192 -0.0426
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* -3.08 491. 0.00216 two.sided -0.118 -0.192 -0.0426
# democrat more attrit (wave 2 vs. 1)
attrit_dta |>
mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
infer::t_test(democrat ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 -0.325 163. 0.746 two.sided -0.0162 -0.115 0.0824
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* -0.325 163. 0.746 two.sided -0.0162 -0.115 0.0824
# no sig diff (wave 3 vs. 1)
attrit_dta3 |>
mutate(democrat = ifelse(party_id == "Democrat", 1, 0)) |>
infer::t_test(democrat ~ Y23, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2.00 245. 0.0461 two.sided 0.125 0.00217 0.249
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 2.00 245. 0.0461 two.sided 0.125 0.00217 0.249
# democrat less attrit (wave 3 vs. 2)
✅ democrat more attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; democrat less attrit (wave 3 vs. 2)
- Republican
(non-republican - republican
)
# ============================================================================ #
attrit_dta |>
mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
infer::t_test(republican ~ Y2, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 3.97 453. 0.0000840 two.sided 0.147 0.0742 0.220
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 3.97 453. 0.000084 two.sided 0.147 0.0742 0.220
# republican less attrit (wave 2 vs. 1)
attrit_dta |>
mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
infer::t_test(republican ~ Y3, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1.07 160. 0.288 two.sided 0.0514 -0.0439 0.147
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* 1.07 160. 0.288 two.sided 0.0514 -0.0439 0.147
# no sig diff (wave 3 vs. 1)
attrit_dta3 |>
mutate(republican = ifelse(party_id == "Republican", 1, 0)) |>
infer::t_test(republican ~ Y23, order = c("0", "1"), var.equal=FALSE)
## # A tibble: 1 × 7
## statistic t_df p_value alternative estimate lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 -1.71 248. 0.0888 two.sided -0.107 -0.230 0.0163
#* statistic t_df p_value alternative estimate lower_ci upper_ci
#* -3.83 179. 0.00018 two.sided -0.181 -0.274 -0.0876
# republican more attrit (wave 3 vs. 2)
✅ republican less attrit (wave 2 vs. 1) ; no sig diff (wave 3 vs. 1) ; republican more attrit (wave 3 vs. 2) (=Opposite of democrat)
- Chi-squared tests using the Benjamini-Hochberg procedure for factors
- factors:
party_id
,age_group
#* ========================================================================== *#
#* Chi-squared tests using the Benjamini-Hochberg procedure for factors
#* - party_id, age_group (factors)
#* ========================================================================== *#
attrit_dta |>
infer::chisq_test(party_id ~ Y2)
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 16.5 2 0.000259
party_id | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
Democrat | 110 27.2 % | 294 72.8 % | 404 100 % |
Neither | 32 27.8 % | 83 72.2 % | 115 100 % |
Republican | 109 41.8 % | 152 58.2 % | 261 100 % |
Total | 251 32.2 % | 529 67.8 % | 780 100 % | χ2=16.520 · df=2 · Cramer’s V=0.146 · p=0.000 |
## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
## incorrect
## # A tibble: 1 × 3
## statistic chisq_df p_value
## <dbl> <int> <dbl>
## 1 38.1 6 0.00000107
#* statistic chisq_df p_value
#* 38.1 6 0.00000107
#* (with warning: chi-squared approximation may be incorrect)
tab_xtab(attrit_dta$age_group, attrit_dta$Y2, show.row.prc = TRUE)
age_group | Y2 | Total | |
---|---|---|---|
0 | 1 | ||
18-34 | 45 19.9 % | 181 80.1 % | 226 100 % |
35-44 | 65 28.6 % | 162 71.4 % | 227 100 % |
45-54 | 67 42.1 % | 92 57.9 % | 159 100 % |
55-64 | 48 42.9 % | 64 57.1 % | 112 100 % |
65-74 | 20 43.5 % | 26 56.5 % | 46 100 % |
75-84 | 5 62.5 % | 3 37.5 % | 8 100 % |
85+ | 1 100 % | 0 0 % | 1 100 % |
Total | 251 32.2 % | 528 67.8 % | 779 100 % | χ2=38.111 · df=6 · Cramer’s V=0.221 · Fisher’s p=0.000 |
- To be updated (currently figuring our what I should show)
OR following this paper (I used their R script for attrition tables):
% of respondents by each category in each wave
e.g., In Wave 1, 51.8% were self-identified democrat, 33.5% republican, and 14.7% as neither. In Wave 2 (among those who took Wave 2; didn’t attrit), 43.8% were democrat, 43.4% republican, and 12.7% neither.
p-value: result of chi-squared tests
- For factor variables (Party ID and Age Group), p-values are adjusted using BH => 👀 not sure I did this right though
# OR
# SETUP ========================================================================
data_w1 <- attrit_dta # started from wave 1
data_w12 <- attrit_dta |> filter(Y2 == 0) # drop the attrit in wave 2 (vs. 1)
data_w123 <- attrit_dta |> filter(Y2 == 0 & Y3 == 0) # drop the attrit in wave 3 (vs. 2)
# Attrition Table
wave1_bind <- rbind(
data.frame(prop.table(table(data_w1$party_id))*100, variable = "Party ID"),
data.frame(prop.table(table(data_w1$white))*100, variable = "White"),
data.frame(prop.table(table(data_w1$age_group))*100, variable = "Age Group"),
data.frame(prop.table(table(data_w1$college_grad))*100, variable = "College Grad"),
data.frame(prop.table(table(data_w1$male))*100, variable = "Male")
) |> rename(value = Var1, wave1 = Freq)
wave2_bind <- rbind(
data.frame(prop.table(table(data_w12$party_id))*100),
data.frame(prop.table(table(data_w12$white))*100),
data.frame(prop.table(table(data_w12$age_group))*100),
data.frame(prop.table(table(data_w12$college_grad))*100),
data.frame(prop.table(table(data_w12$male))*100)
) |> rename(value = Var1, wave2 = Freq) |> select(!value)
wave3_bind <- rbind(
data.frame(prop.table(table(data_w123$party_id))*100),
data.frame(prop.table(table(data_w123$white))*100),
data.frame(prop.table(table(data_w123$age_group))*100),
data.frame(prop.table(table(data_w123$college_grad))*100),
data.frame(prop.table(table(data_w123$male))*100)
) |> rename(value = Var1, wave3 = Freq) |> select(!value)
cbind(wave1_bind, wave2_bind, wave3_bind) |>
relocate(variable, value, wave1, wave2, wave3) |>
mutate(
wave1 = round(wave1, 1),
wave2 = round(wave2, 1),
wave3 = round(wave3, 1)
) -> attrition_table
# Add significance tests
p_pid <- round(chisq.test(rbind(
(table(data_w1$party_id)),
(table(data_w12$party_id)),
(table(data_w123$party_id))))$p.value, 3)
p_white <- round(chisq.test(rbind(
(table(data_w1$white)),
(table(data_w12$white)),
(table(data_w123$white))))$p.value, 3)
p_age <- round(chisq.test(rbind(
(table(data_w1$age_group)),
(table(data_w12$age_group)),
(table(data_w123$age_group))))$p.value, 3)
## Warning in chisq.test(rbind((table(data_w1$age_group)),
## (table(data_w12$age_group)), : Chi-squared approximation may be incorrect
p_collgrad <- round(chisq.test(rbind(
(table(data_w1$college_grad)),
(table(data_w12$college_grad)),
(table(data_w123$college_grad))))$p.value, 3)
p_male <- round(chisq.test(rbind(
(table(data_w1$male)),
(table(data_w12$male)),
(table(data_w123$male))))$p.value, 3)
# BH procedure for factor variables (***Is this right?)
# (https://www.r-bloggers.com/2023/07/the-benjamini-hochberg-procedure-fdr-and-p-value-adjusted-explained/)
p.adjust(c(p_pid, p_age), method="BH")
## [1] 0.067 0.062
attrition_table <- attrition_table |>
mutate(sig = case_when(
variable == "Party ID" ~ p_pid,
variable == "White" ~ p_white,
variable == "Age Group" ~ 0.062,
variable == "College Grad" ~ p_collgrad,
variable == "Male" ~ p_male))
attrition_table |>
kbl(caption = "Attrition by demographics",
format = "html", booktabs = T, escape = F, linesep = "", row.names = F,
col.names = c("Variable", "Value", "Wave1", "Wave2", "Wave3", "p-value")) |>
collapse_rows(columns = c(1, 6), valign = "middle",
row_group_label_position = "stack",
latex_hline = "major") |>
kable_classic_2("hover", full_width = F)
Variable | Value | Wave1 | Wave2 | Wave3 | p-value |
---|---|---|---|---|---|
Party ID | Democrat | 51.8 | 43.8 | 50.4 | 0.067 |
Neither | 14.7 | 12.7 | 11.8 | ||
Republican | 33.5 | 43.4 | 37.8 | ||
White | 0 | 35.1 | 30.3 | 26.1 | 0.081 |
1 | 64.9 | 69.7 | 73.9 | ||
Age Group | 18-34 | 29.0 | 17.9 | 20.2 | 0.062 |
35-44 | 29.1 | 25.9 | 28.6 | ||
45-54 | 20.4 | 26.7 | 23.5 | ||
55-64 | 14.4 | 19.1 | 19.3 | ||
65-74 | 5.9 | 8.0 | 7.6 | ||
75-84 | 1.0 | 2.0 | 0.8 | ||
85+ | 0.1 | 0.4 | 0.0 | ||
College Grad | 0 | 51.0 | 43.0 | 41.2 | 0.023 |
1 | 49.0 | 57.0 | 58.8 | ||
Male | 0 | 52.9 | 54.6 | 63.0 | 0.120 |
1 | 47.1 | 45.4 | 37.0 |
✅ College graduates attrited less