print(paste("Started at", Sys.time()))
## [1] "Started at 2020-07-20 14:45:27"
surv_ver <- "20200707"
date_max <- as.Date("2020-07-11")

dt_raw <- f_readhb("2020_update", geo_rep=T)
## [1] "Rows without original county FIPS: 1370621"
## [1] "Rows without improved county FIPS: 1119573"
dt_resp <- data.table(readRDS(paste0(dir_clean, "survey_worker/homebase_worker_survey_hours_",surv_ver,".rds")))[row_sel==1,]
dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_2020.rds"))
dt_userv <- data.table(readRDS(paste0(dir_clean, "survey_worker/worker_userid_var_sel.rds")))
dt_st <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))

dt_firm[, ftec_2wk:=cut(hours_2wk/80,c(1,5,10,20,50,Inf),include.lowest = T)]
dt_raw <- merge(dt_raw,dt_firm[,c("firmid","ind","st","msa","firm_base_2wk","st_sel_2wk","ftec_2wk")], by=c("firmid","ind","st","msa"), all.x=T)
dt_raw <- dt_raw[firm_base_2wk==1 & st_sel_2wk==1,]

dt_raw <- merge(dt_raw, dt_st[,c("st","state_div","state_reg")], by=c("st"), all.x=T)

dt_baseu <- unique(dt_raw[date>=as.Date("2020-01-19") & date<=as.Date("2020-02-01"),c("userid")])
dt_raw[, in_base:=ifelse(userid %in% unique(dt_baseu[,userid]),1,0)] # Whether user is in base
dt_raw[, in_surv:=ifelse(userid %in% unique(dt_resp[,userid]),1,0)] # Whether user is in survey
rm(dt_baseu)

dt_raw[, wkd:=as.POSIXlt(date)$wday]

#-------------------------------------------------------------------------------
# Keep users in base period

# Restrict to in_base and states + DC
dt_sel <- dt_raw[in_base == 1 & stfips <= 56,]
# Restrict to only one firm (note that dt_userv already restrict to selected firms)
dt_sel <- dt_sel[userid %in% dt_userv[user_nfirme==1,userid]]

print(paste0("All workers with only 1 firm: ",nrow(unique(dt_sel[,c("userid")]))))
## [1] "All workers with only 1 firm: 427731"
print(nrow(unique(dt_sel[in_surv==1,c("userid")]))) # Check that this is of the right size
## [1] 1688
dt_sel[, in_survf:=factor(in_surv)]
setattr(dt_sel$in_survf, "levels", c("Non-Respondent","Respondent"))
dt_sel[, in_survf:=relevel(in_survf,"Respondent")]

nrow(dt_sel[firm_base_2wk==0|is.na(firm_base_2wk)])
## [1] 0
# dt_view <- dt_sel[firm_base_2wk==0|is.na(firm_base_2wk)]

dt_base <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01"))]

lvfirm <- c("firmid","ind","st","msa")

Some worker characteristics in base period

#-------------------------------------------------------------------------------
# nest, nfirm, nind, hours by userid

dt_sum <- copy(dt_base[stfips<=56])
dt_sum[dt_sum[,.I[1],by=c("userid","estid")]$V1,nest:=1]
dt_sum[dt_sum[,.I[1],by=c("userid",lvfirm)]$V1,nfirm:=1]
dt_sum[dt_sum[,.I[1],by=c("userid","ind")]$V1,nind:=1]
dt_sum_emp <- dt_sum[,lapply(.SD, sum, na.rm=T),by=c("userid","in_surv","in_survf"),.SDcols=c("hours","nest","nfirm","nind")]

# Add firm size
dt_sum_emp <- merge(dt_sum_emp, dt_userv[,c("userid","userf","userf_ind","userf_st","userf_msa")], by=c("userid"), all.x=T)
dt_sum_emp <- merge(dt_sum_emp, dt_firm[,c("firmid","ind","st","msa","ftec_2wk")], 
                    by.x=c("userf","userf_ind","userf_st","userf_msa"),
                    by.y=c("firmid","ind","st","msa"), all.x=T)
dt_sum_emp <- merge(dt_sum_emp,dt_st[,c("st","state_div","state_reg")],by.x=c("userf_st"),by.y=c("st"), all.x=T)

Distribution of workers

#-------------------------------------------------------------------------------

gt_tab <- gtsummary::tbl_summary(dt_sum_emp[,c("in_survf","userf_ind","state_div","ftec_2wk")], by=c("in_survf"),
                       list(vars(userf_ind) ~ "Industry","state_div"~"Census Division","ftec_2wk"~"Firm Size"),
                       statistic = list(all_categorical() ~ "{p}%"))
gt_tab <- gtsummary::as_gt(gt_tab)
gt_tabl <- gt::as_latex(gt_tab)

writeLines(gt_tabl,paste0(dir_otab,"P2_resp_vs_nresp_dist.txt"))

dt_tabl <- data.table(as.data.frame(gt_tab))
dt_tabl <- dt_tabl[,c("label","stat_1","stat_2")]
dt_tabl <- rbind(dt_tabl,data.table(label="N",stat_1=nrow(dt_sum_emp[in_survf=="Respondent"]),stat_2=nrow(dt_sum_emp[in_survf=="Non-Respondent"])))
setnames(dt_tabl,names(dt_tabl),c("Characteristics","Respondent","Non-Respondent"))
fwrite(dt_tabl,paste0(dir_otab,"P2_resp_vs_nresp_dist.csv"))
rm(dt_tabl)

Number of hours the worker worked

gg_out <- ggplot(dt_sum_emp, aes(x=hours, color=in_survf, linetype=in_survf)) + ggtheme +
  geom_density(size=1.2) + 
  scale_color_brewer(palette = "Set1") +
  scale_linetype_manual(values=c("solid","longdash")) +
  labs(x="Total Hours in Base Period", y="Density",color="",linetype="")
print(gg_out)

ggsave(plot=gg_out, paste0(dir_ofig,"P2_resp_vs_nresp_hoursb_dist.png"),width=9, height=6, dpi=300)

lresp <- factor(c(1:2))
levels(lresp) <- c("Respondent","Non-Respondent")
dt_den1 <- density(dt_sum_emp[in_survf=="Respondent",hours])
dt_den2 <- density(dt_sum_emp[in_survf=="Non-Respondent",hours])
dt_den <- rbind(data.table(in_survf=lresp[1],x=dt_den1$x,y=dt_den1$y),
                data.table(in_survf=lresp[2],x=dt_den2$x,y=dt_den2$y))
ggplot(dt_den, aes(x=x,y=y, color=in_survf, linetype=in_survf)) + ggtheme +
  geom_line(size=1.2)

fwrite(dt_den,paste0(dir_ofigd,"P2_resp_vs_nresp_hoursb_dist.csv"))

Hours trend for respondents and non-respondents

source(paste0(dir_proj,"0_function/f_ratio.R"))
dt_ratio <- f_ratio(copy(dt_sel),var_sum="hours",var_by="in_survf")
dt_ratio[, r_var:=r_hours]
dt_ratio[,lnr_var:=log(r_var)]
dt_ratio <- dt_ratio[week>=f_wk("2020-03-01")]

ggplot(dt_ratio[date<=date_max,], aes(x=date,y=r_var,color=in_survf,linetype=in_survf)) + ggtheme +
  geom_line(size=1.2) +
  scale_color_brewer(palette = "Set1") +
  scale_linetype_manual(values=c("solid","longdash")) +
  labs(x="Date",y="Share of hours relative to baseline",color="",linetype="") +
  theme(legend.key.width=unit(0.6,"inch"))

ggsave(paste0(dir_ofig,"P2_resp_vs_nresp_hours.png"),width=9, height=6, dpi=300)
fwrite(dt_ratio[date<=date_max,],paste0(dir_ofigd,"P2_resp_vs_nresp_hours.csv"))

Note: The trend for all is different from the trend for firm, because we are restricting employees to those active in the base period.

print(paste("Ended at", Sys.time()))
## [1] "Ended at 2020-07-20 14:51:43"
# End of R script