print(paste("Started at", Sys.time()))
## [1] "Started at 2020-07-20 14:45:27"
surv_ver <- "20200707"
date_max <- as.Date("2020-07-11")
dt_raw <- f_readhb("2020_update", geo_rep=T)
## [1] "Rows without original county FIPS: 1370621"
## [1] "Rows without improved county FIPS: 1119573"
dt_resp <- data.table(readRDS(paste0(dir_clean, "survey_worker/homebase_worker_survey_hours_",surv_ver,".rds")))[row_sel==1,]
dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_2020.rds"))
dt_userv <- data.table(readRDS(paste0(dir_clean, "survey_worker/worker_userid_var_sel.rds")))
dt_st <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))
dt_firm[, ftec_2wk:=cut(hours_2wk/80,c(1,5,10,20,50,Inf),include.lowest = T)]
dt_raw <- merge(dt_raw,dt_firm[,c("firmid","ind","st","msa","firm_base_2wk","st_sel_2wk","ftec_2wk")], by=c("firmid","ind","st","msa"), all.x=T)
dt_raw <- dt_raw[firm_base_2wk==1 & st_sel_2wk==1,]
dt_raw <- merge(dt_raw, dt_st[,c("st","state_div","state_reg")], by=c("st"), all.x=T)
dt_baseu <- unique(dt_raw[date>=as.Date("2020-01-19") & date<=as.Date("2020-02-01"),c("userid")])
dt_raw[, in_base:=ifelse(userid %in% unique(dt_baseu[,userid]),1,0)] # Whether user is in base
dt_raw[, in_surv:=ifelse(userid %in% unique(dt_resp[,userid]),1,0)] # Whether user is in survey
rm(dt_baseu)
dt_raw[, wkd:=as.POSIXlt(date)$wday]
#-------------------------------------------------------------------------------
# Keep users in base period
# Restrict to in_base and states + DC
dt_sel <- dt_raw[in_base == 1 & stfips <= 56,]
# Restrict to only one firm (note that dt_userv already restrict to selected firms)
dt_sel <- dt_sel[userid %in% dt_userv[user_nfirme==1,userid]]
print(paste0("All workers with only 1 firm: ",nrow(unique(dt_sel[,c("userid")]))))
## [1] "All workers with only 1 firm: 427731"
print(nrow(unique(dt_sel[in_surv==1,c("userid")]))) # Check that this is of the right size
## [1] 1688
dt_sel[, in_survf:=factor(in_surv)]
setattr(dt_sel$in_survf, "levels", c("Non-Respondent","Respondent"))
dt_sel[, in_survf:=relevel(in_survf,"Respondent")]
nrow(dt_sel[firm_base_2wk==0|is.na(firm_base_2wk)])
## [1] 0
# dt_view <- dt_sel[firm_base_2wk==0|is.na(firm_base_2wk)]
dt_base <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01"))]
lvfirm <- c("firmid","ind","st","msa")
#-------------------------------------------------------------------------------
# nest, nfirm, nind, hours by userid
dt_sum <- copy(dt_base[stfips<=56])
dt_sum[dt_sum[,.I[1],by=c("userid","estid")]$V1,nest:=1]
dt_sum[dt_sum[,.I[1],by=c("userid",lvfirm)]$V1,nfirm:=1]
dt_sum[dt_sum[,.I[1],by=c("userid","ind")]$V1,nind:=1]
dt_sum_emp <- dt_sum[,lapply(.SD, sum, na.rm=T),by=c("userid","in_surv","in_survf"),.SDcols=c("hours","nest","nfirm","nind")]
# Add firm size
dt_sum_emp <- merge(dt_sum_emp, dt_userv[,c("userid","userf","userf_ind","userf_st","userf_msa")], by=c("userid"), all.x=T)
dt_sum_emp <- merge(dt_sum_emp, dt_firm[,c("firmid","ind","st","msa","ftec_2wk")],
by.x=c("userf","userf_ind","userf_st","userf_msa"),
by.y=c("firmid","ind","st","msa"), all.x=T)
dt_sum_emp <- merge(dt_sum_emp,dt_st[,c("st","state_div","state_reg")],by.x=c("userf_st"),by.y=c("st"), all.x=T)
#-------------------------------------------------------------------------------
gt_tab <- gtsummary::tbl_summary(dt_sum_emp[,c("in_survf","userf_ind","state_div","ftec_2wk")], by=c("in_survf"),
list(vars(userf_ind) ~ "Industry","state_div"~"Census Division","ftec_2wk"~"Firm Size"),
statistic = list(all_categorical() ~ "{p}%"))
gt_tab <- gtsummary::as_gt(gt_tab)
gt_tabl <- gt::as_latex(gt_tab)
writeLines(gt_tabl,paste0(dir_otab,"P2_resp_vs_nresp_dist.txt"))
dt_tabl <- data.table(as.data.frame(gt_tab))
dt_tabl <- dt_tabl[,c("label","stat_1","stat_2")]
dt_tabl <- rbind(dt_tabl,data.table(label="N",stat_1=nrow(dt_sum_emp[in_survf=="Respondent"]),stat_2=nrow(dt_sum_emp[in_survf=="Non-Respondent"])))
setnames(dt_tabl,names(dt_tabl),c("Characteristics","Respondent","Non-Respondent"))
fwrite(dt_tabl,paste0(dir_otab,"P2_resp_vs_nresp_dist.csv"))
rm(dt_tabl)
gg_out <- ggplot(dt_sum_emp, aes(x=hours, color=in_survf, linetype=in_survf)) + ggtheme +
geom_density(size=1.2) +
scale_color_brewer(palette = "Set1") +
scale_linetype_manual(values=c("solid","longdash")) +
labs(x="Total Hours in Base Period", y="Density",color="",linetype="")
print(gg_out)
ggsave(plot=gg_out, paste0(dir_ofig,"P2_resp_vs_nresp_hoursb_dist.png"),width=9, height=6, dpi=300)
lresp <- factor(c(1:2))
levels(lresp) <- c("Respondent","Non-Respondent")
dt_den1 <- density(dt_sum_emp[in_survf=="Respondent",hours])
dt_den2 <- density(dt_sum_emp[in_survf=="Non-Respondent",hours])
dt_den <- rbind(data.table(in_survf=lresp[1],x=dt_den1$x,y=dt_den1$y),
data.table(in_survf=lresp[2],x=dt_den2$x,y=dt_den2$y))
ggplot(dt_den, aes(x=x,y=y, color=in_survf, linetype=in_survf)) + ggtheme +
geom_line(size=1.2)
fwrite(dt_den,paste0(dir_ofigd,"P2_resp_vs_nresp_hoursb_dist.csv"))
source(paste0(dir_proj,"0_function/f_ratio.R"))
dt_ratio <- f_ratio(copy(dt_sel),var_sum="hours",var_by="in_survf")
dt_ratio[, r_var:=r_hours]
dt_ratio[,lnr_var:=log(r_var)]
dt_ratio <- dt_ratio[week>=f_wk("2020-03-01")]
ggplot(dt_ratio[date<=date_max,], aes(x=date,y=r_var,color=in_survf,linetype=in_survf)) + ggtheme +
geom_line(size=1.2) +
scale_color_brewer(palette = "Set1") +
scale_linetype_manual(values=c("solid","longdash")) +
labs(x="Date",y="Share of hours relative to baseline",color="",linetype="") +
theme(legend.key.width=unit(0.6,"inch"))
ggsave(paste0(dir_ofig,"P2_resp_vs_nresp_hours.png"),width=9, height=6, dpi=300)
fwrite(dt_ratio[date<=date_max,],paste0(dir_ofigd,"P2_resp_vs_nresp_hours.csv"))
Note: The trend for all is different from the trend for firm, because we are restricting employees to those active in the base period.
print(paste("Ended at", Sys.time()))
## [1] "Ended at 2020-07-20 14:51:43"
# End of R script