# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code/
# sbatch --cpus-per-task=6 --mem=96g --export=ALL,script=bpea_1_worker_hours.R,param="T" --partition=high zb_r.sh
# cd /accounts/projects/jrothst/homebase/data/1_do_bpea/
# sbatch --cpus-per-task=6 --mem=96g --export=ALL,script=bpea_1_worker_hours.R,param="F" --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)
bpea_rep <- ifelse(!exists("bpea_rep"), ifelse(length(args)==0,T,eval(parse(text=args[1]))), bpea_rep)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase" & bpea_rep==T) { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder.
} else if (sys_user=="homebase" & bpea_rep==F) {
  dir_func <- "/accounts/projects/jrothst/homebase/data/0_function/"
}

source(paste0(dir_func, "0_directory.R"))
source(paste0(dir_proj, ifelse(bpea_rep==T,"code/bpea_0_directory.R","1_do_bpea/bpea_0_directory.R")))

print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())
setwd(paste0(dir_proj))

#===============================================================================

source(paste0(dir_func,"f_readhb.R"))
source(paste0(dir_func,"f_wk.R"))
source(paste0(dir_func,"f_ratio.R"))

#===============================================================================

lvfirm <- c("firmid","ind","st","msa")

if (sys_user=="homebase" & sys_batch==1) {
  # Data before 2020
  dt_raw0 <- f_readhb("all", geo_rep = T)[date<=as.Date("2019-12-31")]
  dt_raw0[, week:=f_wk(date)]
  dt_raw0[, weekd:=f_wkdate(week)]
  # Data in 2020
  dt_raw <- f_readhb("2020_update", geo_rep = T)
  # rbind and clean
  dt_raw <- rbind(dt_raw0, dt_raw, fill=T)
  dt_raw[, ndaywk:=NULL]
  f_ndaywk("dt_raw","week","date")
  rm(dt_raw0)
  gc()
} else {
  dt_raw <- f_readhb("2020_update", geo_rep = T) 
}

#===============================================================================
# Share of base workers in 2020

dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_2020.rds"))[firm_base_2wk==1 & st_sel_2wk==1] # subset firms

dt_sel <- dt_raw[stfips<=56 & ndaywk==7, lapply(.SD, sum, na.rm=T), by=c("date","week","weekd",lvfirm,"userid"), .SDcols=c("hours")][, nemp:=1]
dt_sel <- merge(dt_sel,dt_firm[,.SD,.SDcols=c(lvfirm,"firm_base_2wk","st_sel_2wk")],by=c(lvfirm),all.x=T)[firm_base_2wk==1 & st_sel_2wk==1] # subset firms
print(names(dt_sel))

#-------------------------------------------------------------------------------
# Share of hours from workers in base period

dt_ubase <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01")),lapply(.SD, sum, na.rm=T),by=c("userid"),.SDcols=c("hours")][,user_base:=1]
dt_sel <- merge(dt_sel, dt_ubase[,c("userid","user_base")],by=c("userid"),all.x=T)[is.na(user_base),user_base:=0]
table(dt_sel$user_base)

dt_share <- dt_sel[,lapply(.SD,sum,na.rm=T),by=c("date","user_base"),.SDcols=c("hours")]
dt_share[,s_hours:=hours/sum(hours),by=c("date")]

saveRDS(dt_share,paste0(dir_exp, "worker_hours_shours_base_worker.rds"))

#-------------------------------------------------------------------------------
# Share of hours from worker-firm pairs in base period

dt_ufbase <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01")),lapply(.SD, sum, na.rm=T),by=c("userid","firmid","ind","st","msa"),.SDcols=c("hours")][,user_base:=1]
dt_seluf <- merge(copy(dt_sel)[,user_base:=NULL], dt_ufbase[,c("userid","firmid","ind","st","msa","user_base")],by=c("userid","firmid","ind","st","msa"),all.x=T)[is.na(user_base),user_base:=0]
table(dt_seluf$user_base)

dt_share <- dt_seluf[,lapply(.SD,sum,na.rm=T),by=c("date","user_base"),.SDcols=c("hours")]
dt_share[,s_hours:=hours/sum(hours),by=c("date")]

saveRDS(dt_share,paste0(dir_exp, "worker_hours_shours_base_workerf.rds"))

#-------------------------------------------------------------------------------
# Distribution of hours worked for those with positive hours

dt_selw <- dt_sel[,lapply(.SD,sum,na.rm=T),by=c("week","weekd",lvfirm,"userid","user_base"),.SDcols=c("hours")]

code_sum <- 'dt_dist <- dt_selw[, list(hours_mu=mean(hours)'
for (ip in c(10,25,50,75,90)) {
  code_sum <- paste0(code_sum,",hours_p",ip,"=quantile(hours,",ip/100,")")
}
code_sum <- paste0(code_sum,'), by=c("week","weekd")]')

eval(parse(text=code_sum))

dt_dist[, hours_mu_copy:=hours_mu]
dt_dist <- melt(dt_dist,id.vars=c("week","weekd","hours_mu_copy"),variable.name="stats",value.name="hours",variable.factor = F)
dt_dist[,stats:=sub("hours_p(\\d+)","\\1%",stats)]
dt_dist[stats=="hours_mu",stats:="Mean"]

saveRDS(dt_dist,paste0(dir_exp, "worker_hours_hours_dist_active_worker.rds"))

#===============================================================================
# Share of base workers in 2019

dt_info <- data.table(year=c(2018,2019,2020),
                      date1=c(as.Date("2018-01-21"),as.Date("2019-01-20"),as.Date("2020-01-19")),
                      date2=c(as.Date("2018-02-03"),as.Date("2019-02-02"),as.Date("2020-02-01")))

for (iyear in c(2018,2019)) {

dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_",iyear,".rds"))[firm_base_2wk==1 & st_sel_2wk==1] # subset firms

rm(dt_sel)
gc()

dt_sel <- dt_raw[stfips<=56 & ndaywk==7, lapply(.SD, sum, na.rm=T), by=c("date","week","weekd",lvfirm,"userid"), .SDcols=c("hours")][, nemp:=1]
dt_sel <- merge(dt_sel,dt_firm[,.SD,.SDcols=c(lvfirm,"firm_base_2wk","st_sel_2wk")],by=c(lvfirm),all.x=T)[firm_base_2wk==1 & st_sel_2wk==1] # subset firms

#-------------------------------------------------------------------------------
# Share of hours from workers in base period

dt_ubase <- dt_sel[week %in% f_wk(c(dt_info[year==iyear,date1],dt_info[year==iyear,date2])),lapply(.SD, sum, na.rm=T),by=c("userid"),.SDcols=c("hours")][,user_base:=1]
dt_sel <- merge(dt_sel, dt_ubase[,c("userid","user_base")],by=c("userid"),all.x=T)[is.na(user_base),user_base:=0]
table(dt_sel$user_base)

dt_share <- dt_sel[,lapply(.SD,sum,na.rm=T),by=c("date","user_base"),.SDcols=c("hours")]
dt_share[,s_hours:=hours/sum(hours),by=c("date")]

saveRDS(dt_share,paste0(dir_exp, "worker_hours_shours_base_worker_",iyear,".rds"))

}

print(paste("Ended at", Sys.time()))
# End of R script
