# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=2 --mem=32g --export=ALL,script=ws_2_hours_match.R zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())

setwd(paste0(dir_proj))

source(paste0(dir_func, "f_readhb.R"))
source(paste0(dir_func, "f_wk.R"))

#===============================================================================

surv_ver <- "20200707"

dt_surv <- readRDS(paste0(dir_clean, "survey_worker/homebase_worker_survey_raw_", surv_ver,".rds"))
dt_raw <- f_readhb("2020_update", geo_rep=T)
dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_2020.rds"))
dt_st <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))

# Subset users that are in the survey
dt_sel <- dt_raw[userid %in% dt_surv[, userid],][,user_surv:=1]
print(nrow(unique(dt_sel[,c("userid")])))

# Subset users that are active in our base period
# Note that this is the only restriction imposed in this file
dt_base <- unique(dt_sel[date>=as.Date("2020-01-19") & date<=as.Date("2020-02-01"),c("userid")])[,user_base:=1]
print(nrow(dt_base))
dt_sel <- merge(dt_sel, dt_base, by=c("userid"), all.x=T)

# Merge with selected firms
dt_sel <- merge(dt_sel, dt_firm[,c("firmid","ind","st","msa","firm_base_2wk","st_sel_2wk")], by=c("firmid","ind","st","msa"), all.x=T)

# Merge with demographics
dt_sel <- merge(dt_sel, dt_surv[, .SD, .SDcols=patterns("userid|user_|rowid|row_dup|row_sel")], by=c("userid"), all.x=T)

# Merge with state info
dt_sel <- merge(dt_sel, dt_st[,c("st","state_div","state_reg")], by=c("st"), all.x=T)

# Merge with user characteristics (geo + firm) with most hours
dt_sel <- merge(dt_sel, readRDS(paste0(dir_clean,"survey_worker/worker_userid_var_sel.rds"))[,c("userid","user_nfirm","user_nfirme")], by=c("userid"), all.x=T)

# Improve a few variables in dt_sel
dt_sel[, week:=f_wk(date)]
dt_sel[, weekd:=f_wkdate(week)]

# Make sure that now row_sel solves the duplicate issue
if (nrow(dt_sel[row_sel==1,]) != nrow(unique(dt_sel[row_sel==1,c("userid","estid","date")]))) {
  print("ERROR: ISSUE REMAINS")
}

# Export
saveRDS(dt_sel, paste0(dir_clean, "survey_worker/homebase_worker_survey_hours_", surv_ver,".rds"))

print(paste("Ended at", Sys.time()))
# End of R script
