# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=2 --mem=16g --export=ALL,script=ws_1_userid_var_sel.R zb_r.sh


sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(ggplot2)
library(knitr)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================

source(paste0(dir_func, "f_readhb.R"))
source(paste0(dir_func, "f_wk.R"))

#===============================================================================

f_dup <- function(dt_in, vvar) {
  dt_mult <- dt_in[nemp==1, .(ndup=.N), by=c("userid")]
  print(paste0("Multiple ", paste(vvar,collapse=" "), ": ",nrow(dt_mult[ndup!=1,])))
  return(dt_mult)
}

f_seldup <- function(dt_in, vvar, rseed=60637) {
  
  dti_in <- dt_in[, lapply(.SD,sum,na.rm=T),by=c("userid",vvar),.SDcols=c("hours")][,nemp:=1]
  
  # Check workers associated with multiple vvar
  dt_multo <- f_dup(dti_in, vvar)
  if (all(sort(vvar) == sort(c("firmid","ind","st","msa")))) {
    setnames(dt_multo,"ndup",paste0("user_nfirm"))
  } else {
    setnames(dt_multo,"ndup",paste0("user_n",vvar)) 
  }
  
  # When counting number of workers, only use the vvar with most hours
  dti_in[, hours_max:=max(hours), by=c("userid")]
  dti_in[hours!=hours_max, nemp:=NA]
  
  # Check workers associated with multiple vvar after change
  dt_mult <- f_dup(dti_in, vvar)
  
  # Further choose by random numbers
  dt_rnorm <- unique(dti_in[,.SD,.SDcols=c("userid",vvar)])
  setorderv(dt_rnorm, c("userid",vvar))
  set.seed(rseed)
  dt_rnorm[, rnorm:=rnorm(.N)]
  
  dti_in <- merge(dti_in, dt_mult, by=c("userid"))
  dti_in <- merge(dti_in, dt_rnorm, by=c("userid",vvar))
  dti_in[, rnorm_max:=max(rnorm), by=c("userid")]
  dti_in[ndup!=1 & rnorm!=rnorm_max, nemp:=NA]
  
  # Check workers associated with multiple vvar after change
  dt_mult <- f_dup(dti_in, vvar)
  
  # Merge with number of duplicates
  dti_in <- merge(dti_in,dt_multo,by=c("userid"),all.x=T)
  
  dti_in[, ndup:=NULL]
  return(dti_in)
}

#===============================================================================

dt_raw <- f_readhb("2020_update",geo_rep = T)
dt_firm <- readRDS(paste0(dir_clean,"homebase_sel_firm_ind_geo_2020.rds"))

dt_sel <- merge(dt_raw, dt_firm[,c("firmid","ind","st","msa","firm_base_2wk","st_sel_2wk")], by=c("firmid","ind","st","msa"), all.x=T)
dt_sel <- dt_sel[firm_base_2wk==1 & st_sel_2wk==1 & stfips<=56,]
dt_sel[, week:=f_wk(date)]
dt_sel[, weekd:=f_wkdate(week)]

# Determine userid's state and ind
dt_base <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01")), lapply(.SD,sum,na.rm=T),by=c("userid","st","ind","level"),.SDcols=c("hours")]

dt_uind <- f_seldup(dt_base,"ind",60637)[nemp==1]
setnames(dt_uind,"ind","user_ind")
dt_ust <- f_seldup(dt_base,"st",5807)[nemp==1]
setnames(dt_ust,"st","user_st")
dt_ulvl <- f_seldup(dt_base,"level",2020)[nemp==1]
setnames(dt_ulvl,"level","user_level")

# Determine userid's firm based on base period
dt_base2 <- dt_sel[week %in% f_wk(c("2020-01-19","2020-02-01")), lapply(.SD,sum,na.rm=T),by=c("userid","firmid","ind","st","msa"),.SDcols=c("hours")]
dt_ufirm <- f_seldup(dt_base2,c("firmid","ind","st","msa"),2020)[nemp==1]
setnames(dt_ufirm, c("firmid","ind","st","msa"),c("userf","userf_ind","userf_st","userf_msa"))

# Determine userid's number of firms based on 2020-01-19-on
dt_base2 <- dt_sel[week >= f_wk(c("2020-01-19")) & ndaywk==7, lapply(.SD,sum,na.rm=T),by=c("userid","firmid","ind","st","msa"),.SDcols=c("hours")]
dt_ufirme <- f_seldup(dt_base2,c("firmid","ind","st","msa"),2020)[nemp==1]
setnames(dt_ufirme, c("firmid","ind","st","msa","user_nfirm"),c("userf","userf_ind","userf_st","userf_msa","user_nfirme"))
# Note that this data set also includes userid not in base period.

# Merge together
dt_out <- merge(dt_uind[nemp==1,c("userid","user_ind","user_nind")],dt_ust[nemp==1,c("userid","user_st")], by=c("userid"), all=T)
dt_out <- merge(dt_out,dt_ulvl[nemp==1,c("userid","user_level","user_nlevel")], by=c("userid"), all=T)
dt_out <- merge(dt_out,dt_ufirm[nemp==1,c("userid","userf","userf_ind","userf_st","userf_msa","user_nfirm")], by=c("userid"), all=T)
dt_out <- merge(dt_out,dt_ufirme[nemp==1,c("userid","user_nfirme")], by=c("userid"), all.x=T)

if (nrow(dt_out)!=nrow(dt_uind) | nrow(dt_out)!=nrow(dt_ust) | nrow(dt_out)!=nrow(dt_ulvl) | nrow(dt_out)!=nrow(dt_ufirm)) {
  print("ERROR: ISSUE WITH MERGE")
}

saveRDS(dt_out, paste0(dir_clean,"survey_worker/worker_userid_var_sel.rds"))

print(paste("Ended at", Sys.time()))
# End of R script
