# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=32g --export=ALL,script=data_2_firm_ind_geo.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

source(paste0(dir_func, "f_readhb.R"), echo=T, verbose=T)

#===============================================================================


date_min <- as.Date("2020-01-01")
date_max <- as.Date("2020-12-31")
fout_suf <- "_2020"

if (date_min >= as.Date("2020-01-01")) {
  dt_raw <- f_readhb("2020_update", geo_rep = T)
} else {
  dt_raw <- f_readhb("ALL", geo_rep = T)
}
dt_sel <- dt_raw[date>=date_min & date<=date_max]

# Firms are defined at firm-ind-msa-state level

igeo <- c("msa", "st")
print(igeo)

lvfirm <- c("firmid", "ind", igeo) # Variables that identifies a firm

# hours
dt_hours <- dt_sel[, lapply(.SD, sum), by=c(lvfirm, "date"), .SDcols=c("hours")]

# twage: total wage
# hwage: does it make sense to do something here?
dt_wage <- copy(dt_sel)
dt_wage <- dt_wage[, lapply(.SD, sum, na.rm=T), by=c(lvfirm, "date"), .SDcols=c("twage")]

# emp 
# removing same userid in the firm at different locations (for now, treat users as different if in different MSA/ind)
dt_emp <- unique(dt_sel[, .SD, .SDcols=c(lvfirm, "date", "userid")])
dt_emp[, emp:=1]
dt_emp <- dt_emp[, lapply(.SD, sum), by=c(lvfirm, "date"), .SDcols=c("emp")]

# est
dt_est <- unique(dt_sel[, .SD, .SDcols=c(lvfirm, "date", "estid")])
dt_est[, est:=1]
dt_est <- dt_est[, lapply(.SD, sum), by=c(lvfirm, "date"), .SDcols=c("est")]

# este (ever active)
dt_este <- unique(dt_sel[, .SD, .SDcols=c(lvfirm, "estid")])
dt_este[, este:=1]
dt_este <- dt_este[, lapply(.SD, sum), by=c(lvfirm), .SDcols=c("este")]

#-------------------------------------------------------------------------------
# Merge data sets
dt_firm <- merge(dt_hours, dt_emp, c(lvfirm, "date"), all=T)
dt_firm <- merge(dt_firm, dt_est, c(lvfirm, "date"), all=T)
dt_firm <- merge(dt_firm, dt_wage, c(lvfirm, "date"), all=T)
dt_firm <- merge(dt_firm, dt_este, c(lvfirm), all=T)

dt_size <- readRDS(paste0(dir_clean, "homebase_sel_firm_ind_geo",fout_suf,".rds"))
dt_firm <- merge(dt_firm, dt_size, by=c("firmid","ind",igeo), all=T)

# Export
source(paste0(dir_func, "f_wk.R"), verbose=T)
dt_firm[, week:=f_wk(date)]
dt_firm[, weekd:=f_wkdate(week)]
dt_firm[, wkd:=as.POSIXlt(date)$wday]
f_ndaywk("dt_firm","week","date")
setcolorder(dt_firm, c(lvfirm, "date","week","weekd","wkd","ndaywk"))

saveRDS(dt_firm, paste0(dir_clean, "homebase_firm_ind_geo_date", fout_suf, ".rds"))
write_dta(dt_firm, paste0(dir_clean, "homebase_firm_ind_geo_date", fout_suf, ".dta"))

print(paste("Ended at", Sys.time()))
# End of R script
