# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=64g --export=ALL,script=data_1_sel_firm_year.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

source(paste0(dir_func, "f_readhb.R"), echo=T, verbose=T)

#===============================================================================
# Firm size based on employment in January
# Firms are defined at the level of firmid-ind-msa-state
# Defined firm size using number of unique userid associated with the given firm in the period

vstate <- "st"
vmsa <- "msa"

date_min=as.Date("2020-01-19")
date_max=as.Date("2020-02-01")
vsize <- "_2wk"
vgeo <- c(vmsa,vstate)

f_firmsize <- function(vgeo=c(vmsa,vstate), vsize="", 
                       date_min=as.Date("2020-01-19"), 
                       date_max=as.Date("2020-02-01")) {
  
  print(paste0("Date Range: ", date_min, " - ", date_max))
  print(vgeo)
  
  lvfirm <- c("firmid", "ind", vgeo)
  
  dt_sel <- dt_raw[date>=date_min & date<=date_max, ]
  
  # Firm Size
  dt_emp <- data.table(dt_sel[, lapply(.SD, sum, na.rm=T), by=c(lvfirm, "userid"), .SDcols=c("hours")])
  dt_emp[, size:=1]
  dt_emp <- dt_emp[, lapply(.SD, sum), by=c(lvfirm), .SDcols=c("size","hours")]
  
  dt_emp[size>=1 & size<=5, sizec:=1]
  dt_emp[size>=6 & size<=20, sizec:=2]
  dt_emp[size>=21 & size<=50, sizec:=3]
  dt_emp[size>=51 , sizec:=4]
  dt_emp[, sizec:=as.factor(sizec)]
  setattr(dt_emp$sizec, "levels", c("1-5", "6-20", "21-50", "50+"))
  
  setcolorder(dt_emp, c(lvfirm, "size", "sizec", "hours"))
  
  # Firm establishments
  dt_est <- unique(dt_sel[,.SD, .SDcols=c(lvfirm, "estid")])
  dt_est <- dt_est[, .(esta=.N), by=c(lvfirm)]
  
  dt_out <- merge(dt_emp, dt_est, by=c(lvfirm), all=T)
  
  setnames(dt_out, c("size", "sizec", "hours", "esta"), 
           c(paste0("size", vsize), paste0("sizec", vsize), paste0("hours", vsize), paste0("esta", vsize)))
  
  return(dt_out)
  
}

#===============================================================================

dt_info <- data.table(year=c(2018,2019,2020),
                      date1=c(as.Date("2018-01-07"),as.Date("2019-01-06"),as.Date("2020-01-05")),
                      date2=c(as.Date("2018-01-21"),as.Date("2019-01-20"),as.Date("2020-01-19")),
                      date3=c(as.Date("2018-02-03"),as.Date("2019-02-02"),as.Date("2020-02-01")))

lvfirm <- c("firmid", "ind", vmsa, vstate)

#===============================================================================

for (geo_rep in c(T,F)) {

print(paste0("Replacing geographical variables: ", geo_rep))
dt_raw <- f_readhb("all", geo_rep=geo_rep)
print(nrow(dt_raw))

for (iyear in c(2018,2019,2020)) {

print(dt_info[year==iyear,])
  
dt_size <- merge(f_firmsize(vgeo=c(vmsa,vstate), vsize="_2wk", date_min=dt_info[year==iyear,date2], date_max=dt_info[year==iyear,date3]), 
                 f_firmsize(vgeo=c(vmsa,vstate), vsize="_4wk", date_min=dt_info[year==iyear,date1], date_max=dt_info[year==iyear,date3]),
                 by=lvfirm, all=T)
print(nrow(dt_size))

# Subset firms
dt_size[, firm_base_2wk:=0]
dt_size[, firm_base_4wk:=0]
dt_size[!is.na(size_2wk), firm_base_2wk:=1]
dt_size[!is.na(size_4wk), firm_base_4wk:=1]

# Find out firms that does not have enough hours (hours equivalent to 1 full time employee)
dt_size[hours_2wk<80, firm_base_2wk:=0]
dt_size[hours_4wk<160, firm_base_4wk:=0]

# Find out msa and state with too few firms (<50)
vgeo <- vmsa
for (vgeo in c(vmsa,vstate)) {
  vsel1 <- paste0(vgeo,"_sel_2wk")
  vsel2 <- paste0(vgeo,"_sel_4wk")
  # Find # of selected firms by geo
  dt_geo <- dt_size[, lapply(.SD, sum, na.rm=T), by=c(vgeo), .SDcols=c("firm_base_2wk","firm_base_4wk")]
  dt_geo <- dt_geo[firm_base_2wk<50 | firm_base_4wk<50,]
  # Mark geo regions to exclude
  dt_geo[firm_base_2wk<50, (vsel1):=0]
  dt_geo[firm_base_4wk<50, (vsel2):=0]
  dt_geo <- dt_geo[, .SD, .SDcols=c(vgeo, vsel1, vsel2)]
  # Merge back and change firm_base codes
  dt_size <- merge(dt_size, dt_geo, by=c(vgeo), all.x=T)
  # Mark geo regions to keep
  dt_size[!is.na(firm_base_2wk) & is.na(get(vsel1)), (vsel1):=1]
  dt_size[!is.na(firm_base_4wk) & is.na(get(vsel2)), (vsel2):=1]
}

fout <- ifelse(geo_rep==T, 
               paste0("homebase_sel_firm_ind_geo_",iyear),
               paste0("homebase_sel_firm_ind_geo_",iyear,"o"))
saveRDS(dt_size, file=paste0(dir_clean, fout, ".rds"))
write_dta(dt_size, paste0(dir_clean, fout, ".dta"))

} # End of for (iyear in c(2018,2019,2020))

} # End of for (vgeo in c(T,F))

print(paste("Ended at", Sys.time()))
# End of R script
