# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=64g --export=ALL,script=data_1_cw_geo_raw.R,param="" --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================
# Raw data

lcsv <- list.files(dir_raw, recursive = T)
lcsv <- sort(lcsv[grepl("20180101_to_20200704", lcsv)])

dt_raw <- data.table()
for (icsv in lcsv) {
  print(icsv)
  dti_raw <- fread(paste0(dir_raw, icsv))
  dt_raw <- rbind(dt_raw, dti_raw)
}
rm(dti_raw)

print(paste0("Date Range: ", min(dt_raw$event_date), " - ", max(dt_raw$event_date)))

#-------------------------------------------------------------------------------
# MSA

dt_msa <- copy(dt_raw[, c("state", "msa")])
dt_msa[, n_obs:=1]
dt_msa <- dt_msa[, lapply(.SD, sum), by=c("state", "msa"), .SDcols=c("n_obs")]
# Flag potentially erroneous state
dt_msa[nchar(state)>2, flag_wstate:=1]
# Flag msa with mulitple states
dt_msas <- copy(dt_msa)
dt_msas[, flag_nstate:=1]
dt_msas <- dt_msas[, lapply(.SD, sum), by=c("msa"), .SDcols=c("flag_nstate")]
dt_msa <- merge(dt_msa, dt_msas[flag_nstate>1 & !msa %in% c("All other"),], by="msa", all.x=T)

dt_msa[, c("state_cor", "msa_cor", "change"):=list(state, msa, NA)]
setorderv(dt_msa, c("msa", "state"))
setcolorder(dt_msa, c("state", "msa", "state_cor", "msa_cor"))
fwrite(dt_msa, paste0(dir_clean, "cw_geo_msa_raw.csv"))

#-------------------------------------------------------------------------------
# state, msa, fips, zip -- all geo variables

dt_geo <- copy(dt_raw[, c("state", "msa", "county_code", "zip")])
dt_geo <- dt_geo[, .(nobs=.N), by=c("state", "msa", "county_code", "zip")]

fwrite(dt_geo, paste0(dir_clean, "cw_geo_raw.csv"))

print(paste("Ended at", Sys.time()))
# End of R script
