# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=16g --export=ALL,script=cbp_1_raw.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())
setwd(paste0(dir_proj))

#===============================================================================

dir_cbpr <- paste0(dir_raw,"census/cbp2018/")
dir_cen <- paste0(dir_clean,"census/")

dt_geo <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))
dt_geor <- fread(paste0(dir_cbpr,"georef17.txt"))
setnames(dt_geor, "st","stfips")
dt_geor[,fips:=stfips*1000+cty]
dt_geor[,st2_str:=toupper(sub("^.*?, ([[:alpha:][:space:]]+)$","\\1",ctyname))]

# Double check the state FIPS codes are the same
dt_str <- unique(dt_geor[,c("stfips","st2_str")])
if (nrow(dt_str[is.na(st2_str)])!=0) { print("ERROR: EMPTY STATE NAME") }
dt_view <- merge(dt_geo, unique(dt_str[,c("stfips","st2_str")]), by=c("stfips"), all=T)

if (nrow(dt_view[st2_str.x!=toupper(st2_str.y) & !is.na(st2_str.y)])!=0) {
  print("ERROR: MISMATCHED STATES")
  print(dt_view[st2_str.x!=toupper(st2_str.y) & !is.na(st2_str.y), c("stfips","st2_str.x","st2_str.y")])
}
print("States not in CBP")
print(dt_view[is.na(st2_str.y),c("stfips","st2")])

#===============================================================================

unzip(paste0(dir_cbpr,"cbp18st.zip"),exdir = dir_cbpr, setTimes = T)
dt_raw <- fread(paste0(dir_raw,"census/cbp2018/cbp18st.txt"))

lvar <- names(dt_raw)
lvarn <- gsub("<5","1_5",lvar)
setnames(dt_raw,lvar,lvarn)
setnames(dt_raw, c("fipstate"),c("stfips"))

dt_raw[,rowid:=.I]
lvar <- names(dt_raw)

#-------------------------------------------------------------------------------
# geo and naics

dt_raw <- merge(dt_geo[,c("st","stfips")], dt_raw, by=c("stfips"), all.y=T)
# dt_view <- unique(dt_raw[,c("naics")])
# dt_raw[, naics:=gsub("[/-]","0",naics)]
# dt_view <- unique(dt_raw[,c("naics")])

#-------------------------------------------------------------------------------
# Factorize selected variables

f_fac <- function(vfac,nlfac) {
  
  lfac <- as.factor(c(1:length(nlfac)))
  levels(lfac) <- names(nlfac)
  # levels(lfac) <- unlist(unname(nlfac)) # If name and list are reversed
  
  print(vfac)
  print(table(dt_raw[[vfac]],useNA = "always"))
  setnames(dt_raw, vfac, "vraw")
  for (ifac in lfac) {
    dt_raw[vraw==nlfac[as.character(ifac)], (vfac):=ifac]
  }
  print(table(dt_raw[[vfac]],useNA = "always"))
  dt_raw[, vraw:=NULL]
}

#---------------------------------------
# lfo
# '-' - All Establishments                        
# C - C-Corporations and other corporate legal forms of organization
# Z - S-Corporations
# S - Sole Proprietorships
# P - Partnerships
# N - Non-Profits
# G - Government
# O - Other

nlfac <- list("C-Corporations and other corporate legal forms of organization"="C",
              "S-Corporations"="Z",
              "Sole Proprietorships"="S",
              "Partnerships"="P",
              "Non-Profits"="N",
              "Government"="G",
              "Other"="O",
              "All Establishments"="-")
f_fac("lfo",nlfac)

#---------------------------------------
# Noise fields
# G       0 to < 2% noise (low noise)
# H       2 to < 5% noise (medium noise)
# J       >= 5% noise (high noise)
# N       Not available or not comparable. Employment or payroll field set to zero.

nlfac <- list("0 to < 2% noise (low noise)"="G",
              "2 to < 5% noise (medium noise)"="H",
              ">= 5% noise"="J",
              "Not available or not comparable. Employment or payroll field set to zero."="N")
lvnf <- lvar[grepl("nf$",lvar)]
cat(lvnf)

for (ivnf in lvnf) {
  f_fac(ivnf,nlfac) 
}

#-------------------------------------------------------------------------------
# Export

setcolorder(dt_raw, c("st",lvar))
setorderv(dt_raw,c("rowid"))
saveRDS(dt_raw, paste0(dir_cen,"cbp_2018_st.rds"))

print(paste("Ended at", Sys.time()))
# End of R script
