# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=6 --mem=8g --export=ALL,script=cw_geo_nber.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

dir_cw <- paste0(dir_clean, "cw/")

#===============================================================================

dt_nber <- fread(paste0(dir_raw, "cw/cbsatocountycrosswalk.csv"))
print(dt_nber[msa<100 & msa != ssast, c("msaname","msa","ssast","fipst")])
# MSA codes for residual of states are based on ssast codes

#-------------------------------------------------------------------------------
# Clean States

dt_st <- unique(dt_nber[, c("state","fipst","ssast")])
setnames(dt_st, c("fipst","ssast"),c("stfips","stssa"))

dt_st <- rbind(dt_st[state!="PR", ], 
               data.table(state=c("AS","GU","MP","PR","VI","NOT USA","UNCLASSIFIED"), 
                          stfips=as.integer(c(60,66,69,72,78,98,99)),
                          stssa=as.integer(c(64,65,97,40,48,98,99))))
# fips 98, 99 not original codes
# ssa 98, 99 modified definition

dt_st <- merge(dt_st, unique(dt_nber[!grepl(",", msaname),c("fipst","msaname")]), by.x=c("stfips"), by.y=c("fipst"), all.x=T)
setnames(dt_st, c("state","msaname"),c("st_str","st2_str"))

# Fix missing or duplicates state strings
print(dt_st[is.na(st2_str)|st2_str==""|grepl("[^[:alpha:] ]",st2_str),])
lst <- c("AK","CT","DC","NJ","RI","AS","GU","MP","PR","VI","NOT USA","UNCLASSIFIED")
lsts <- c("ALASKA","CONNECTICUT","DISTRICT OF COLUMBIA","NEW JERSEY","RHODE ISLAND","American Samoa","Guam","Northern Mariana Islands","Puerto Rico","Virgin Islands","Not USA","Unclassified")
for (i in c(1:length(lst))) {
  dt_st[st_str==lst[i], st2_str:=toupper(lsts[i])] 
}
dt_st <- unique(dt_st)

# Make factors
setorder(dt_st, stfips)
dt_st[, st:=factor(stfips)]
dt_st[, st2:=factor(stfips)]
setorder(dt_st, stfips)
setattr(dt_st$st, "levels", dt_st$st_str)
setattr(dt_st$st2, "levels", dt_st$st2_str)
# Check if all matched
print(nrow(dt_st[as.character(st)!=st]))
print(nrow(dt_st[as.character(st2)!=st2_str]))

# Organize state order
setcolorder(dt_st, c("st","st2","stfips","stssa","st_str","st2_str"))

# Add Census regions/divisions
dt_div <- fread(paste0(dir_raw, "cw/cw_state_census_division.csv"))
setnames(dt_div, c("State","State Code","Region","Division"), c("state_str","state","state_reg_str","state_div_str"))
dt_fac <- fread(paste0(dir_raw, "cw/cw_state_census_division_fac.csv"))
setorder(dt_fac, state_reg, state_div)
dt_div <- merge(dt_div, dt_fac, by=c("state_reg_str","state_div_str"), all=T)
dt_div[, state_reg:=factor(state_reg)]
dt_div[, state_div:=factor(state_div)]
setattr(dt_div$state_reg, "levels", unique(dt_fac[["state_reg_str"]]))
setattr(dt_div$state_div, "levels", unique(dt_fac[["state_div_str"]]))
setcolorder(dt_div, c("state","state_str","state_reg","state_reg_str","state_div","state_div_str"))

dt_sto <- merge(dt_st, dt_div[, -c("state_str")], by.x=c("st_str"), by.y=c("state"), all.x=T)
setcolorder(dt_sto, c("st","st2","stfips","stssa","st_str","st2_str"))
setorderv(dt_sto, "stfips")

saveRDS(dt_sto, paste0(dir_cw, "cw_geo_nber_state.rds"))
haven::write_dta(dt_sto, paste0(dir_cw, "cw_geo_nber_state.dta"))

dt_sto[, st:=as.integer(st)]
dt_sto[, st2:=as.integer(st2)]
dt_sto[, state_reg:=as.integer(state_reg)]
dt_sto[, state_div:=as.integer(state_div)]
fwrite(dt_sto, paste0(dir_cw, "cw_geo_nber_state.csv"))
rm(dt_sto,dt_div,dt_fac)

#-------------------------------------------------------------------------------
# Clean fips and MSA

dt_fips <- dt_nber[, c("countyname","fipscounty","msa","msaname","state")]
setnames(dt_fips, c("countyname","fipscounty","msa","msaname","state"), c("fips_str","fips","msa","msa_str","st_str"))
dt_fips <- unique(dt_fips)

dt_view <- dt_fips[is.na(msa_str)|msa_str==""]
print(dt_view)
dt_fips[fips_str=="STATEWIDE" & msa==31, msa_str:="NEW JERSEY"]

# Add additional ones from AS, GU, MP, and VI
dt_fipsa <- fread(paste0(dir_raw, "cw/cw_fips_nrcs.csv"))[!st_str %in% c("PR")]
dt_fipsa[, fips_str:=toupper(fips_str)]
dt_fipsa[fips==60020, fips_str:="MANUA-OFU-OLOSEGA"]
dt_fipsa[fips==66010, fips_str:="GUAM/COCOS ISLAND"]
dt_fipsa[fips==66010, st_str:="GU"]
dt_fipsa[fips==69120, fips_str:="AGUIJAN-TINIAN"]
dt_fips <- rbind(dt_fips, unique(dt_fipsa), fill=T)

dt_fips <- merge(dt_fips, dt_st, by=c("st_str"), all=T)
setcolorder(dt_fips, c("fips","fips_str","msa","msa_str","st","st2","stfips","stssa","st_str","st2_str"))
setorderv(dt_fips, c("fips","stfips"))

dt_fips[is.na(msa), msa_str:=as.character(st2)]
dt_fips[is.na(msa), msa:=stssa]
dt_fips[stfips %in% c(98,99), fips:=stfips]

saveRDS(dt_fips, paste0(dir_cw, "cw_geo_nber.rds"))
haven::write_dta(dt_fips, paste0(dir_cw, "cw_geo_nber.dta"))

dt_fips[, st:=as.integer(st)]
dt_fips[, st2:=as.integer(st2)]
fwrite(dt_fips, paste0(dir_cw, "cw_geo_nber.csv"), )

print(paste("Ended at", Sys.time()))
# End of R script
