# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=1 --mem=4g --export=ALL,script=data_1_cw_geo.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================
# Fix MSA crosswalk

dt_msa <- fread(paste0(dir_clean, "cw_geo_msa_raw.csv"))
dt_msa[, c("state_cor","msa_cor"):=NULL]
dt_msao <- fread(paste0(dir_clean, "cw_geo_msa_fixed_orig.csv"))

dt_out <- merge(dt_msao[, .SD, .SDcols=c("state","msa","state_cor","msa_cor")],dt_msa,
                by=c("state","msa"), all=T)
print(dt_out[is.na(state_cor) | is.na(msa_cor)], )

f_rev <- function(istate, imsa, istatec) {
  dt_out[state==istate & msa==imsa, c("state_cor","msa_cor"):=list(istatec, imsa)]
}

f_add <- function(istate, imsa, istatec) {
  dt_add <- data.table(state=istate, msa=imsa, state_cor=istatec, msa_cor=imsa)
  dt_out <<- rbind(dt_out, dt_add, fill=T)
}

f_rev("al","Cleveland","oh")
f_rev("co","Las Vegas","nv")
f_rev("co","San Jose","ca")
f_rev("oh","Philadelphia","pa")
f_rev("ri","Houston","tx")

f_rev("oh","Washington DC","va")
f_rev("Unclassified","Kansas City","mo")
f_rev("Unclassified","Cleveland","oh")

# "State" DC should all be MSA DC
dt_out[state_cor=="dc" & msa_cor=="All other", msa_cor:="Washington DC"]

print(dt_out[is.na(state_cor) | is.na(msa_cor)], )

setorderv(dt_out, c("msa","state"))

dt_out[,change:=NULL]
fwrite(dt_out, paste0(dir_clean, "cw_geo_msa_fixed.csv"))


#===============================================================================
# Factorize geo variables

#-------------------------------------------------------------------------------
# State from NBER

dt_st <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))[,-c("state_reg_str","state_div_str")]

#-------------------------------------------------------------------------------
# MSA
# msa: factor with Homebase msa name
# msa2: factor with NBER msa name
# msac: msa code from NBER

# MSA from data
dt_msad <- unique(fread(paste0(dir_clean, "cw_geo_msa_fixed.csv"))[, c("state_cor","msa_cor")])
setnames(dt_msad, c("state_cor","msa_cor"),c("state","msa"))
dt_msad[msa=="All other", msa:=toupper(state)]
dt_msad <- unique(dt_msad[, c("msa")])

dt_msast <- dt_msad[nchar(msa)==2 | msa %in% c("NOT USA","UNCLASSIFIED"), ] # Residual MSA
dt_msad <- dt_msad[! (nchar(msa)==2 | msa %in% c("NOT USA","UNCLASSIFIED")), ] # Actual MSA

#-------------------
# For residual MSAs
dt_msast <- merge(dt_msast, dt_st[ ,c("st_str","st2_str","stssa")], by.x=c("msa"), by.y=c("st_str"), all.x=T)
setnames(dt_msast, c("msa","st2_str","stssa"),c("msa_str","msa_nberstr","msa_nberc"))
dt_msast[, msa_nberstr:=toupper(msa_nberstr)]

#-------------------
# For actual MSAs
dt_msad[, msa2:=toupper(msa)]
dt_msad <- dt_msad[, c("msa2","msa")]
setnames(dt_msad, "msa","msa_str")

# MSA from nber
dt_nber <- readRDS(paste0(dir_clean,"cw/cw_geo_nber.rds"))
dt_msan <- unique(dt_nber[,c("msa","msa_str")])
dt_msan <- dt_msan[msa>=100 | msa %in% c(40,60,80),]
dt_msan[, msa2:=sub("^([[:upper:][:space:]\\.\\-]+),[[:upper:][:space:]\\-]+$", "\\1", msa_str)]
setnames(dt_msan, c("msa", "msa_str"),c("msa_nberc", "msa_nberstr"))
dt_sum <- dt_msan[, .(nobs = .N), by = c("msa2")]
print(dt_sum[nobs>1,])
dt_msan <- merge(dt_msan, dt_sum, by=c("msa2"), all.x=T)

dt_msad <- merge(dt_msad, dt_msan[nobs==1,c("msa2","msa_nberstr","msa_nberc")], by=c("msa2"), all.x=T)
setorder(dt_msad, msa_nberc, msa_str)

# Manual fix some MSA names
f_msan <- function(msao, msan) {
  dt_msad[msa2==msao,msa_nberstr:=msan] 
}
# f_msan("","")
f_msan("AUSTIN","AUSTIN-SAN MARCOS, TX")
f_msan("BOSTON","BOSTON-WORCESTER-LAWRENCE-LOWELL-BROCKTON, MA")
f_msan("BUFFALO","BUFFALO-NIAGARA FALLS, NY")
f_msan("CHARLOTTE","CHARLOTTE-GASTONIA-ROCK HILL, NC-SC")
f_msan("CLEVELAND","CLEVELAND-LORAIN-ELYRIA, OH")
f_msan("COLUMBUS","COLUMBUS, OH")
f_msan("JACKSONVILLE","JACKSONVILLE, FL")
f_msan("LOS ANGELES","LOS ANGELES-LONG BEACH, CA")
f_msan("MILWAUKEE","MILWAUKEE-WAUKESHA, WI")
f_msan("MINNEAPOLIS","MINNEAPOLIS-ST. PAUL, MN-WI")
f_msan("PHOENIX","PHOENIX-MESA, AZ")
f_msan("PROVIDENCE","PROVIDENCE-WARWICK-PAWTUCKET, RI")
f_msan("RALEIGH","RALEIGH-DURHAM-CHAPEL HILL, NC")
f_msan("RICHMOND","RICHMOND-PETERSBURG, VA")
f_msan("RIVERSIDE","RIVERSIDE-SAN BERNADINO, CA")
f_msan("SALT LAKE CITY","SALT LAKE CITY-OGDEN, UT")
f_msan("SEATTLE","SEATTLE-BELLEVUE-EVERETT, WA")
f_msan("ST. LOUIS","ST. LOUIS, MO-IL")
f_msan("TAMPA","TAMPA-ST. PETERSBURG-CLEARWATER, FL")
f_msan("VIRGINIA BEACH","NORFOLK-VIRGINIA BEACH-NEWPORT NEWS, VA-NC")
f_msan("WASHINGTON DC","WASHINGTON, DC-MD-VA-WV")
f_msan("NEW YORK","NEW YORK-NEWARK, NY-NJ-PA")

print("Now Export MSA")

dt_msa <- merge(dt_msad[,c("msa_str","msa_nberstr")], dt_msan[,c("msa_nberstr","msa_nberc")], all.x=T)
dt_msa <- rbind(dt_msast, dt_msa)
setorder(dt_msa, msa_nberc)

dt_msa[, msa:=factor(msa_nberc)]
setattr(dt_msa$msa, "levels", dt_msa$msa_str)
print(nrow(dt_msa[as.character(msa)!=msa_str]))

dt_msa[, msa2:=factor(msa_nberc)]
setattr(dt_msa$msa2, "levels", dt_msa$msa_nberstr)
print(nrow(dt_msa[as.character(msa2)!=msa_nberstr]))

dt_msa[, msac:=msa_nberc]
dt_msa <- dt_msa[, c("msa","msa2","msac")]

dt_msa[, msa_str:=as.character(msa)]
dt_msa[, msa2_str:=as.character(msa2)]
setcolorder(dt_msa, c("msa","msa_str","msa2","msa2_str","msac"))

saveRDS(dt_msa, paste0(dir_clean, "cw_geo_msa.rds"))
write_dta(dt_msa, paste0(dir_clean,"cw_geo_msa.dta"))

dt_msac <- copy(dt_msa)
dt_msac[, msa:=as.integer(msa)]
dt_msac[, msa2:=as.integer(msa2)]

fwrite(dt_msa, paste0(dir_cleanc, "cw_geo_msa.csv"))

print(paste("Ended at", Sys.time()))
# End of R script
