# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=6 --mem=8g --export=ALL,script=data_1_cw_geo_improved.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

dir_cw <- paste0(dir_clean, "cw/")

#===============================================================================

#-------------------------------------------------------------------------------
# External crosswalks

# NBER crosswalk
dt_fips <- readRDS(paste0(dir_cw, "cw_geo_nber.rds"))
dt_fips[, dup:=.N, by=c("fips")]
print("Duplicated FIPS")
print(dt_fips[dup!=1,])

# HUD crosswalk
dt_zipq <- readRDS(paste0(dir_cw,"cw_geo_zip_year_qtr.rds"))[year==2020 & qtr==1]
dt_zipa <- readRDS(paste0(dir_cw,"cw_geo_zip.rds"))
dt_zipq[,fips:=as.integer(fips)]
dt_zipa[,fips:=as.integer(fips)]

dt_zipe <- dt_zipa[! as.integer(fips) %in% dt_fips$fips, ]
print("FIPS in HUD but not NBER")
print(dt_zipe)


#-------------------------------------------------------------------------------
# Raw fips

dt_geo <- fread(paste0(dir_clean, "cw_geo_raw.csv"))
# dt_geo <- dt_geo[! state %in% c("Not USA","Unclassified")]
dt_view <- dt_geo[grepl("[[:alpha:]]", zip)]
dt_view <- dt_view[, lapply(.SD, sum, na.rm=T), by=c("state"), .SDcols=c("nobs")]
print("Non-numbers in zip codes:")
print(dt_view)
dt_geo <- dt_geo[!grepl("[[:alpha:]]", zip)]
setnames(dt_geo, c("county_code"),c("fips"))

# Fixed state-MSA using our manual crosswalk
dt_msaf <- fread(paste0(dir_clean, "cw_geo_msa_fixed.csv"))
dt_msaf <- dt_msaf[state!=state_cor | msa!=msa_cor, ]

if (nrow(dt_msaf)>0) {
  for (irow in c(1:nrow(dt_msaf))) {
    vstate <- dt_msaf[irow, state]
    vmsa <- dt_msaf[irow, msa]
    vstate_cor <- dt_msaf[irow, state_cor]
    vmsa_cor <- dt_msaf[irow, msa_cor]
    
    dt_geo[state==vstate & msa==vmsa, c("state", "msa"):=list(vstate_cor, vmsa_cor)]
  }
}

dt_geo <- dt_geo[, lapply(.SD, sum, na.rm=T), by=c("state","msa","fips","zip"), .SDcols=c("nobs")]
dt_view <- dt_geo[state %in% c("Not USA","Unclassified")]
# Some of these observations are assigned fips codes
rm(dt_msaf)

#-------------------------------------------------------------------------------
# Match fips with Homebase fips

dt_geom <- merge(dt_geo[, lapply(.SD, sum, na.rm=T), by=c("fips","state","msa"), .SDcols=c("nobs")], dt_fips, by=c("fips"), all.x=T)
print(dt_geom[!is.na(fips) & is.na(fips_str),])
# The rest missings are those with zip codes but not fips codes

dt_geom <- merge(dt_geo, dt_fips, by=c("fips"), all.x=T)
print(nrow(dt_geom[toupper(st)!=st_str])) # Check if state matches: very few of them
dt_view <- dt_geom[toupper(st)!=st_str] # It seems that the fips codes are more accurate?

# dt_geoz <- dt_geom[is.na(fips)]

#-------------------------------------------------------------------------------
# Match fips with HUD fips

dt_geoh <- copy(dt_geo)

# Check and remove duplicated zip-fips
dt_view <- copy(dt_geoh)
dt_view[, ndup:=.N, by=c("fips","zip")]
dt_view <- dt_view[ndup!=1]
setorderv(dt_view, c("zip","fips"))
nrow(dt_view[is.na(fips)])
# Most have missing fips, some may have been assigned wrong states (which causes duplicates)

dt_geoh <- dt_geoh[, lapply(.SD, sum, na.rm=T), by=c("fips","zip"), .SDcols=c("nobs")] # Remove duplicates
setnames(dt_geoh, "fips","fipshb")
dt_geoh[,ziphb:=zip]
dt_geoh[,zip:=sub("^(\\d{5}).*?$","\\1",trimws(zip))]
dt_geoh <- merge(dt_geoh, dt_zipq, by=c("zip"), all.x=T) # Merge with zip from 2020Q1 since this is our focus

# no fipshb, but fipshud
print("Rows with no FIPS-HB but FIPS-HUD")
dt_view <- dt_geoh[is.na(fipshb) & !is.na(fips)]
print(nrow(dt_view))

# still no fips
print("Rows with no FIPS-HB or FIPS-HUD")
dt_view <- dt_geoh[is.na(fipshb) & is.na(fips)]
print(nrow(dt_view))

# Let's see if any of these zip codes are from the previous years
sum(dt_view$zip %in% dt_zipa$zip) # Not many of them
dt_view <- dt_view[zip %in% dt_zipa$zip]
print(dt_view)
# These seems to be all in the US. So let's use HUD fips
for (irow in c(1:nrow(dt_view))) {
  izip <- dt_view[irow, zip]
  dt_geoh[zip==izip, fips:=dt_zipa[zip==izip, fips]]
}
dt_view <- dt_geoh[zip %in% dt_view$zip]
print(dt_view)

#-------------------------------------------------------------------------------
# Quality control

# zips that are still not mapped to fips
print("zip codes that are still not mapped to fips")
print(nrow(dt_geoh[is.na(fips)]))
# Take a closer look at them
dt_view <- dt_geoh[is.na(fips)]
dt_view <- merge(dt_view, dt_geo, by.x=c("ziphb"), by.y=c("zip"), all.x=T)
print(dt_view[,lapply(.SD, sum, na.rm=T), by=c("state","msa"), .SDcols=c("nobs.y")])
print(paste(sum(dt_view$nobs.y), sum(dt_geo$nobs), sum(dt_view$nobs.y)/sum(dt_geo$nobs)))
# In total about 0.6% of rows have this problem, so maybe not too bad

# Take a look at zips codes that were categorized as not in USA but now matched
dt_view <- dt_geoh[ziphb %in% dt_geo[state %in% c("Unclassified","Not USA"),zip] & !is.na(fips)]
dt_view <- merge(dt_view, dt_geo[,-c("fips")], by.x=c("ziphb"), by.y=c("zip"), all.x=T)
dt_view <- merge(dt_view[, c("ziphb","zip","fipshb","fips","state")], dt_fips[,c("fips","st","msa_str")], by=c("fips"), all.x=T)
print(dt_view[state %in% c("Not USA")])

# different fips from HB and HUD
dt_view <- dt_geoh[fipshb!=fips]
print(nrow(dt_view))
print(dt_view)
# Manual choose alternativ fips from HB if more proper
f_repfips <- function(izip, ifips) {
  dt_geoh[zip==izip, fips:=ifips]
}
f_repfips(23173,51760) # Before 2020 it has always been associated with 51760
# 23867: Seems to be on the boundary

# Check if any fips from the ones not mapped to NBER
nrow(dt_geoh[fips %in% dt_zipe$fips])
print(dt_geoh[fips %in% dt_zipe$fips])
# Seems to be VI and PR (Now seems to be all mapped)

# Check if any ziphb has more than one row
print("ziphb with more than one row")
dt_view <- copy(dt_geoh)[,nrow:=.N, by=c("ziphb")]
print(dt_view[nrow>1,])
# drop 45251 - NA
dt_geoh <- dt_geoh[! (ziphb=="45251" & is.na(fipshb)),]
dt_view <- copy(dt_geoh)[,nrow:=.N, by=c("ziphb")]
print(dt_view[nrow>1,])

#-------------------------------------------------------------------------------
# Export

# Merge with FIPS information
dt_geoh <- merge(dt_geoh[, c("ziphb","zip","fipshb","fips")], dt_fips, by=c("fips"), all.x=T)
setcolorder(dt_geoh, c("ziphb","zip","fipshb","fips"))

saveRDS(dt_geoh, paste0(dir_clean, "cw_geo_improved.rds"))
haven::write_dta(dt_geoh, paste0(dir_clean, "cw_geo_improved.dta"))
fwrite(dt_geoh, paste0(dir_cleanc, "cw_geo_improved.csv"))

print(paste("Ended at", Sys.time()))
# End of R script
