# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=6 --mem=8g --export=ALL,script=cw_geo_zip.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================
# zip-county crosswalk from HUD

dir_hud <- paste0(dir_raw, "cw_hud/")
dir_cw <- paste0(dir_clean, "cw/")

lxlsx <- list.files(dir_hud, pattern = "xlsx")

dt_raw <- data.table()
for (ixlsx in lxlsx) {
  
  print(ixlsx)
  
  dti_raw <- data.table(readxl::read_excel(paste0(dir_hud, ixlsx), col_types = "text"))
  setnames(dti_raw, names(dti_raw), tolower(names(dti_raw)))
  dti_raw <- dti_raw[, c(1:6)]
  
  # Extract year and quarter information
  dti_raw[, year:=as.integer(sub("ZIP_COUNTY_(\\d{2})(\\d{4})\\.xlsx", "\\2", ixlsx))]
  dti_raw[, qtr:=as.integer(sub("ZIP_COUNTY_(\\d{2})(\\d{4})\\.xlsx", "\\1", ixlsx))/3]
  dt_raw <- rbind(dt_raw, dti_raw)
}

setnames(dt_raw, "county", "fips")
setorderv(dt_raw, c("year","qtr","zip","fips"))
dt_raw <- dt_raw[, lapply(.SD, as.numeric), by=c("year","qtr","zip","fips"), .SDcols=c("res_ratio","bus_ratio","oth_ratio","tot_ratio")]
saveRDS(dt_raw, paste0(dir_cw, "cw_geo_zip_raw.rds"))
haven::write_dta(dt_raw, paste0(dir_cw, "cw_geo_zip_raw.dta"))

#-------------------------------------------------------------------------------
# Define a consistent crosswalk
# There are duplicates zip since zip can be across fips

# First make basic selection within each year-quarter-zip
# First choose by business address, if there are duplicates, then by total address
# dt_raw <- readRDS(paste0(dir_cw, "cw_zip_raw.rds"))

dt_zip <- copy(dt_raw)
print(paste0("nrow(dt_raw) = ", nrow(dt_raw)))
dt_zip[, max_bus:=max(bus_ratio), by=c("zip","year","qtr")]
dt_zip[, max_tot:=max(tot_ratio), by=c("zip","year","qtr")]
dt_zip[, sel_bus:=ifelse(bus_ratio==max_bus, 1, 0)]
dt_zip[, sel_tot:=ifelse(tot_ratio==max_tot, 1, 0)]
print(paste0("nrow(dt_zip[sel_bus!=sel_tot]): ",nrow(dt_zip[sel_bus!=sel_tot])))

dt_zip <- dt_zip[bus_ratio==max_bus]
dt_zip[, max_tot2:=max(tot_ratio), by=c("zip","year","qtr")]
dt_zip <- dt_zip[tot_ratio==max_tot2]

# Generate a random variable for each zip-fips combination that we can use to break ties in future steps
# We do not allow this to vary across year and quarter so that we can select the same combination in similar situations
set.seed(60637)
dt_rand <- unique(dt_zip[,c("zip","fips")])
dt_rand[,rand:=rnorm(nrow(dt_rand))]
dt_zip <- merge(dt_zip, dt_rand, by=c("zip","fips"), all.x=T)
rm(dt_rand)

#---------------------------------------
# Function to deal with duplicates
# First use the mode, and then random number

f_zipfips <- function(vtime=c()) {
  
  lvall <- c("zip","fips",vtime)
  lvby <- c("zip",vtime)
  
  # Try to collapse to zip-fips-time level and identify duplicates
  dt_zipa <- unique(dt_zip[,.SD,.SDcols=c(lvall)])
  dt_zipa[, dup:=.N, by=c(lvby)]
  print(table(dt_zipa$dup))
  dt_out1 <- dt_zipa[dup==1] # The ones without duplicates
  dt_out1[, dup:=NULL]
  
  dt_out2 <- merge(dt_zip, dt_zipa[dup>1], by=c(lvall))
  # First use the mode
  dt_out2[, fipsn:=.N, by=c(lvall)]
  dt_out2[, fipsn_max:=max(fipsn), by=c(lvby)]
  dt_out2s <- dt_out2[fipsn==fipsn_max]
  dt_out2s <- unique(dt_out2s[, .SD,.SDcols=c(lvall,"rand")])
  dt_out2s[, dup:=.N, by=c(lvby)]
  print(table(dt_out2s$dup))
  # Then use random number
  dt_out2s[, max_rand:=max(rand), by=c(lvby)]
  dt_out2s <- dt_out2s[rand==max_rand]
  dt_out2s[, dup:=.N, by=c(lvby)]
  print(table(dt_out2s$dup))
  
  dt_out <- rbind(dt_out1,dt_out2s[,.SD,.SDcols=lvall])
  return(dt_out)
}

#---------------------------------------
# zip-county by year-quarter

print("zip-fips-year-quarter")
dt_out <- f_zipfips(vtime=c("year","qtr"))
saveRDS(dt_out, paste0(dir_cw, "cw_geo_zip_year_qtr.rds"))
haven::write_dta(dt_out, paste0(dir_cw, "cw_geo_zip_year_qtr.dta"))
rm(dt_out)

#---------------------------------------
# zip-county by year

print("zip-fips-years")
dt_out <- f_zipfips(vtime=c("year"))
saveRDS(dt_out, paste0(dir_cw, "cw_geo_zip_year.rds"))
haven::write_dta(dt_out, paste0(dir_cw, "cw_geo_zip_year.dta"))
rm(dt_out)

#---------------------------------------
# zip-county for all

print("zip-fips")
dt_out <- f_zipfips(vtime=c())
saveRDS(dt_out, paste0(dir_cw, "cw_geo_zip.rds"))
haven::write_dta(dt_out, paste0(dir_cw, "cw_geo_zip.dta"))
rm(dt_out)

print(paste("Ended at", Sys.time()))
# End of R script
