# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=16g --export=ALL,script=ppp_1_code.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())
setwd(paste0(dir_proj))

#===============================================================================

dir_pppr <- paste0(dir_raw,"ppp/")
dir_ppp <- paste0(dir_clean,"ppp/")

lcsv <- list.files(dir_pppr, pattern="foia_.*?\\.csv", full.names = T)
dt_raw <- data.table()
for (icsv in lcsv) {
  print(sub(".*?/(foia_.*?\\.csv)","\\1",icsv))
  dti_raw <- fread(icsv,colClasses = list("character"="Zip"))
  dt_raw <- rbind(dt_raw, dti_raw, fill=T)
}

lvar <- names(dt_raw)

# Variables specific to above 150k: LoanRange, BusinessName, Address
# Variables specific to below 150k: LoanAmount
lvar <- c("LoanRange","LoanAmount","BusinessName","Address","City","State","Zip","NAICSCode","BusinessType",
          "RaceEthnicity","Gender","Veteran","NonProfit","JobsRetained","DateApproved","Lender","CD")
lvarn <- c("loanr","loan","bus","address","city","st","zip","naics","bustype",
           "race","gender","veteran","nprofit","njob","date_approval","lender","cd")
setnames(dt_raw, lvar, lvarn)

lvarc <- c("loanr","st","bustype","race","gender","veteran","nprofit","cd")
dt_code <- data.table()
for (ivar in lvarc) {
  dti_code <- unique(dt_raw[,.SD,.SDcols=c(ivar)])[,var:=ivar]
  setnames(dti_code, ivar, "val")
  setcolorder(dti_code, c("var","val"))
  setorderv(dti_code, "val")
  dt_code <- rbind(dt_code,dti_code)
}

# table(dt_raw[is.na(nprofit),st])
# All missing nprofit seems to be from "Other", so this should be fine
dt_code <- dt_code[!is.na(val) & val!="",]
saveRDS(dt_code, paste0(dir_ppp,"ppp_code_raw.rds"))
fwrite(dt_code, paste0(dir_ppp,"ppp_code_raw.csv"))

# cd with no district codes needs some attention later

dt_out <- data.table()

#---------------------------------------
# Loan Range
dti_out <- dt_code[var=="loanr"]
dti_out[,valc:=sub("^\\D ","",val)]
setorderv(dti_out, "val")
dti_out[,valf:=.I]
dt_out <- rbind(dt_out, dti_out)

#---------------------------------------
# Business type
dti_out <- dt_code[var=="bustype"]
dti_out[,valc:=val]
dti_out[val=="Employee Stock Ownership Plan(ESOP)", valc:="Employee Stock Ownership Plan (ESOP)"]
dti_out[val=="Limited  Liability Company(LLC)", valc:="Limited  Liability Company (LLC)"]
dti_out[val=="Rollover as Business Start-Ups (ROB", valc:="Rollover as Business Start-Ups (ROBS)"]
setorderv(dti_out, "val")
dti_out[,valf:=.I]
dt_out <- rbind(dt_out, dti_out)

#---------------------------------------
# Loan Range
dti_out <- dt_code[var=="race"]
dti_out[,valc:=val]
dti_out[val=="Unanswered",valf:=0]
dti_out[val=="Unanswered",valf:=1]
setorderv(dti_out, c("valf","val"))
dti_out[,valf:=.I]
dti_out[val=="Unanswered",c("valc","valf"):=list(as.character(NA),as.integer(NA))]
dt_out <- rbind(dt_out, dti_out)

#---------------------------------------
# Gender
dti_out <- dt_code[var=="gender"]
dti_out[,valc:=val]
dti_out[val=="Male Owned",valf:=1]
dti_out[val=="Female Owned",valf:=2]
dti_out[val=="Unanswered",valf:=3]
setorderv(dti_out, c("valf","val"))
dti_out[,valf:=.I]
dti_out[val=="Unanswered",c("valc","valf"):=list(as.character(NA),as.integer(NA))]
dt_out <- rbind(dt_out, dti_out)

#---------------------------------------
# Veteran
dti_out <- dt_code[var=="veteran"]
dti_out[,valc:=val]
dti_out[val=="Veteran",valf:=1]
dti_out[val=="Non-Veteran",valf:=2]
dti_out[val=="Unanswered",valf:=3]
setorderv(dti_out, c("valf","val"))
dti_out[,valf:=.I]
dti_out[val=="Unanswered",c("valc","valf"):=list(as.character(NA),as.integer(NA))]
dt_out <- rbind(dt_out, dti_out)

#---------------------------------------
# Non-Profit
dti_out <- dt_code[var=="nprofit"]
dti_out[,valc:=val]
dti_out[val=="Y",valf:=1]
setorderv(dti_out, c("valf","val"))
dti_out[,valf:=.I]
dti_out[val=="Unanswered",c("valc","valf"):=list(as.character(NA),as.integer(NA))]
dt_out <- rbind(dt_out, dti_out)

saveRDS(dt_out, paste0(dir_ppp,"ppp_code_clean.rds"))
fwrite(dt_out, paste0(dir_ppp,"ppp_code_clean.csv"))

print(paste("Ended at", Sys.time()))
# End of R script
