# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=16g --export=ALL,script=ppp_1_raw.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())
setwd(paste0(dir_proj))

#===============================================================================

dir_pppr <- paste0(dir_raw,"ppp/")
dir_ppp <- paste0(dir_clean,"ppp/")

lcsv <- list.files(dir_pppr, pattern="foia_.*?\\.csv", full.names = T)
dt_raw <- data.table()
for (icsv in lcsv) {
  print(sub(".*?/(foia_.*?\\.csv)","\\1",icsv))
  dti_raw <- fread(icsv,colClasses = list("character"="Zip"))
  dt_raw <- rbind(dt_raw, dti_raw, fill=T)
}
rm(dti_raw)

# Variables specific to above 150k: LoanRange, BusinessName, Address
# Variables specific to below 150k: LoanAmount
lvar <- c("LoanRange","LoanAmount","BusinessName","Address","City","State","Zip","NAICSCode","BusinessType",
          "RaceEthnicity","Gender","Veteran","NonProfit","JobsRetained","DateApproved","Lender","CD")
lvarn <- c("loanr","loan","bus","address","city","st","zip","naics","bustype",
           "race","gender","veteran","nprofit","njob","date_approval","lender","cd")
setnames(dt_raw, lvar, lvarn)
setcolorder(dt_raw, lvarn)

# dt_rawc <- copy(dt_raw)
# dt_raw <- copy(dt_rawc)

dt_st <- readRDS(paste0(dir_clean,"cw/cw_geo_nber_state.rds"))
dt_code <- readRDS(paste0(dir_ppp,"ppp_code_clean.rds"))

#===============================================================================

ivar <- "loanr"
ivar <- "bustype"
ivar <- "race"

f_fac <- function(ivar) {
  dti_code <- dt_code[var==ivar]
  dti_code[, valf2:=as.factor(valf)]
  setattr(dti_code$valf2, "levels", dti_code$valc)
  # Check factor
  if (any(dti_code$valf2!=dti_code$valc,na.rm=T) | any(as.integer(dti_code$valf2)!=dti_code$valf,na.rm=T)) {
    print("ERROR: ISSUES WITH FACTOR")
  }
  
  print(table(dt_raw[[ivar]],useNA="always"))
  setnames(dt_raw, ivar, "varo")
  for (irow in c(1:nrow(dti_code))) {
    ival <- dti_code[irow,val]
    ivalf <- dti_code[irow,valf2]
    dt_raw[varo==ival, (ivar):=ivalf]
  }
  print(table(dt_raw[[ivar]],useNA="always"))
  dt_raw[,varo:=NULL]
}

for (ivar in c("loanr","bustype","race","gender","veteran","nprofit")) {
  f_fac(ivar)
}

# Get state factor and state fips
setnames(dt_raw, "st","sto")
dt_raw[, rowid:=.I]
dt_out <- merge(dt_raw, dt_st[,c("st","stfips","st_str")], by.x=c("sto"), by.y=c("st_str"), all.x=T)
setorderv(dt_out,c("rowid"))

setcolorder(dt_out, c("loanr","loan","bus","address","city","st","stfips","sto","zip","naics","bustype",
                      "race","gender","veteran","nprofit","njob","date_approval","lender","cd","rowid"))

dt_out[, date_approval:=as.Date(date_approval,format="%m/%d/%Y")]

saveRDS(dt_out, paste0(dir_ppp, "ppp_clean.rds"))
haven::write_dta(dt_out, paste0(dir_ppp, "ppp_clean.dta"))

print(paste("Ended at", Sys.time()))
# End of R script
