# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=1 --mem=8g --export=ALL,script=cw_naics.R zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================

ver_naics <- 2017

if (ver_naics==2017) {
  dt_naics <- readxl::read_excel(paste0(dir_raw,"naics/2017/2-6 digit_2017_Codes.xlsx"),
                                 skip = 2,col_names = c("seq","naics","naicss","v4","v5","v6"))
} else if (ver_naics==2012) {
  dt_naics <- readxl::read_excel(paste0(dir_raw,"naics/2012/2-digit_2012_Codes.xls"),
                                 skip = 2,col_names = c("seq","naics","naicss"))
}
dt_naics <- data.table(dt_naics)
dt_naics[, c("seq","v4","v5","v6"):=NULL]

# 2 dig
i <- 2
dt_naicsi <- dt_naics[nchar(naics)==i | grepl("-", naics),c("naics","naicss")]
for (irow in dt_naicsi[grepl("-", naics), which=T]) {
  dmin <- sub("(\\d+)-(\\d+)","\\1", dt_naicsi[irow, naics])
  dmax <- sub("(\\d+)-(\\d+)","\\2", dt_naicsi[irow, naics])
  dt_naicsi <- rbind(dt_naicsi, data.table(naics=as.character(c(dmin:dmax)),naicss=dt_naicsi[irow, naicss]))
}
dt_naicsi <- dt_naicsi[nchar(naics)==i,c("naics","naicss")]
setnames(dt_naicsi, c("naics","naicss"), paste0(c("naics","naicss"),i))
dt_naics[nchar(naics)>=i, eval(paste0("naics",i)):=substr(naics, 1, i)]
dt_naics <- merge(dt_naics, dt_naicsi, by=c(paste0("naics",i)), all.x=T)
dt_naics <- dt_naics[!grepl("-",naics), ]

# 3-5 dig
for (i in c(3:5)) {
  dt_naicsi <- dt_naics[nchar(naics)==i,c("naics","naicss")]
  setnames(dt_naicsi, c("naics","naicss"), paste0(c("naics","naicss"),i))
  dt_naics[nchar(naics)>=i, eval(paste0("naics",i)):=substr(naics, 1, i)]
  dt_naics <- merge(dt_naics, dt_naicsi, by=c(paste0("naics",i)), all.x=T)
}

# Fill naics to 6 dig
dt_naics[,naicsd:=nchar(naics)]
for (i in c(2:5)) {
  dt_naics[naicsd==i,naics:=paste0(naics, paste(rep("0",6-i), collapse = ""))] 
}

setcolorder(dt_naics, c("naics","naicss","naicsd",
                        "naics2","naicss2",
                        "naics3","naicss3",
                        "naics4","naicss4",
                        "naics5","naicss5"))
setorderv(dt_naics, c("naics"))

saveRDS(dt_naics,paste0(dir_clean, "cw_ind_naics_", ver_naics, ".rds"))
  
print(paste("Ended at", Sys.time()))
# End of R script
