# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=96g --export=ALL,script=data_1_raw.R,param="ALL F" --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

# Whether to run with selected year or to save memory
sys_year <- ifelse(is.na(args[1]), "ALL", args[1])
sys_mem <- ifelse(is.na(args[2]), T, as.logical(args[2]))

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================
# Load raw data

#-------------------------------------------------------------------------------
# The original large files

lcsv <- list.files(dir_raw, recursive = T)
lcsv <- sort(lcsv[grepl("20180101_to_20200704", lcsv)])

dt_raw <- data.table()
for (icsv in lcsv) {
  print(icsv)
  dti_raw <- fread(paste0(dir_raw, icsv),
                   colClasses=list("character"="company_id", "character"="location_id", "character"="user_id",
                                   "character"="zip","character"="county_code"))
  dt_raw <- rbind(dt_raw, dti_raw)
}
dt_raw[, update_date:=as.Date("2020-07-06")]

rm(dti_raw)

# Check uniqueness
print(nrow(dt_raw))
print(nrow(unique(dt_raw)))

print(paste0("Date Range: ", min(dt_raw$event_date), " - ", max(dt_raw$event_date)))

#-------------------------------------------------------------------------------
# Daily update until a given time

if (F) { # As of 7/7/2020, this part is not needed

# List raw data
lcsv <- list.files(dir_raw, recursive = T, pattern = "^\\d{8}.+?\\.csv")

# Extract variable names
dt_var <- fread(paste0(dir_raw, lcsv[grepl("20180101_to_20200704", lcsv)][1]), nrows = 1)
lvar <- names(dt_var)

lcsv <- sort(lcsv[!grepl("20180101_to_20200704", lcsv)])
dt_csv <- data.table(csv=lcsv)[,date:=as.Date(sub(".*?(\\d{8}).*?","\\1",csv),format="%Y%m%d")]
lcsv <- dt_csv[date<=as.Date("2020-05-31"),csv]

# Variables that uniquely identify a row
lvuniq <- c("location_id","user_id","event_date")

icsv <- lcsv[3]
dt_rawu <- data.table()
for (icsv in lcsv) {
  print(icsv)
  print(nrow(dt_rawu))
  date_src <- as.Date(sub("(^|.+?/)(\\d{8}).+?$", "\\2", icsv), format="%Y%m%d")
  print(date_src)
  
  
  if (grepl("20200415",icsv)) {
    dti_raw <- fread(paste0(dir_raw, icsv),
                     colClasses=list("character"="V2", "character"="V3", "character"="V4",
                                     "character"="V8","character"="V9"))
    setnames(dti_raw, names(dti_raw), lvar)
    setnames(dti_raw, c("avg_hourly_wage_rate", "total_wages_earned", "hours_worked"), 
             c("total_wages_earned", "hours_worked", "avg_hourly_wage_rate"))
  } else {
    dti_raw <- fread(paste0(dir_raw, icsv),
                     colClasses=list("character"="company_id", "character"="location_id", "character"="user_id",
                                     "character"="zip"))
  }
  
  # Check row uniqueness
  print(nrow(dti_raw)==nrow(unique(dti_raw, by=lvuniq)))
  
  # print date range
  ldate <- as.Date(unique(dti_raw$event_date))
  print(sort(ldate))
  
  # Fix "\\N"
  for (var in c("total_wages_earned", "avg_hourly_wage_rate")) {
    dti_raw[, eval(var):=as.numeric(get(var))] # \\N is forced to NA
  }
  
  for (ivar in c("county_code","zip","job_created_date","job_archived_date")) {
    dti_raw[get(ivar)=="\\N", eval(ivar):=as.character(NA)]
  }

  # Append with dt_rawu and update
  # Now we replace overlapping dates using new data
  if (nrow(dt_rawu)!=0) {
    dt_rawu <- dt_rawu[! event_date %in% unique(dti_raw$event_date), ]
  }
  
  dti_raw[, update_date:=date_src]
  dt_rawu <- rbind(dt_rawu, dti_raw)
  print(nrow(dt_rawu))
}
rm(dti_raw)

# Append dt_raw and dt_rawu
dt_raw <- rbind(dt_raw[! event_date %in% unique(dt_rawu$event_date), ], dt_rawu)
rm(dt_rawu)
gc()

print(paste0("Date Range: ", min(dt_raw$event_date), " - ", max(dt_raw$event_date)))
} # End of if(T/F)

print("Finish loading raw data.")

#-------------------------------------------------------------------------------

source(paste0(dir_dohb, "data_0_raw_clean.R"), verbose=T)

dt_raw <- f_raw_clean(dt_raw, dir_clean, F, sys_mem)

if (sys_year!="2020") {
  saveRDS(dt_raw, paste0(dir_cleanl,"homebase_raw.rds"), compress=T)
  # write_dta(dt_raw, paste0(dir_clean,"homebase_raw.dta"))
}
dt_raw20 <- dt_raw[event_date>=as.Date("2020-01-01") & (!is.na(event_date)), ]
rm(dt_raw)
gc()
saveRDS(dt_raw20, paste0(dir_clean,"homebase_raw_2020.rds"), compress=T)
# write_dta(dt_raw, paste0(dir_clean,"homebase_raw_2020.dta"))

print(paste("Ended at", Sys.time()))
# End of R script
