# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=6 --mem=32g --export=ALL,script=data_1_raw_update.R zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)
library(haven)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================

#-------------------------------------------------------------------------------
source(paste0(dir_dohb, "data_0_raw_clean.R"), verbose=T)

# List raw data
lcsv <- list.files(dir_raw, recursive = T, pattern = "^\\d{8}.+?\\.csv")

# Extract variable names
dt_var <- fread(paste0(dir_raw, lcsv[grepl("20180101_to_20200704", lcsv)][1]), nrows = 1)
lvar <- names(dt_var)

lcsv <- sort(lcsv[!grepl("20180101_to_20200704", lcsv) & !grepl("scheduling|scheduled", lcsv)])

# Variables that uniquely identify a row
lvuniq <- c("location_id","user_id","event_date")

icsv <- lcsv[3]
dt_raw <- data.table()
for (icsv in lcsv) {
  date_src <- as.Date(sub("(^|.+?/)(\\d{8}).+?$", "\\2", icsv), format="%Y%m%d")
  if (date_src<=as.Date("2020-07-06")) {
    next
  } # These dates are now in raw
  
  print(icsv)
  print(nrow(dt_raw))
  print(date_src)
  
  if (grepl("20200415",icsv)) {
    dti_raw <- fread(paste0(dir_raw, icsv),
                     colClasses=list("character"="V2", "character"="V3", "character"="V4"))
    setnames(dti_raw, names(dti_raw), lvar)
    setnames(dti_raw, c("avg_hourly_wage_rate", "total_wages_earned", "hours_worked"), 
             c("total_wages_earned", "hours_worked", "avg_hourly_wage_rate"))
  } else {
    dti_raw <- fread(paste0(dir_raw, icsv),
                     colClasses=list("character"="company_id", "character"="location_id", "character"="user_id"))
  }
  
  # Check row uniqueness
  print(nrow(dti_raw)==nrow(unique(dti_raw, by=lvuniq)))
  
  # print date range
  ldate <- as.Date(unique(dti_raw$event_date))
  print(sort(ldate))
  
  # Append with dt_raw and update
  # Now we replace overlapping dates using new data
  if (nrow(dt_raw)!=0) {
    dt_raw <- dt_raw[! event_date %in% unique(dti_raw$event_date), ]
  }
  
  dti_raw[, update_date:=date_src]
  dt_raw <- rbind(dt_raw, dti_raw)
  print(nrow(dt_raw))
}
rm(dti_raw)
for (var in c("total_wages_earned", "avg_hourly_wage_rate")) {
  dt_raw[, eval(var):=as.numeric(get(var))] # \\N is forced to NA
}

# Fix "\\N"
for (ivar in c("county_code","zip","job_created_date","job_archived_date")) {
  dt_raw[get(ivar)=="\\N", eval(ivar):=as.character(NA)]
}

dt_raw <- f_raw_clean(dt_raw, dir_clean, T, F)

# Append with 2020 data and update

dt_rawo <- readRDS(paste0(dir_clean, "homebase_raw_2020.rds"))
# dt_rawo[, update_date:=as.Date("2020-04-11")] # Now more daily updates are in raw

print(nrow(dt_rawo))
dt_rawo <- dt_rawo[! event_date %in% unique(dt_raw$event_date), ]
print(nrow(dt_rawo))

dt_out <- rbind(dt_rawo, dt_raw)
print(nrow(dt_out))

rm(dt_raw, dt_rawo)
gc()

# Additional variables

# Days since 2019-12-31
dt_out[, day:=as.integer(event_date-as.Date("2019-12-31"))]
# Week Number
dt_out[, weeki:=as.integer(NA)]
lweek <- c()
sun0 <- as.Date("2019-12-29")
sat0 <- as.Date("2020-01-04")
week_min <- 1
week_max <- as.integer(max(ceiling((dt_out$event_date-sun0)/7))) + 5

for (i in c(week_min:week_max)) {
  suni <- sun0 + 7*(i-1)
  sati <- sat0 + 7*(i-1)
  mon1 <- month.abb[month(suni)]
  mon2 <- month.abb[month(sati)]
  day1 <- sub("^0", "", format(suni, format="%d"))
  day2 <- sub("^0", "", format(sati, format="%d"))
  if (mon1==mon2) {
    tweek <- paste0(mon1, " ", day1, "-", day2)
  } else {
    tweek <- paste0(mon1, " ", day1, "-", mon2, " " ,day2)
  }
  
  lweek <- c(lweek, tweek)
  dt_out[event_date>=suni & event_date<=sati, weeki:=i]
}
dt_date <- data.table(weeki=c(week_min:week_max), week=factor(c(week_min:week_max)), weeks=lweek)
setorder(dt_date, weeki)
setattr(dt_date$week, "levels", dt_date$weeks)
print(nrow(dt_date[as.character(week)!=weeks,]))
print(nrow(dt_date[as.integer(week)!=weeki,]))

dt_out <- merge(dt_out, dt_date[, c("weeki","week")], by=c("weeki"), all.x=T)
print(nrow(dt_out[is.na(week), ]))
dt_out[, weeki:=NULL]
print(nrow(dt_out[is.na(update_date), ]))

# Add numdaysinweek
dt_nd <- unique(dt_out[, c("week","event_date")])
dt_nd <- dt_nd[, .(numdaysinweek=.N), by=c("week")]
for (iwk in dt_nd$week) {
  dt_out[week==iwk, numdaysinweek:=dt_nd[week==iwk, numdaysinweek]]
}

# Merge with owner_id
dt_own <- readRDS(paste0(dir_clean, "cw_owner.rds"))
dt_out <- merge(dt_own, dt_out, by=c("company_id","location_id"), all.y=T)
print(paste0("Missing Owner ID: ", nrow(dt_out[is.na(owner_id),])))

if (sys_user=="homebase") {
  dt_out2 <- copy(dt_out)
  dt_geo <- readRDS(paste0(dir_clean,"cw_geo_improved.rds"))[ziphb!="",]
  if (nrow(dt_geo)!=nrow(unique(dt_geo[,c("ziphb")]))) {
    print("ERROR: CHECK GEO CROSSWALK")
  }
  
  print(paste0("Rows without original county FIPS: ",nrow(dt_out2[is.na(fips)])))
  nrow <- nrow(dt_out2)
  
  # Rename selected variables
  lvren <- c("zip","st","st2","stfips","stssa","msa","msa2","msac","fips")
  setnames(dt_out2, lvren, paste0(lvren,"hb"))
  
  setnames(dt_geo, "msa", "msac")
  lvreng <- c("zip","st","stfips","msac","fips")
  setnames(dt_geo, lvreng, paste0(lvreng,"hud"))
  
  dt_out2 <- merge(dt_geo[,.SD,.SDcols=c("ziphb",paste0(lvreng,"hud"))], dt_out2, by=c("ziphb"), all.y=T)
  
  print(paste0("Rows without improved county FIPS: ",nrow(dt_out2[is.na(fipshud)])))
  if (nrow!=nrow(dt_out2)) {
    print("ERROR: MERGE ISSUES")
  }
  dt_out2[, ziphb:=NULL]
  write_dta(dt_out2, paste0(dir_clean,"homebase_raw_2020_update.dta"))
  rm(dt_out2)
}

source(paste0(dir_func, "f_wk.R"), verbose=T)
setnames(dt_out, c("numdaysinweek","week"),c("ndaywk_matt","week_matt"))
dt_out[, week:=f_wk(event_date)]
dt_out[, weekd:=f_wkdate(week)]
f_ndaywk("dt_out","week","event_date")
if (nrow(dt_out[ndaywk!=ndaywk_matt])!=0) {
  print("ERROR: ndaywk INCONSISTENT")
} else {
  dt_out[, ndaywk_matt:=NULL]
}
saveRDS(dt_out, paste0(dir_clean,"homebase_raw_2020_update.rds"))

print(paste("Ended at", Sys.time()))
# End of R script
