
# source(paste0(dir_box, "0_do/data_0_raw_clean.R"), verbose=T)

#===============================================================================
# Function export the crosswalk for factor variables

f_factor <- function(dt_raw, lvar, fout, update) {
  
  if (update==F) {
    
    dt_cw <- unique(dt_raw[, .SD, .SDcols=lvar])
    setorderv(dt_cw, lvar)
    for (var in lvar) {
      dt_cw[, eval(paste0(var, "_str")):=as.character(get(var))]
    }
    saveRDS(dt_cw, paste0(dir_clean, fout, ".rds"))
    write_dta(dt_cw, paste0(dir_clean, fout, ".dta"))
    for (var in lvar) {
      dt_cw[, eval(paste0(var)):=as.integer(get(var))]
    }
    fwrite(dt_cw, paste0(dir_cleanc, fout, ".csv"))
    
    return(NULL)
    
  } else if (update==T) {
    lvars <- paste(lvar, "_str", sep="")
    
    dt_cw <- readRDS(paste0(dir_clean, fout, ".rds"))
    setnames(dt_raw, lvar, lvars)
    dt_raw <- merge(dt_raw, dt_cw, by=lvars, all.X=T)
    print("Potential Issues for:")
    for (var in lvar) {
      print(var)
      dt_issue <- dt_raw[is.na(get(var)), ]
      dt_issue <- unique(dt_issue[, .SD, .SDcols=c(lvar, lvars)])
      print(dt_issue)
    }
    dt_raw[, c(eval(lvars)):=NULL]
    
    return(dt_raw)
  }
}

#===============================================================================
# Main function

f_raw_clean <- function(dt_raw, dir_clean, update, savemem=T) {
# update = T or F

#-------------------------------------------------------------------------------
# Fix state and MSA
# In the future this part will be improved by using zipcodes.

print("State and MSA")

dt_msa <- fread(paste0(dir_clean, "cw_geo_msa_fixed.csv"))
dt_msa <- dt_msa[state!=state_cor | msa!=msa_cor, ]

if (nrow(dt_msa)>0) {
for (irow in c(1:nrow(dt_msa))) {
  vstate <- dt_msa[irow, state]
  vmsa <- dt_msa[irow, msa]
  vstate_cor <- dt_msa[irow, state_cor]
  vmsa_cor <- dt_msa[irow, msa_cor]
  
  dt_raw[state==vstate & msa==vmsa, c("state", "msa"):=list(vstate_cor, vmsa_cor)]
}
}

# Check if there is any new combination not covered by cw_msa_fixed.csv
print("Check any new combination")
dt_msa <- fread(paste0(dir_clean, "cw_geo_msa_fixed.csv"))
dt_msac <- unique(dt_raw[, c("state","msa")])
dt_msa <- unique(dt_msa[, c("state_cor","msa_cor")])
dt_msa[, in_fix:=1]
dt_msac <- merge(dt_msac, dt_msa, by.x=c("state","msa"), by.y=c("state_cor","msa_cor"), all.x=T)
print(dt_msac[is.na(in_fix),])

# Merge with geo crosswalk
dt_raw[, state:=toupper(state)]
dt_raw[msa=="All other", msa:=state] # Residual MSA

dt_st <- readRDS(paste0(dir_clean, "cw/cw_geo_nber_state.rds"))[, c("st","st2","stfips","stssa")]
dt_st[, state:=as.character(st)]
dt_msa <- readRDS(paste0(dir_clean, "cw_geo_msa.rds"))[, c("msa","msa2","msac")]
dt_msa[, msa_str:=as.character(msa)]

dt_raw <- merge(dt_raw, dt_st, by=c("state"))
setnames(dt_raw, c("msa"),c("msa_str"))
dt_raw <- merge(dt_raw, dt_msa, by=c("msa_str"), all.x=T)

# Check if any missing
print("Check missing codes")
dt_view <- dt_raw[, .(nobs=.N), by=c("st","state")]
print(dt_view[is.na(st),])
dt_view <- dt_raw[, .(nobs=.N), by=c("msa","msa_str")]
print(dt_view[is.na(msa),])

# Remove state and msa strings
dt_raw[, c("state","msa_str"):=NULL]

# Make county codes integer
print("County codes")
print(nrow(dt_raw[grepl("\\D|^0", county_code),]))
dt_raw[, county_code:=as.integer(county_code)]
setnames(dt_raw, "county_code", "fips")

#-------------------------------------------------------------------------------
# Factorize selected variables

print("Industry")

if (update==F) {
  dt_ind <- unique(dt_raw[, .SD, .SDcols=c("industry")])
  dt_ind[, ind:=as.integer(factor(industry))]
  n_ind <- max(dt_ind[, ind])
  dt_ind[industry=="Other", ind:=n_ind+1]
  dt_ind[industry=="Unknown", ind:=n_ind+2]
  dt_ind[, ind:=as.factor(ind)]
  setorder(dt_ind, ind)
  setattr(dt_ind$ind, "levels", dt_ind$industry)
  
  # Check the labels are correct
  print("Check if industry labels are correct:")
  print(nrow(dt_ind[as.character(ind)!=industry,]))
  
  dt_raw <- merge(dt_raw, dt_ind, by=c("industry"), all.x=T)
  dt_raw[, industry:=NULL]
  
  f_factor(dt_raw, c("ind"), paste0("cw_ind"), update)
} else if (update==T) {
  setnames(dt_raw, c("industry"), c("ind"))
  dt_raw <- f_factor(dt_raw, c("ind"), paste0("cw_ind"), update)
}

print("Level")

dt_raw[, level:=trimws(level)]
dt_raw[level=="", level:=as.character(NA)]

if (update==F) {
  dt_raw[level=="Employee",level:="LV1"]
  dt_raw[level=="Manager",level:="LV2"]
  dt_raw[level=="General Manager",level:="LV3"]
  dt_raw[, level:=factor(level)]
  llevelold <- levels(dt_raw$level)
  llevelnew <- llevelold
  llevelnew <- sub("^LV1$","Employee",llevelnew)
  llevelnew <- sub("^LV2$","Manager",llevelnew)
  llevelnew <- sub("^LV3$","General Manager",llevelnew)
  print(llevelold)
  print(llevelnew)
  setattr(dt_raw$level, "levels", llevelnew)
  f_factor(dt_raw, c("level"), paste0("cw_level"), update)
} else if (update==T) {
  dt_raw <- f_factor(dt_raw, c("level"), paste0("cw_level"), update)
}

#-------------------------------------------------------------------------------
# Keep only date and not time

print("Improve Date")
pyyyymmdd <- "^(\\d{4}-\\d{2}-\\d{2}).*?$"
for (ivar in c("location_created_date", "job_created_date", "job_archived_date","event_date")) {
  print(paste0("Clean ",ivar))
  dt_raw[, eval(ivar):=trimws(gsub(pyyyymmdd, "\\1", get(ivar)))]
  dt_raw[get(ivar)=="", eval(ivar):=as.character(NA)]
  if (savemem==T) {
    dt_raw[, eval(ivar):=as.Date(get(ivar))] # This is very slow
  }
}

if (savemem==F) {
ldate <- sort(unique(c(dt_raw$location_created_date, dt_raw$job_created_date, dt_raw$job_archived_date, dt_raw$event_date)))
dt_date <- data.table(sdate=ldate)
dt_date[, date:=as.Date(sdate)]

for (ivar in c("location_created_date", "job_created_date", "job_archived_date","event_date")) {
  print(paste0("Merge with ",ivar))
  
  setnames(dt_raw, ivar, "sdate")
  dt_raw <- merge(dt_raw, dt_date, by=c("sdate"), all.x=T)
  setnames(dt_raw, "date", ivar)
  dt_raw[, sdate:=NULL]
}
}

#-------------------------------------------------------------------------------
# Save output

setcolorder(dt_raw, c("company_id","ind","st","st2","stfips","stssa","msa","msa2","msac","fips","zip",
                      "location_id","location_created_date",
                      "user_id","level","job_created_date","job_archived_date",
                      "event_date","hours_worked","avg_hourly_wage_rate","total_wages_earned"))


return(dt_raw)
}

# End of R script