# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=4 --mem=64g --export=ALL,script=ws_1_raw.R zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================

lfile <-  list.files(paste0(dir_raw,"survey_worker/"), pattern = "employeesurvey")

for (ifile in lfile) {

print(ifile)

iupdate <- paste0("2020",sub("employeesurvey(\\d{4})\\.csv","\\1",ifile))

dt_surv <- fread(paste0(dir_raw,"survey_worker/", ifile), encoding = "UTF-8")
dt_val <- readRDS(paste0(dir_clean,"survey_worker/survey_worker_var_val.rds"))

lvorder <- names(dt_surv)
lvorder <- lvorder[grepl("^Q", lvorder)]

#-------------------------------------------------------------------------------
# Filer out non-complete, survey preview, before 5/1/2020, no response
dt_surv <- dt_surv[Progress=="100",] # Unnecessary condition: & !Status %in% c("Survey Preview","Spam")
if (sum(grepl("/",dt_surv$StartDate))>0) {
  dt_surv[,c("date_start","date_end"):=.(as.Date(StartDate,"%m/%d/%Y %H:%M"),as.Date(EndDate,"%m/%d/%Y %H:%M"))]
  dt_surv[,c("time_start","time_end"):=.(strptime(StartDate,"%m/%d/%Y %H:%M"),strptime(EndDate,"%m/%d/%Y %H:%M"))]
} else {
  dt_surv[,c("date_start","date_end"):=.(as.Date(StartDate),as.Date(EndDate))]
  dt_surv[,c("time_start","time_end"):=.(strptime(StartDate,"%Y-%m-%d %H:%M:%S"),strptime(EndDate,"%Y-%m-%d %H:%M:%S"))] 
}
dt_surv <- dt_surv[date_start>=as.Date("2020-05-01")]
dt_surv <- dt_surv[Q1!="I do NOT agree to participate in the research",]

# Drop Qualtric variables
dt_surv <- dt_surv[, .SD, .SDcols=patterns("date_|time_|Q|employee_ID|linksource")]

# Rename and reorder
dt_surv[, rowid:=.I]
setnames(dt_surv, c("employee_ID","linksource"),c("userid","linksrc"))
setcolorder(dt_surv, c("userid","linksrc","date_start","date_end","rowid"))

#-------------------------------------------------------------------------------
# Code demographics

# Function that summarizes a variable and remove na
f_na <- function(vvar) {
  print(table(dt_surv[[vvar]]))
  dt_surv[get(vvar)=="", (vvar):=NA]
}

# Function that cleans dt with factors
f_fac <- function(dt_in, vvar_str, vvar_fac) {
  setorderv(dt_in, c(vvar_fac))
  dt_in[, (vvar_fac):=factor(get(vvar_fac))]
  dt_in[, (vvar_str):=gsub("\u2019","'",get(vvar_str))] # Fix curly quote
  setattr(dt_in[[vvar_fac]], "levels", dt_in[, get(vvar_str)])
  if (nrow(dt_in[get(vvar_fac)!=as.character(get(vvar_str))])!=0) {
    print("ERROR: CHECK FACTOR")
  }
  return(dt_in)
}

# Function that merges dt_surv with dt_fac
f_facm <- function(vquest,vdemogr,lvalue) {
  print(vdemogr)
  
  # Fix curly quote first and remove ws
  dt_surv[, (vquest):=gsub("\u2019","'",get(vquest))] 
  dt_surv[, (vquest):=trimws(get(vquest))]
  lvalue <- gsub("\u2019","'",lvalue)
  
  f_na(vquest)
  
  # Make sure that actual values are in lvalue
  ldiff <- setdiff(dt_surv[[vquest]], c(lvalue,NA))
  if (length(ldiff)!=0) {
    print("ERROE: CHECK VALUES")
    print(ldiff) 
  }
  
  dt_fac <- data.table()[,c(vquest,vdemogr):=.(lvalue,c(1:length(lvalue)))]
  dt_fac <- f_fac(dt_fac, vquest, vdemogr)
  
  dt_surv <<- merge(dt_surv, dt_fac, by=c(vquest), all.x=T)
  
  lna1 <- dt_surv[is.na(get(vquest)), which=T]
  lna2 <- dt_surv[is.na(get(vdemogr)), which=T]
  if (sum(lna1!=lna2)!=0 | length(lna1)!=length(lna2)) {
    print("ERROR: CHECK NA")
  }
  print("")
}

# Gender
f_na("Q34")
dt_surv[, user_gender:=as.factor(Q34)]

# Age
f_na("Q35")
dt_surv[, user_age:=as.factor(Q35)]

# Income
vquest <- "Q36"
vdemogr <- "user_income"
f_facm(vquest,vdemogr,c("Less than $15,000",
                        "$15,000-$24,999",
                        "$25,000-$34,999",
                        "$35,000-$44,999",
                        "$45,000-$54,999",
                        "$55,000-$64,999",
                        "$65,000-$74,999",
                        "$75,000-$84,999",
                        "More than $85,000"))

# Marital status
vquest <- "Q37"
vdemogr <- "user_marital"
f_facm(vquest,vdemogr,c("Single",
                        "Married",
                        "Living with partner",
                        "Separated",
                        "Divorced",
                        "Widowed"))

# Education
vquest <- "Q38"
vdemogr <- "user_educ"
f_facm(vquest,vdemogr,c("Some high school",
                        "High school graduate",
                        "Two-year degree/some college",
                        "Bachelor's degree",
                        "Master's degree or more"))

# Race (can choose multiple options)
vquest <- "Q39"
vdemogr <- "user_race"
f_na("Q39")
dt_surv[grepl("Hispanic",get(vquest)), user_race_h:=1]
dt_surv[grepl("American Indian",get(vquest)), user_race_i:=1]
dt_surv[grepl("Asian",get(vquest)), user_race_a:=1]
dt_surv[grepl("Hawaiian",get(vquest)), user_race_p:=1]
dt_surv[grepl("Black",get(vquest)), user_race_b:=1]
dt_surv[grepl("White",get(vquest)), user_race_w:=1]

# Create a harmonized race variable
# with White, Black, Hispanic, Asian, Native American, Pacific Islander
lrace <- factor(c(1,2,3,4,5,6))
levels(lrace) <- c("White","Black","Hispanic","Asian","Native American","Pacific Islander")
dt_surv[user_race_w==1 & is.na(user_race_b) & is.na(user_race_h) & is.na(user_race_i) & is.na(user_race_a) & is.na(user_race_p),user_race:=lrace[1]]
dt_surv[is.na(user_race) & user_race_b==1,user_race:=lrace[2]]
dt_surv[is.na(user_race) & user_race_h==1,user_race:=lrace[3]]
dt_surv[is.na(user_race) & user_race_a==1,user_race:=lrace[4]]
dt_surv[is.na(user_race) & user_race_i==1,user_race:=lrace[5]]
dt_surv[is.na(user_race) & user_race_p==1,user_race:=lrace[6]]
print(table(dt_surv$user_race))

# with White, Black, Hispanic, Other
lrace <- factor(c(1,2,3,4))
levels(lrace) <- c("White","Black","Hispanic","Other")
dt_surv[user_race_w==1 & is.na(user_race_b) & is.na(user_race_h) & is.na(user_race_i) & is.na(user_race_a) & is.na(user_race_p),user_race2:=lrace[1]]
dt_surv[is.na(user_race2) & user_race_b==1,user_race2:=lrace[2]]
dt_surv[is.na(user_race2) & user_race_h==1,user_race2:=lrace[3]]
dt_surv[is.na(user_race2) & (user_race_i==1 | user_race_a==1 | user_race_p==1),user_race2:=lrace[4]]
print(table(dt_surv$user_race2))

# Children
vquest <- "Q40"
vdemogr <- "user_child"
f_facm(vquest,vdemogr,c("Yes","No"))

# Children under 18
vquest <- "Q41"
vdemogr <- "user_child18"
print(vdemogr)
f_na(vquest)
dt_surv[, (vdemogr):=factor(get(vquest))]

#-------------------------------------------------------------------------------
# Code questions

# Replace Q18 with standard responses
if (any(grep("1-",dt_surv$Q18))) {
  print("Q18 has to be improved")
  print(table(dt_surv$Q18))
  
  dt_surv[Q18=="1-May",Q18:="May 1"]
  dt_surv[Q18=="1-Jun",Q18:="June 1"]
  dt_surv[Q18=="1-Jul",Q18:="July 1"]
  dt_surv[Q18=="1-Aug",Q18:="August 1"]
  dt_surv[Q18=="1-Sep",Q18:="September 1"]
  dt_surv[Q18=="1-Oct",Q18:="October 1"]
  dt_surv[Q18=="1-Nov",Q18:="November 1"]
  dt_surv[Q18=="1-Dec",Q18:="December 1"]
}

# Automated
lvquest <- c(paste0("Q",c(1:5,7:12,14,17:20,25,27,28,31)),
             paste0("Q15_",c(1:4)),paste0("Q16_",c(1:4)),
             paste0("Q32_",c(1:3)),paste0("Q33_",c(1:3)))
for (vquest in lvquest) {
  vdemogr <- paste0("f",vquest)
  dti_val <- dt_val[var==vquest]
  # lvalue <- dti_val$lab
  f_facm(vquest,vdemogr,dti_val$lab)
}

# Multiple choices

vin <- "Q13"
vout <- "fQ13"
ival <- 1

f_multi <- function(vin,vout,lval=c()) {
  dti_val <- dt_val[var==vin]
  
  # Create a temp variable to avoid modification or original variable
  dt_surv[get(vin)=="", (vin):=NA]
  dt_surv[, vintmp:=get(vin)]
  if (length(lval)==0) {
    lval <- dti_val$val
  }
  for (ival in lval) {
    sval <- dti_val[val==ival,lab]
    dt_surv[!is.na(get(vin)), (paste0(vout,"_",ival)):=0]
    dt_surv[grepl(sval,vintmp,fixed = T), (paste0(vout,"_",ival)):=1]
    dt_surv[, vintmp:=sub(sval,"",vintmp,fixed=T)]
  }
  
  # Check if any unmatched
  if (nrow(dt_surv[grepl("[^, ]",trimws(vintmp))])!=0) {
    print("ERROR: REMAINING STRINGS")
    print(unique(dt_surv[grepl("[^, ]",trimws(vintmp)), c("vintmp")]))
  }
  dt_surv[, vintmp:=NULL]
  
}

f_multi("Q13","fQ13")
f_multi("Q23","fQ23")
f_multi("Q24","fQ24")
f_multi("Q26","fQ26")
f_multi("Q29","fQ29")

# Q30: Needs to assign some codes first because of special patterns
f_multi("Q30","fQ30",c(8,9,2,10,1,3:7))

dt_val[, var_new:=ifelse(var%in%c("Q13","Q23","Q24","Q26","Q29","Q30"), paste0("f",var,"_",val), paste0("f",var))]
lvarnf <- unique(dt_val$var_new)
lvarnf <- lvarnf[lvarnf %in% names(dt_surv)]

#-------------------------------------------------------------------------------
# Deal with users with multiple rows

# Add a flag variable for duplicates
dt_flag <- dt_surv[,.(ndup=.N),by=c("userid")]
print(dt_flag[ndup>1,])
dt_surv[userid %in% dt_flag[ndup>1,userid], row_dup:=1]

# Pick the later one for duplicates, except for 1855438 (the later one does not have demogr)
dt_view <- dt_surv[userid %in% dt_flag[ndup>1,userid]]
dt_view[,time_end_max:=max(time_end), by=c("userid")]
dt_view <- dt_view[(time_end==time_end_max & userid!="1855438") | (rowid==764)]
dt_surv[is.na(row_dup), row_sel:=1]
dt_surv[!is.na(row_dup) & rowid %in% dt_view[,rowid], row_sel:=1]

if (nrow(dt_surv[row_sel==1,]) != nrow(unique(dt_surv[row_sel==1,c("userid")]))) {
  print("ERROR: ISSUE REMAINS")
}

#-------------------------------------------------------------------------------
# Export cleaned data

# Add wage as a demographics
dt_surv[, user_wage:=fQ14]

setorderv(dt_surv,c("rowid"))

setcolorder(dt_surv, c("userid","linksrc","date_start","date_end","time_start","time_end","rowid","row_sel",
                       "user_gender","user_age","user_income","user_marital","user_educ","user_wage",
                       "user_race_h","user_race_i","user_race_a","user_race_p","user_race_b","user_race_w","user_race",
                       "user_child","user_child18",lvarnf,lvorder))

saveRDS(dt_surv, paste0(dir_clean, "survey_worker/homebase_worker_survey_raw_",iupdate,".rds"))
haven::write_dta(dt_surv, paste0(dir_clean, "survey_worker/homebase_worker_survey_raw_",iupdate,".dta"))
try(saveRDS(dt_surv, paste0(dir_projws, "Data/Data_clean/homebase_worker_survey_raw_",iupdate,".rds")))
try(haven::write_dta(dt_surv, paste0(dir_projws, "Data/Data_clean/homebase_worker_survey_raw_",iupdate,".dta")))

}

print(paste("Ended at", Sys.time()))
# End of R script
