# cd /accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_data/
# sbatch --cpus-per-task=8 --mem=8g --export=ALL,script=sg_1_visit_sum_stats.R --partition=high zb_r.sh

sinfo <- commandArgs(trailingOnly = F)
args <- commandArgs(trailingOnly = T)

print(args)

sys_user <- ifelse(Sys.getenv("USERNAME")!="", Sys.getenv("USERNAME"), Sys.getenv("USER"))
sys_cluster <- Sys.getenv("SLURM_CLUSTER_NAME")
sys_batch <- ifelse(interactive(), 0, 1)

if (sys_user=="homebase") { # For replication: Replace "homebase" with current username
  dir_func <- "/accounts/projects/jrothst/homebase/data/bpea_replication_archive/code_function/"
  # For replication: Replace with the path of the "code_function/" folder
}

source(paste0(dir_func, "0_directory.R"))
print(paste("Started at", Sys.time()))

library(data.table)

print(sessionInfo())

setwd(paste0(dir_proj))

#===============================================================================

dir_raws <- paste0(dir_raw,"safegraph/")
dir_cleans <- paste0(dir_clean,"safegraph/")

source(paste0(dir_proj,"0_function/f_wk.R"), verbose = T)

#===============================================================================

#-------------------------------------------------------------------------------
# Normalization stats and Meta data

dt_norm <- data.table()
lcsv <- list.files(path=dir_raws, recursive = T, pattern = "normalization-stats|normalization_stats")
lcsv <- lcsv[!grepl("normalization-stats/2020-06-15-normalization-stats.csv",lcsv, fixed = T)] # Overlapping week when format changed

for (icsv in lcsv) {
  
  print(icsv)
  
  # Normalization stats
  dti_norm <- fread(paste0(dir_raws,icsv))
  
  # Metadata
  if (grepl("weekly_patterns_delivery",icsv)) {
    dti_meta <- fread(paste0(dir_raws,gsub("normalization_stats","release_metadata",icsv)))
  } else {
    dti_meta <- fread(paste0(dir_raws,gsub("normalization-stats","release-metadata",icsv)))
  }
  dti_norm[, poi_ver:=sub("(\\d{2})-(\\d{4})","\\2\\1",dti_meta[metadata_description=="core_places_version_used",metadata_value])]
  
  dt_norm <- rbind(dt_norm, dti_norm)
}
dt_norm[,date:=as.Date(paste0(year,"-",month,"-",day))]
dt_norm[,week:=f_wk(date)]
dt_norm[,weekd:=f_wkdate(week)]
dt_norm[, c("year","month","day"):=NULL]
setcolorder(dt_norm, c("date","week","weekd"))
setorderv(dt_norm, c("date"))

# Check duplicate dates
dt_dup <- dt_norm[, .(nrow=.N),by=c("date")]
if (nrow(dt_dup[nrow!=1])!=0) {
  print("ERROR: DUPLICATED DATES")
  print(dt_dup[nrow!=1])
}

saveRDS(dt_norm,paste0(dir_cleans,"safegraph_visit_normalization_stats.rds"))

#-------------------------------------------------------------------------------
# Home summary

dt_home <- data.table()
lcsv <- list.files(path=dir_raws, recursive = T, pattern = "home")
lcsv <- lcsv[!grepl("home-summary-file/2020-06-15-home-summary-file.csv",lcsv, fixed = T)] # Overlapping week when format changed

for (icsv in lcsv) {
  print(icsv)
  dt_home <- rbind(dt_home, fread(paste0(dir_raws,icsv), colClasses = "character"))
}
dt_home[,date_range_start:=as.Date(substr(date_range_start,1,10))]
dt_home[,date_range_end:=as.Date(substr(date_range_end,1,10))]
dt_home[,number_devices_residing:=as.integer(number_devices_residing)]
setcolorder(dt_home, c("date_range_start","date_range_end"))
setorderv(dt_home, c("date_range_start","date_range_end"))

# Check duplicate dates
dt_dup <- dt_home[, .(nrow=.N),by=c("date_range_start","date_range_end","census_block_group")]
if (nrow(dt_dup[nrow!=1])!=0) {
  print("ERROR: DUPLICATED DATES")
  print(dt_dup[nrow!=1])
}

saveRDS(dt_home,paste0(dir_cleans,"safegraph_visit_home_summary.rds"))

print(paste("Ended at", Sys.time()))
# End of R script
