


# source(paste0(dir_func, "data_0_readhb.R"))

f_readhb <- function(sdata="2020", max_hrs=20, geo_rep=T) {
  # sdata = suffix of data file to use
  # max_hrs = max daily hours allowed for each row
  # geo_rep = whether to replace Homebase geographics by the ones cleaned by Feng
  
  if (toupper(sdata)=="ALL") {
    dt_raw <- readRDS(paste0(dir_clean, "homebase_raw.rds"))
  } else {
    dt_raw <- readRDS(paste0(dir_clean, "homebase_raw_", sdata, ".rds"))
  }
  
  # Rename
  setnames(dt_raw, 
           c("company_id","location_id","user_id",
             "event_date",
             "hours_worked","avg_hourly_wage_rate","total_wages_earned"), 
           c("firmid","estid","userid",
             "date",
             "hours","hwage","twage"))
  
  if ("owner_id" %in% names(dt_raw)) {
    setnames(dt_raw, c("owner_id"), c("ownerid"))
  }
  
  # dt_rawc <- copy(dt_raw)
  # dt_raw <- copy(dt_rawc)
  
  # Subset hours range and states
  if (geo_rep==F) {
    # Use original geographical variables
    
    dt_raw <- dt_raw[hours>0 & hours<=max_hrs & (! toupper(as.character(st)) %in% c("NOT USA", "UNCLASSIFIED")), ]
    
  } else if (geo_rep==T) {
    # Use improved geographical variables
    
    dt_geo <- readRDS(paste0(dir_clean,"cw_geo_improved.rds"))[ziphb!="",]
    
    print(paste0("Rows without original county FIPS: ",nrow(dt_raw[is.na(fips)])))
    lvren <- c("zip","st","st2","stfips","stssa","msa","msa2","msac","fips")
    setnames(dt_raw, lvren,paste0(lvren,"hb"))
    dt_raw <- merge(dt_raw, dt_geo[,c("ziphb","zip","st","st2","stfips","stssa","msa","fips")], by=c("ziphb"), all.x=T)
    print(paste0("Rows without improved county FIPS: ",nrow(dt_raw[is.na(fips)])))
    
    dt_raw <- dt_raw[hours>0 & hours<=max_hrs & (! is.na(fips)), ]
  }
  
  return(dt_raw)

}