cd "/Users//`=c(username)'/Dropbox/coronavirus/Code/BPEA_replication_code"
*******************************************************************************
*  NPI_setup_v5.do                     JS 4/18/20 - 7/6/20
*
*  Merger of NPI_construct_v9 and NPI_setup_v2
*   Structure is for BPEA paper (not NBER WP) with NPIs estimated by TV phi function
*   With this version this becomes the master data read file which also creates all 
*   matrices needed for estimation and simulation 
*   Note somewhat complicated data structure: 
*     series by industry with no time dimension are obsno 1-68
*     time series with no industry dimension are by date (21Feb2020 = first obs)
*     Series with industry and time dimensions are denoted by industry number suffix: lshock`i'
*  _v5: final draft of BPEA paper (cleaned)
*******************************************************************************
*
* set paths
global dpath "data-in"

global nage = 5
* Contact parameters
global inf_kids2adults = 0.27 // fraction of beta for transmission, kids to adults (Kong lit review)
global inf_adults2kids = 0.44 // fraction of beta for transmission, adults to kids (Kong lit review)
global o1ns = 0.3 // fraction of <20 contacts made while not at school under normal conditions
global hHH = 0.8 // fraction of home contacts that are household members

global today: dis %td date(c(current_date),"DMY")

* -----------------------------------------------------
*    A. Read in data
* -----------------------------------------------------
*     A1. Industry data 
* -----------------------------------------------------
* Employment count by sector (total) - aligns with CPS
import delimited using "$dpath/sector_empshare_bea.csv", case(lower) varn(1) clear
 keep in 1/68
 gen ncode = _n
 save tmp1.dta, replace

* Age share breakdown by sector from ACS (note: counts don't add to total CPS count, use these for shares)
import delimited using "$dpath/sector_age_wfh_profiles_bea_summary_051820.csv", case(lower) varn(1) clear 
 keep in 1/68
 save tmp2.dta, replace

* Fraction able to work from home (wfw_share - Dingel-Nieman) and fraction with high personal proximity at work (pp_share - Mongey-Weinberg)
import delimited using "$dpath/sector_wfh_profiles_bea_summary_051820.csv", case(lower) varn(1) clear 
 gen wfh_share = wfh_share_dn // preferred measure since they checked by hand per Stephanie
 label var wfh_share "Share of workers who can work at home (Dingel-Nieman)"
 label var pp_share "Share of workers with high personal proximity at work (Mongey-Weinberg)"
 save tmp3.dta, replace

* Hours shock from CPS, Feb (benchmark)
import excel using "$dpath/LaborShock_EPImodel_July2020.xlsx", case(lower) firstrow clear // Feb, March, April, May, July
 rename epi_code iocode
 drop if febtomarch_hoursshock ==.
 save tmp4.dta, replace
 
* Labor income share data
import excel using "$dpath/labor_income_share.xlsx", firstrow case(lower) clear
 save tmp5.dta, replace

* -----------------------------------------------------
*     A2. Time series data
* -----------------------------------------------------
* Google mobility data and school closure data
* update: https://www.google.com/covid19/mobility/
use "$dpath/Google mobility indicators_US", clear
 format date %td
 tsset date
 gen gmio = (retail_and_recreation_percent_ch + transit_stations_percent_change_ + grocery_and_pharmacy_percent_cha)/3
 gen gmiw = workplaces_percent_change_from_b 
  qui replace gmiw = gmiw[_n+1] if tin(25May2020,25May2020) // smooth over Memorial day spike
 foreach vv in "gmio" "gmiw" {
  qui replace `vv' = 1 + `vv'/100
  tssmooth ma `vv'_sm = `vv', window(3 1 3)
  su `vv'_sm if tin(21Feb2020,29Feb2020)
  qui replace `vv'_sm = `vv'_sm/r(mean)
  qui replace `vv' = `vv'/r(mean)
 }
 keep if tin(21Feb2020,)
 global t0_gmi = string(day(date[_N-3])) + "/" + string(month(date[_N-3])) + "/" + string(year(date[_N-3]))
 keep if tin(,$t0_gmi)
 *
 * School closure data (Source: \Data\Clean\Epi|SchoolClosures.xlsx has data and documentation)
 gen school_frac = 1
  qui replace school_frac = .6548 if tin(16Mar2020,16Mar2020)
  qui replace school_frac = .4880 if tin(17Mar2020,17Mar2020)
  qui replace school_frac = .3255 if tin(18Mar2020,18Mar2020)
  qui replace school_frac = .0707 if tin(19Mar2020,19Mar2020)
  qui replace school_frac = .0301 if tin(20Mar2020,22Mar2020)
  qui replace school_frac = .0059 if tin(23Mar2020,23Mar2020)
  qui replace school_frac = 0 if tin(24Mar2020,)
*
 save tmp_t1, replace

 * -------- JHU deaths data ----------------
use "$dpath/JHU_data_us", clear
gen deaths_thou = deaths/1000  // Units of SEIRD output are thousands
gen confirmed_thou = confirmed/1000    // Units of SEIRD output are thousands
label var deaths_thou "Deaths (actual, thous)"
 label var confirmed_thou "Confirmed cases (actual, thous)"
format date %td
tsset date
keep if tin(21Feb2020,)
cap drop deaths_weekly
gen deaths_weekly = deaths_thou - L7.deaths_thou
   label var deaths_weekly "Weekly deaths, actual (thou)"
gen deaths_daily = deaths_thou - L.deaths_thou
   label var deaths_daily "Daily deaths, actual (thou)"
list date deaths deaths_daily deaths_weekly
gen ldeaths_weekly = ln(deaths_weekly)
global t0_deaths = string(day(date[_N])) + "/" + string(month(date[_N])) + "/" + string(year(date[_N]))
save tmp_t2.dta, replace
 
* -----------------------------------------------------
*     A3. Merge
* -----------------------------------------------------
* First merge industry data by iocode
use tmp1, clear
 erase tmp1.dta
forvalues i = 2/5 {
 merge 1:1 iocode using tmp`i'
 drop _merge
 erase tmp`i'.dta
}
sort ncode
list iocode iocode_name
list ncode iocode labor_income_share
* Merge time series data by date
gen date = d(21Feb2020)+_n-1
sort date
format date %td
tsset date
tsappend, last(01Jan2021) tsfmt(td)
forvalues i = 1/2 {
 merge 1:1 date using tmp_t`i'
 drop _merge
 erase tmp_t`i'.dta
}
gen day = day(date)
gen month = month(date)
gen quarter = quarter(date)
gen year = year(date)
gen yrmo = 100*(year-2000) + month
gen yrq = 10*(year-2000) + quarter
global ndates_tot = _N

* ----------------------------------------------------------------------------- 
*    B. Renaming, recoding, create some variables, interpolation, normalizations 
* ----------------------------------------------------------------------------- 
gen obsno = _n
local nind = 66 // industries in IO model = covered industries

* Sectoral Shock time series
*   Sectoral hours shocks are pinned down on 12th of month from Establishment Survey data, interpolated using excess new claims
foreach y of varlist febtomarch_hoursshock marchtoapril_hoursshock apriltomay_hoursshock maytojune_hoursshock {
 qui replace `y' = 0 if iocode=="111CA"
 qui replace `y' = 0 in 67/68
}
gen shock_Marchhrs = (1+febtomarch_hoursshock)
gen shock_Aprilhrs = shock_Marchhrs*(1+marchtoapril_hoursshock)
gen shock_Mayhrs = shock_Aprilhrs*(1+apriltomay_hoursshock)
gen shock_Junhrs = shock_Mayhrs*(1+maytojune_hoursshock)
cap drop y99
gen y99 = .
forvalues i = 1/66 {
 qui replace y99 = .
 qui replace y99 = shock_Marchhrs[`i'] if tin(12Mar2020,12Mar2020)
 qui replace y99 = shock_Aprilhrs[`i'] if tin(12Apr2020,12Apr2020)
 qui replace y99 = shock_Mayhrs[`i'] if tin(12May2020,12May2020)
 qui replace y99 = shock_Junhrs[`i'] if tin(12Jun2020,12Jun2020)
 qui ipolate y99 gmiw_sm, gen(labor_shock`i') e
 qui su labor_shock`i' if tin($t0_gmi,$t0_gmi)
 qui replace labor_shock`i' = r(mean) if tin($t0_gmi,)
 qui replace labor_shock`i' = 1 if tin(,08Mar2020)
}
gen labor_shock67 = 1
gen labor_shock68 = 1

* population shares of covered and noncovered workers by age
*   CPS employment = 158.759m Feb 2020
*   CPS LF = 164.546m Feb 2020
*   Establishment total priv emp = 129.740m Feb 2020
sca N = 328.2e6 // US population
mat Nvec = (0.256\0.334\0.261\0.086\0.063)*N
mat Nvecinv = J(5,1,0)
forvalues a = 1/5 {
 mat Nvecinv[`a',1] = 1/Nvec[`a',1]
}

* Baseline employment shares by age/industry.
* Note need to make adjustment, employment counts by age don't add to total employment count because of BEA/BLS counting discrepency
*  Totals will be BLS values 
rename emp_count emp_count_tot
gen epop = emp_count_tot/N // baseline employment-population ratio by industry
gen emp_count_tot_unadj = emp_count1 + emp_count2 + emp_count3 + emp_count4 + emp_count5
forvalues a = 1/5 {
 rename emp_count`a' emp_count`a'_unadj
 gen emp_count`a' = emp_count_tot*(emp_count`a'_unadj/emp_count_tot_unadj)  // estimated employment by industry by age, adjusted for BEA/BLS counting discrepancy
 gen epop`a' = emp_count`a'/Nvec[`a',1]  // employment-population ratio WITHIN each age group (fraction of all 30 yr olds who work in given industry)
}

* Relative personal proximity measure - normalize so that wtd averages by age group = 1
forvalues a = 1/5 {
 cap drop x99
 qui gen x99 = pp_share*epop`a'
 qui su x99
  local wm = r(mean)
 qui su epop`a'
 qui gen pp_rel`a' = pp_share*r(mean)/(`wm'+1e-20)
}

* wfh time series
*    Bick, Blandin, and Mertens (June 2020): in Feb 2020 8.2% worked from home, in May 35.2% worked from home. 
mkmat wfh_share in 1/68, mat(wfh_share)
mkmat emp_count_tot in 1/68, mat(emp_count_tot)
mat wfh_count = wfh_share'*emp_count_tot
mat Ltot = emp_count_tot'*J(rowsof(emp_count_tot),1,1)
 global Ltot = Ltot[1,1]
global wfh_share_max = wfh_count[1,1]/Ltot[1,1]
dis wfh_count[1,1] "  " Ltot[1,1] "  " $wfh_share_max
gen w99 = .
 qui replace w99 = 0.082/$wfh_share_max if tin(21Feb2020,21Feb2020) // BBM estimate
 qui replace w99 = 0.352/$wfh_share_max if tin(21May2020,21May2020) // BBM estimate
ipolate w99 gmiw_sm, gen(wfh_frac) e

***************************************************************************
*          NPI weights
***************************************************************************
* Construct NPI time series weightws by date
*  There are no good data on fraction of home contacts that are non-HH members. Dorelien et al (2020) Fig 2, 3 suggestion ~85% are HH members (ATUS) 
*  However, their mean number of total home contacts is ~2 for 20-54 while POLYMOD is 3.7, so 2/3.7 = 0.54. Dorelian et al is 
*  especially low at young ages which suggests they are missing play dates from their ATUS calculations.
*  Absent good data, assume provisionally that 80% of home contacts are HH members. For the non-HH 20%, assume proportionality to gmio, scaled

gen school_fac = $o1ns + (1-$o1ns)*school_frac // Other activity contact factor for <20, non-school + school
su gmio_sm, d
 gen gmio_sm_scaled = (gmio_sm-r(min))/(r(max)-r(min))

forvalues a = 1/5 {
  gen cmox`a' = gmio_sm
  gen cmhx`a' = 1
  gen cmwx`a' = 0
  forvalues i = 1/68 {
   qui replace cmwx`a' = cmwx`a' + labor_shock`i'*(1-wfh_frac*wfh_share[`i'])*epop`a'[`i']*pp_rel`a'[`i']
  }
 }
qui replace cmox1 = gmio_sm*school_fac

list epop4 epop5 pp_rel4 pp_rel5 cmwx4 cmwx5 in 1/70

desc
cap drop _merge
save "data\NPI_industry_data", replace

* ------ Misc matrices, odss & ends --------
mkmat deaths, mat(deaths)
mkmat obsno, mat(obsno)
mkmat emp_count1 emp_count2 emp_count3 emp_count4 emp_count5 in 1/68, mat(emp_count)
mat emp_count_tot = emp_count*J(5,1,1)
mat Lvec = emp_count'*J(68,1,1)
mat Ltot = emp_count_tot'*J(rowsof(emp_count_tot),1,1)
global Ltot = Ltot[1,1]
gen psi = labor_income_share
mkmat psi in 1/66, mat(psi)

* labor shocks by sector: historical and maximum until virus
mkmat labor_shock1-labor_shock68, mat(lshock_npi)
mkmat epop in 1/68, mat(epop)
mkmat epop1 epop2 epop3 epop4 epop5 in 1/68, mat(epop_age)
mat lshare_npi = hadamard(lshock_npi,J(rowsof(lshock_npi),1,1)*epop')
 mat lshare_npi = lshare_npi[1...,1..66]
mat lshockmax = J(68,1,0)

forvalues i = 1/68 {
    mat lshockmax[`i',1] = 1 - 0.03*(lshock_npi[113,`i']<0.95) // Sectors for which employment was down by more than 10% as of June empl report are capped at -3%
}

* ---------- SEIQRD parameters (global) -------------------
* IFR profile interpolated from Salje Science May 2020 https://science.sciencemag.org/content/early/2020/05/12/science.abc3517 interpolated 
*  Interpolation from log-linear fit (Salje used log-linear) with US population weights, calculated in ifr_age_population_adj.xlsx
if $nage==5 {
 mat IFR00 =  (0.001    \ 0.020    \ 0.280      \ 1.349   \ 7.177)/100 // Salje Science May 2020 (adjusted for our age groups)
 mat a98 = IFR00'*Nvec/N
 mat IFR00 = (1.2/a98[1,1])*IFR00 // factor of 1.41 to adjust for typical simulated infection rate age distn
 
 * fractional distribution of initially infected
 mat I00 = (1.75 \ 10.15 \ 12.25 \ 4.55 \ 6.30)
 mat a99 = I00'*J(5,1,1)
 mat I00 = I00/a99[1,1]
}
if $nage==1 {
 mat IFR00 = 1
 mat I00 = 1
 forvalues i = 1/$numnpis {
  mat cnormalized = hadamard(c_`i',Nvec*Nvecinv')
  mat eigenvalues r c = cnormalized
  mat c_`i' = r[1,1]
 }
 mat Nvec = N
 mat Nvecinv = 1/N
}

* ----------------------------------------------------------------------------- 
*  Read in POLYMOD contact matrices
*    compute conditional contact matrices
*    reweight for US 
* ----------------------------------------------------------------------------- 
preserve 
 foreach cc in "home_mean" "other_mean" "work_mean" {
  import delimited using "$dpath/contact_`cc'.csv", rowrange(2) colrange(2) clear
  destring v*, replace
  mkmat v*, mat(c)
  mat c`cc' = c
  mat list c`cc'
 }
 * sampling fractions by age in polymod
 import delimited using "$dpath/fractions_by_category.csv", rowrange(2) colrange(2) clear
 mkmat fraction_work fraction_home fraction_other, mat(fpolymod)
 mat list fpolymod
restore

mat list chome_mean
mat list cother_mean
mat list fpolymod

* Create normalize mean polymod contact matrices so they are conditional on being in that state
local j = 0
foreach cc in "work" "home" "other" {
 local j = `j'+1
 mat cn`cc'_mean = J(5,5,0)
 forvalue i = 1/5 {
   mat cn`cc'_mean[`i',1] = c`cc'_mean[`i',1...]/fpolymod[`i',`j']
 }
 mat cn`cc'_tot = cn`cc'_mean*J(5,1,1) // Mean contacts by age by activity
 mat list cn`cc'_tot 
}

* Reweighted mean work polymod matrix, reweighted for baseline US population shares by age
mat lf99 = J(5,1,0)
forvalues i = 1/5 {
 qui su epop`i'
 mat lf99[`i',1] = r(N)*r(mean)
}
mat cwork_USwt = hadamard(lf99*J(1,5,1),cnwork_mean)
mat list cwork_USwt
* Pull out rows of other contact matrix for subsequent individual manipulation 
local m "cother_mean"
forvalues i = 1/5 {
 mat `m'`i' = J(5,5,0)
 mat `m'`i'[`i',1] = `m'[`i',1...]
}
mat `m'23 = `m'2 + `m'3
mat `m'24 = `m'2 + `m'3 + `m'4
mat `m'25 = `m'2 + `m'3 + `m'4 + `m'5
*

* Adjustment matrix for reduced infectiousness of children
*   Note will want to vary this for sensitivity analysis. See Kong-Li memo 060620
mat cmat_adjust_kids = J(5,5,1)
mat cmat_adjust_kids[1,2] = J(1,4,$inf_adults2kids)
mat cmat_adjust_kids[2,1] = J(4,1,$inf_kids2adults)
mat cmat_adjust_kids[1,1] = ($inf_kids2adults+$inf_adults2kids)/2

* Create contact summary measure (maxReval) for use in nage=1 model, no SD measures (phifac=1)
local T = _N
qui gen maxReval_un = .
qui gen maxReval = .
forvalues t = 1/`T' {
 if (cmhx1[`t']~=.)*(cmox1[`t']~=.)*(cmwx1[`t']~=.) {
  mat cwork = J(5,5,0)
  mat cother = J(5,5,0)
  mat chome = J(5,5,0)
  forvalues a = 1/5 {
   mat cwork[`a',1]  = cmwx`a'[`t']*cnwork_mean[`a',1...] 
   mat cother[`a',1] = cmox`a'[`t']*cother_mean[`a',1...]
   mat chome[`a',1]  = cmhx`a'[`t']*chome_mean[`a',1...]
  }
  mat ctot = chome + cother + cwork
  if `t'==11 {
      mat ctot0 = ctot // basline contact matrix is Monday March 2
  }
  if `t'==55 {
      mat ctot_April15 = ctot // contact matrix on April 15
  }
  mat cmat = hadamard(cmat_adjust_kids,ctot)
  mat cnormalized = hadamard(cmat,Nvec*Nvecinv')
  mat eigenvalues r c = cnormalized
  qui replace maxReval = r[1,1] in `t'/`T'
  mat cnormalized = hadamard(ctot,Nvec*Nvecinv')
  mat eigenvalues r c = cnormalized
  qui replace maxReval_un = r[1,1] in `t'/`T'
 }
}
*tsline maxReval maxReval_un in 1/100
su maxReval if tin(22Feb2020,03Mar2020)
global maxReval0 = r(mean)
mkmat maxReval, mat(maxReval)
mat maxReval_input =  J(`T',1,$maxReval0) // required initialization
dis "Baseline maxReval = " $maxReval0

* ----------------------------------------------------------------------------- 
*       Derivatives of maximum eigenvalue for adding workers to sector i
*         Evaluate derivatives for baseline contact matrix
* ----------------------------------------------------------------------------- 
sca numericalderivfac = 1000
gen mderiv = .
mat cmat = hadamard(cmat_adjust_kids,ctot0)
mat cnormalized = hadamard(cmat,Nvec*Nvecinv')
mat eigenvalues r c = cnormalized
local maxrevalmc_0 = r[1,1]
forvalues a = 1/5 {
  gen depop`a' = (numericalderivfac/Nvec[`a',1])*(emp_count`a'/emp_count_tot)  // increment in epop by age for numerical derivative 
}
forvalues i = 1/66 {
 mat dcwork = J(5,5,0)
 forvalues a = 1/5 {
   mat dcwork[`a',1]  = depop`a'[`i']*pp_rel`a'[`i']*cnwork_mean[`a',1...]
 }
 mat ctot1 = ctot0 + dcwork
 mat cmat = hadamard(cmat_adjust_kids,ctot1)
 mat cnormalized = hadamard(cmat,Nvec*Nvecinv')
 mat eigenvalues r c = cnormalized
 local maxrevalmc_1 = r[1,1]
 qui replace mderiv = (`maxrevalmc_1' - `maxrevalmc_0')/numericalderivfac in `i'/`i'
}

* ----------------------------------------------------------------------------- 
*          Create industry contribution index (theta) 
* ----------------------------------------------------------------------------- 
gen theta_gdp_raw = (psi/emp_count_tot)/((0.05539/0.214)*mderiv)
gen psi_per_worker = psi/emp_count_tot
winsor theta_gdp_raw, gen(theta_gdp) h(3)
 su theta_gdp in 1/66, d
 replace theta_gdp = (theta_gdp - r(mean))/r(sd)
 qui replace theta_gdp = 0 in 67/68
 mkmat theta_gdp in 1/68, mat(theta_gdp)
 *
preserve
 keep in 1/68
 keep ncode iocode iocode_name psi mderiv emp_count_tot psi_per_worker theta_gdp_raw theta_gdp
 save "data\theta_gdp", replace
 sort theta_gdp
 export excel iocode ncode iocode_name psi emp_count_tot psi_per_worker mderiv theta_gdp_raw theta_gdp using "data\theta_gdp.xlsx", replace
restore
list theta_gdp in 1/70

* ----------------------------------------------------------------------------- 
*         Initialization odds and ends 
* ----------------------------------------------------------------------------- 
* Initialize removal to quarantine spike to off, overwite for removal experiment
global tremove = "20May2020"
 cap drop e98
 gen e98 = tin(,$tremove)
 qui su e98
  global ndate_remove = r(sum)
global nremove = 0
gen Q2avg = tin(01Apr2020,30Jun2020)
gen Q3avg = tin(01Jul2020,30Sep2020)
gen Q4avg = tin(01Oct2020,31Dec2020)
forvalues q = 2/4 {
	mkmat Q`q'avg, mat(Q`q'avg)
	qui su Q`q'avg, d
	mat Q`q'avg = Q`q'avg/r(sum)
}
format date %td
sort date

/* Figure: Contact matrix heat maps 
* example contact matrix with 7% unemployment rate (non-nuanced reopening), partial sequestering 75+, ages 64+ don't go to work
mat coth_fac_ex = J(5,5,1)
mat coth_fac_ex[1,1] = 0.3*J(1,4,1)
mat coth_fac_ex[1,5] = 0.25*J(5,1,1)
mat coth_fac_ex[5,1] = 0.25*J(1,5,1)
mat ctot_ex = hadamard((J(4,1,1)\0.25)*J(1,5,1),chome_mean) + hadamard(coth_fac_ex,cother_mean) + hadamard((1\1\1\0\0)*J(1,5,1),cwork_USwt)
foreach cmat in "ctot0" "ctot_April15" "ctot_ex" { 
 cap drop contacta contactb cc1 cc2 cc3 cc4 cc5 ctotplot
 svmat `cmat', names(cc)
 gen contacta = .
 gen contactb = .
 gen ctotplot = .
 local j = 0
 forvalues a = 1/5 { 
	forvalues b = 1/5 {
		local j = `j'+1
		qui replace contacta = `a' in `j'/`j'
		qui replace contactb = `b' in `j'/`j'
		qui replace ctotplot = cc`a'[`b'] in `j'/`j'
	}
 }
 twoway contour ctotplot contacta contactb in 1/25, heatmap ecolor(cranberry) crule(int) ccuts(0(0.2)9) xtitle("") ytitle("")
 save "figs/cmat_heatmap_`cmat'.png", replace
 preserve
  keep ctotplot contacta contactb 
  keep in 1/25
  save "figs/cmat_heatmap_`cmat'.dta", replace
 restore
}
*/
*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
