********************************************************************************
* Program:  Calculate numbers to replicate Erik Hurst's ACS table with CPS data

********************************************************************************

* Set global directories 
cd "/home/buckman/Labor_Disparities/Baseline/"


* Pull in CPS match data 
use "/home/buckman/CPS_files/CPS_Match.dta", clear

* Keep 1990 - present (ernwgt starts in 1982)
keep if ym >= tm(1990m1)

***** ONLY USE ORG DATA *************************
keep if mis == 4 | mis == 8 

* Drop if missing or negative ernwgt
drop if missing(ernwgt)
drop if ernwgt < 0 

* Drop if self-employed: 
drop if (class2 == 5 | class2 == 6) & (ym > tm(1989m12) & ym < tm(1994m1))
drop if (class3 == 6 | class3 == 7) & ym > tm(1993m12)

drop if age <= 24
drop if age >= 65


* Gender groups
rename sex gendergroup

# delimit
label define gendergroups
	1 "Male"
	2 "Female"
	,replace
;
label values gendergroup gendergroups;
# delimit cr


* Race groups
gen spaneth1=spaneth
forvalues x=2/4 {
	replace spaneth=spaneth`x' if spaneth==.
} 

** Hispanic
replace racegrp = 3 if spaneth >= 1 & spaneth <= 7 & inrange(year,1989,2002)
replace racegrp = 3 if spaneth >= 1 & spaneth <= 5 & inrange(year, 2003, 2013) 
replace racegrp = 3 if inrange(spaneth, 1, 8) & year >= 2014 

drop spaneth
rename spaneth1 spaneth

* Race Groups
replace racegrp = 4 if racegrp == 5
rename racegrp racegroup


* Harmonize earnings variable
* Combine weekly earnings variable
gen ern_all = 0
replace ern_all = ernwk_hourly if !missing(ernwk_hourly)  & ernwk_hourly >= 0 & hourlyind == 1
replace ern_all = ernwk_all if !missing(ernwk_all) & ernwk_all >= 0
replace ern_all = ernwk_all2 if !missing(ernwk_all2) & ernwk_all2 >= 0
replace ern_all = ernwk_all3 if !missing(ernwk_all3) & ernwk_all3 >= 0
replace ern_all = ernwk_salary if !missing(ernwk_salary) & ernwk_salary >= 0 & hourlyind != 1

gen hr_all = usualhrs 
replace hr_all = usualhrsf2 if ym < tm(1994m1)

// Replace hours for workers w/ varying hrs
replace hr_all = 10 if usualhrs == -4 & usualhrsrng == 1 & ym >= tm(1994m1) // 0-20 hours
replace hr_all = 27.5 if usualhrs == -4 & usualhrsrng == 2 & ym >= tm(1994m1) // 21-34 hours
replace hr_all = 37 if usualhrs == -4 & usualhrsrng == 3 & ym >= tm(1994m1) // 35-39 hours
replace hr_all = 40 if usualhrs == -4 & usualhrsrng == 4 & ym >= tm(1994m1) // 40 hours
replace hr_all = 45 if usualhrs == -4 & usualhrsrng == 5 & ym >= tm(1994m1) // 41-49 hours
replace hr_all = 50 if usualhrs == -4 & usualhrsrng == 6 & ym >= tm(1994m1) // 50 or more hours
replace hr_all = 40 if usualhrs == -4 & usualhrsrng == 7 & ym >= tm(1994m1) // Full-time hours 
replace hr_all = 20 if usualhrs == -4 & usualhrsrng == 8 & ym >= tm(1994m1) // Part-time hours 

replace hr_all = 2.5 if usualhrs == -4 & usualhrsrng == 6 & ym < tm(1994m1) //1-4 hours
replace hr_all = 9.5 if usualhrs == -4 & usualhrsrng == 7 & ym < tm(1994m1) // 5-14 hours
replace hr_all = 18 if usualhrs == -4 & usualhrsrng == 8 & ym < tm(1994m1) // 15-21 hours
replace hr_all = 25.5 if usualhrs == -4 & usualhrsrng == 9 & ym < tm(1994m1) // 22-29 hours
replace hr_all = 32 if usualhrs == -4 & usualhrsrng == 10 & ym < tm(1994m1) // 30-34 hours
replace hr_all = 37 if usualhrs == -4 & usualhrsrng == 11 & ym < tm(1994m1) // 35-39 hours
replace hr_all = 40 if usualhrs == -4 & usualhrsrng == 12 & ym < tm(1994m1) // 40 hours
replace hr_all = 44 if usualhrs == -4 & usualhrsrng == 13 & ym < tm(1994m1) // 41-47 hours
replace hr_all = 48 if usualhrs == -4 & usualhrsrng == 14 & ym < tm(1994m1) // 48 hours
replace hr_all = 54 if usualhrs == -4 & usualhrsrng == 15 & ym < tm(1994m1) // 49-59 hours
replace hr_all = 60 if usualhrs == -4 & usualhrsrng == 16 & ym < tm(1994m1) // 60+ hours


* Temp file for U and N so they do not affect the top-coding
preserve 
	keep if U == 1 | N == 1
	tempfile u_and_n_ppl
	save `u_and_n_ppl'
restore

keep if E == 1

gen newtop = .
gen tc = .

gen topcode = 999 if year <= 1988
replace topcode = 1923 if ern_all == 1923 & year >= 1989 & year <=1997
replace topcode = 2884 if ern_all == 2884 & year >= 1990


forvalue i = 1990/2019 {
replace tc = 1 if ern_all == 999 & `i' <= 1988 & year == `i'
replace tc = 1 if ern_all == 1923 & `i' >= 1989 & `i' <=1997 & year == `i'
replace tc = 1 if ern_all == 2884 & `i' >= 1998 & year == `i'
replace tc = 0 if tc != 1 & year == `i'

 if `i' <= 1988{
local T = ln(999)
}
if `i' >= 1989 & `i' <=1997 {
local T = ln(1923) 
}
if `i' >= 1998 {
local T = ln(2884) 
}

sum tc [aw=ernwgt] if tc ~= .
local PHI=1-r(mean)

tempvar lnwke
gen `lnwke'=ln(ern_all) if ern_all ~= .
sum `lnwke' [aw=ernwgt] if tc ~=./* only reporting weekly earnings */
local X=r(mean) /* mean, calculated using top-code (E[y] above) */
local SD=r(sd) /* standard deviation calculated using top-code */
local alpha=invnorm(`PHI')
local lamda=-normalden(`alpha')/normal(`alpha')
* c. calculate estimates of true mean and standard deviation
local lsigma=(`T'-`X')/(`PHI'*(`alpha'-`lamda'))
local lmu   = `T' - `alpha'*`lsigma'
* d. convert from natural logs back to dollars per week
local mX=exp(`X')
local mu=exp(`lmu')
local sigma=exp(`lsigma')
* a. calculate mean above top-code
* calculating mean above top-code implies left-truncation
local halpha=(`T'-`lmu')/`lsigma'
local hlamda=normalden(`halpha')/(1 - normal(`halpha'))
local mtc=`lmu' + `lsigma'*`hlamda'
replace newtop = exp(`mtc') if year == `i'
replace tc = .
}

replace ern_all = newtop if ern_all == 999 & year <= 1988
replace ern_all = newtop if ern_all == 1923 & year >= 1989 & year <=1997
replace ern_all = newtop if ern_all == 2884 & year >= 1990


* Add back in U and N ppl
append using `u_and_n_ppl'

* Merge in pce deflator
* Divide ahe by pce 
merge m:1 ym using "./Data_cleaning/Data/pce_index.dta"
rename _merge pce_merge

drop if ym < tm(1990m1)

* PCE Deflator
preserve
	collapse (first) pce*, by(ym)
	gen y = year(dofm(ym))
	bys y: egen tmp_pce = sum(pce_price_index) 
	gen tmp_pce2 = tmp_pce if y == 2019 
	egen tmp_pce3 = max(tmp_pce2)
	gen pce_price_index_2019 = (pce_price_index / (tmp_pce3 / 12)) * 100
	tempfile pce_index_2019
	save `pce_index_2019'
restore

merge m:1 ym using `pce_index_2019', nogen 
gen real_ern_all = ern_all / (pce_price_index_2019 / 100)

drop y tmp*

keep if pce_merge == 3

* Since missing hours -> missing ahe, replace ahe = 0 if missing hours
* to match ahe calculations from main code
replace real_ern_all = 0 if missing(hr_all) | hr_all == 0 

gen date_ym = year(dofm(ym)) 

gen group_pop = E + U + N

* Calculate Table 1 and Table 3
collapse real_ern_all  (sum) group_pop E [pweight = ernwgt], by(date_ym racegroup)
gen avg_annual_ern = real_ern_all * 52
bys date_ym: egen all_pop = sum(group_pop)
replace all_pop = all_pop / 12
replace group_pop = group_pop / 12
gen share = (group_pop / all_pop)

qui bys date_ym: gen double tmpbase_ern = avg_annual_ern if racegroup == 1
qui bys date_ym: egen double counter_earnings = max(tmpbase_ern)
replace counter_earnings = avg_annual_ern if avg_annual_ern > counter_earnings

gen gdp_group = avg_annual_ern * group_pop
gen gdp_group_counter = counter_earnings * group_pop 

save "./Data_cleaning/Data/simple_counterfactual.dta", replace

