cap log close
*clear
set more off
program drop _all

global lastpayroll_filename "0628"
global lastweeknum = week(date(string(2020)+"${lastpayroll_filename}","YMD")+6)
global projdir <The Replication Packet Directory>
global initial_microdatadir <Where The Initial Micro Data Are Stored. Not Included with Replication Packet>
global graphdatadir "$projdir/Graphdata"
global inputdatadir "$projdir/InputData"
global vars_to_keep client_code ooid assoc_obj_id emp_pur_c emp_stat_c ///
					emp_hir_dat gen_c addr_st_c last_prc_pay_d last_pyrl_processed_date ///
					addr_zip5_c pay_freq_c std_hrs_q tot_hrs_q std_rt_type_c ///
					std_pay_rt_a prem_hrs_fctr_c gross_ern_a age ///
					naics_code1 stateb clientpaysize company_size
global refweek = 7 // Week number to calculate wage percentiles in and plot indices against

* Define last available week of data. Will drop all observations after this date in plots.
* Need to update the below local to the MMDD of the last available file. Then the global
* defines the Stata week associated with the last available week. Note that the ADP files
* report the day of the BEGINNING of the week (i.e. Sunday). E.g. the file 0329 covers
* paychecks from Mar 29 through April 4. Stata weeks start on Wednesday. Thus need to add
* 6 to the stata day of the last file, to make it relevant to the subsequent Saturday
* (the last day of paychecks processed)
global lastpayroll_filename "0628"
global lastweek = wofd(date(string(2020)+"${lastpayroll_filename}","YMD")+6)
global lastweeknum = week(date(string(2020)+"${lastpayroll_filename}","YMD")+6)

program main
	clean_data
	collapse_to_TS
	make_plots, suffix(_unweighted)
	make_plots, weightvars(naics2) weightfile(SUSB_weights_naics2) suffix(_weighted_naics2)
	make_plots, weightvars(naics2 size_SUSB) weightfile(SUSB_weights_size_naics2) suffix(_weighted_sizeXnaics2)
end

program clean_data
	use "Q:\Sheldon\Data\Processed\wkly_autopay_2020.dta" if pay_freq_c == "B", clear
	
	initial_cleaning, biweekly
	save "Q:\Sheldon\Data\Processed\wkly_autopay_2020_cleaned.dta", replace
	
	* Do same cleaning for weekly guys
	di "Starting Weekly Guys"
	use "Q:\Sheldon\Data\Processed\wkly_autopay_2020.dta" if pay_freq_c == "W", clear
	initial_cleaning

	* Append adjusted biweekly guys to a weekly dataset
	di "Appending Biweekly guys"
	append using "Q:\Sheldon\Data\Processed\wkly_autopay_2020_cleaned.dta"
	
	* Generate industry subsector variables
	gen naics2 = substr(naics_code1,1,2)
	gen naics3 = substr(naics_code1,1,3)
	gen naics4 = substr(naics_code1,1,4)
	replace naics2 = "44-45" if inlist(naics2,"44","45")
	replace naics2 = "48-49" if inlist(naics2,"49","48")
	replace naics2 = "31-33" if inlist(naics2,"31","32","33")
	
	gen size_SUSB = recode(clientpaysize,0,49,499,999,2499,999999)
	label define size_SUSB 19 "1-19 Employees" 49 "< 50 Employees" 499 "50-499 Employees" ///
				999 "500-999 Employees" 2499 "1000-2499 Employees" 999999 "2500+ Employees"
	label val size_SUSB size_SUSB
	
	* Generate age groups
	gen age_group = recode(age,15,25,30,35,40,45,50,55,60,65,1000)
	label define age_group 	15 "< 16" 25 "16-25" 30 "26-30" 35 "31-35" 40 "36-40" ///
							45 "41-45" 50 "46-50" 55 "51-55" 60 "56-60" 65 "61-65" 1000 "65+
	label val age_group age_group
	
	gen age_group_coarse = recode(age,16,20,30,40,50,60,1000)
	label define age_group_coarse 	10 "< 16" 20 "16-20" 30 "21-30" 40 "31-40" ///
									50 "41-50" 60 "51-60" 1000 "60+
	label val age_group_coarse age_group_coarse
	
	* Generate week number variable
	gen week = week(dofw(week_collapse))
	keep if week >= 5
	
	** Winsorize top and bottom 1% of base wages
	summ std_wage, d
	replace std_wage = . if !inrange(std_wage,r(p1),r(p99)) | std_wage == 0
	
	* Generate employee ID variable
	egen empid = group(client_code emp_pur_c)
	
	* 0.01% of our sample goes from weekly to biweekly or vice versa. Thus they
	* show up multiple times in a week and would be double counted. Drop them
	* at this stage because they're so small.
	duplicates tag empid week_collapse, gen(tag)
	tab tag
	keep if tag == 0
	
	* Generate 4 week change variables
	xtset empid week_collapse, w
	sort empid week_collapse
	
	* Generate paycheck-over-paycheck base wage, hours, and earnings changes for selection-adjustment
	bys emp_pur_c client_code (week_collapse): gen delta_wage = 100*(std_wage - std_wage[_n-1])/std_wage[_n-1]
	bys emp_pur_c client_code (week_collapse): gen delta_hrs = 100*(tot_hrs_q - tot_hrs_q[_n-1])/tot_hrs_q[_n-1]
	
	* Winsorize changes
	foreach var in wage hrs ern {
		summ delta_`var' if abs(delta_`var') > 0.1, d
		}
	replace delta_wage = . if !inrange(delta_wage,-50,50)
	replace delta_hrs = . if !inrange(delta_hrs,-75,150)
	
	* Save pre collapse
	compress 
	save "Q:\Sheldon\Data\Processed\wkly_autopay_2020_cleaned.dta", replace
end

program initial_cleaning
	syntax, [biweekly]
	* Some firms have missing client codes but non-missing ooids. Use the ooid as
	* an auxiliary client code if missing client_code
	replace client_code = ooid if mi(client_code)

	** Get date of last paycheck
	rename last_prc_pay_d temp
	gen last_prc_pay_d = date(temp,"YMD")
	drop temp
	gen last_proc_wk = wofd(last_prc_pay_d+3) // Add 3 to payroll processed date to make week Sunday-Saturday rather than Wednesday-Tuesday as Stata does by default

	format last_prc_pay_d %td
	format last_proc_wk %tw
	
	* Generate week number variable, and keep February forward
	gen week = week(last_prc_pay_d + 3)

	* Clean up employee hire date
	rename emp_hir_dat temp
	gen emp_hir_dat = date(temp,"YMD")
	drop temp
	
	* Generate tenure variable (in weeks)
	gen tenure_weeks = last_proc_wk - wofd(emp_hir_dat)
	gen tenure_months = mofd(last_prc_pay_d) - mofd(emp_hir_dat)

	* If biweekly, choose which week to assign worker to. Assign worker to even
	* week of the 2 week period if majority of their paychecks are given on
	* even weeks, and to the odd week fo the two week period if majority are
	* given on odd weeks. Then designate workr as biweekly even ("BE") or 
	* biweekly odd ("BO")
	if "`biweekly'" != "" {
		gen byte even = mod(week,2) == 0
		bys client_code emp_pur_c (last_proc_wk): egen numEven = total(even)
		bys client_code emp_pur_c (last_proc_wk): gen numobs = _N
		drop even
		gen share_even = numEven/numobs
		gen byte even = share_even > 0.5
		gen week_collapse = 2*floor(last_proc_wk/2) + 1 if even == 1
		replace week_collapse = 2*floor((last_proc_wk+1)/2) if even == 0
		replace pay_freq_c = "BE" if even == 1
		replace pay_freq_c = "BO" if even == 0
		format week_collapse %tw
		}
	else {
		rename last_proc_wk week_collapse
		}

	* Some workers are paid twice in a week. Sum their paychecks. Harmonize
	* residence state, hourly status, payment frequency, industry, etc. to last
	* observed value in two week period
	foreach var in addr_st_c naics_code1 pay_freq_c std_rt_type_c clientpaysize gen_c {
		di "Harmonizing `var'"
		gen byte temp = !mi(`var')
		bys client_code emp_pur_c week_collapse (temp last_prc_pay_d): replace `var' = `var'[_N]
		drop temp
		}
	di "Summing all paychecks within week_collapse together"
	collapse std_hrs std_pay_rt_a (max) tenure_weeks age (sum) tot_hrs_q gross_ern_a, by(emp_pur_c client_code ///
		week_collapse addr_st_c naics_code1 pay_freq_c std_rt_type_c gen_c clientpaysize) fast
	di "All paychecks collapsed, moving onto top coding hours"
	
	if "`biweekly'" != "" {
		* Top code hours at 200, remove 0 hours guys
		replace tot_hrs_q = . if tot_hrs_q == 0
		replace tot_hrs_q = 200 if tot_hrs_q > 200 & !mi(tot_hrs_q)		
		}
	else {
		* Top code hours at 200, remove 0 hours guys
		replace tot_hrs_q = . if tot_hrs_q == 0
		replace tot_hrs_q = 100 if tot_hrs_q > 100 & !mi(tot_hrs_q)
		}
	
	* Make missing naics codes equal to the unclassified naics code for easy 
	* merging with weights
	replace naics_code1 = "999999" if mi(naics_code1)
	
	** Winsorize top and bottom 1% of gross earnings
	summ gross_ern_a, d
	replace gross_ern_a = . if !inrange(gross_ern_a,r(p1),r(p99)) | gross_ern_a == 0
		
	* Generate base wage variable
	gen std_wage = std_pay_rt if std_rt_type_c == "H"
	replace std_wage = std_pay_rt/40 if std_rt_type_c == "S" & pay_freq_c == "W" & mi(std_wage)
	replace std_wage = std_pay_rt/80 if std_rt_type_c == "S" & inlist(pay_freq_c,"B","S") & mi(std_wage)
	replace std_wage = gross_ern_a/tot_hrs_q if mi(std_wage)
	
	* Put lower bound on base wage equal to $2.13; the federal minimum for tipped employees
	replace std_wage = . if std_wage < 2.13
	
	* Put biweekly workers onto a weekly scale by dividing paychecks by two if employee
	* hired > 2 weeks ago
	if "`biweekly'" != "" {
		foreach var of varlist mean_ern mean_hrs {
			replace `var' = `var'/2 if tenure_weeks >= 2
			}
		}
end
	
program collapse_to_TS
	* Collapse to wage percentiles
	use std_wage week age_group naics2 size using "$initial_microdatadir/wkly_autopay_2020_cleaned.dta" if inlist(week,${refweek},${refweek}-1) & !mi(std_wage), clear
	
	* Generate wage quintile as of Reference Week (or reference week minus 1 to account for some biweekly guys)
	xtile wage_quintile = std_wage, nq(5)
	collapse (min) quintile_cutoff =  std_wage, by(wage_quintile) fast
	gen byte ones = 1
	reshape wide quintile_cutoff, i(ones) j(wage_quintile)
	compress
	save `aggwagedist', replace
	
	* Lag firm sizes for weights
	use client_code week_collapse size_SUSB using "$initial_microdatadir/wkly_autopay_2020_cleaned.dta", clear
	collapse (max) size_SUSB, by(client_code week_collapse) fast
	rename size_SUSB size_concurrent
	bys client_code (week_collapse): gen size_SUSB = size_concurrent[_n-1]
	keep client_code week_collapse size_SUSB
	tempfile sizes
	compress
	save `sizes', replace
	
	* Bring in data, winsorize changes, collapse
	use "$initial_microdatadir/wkly_autopay_2020_cleaned.dta" if week >= 5, clear
	
	gen byte ones = 1
	
	* Harmonize wage quintiles over time
	cap drop wage_quintile wage_quintile_in_age
	merge m:1 ones using `aggwagedist', nogen
	gen wage_quintile = 1 if !mi(std_wage)
	forvalues q = 1/5 {
		replace wage_quintile = `q' if std_wage >= quintile_cutoff`q' & !mi(std_wage)
		}
	
	* Merge in lagged sizes
	merge m:1 client_code week_collapse using `sizes', nogen keep(1 3)	  
	
	di "Beginning collapses"
	* Weighted Plot Collapses
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_agg_byfreq", replace)
	
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c std_rt_type_c naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_paytype_byfreq", replace)
			  
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c wage_quintile naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_quintiles_byfreq", replace)
	
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c size naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_size_byfreq", replace)
			  
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c gen_c naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_sex_byfreq", replace)
			  
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c age_group_coarse naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_age_byfreq", replace)
			  
	xcollapse (sum) num_emp = ones (mean) std_wage delta_wage delta_hrs, ///
			  by(week_collapse pay_freq_c addr_st_c naics2 size_SUSB) ///
			  saving("$inputdatadir/TS_state_byfreq", replace)			  
end

program make_plots
	syntax, [weightvars(string asis) weightfile(string asis) suffix(string asis)]
	global weightvars `weightvars'
	global weightfile `weightfile'
	global suffix `suffix'
	
	plot_macro_all // Figure 5 and A2
	export_naics_all, numdigits(2) // Tables A3-A7
	plot_quintile_all // Figure 4A
	plot_sex_all // Figure 4B
	plot_state_by_ind // Figure 10
	plot_age_all // Figure A
end

program plot_macro_all
	** Macro - All. Wage and Hours Indices includes. Figure 5 and A2
	use "$inputdatadir/TS_agg_byfreq" if week_collapse <= ${lastweek}, clear
	
	format_variables_index
		
	sort week_collapse
	
	#delimit ;
	keep day num_emp std_wage wage hrs;
	order day num_emp std_wage wage hrs;
	sort day;
	export excel using "$graphdatadir/all${suffix}.xlsx", firstrow(varlabels) replace;
	#delimit cr
end

program export_naics_all
	// Tables A3-A7
	* Get NAICS titles
	import excel using "Q:/Sheldon/Data/External/2-6 digit_2017_Codes.xlsx", clear first
	drop SeqNo D-F
	drop if mi(NAICSUSCode)
	rename NAICSUSCode naicscode
	rename NAICSUSTitle indtitle
	tempfile naicscodes
	compress
	save `naicscodes', replace

	* Format indices
	use "$inputdatadir/TS_agg_byfreq" if !mi(naics2) & week_collapse <= ${lastweek}, clear
		
	format_variables_index, panelvars(naics2)
		
	* Merge titles
	replace naics2 = strtrim(naics2)
	drop if inlist(naics2,"","99")
	rename naics2 naicscode
	merge m:1 naicscode using `naicscodes', keep(1 3) nogen
	label var naicscode "2-digit NAICS"
	label var indtitle "Industry Title"
	rename naicscode naics2
	
	keep day naics2 indtitle num_emp std_wage wage hrs;
	order day naics2 indtitle num_emp std_wage wage hrs;
	sort naics2 day
	export excel using "$graphdatadir/naics2${suffix}.xlsx", firstrow(varlabels) replace
end

program plot_state_by_ind
	** Residence State x Industry (Figure 10)
	use "$inputdatadir/TS_state_byfreq" if !mi(addr_st_c) & week_collapse <= ${lastweek}, clear
	
	* Generate indicator for early state
	gen byte early_state = inlist(addr_st_c,"FL","GA","TX") if inlist(addr_st_c,"FL","GA","TX","IL","PA","VA","WA")
	
	format_variables_index, panelvars(early_state naics2)
	
	#delimit ;	
	keep day early_state naics2 num_emp std_wage wage hrs;
	order day early_state naics2 num_emp std_wage wage hrs;
	sort early_state naics2 day;
	export excel using "$graphdatadir/state_by_ind${suffix}.xlsx", firstrow(varlabels) replace;
	#delimit cr
end

program plot_quintile_all
	** Wage Quintiles (Figure 4 Panel A)
	use "$inputdatadir/TS_quintiles_byfreq" if week_collapse <= ${lastweek}, clear
	
	format_variables_index, panelvars(wage_quintile)
	
	#delimit ;	
	levelsof wage_quintile, local(quintiles);
	sort wage_quintile week_collapse;
	
	keep day wage_quintile num_emp std_wage wage hrs;
	order day wage_quintile num_emp std_wage wage hrs;
	sort wage_quintile day;
	export excel using "$graphdatadir/wage_quintile${suffix}.xlsx", firstrow(varlabels) replace;
	#delimit cr
end

program plot_sex_all
	** Macro - All. Figure 4 Panel B
	use "$inputdatadir/TS_sex_byfreq" if week_collapse <= ${lastweek}, clear
	
	format_variables_index, panelvars(gen_c)
	
	sort week_collapse
	
	#delimit ;
	
	keep day gen_c num_emp std_wage wage hrs;
	order day gen_c num_emp std_wage wage hrs;
	sort gen_c day;
	export excel using "$graphdatadir/sex${suffix}.xlsx", firstrow(varlabels) replace;
	#delimit cr
end

program plot_age_all
	** By Age - Figure A3
	use "$inputdatadir/TS_age_byfreq" if !mi(age_group_coarse) & week_collapse <= ${lastweek}, clear
	
	format_variables_index, panelvars(age_group_coarse)
	
	#delimit ;
	
	keep day age_group_coarse num_emp std_wage wage hrs;
	order day age_group_coarse num_emp std_wage wage hrs;
	sort age_group_coarse day;
	export excel using "$graphdatadir/age${suffix}.xlsx", firstrow(varlabels) replace;
	#delimit cr
end

** Take an input dataset that has indices at the payment frequency x characteristic level,
** and turn it into a characteristic level index by aggregating together their
** payment frequencies and weighting appropriately
program format_variables_index
	syntax, [Panelvars(varlist)]
	
	* Combine biweekly even and biweekly odd into one category
	replace pay_freq_c = "B" if inlist(pay_freq_c,"BE","BO")
	collapse (sum) num_emp (mean) std_wage delta_wage delta_hrs, by(`panelvars' ${weightvars} pay_freq_c week_collapse)
	
	* Generate 2-week moving averages for biweekly guys to include both even and odd
	egen grp = group(`panelvars' ${weightvars} pay_freq_c), missing
	xtset grp week_collapse, w
	foreach var in num_emp {
		rename `var' `var'_raw
		gen `var' = `var'_raw
		replace `var' = (`var'_raw + l.`var'_raw)/2 if pay_freq_c == "B"
		}
	foreach var in std_wage {
		rename `var' `var'_raw
		gen `var' = `var'_raw
		replace `var' = (num_emp_raw*`var'_raw + l.`var'_raw*l.num_emp_raw)/(num_emp_raw + l.num_emp_raw) if pay_freq_c == "B"
		}
	
	* Generate indices
	gen week = week(dofw(week_collapse))
	gen byte refweek = inlist(week,${refweek})
	
	foreach var in num_emp std_wage {
		bys grp (refweek week): gen `var'_index = `var'/`var'[_N]
		}
	bys grp (refweek week): gen num_emp_refweek = num_emp[_N]
	
	* Produce selection-adjusted indices from wage and hours changes
	gen neg_wk = -week_collapse
	foreach var in wage hrs {
		gen `var'_index = 1
		bys grp (week_collapse): replace `var'_index = `var'_index[_n-1]*(1+delta_`var'/100) if week > ${refweek}
		bys grp (neg_wk): replace `var'_index = `var'_index[_n-1]/(1+delta_`var'[_n-1]/100) if week < ${refweek}
		}
	
	collapse *index [aw=num_emp_refweek], by(`panelvars' ${weightvars} week_collapse)
	
	* Collapse and weight
	merge m:1 ${weightvars} using "$inputdatadir/${weightfile}"
	collapse *index [aw=share_emp], by(`panelvars' week_collapse)
	
	foreach var of varlist *index {
		local newname = substr("`var'",1,strpos("`var'","index")-2)
		rename `var' `newname'
		}
	
	* Put Week on daily scale
	gen day = dofw(week_collapse)+2
	format day %td

	summ day
	global minday = r(min)
	global maxday = r(max)
	format week_collapse %tw
	format day %td
	
	* Label variables
	label var day "Week End Date"
	label var num_emp "Number of Employees"
	label var std_wage "Average Base Wage"
	label var wage "Selection-Adjusted Wage"
	label var hrs "Selection-Adjusted Hours"
end

main

