********************************************************************************
******* Program Create Dataset for CBO feedback regression *******
/*
•	For the dependent variable, we use primary variables (outlays and surplus)
•	For the independent variable, including surplus_exp_cur and surplus_exp, we are using total surplus (based on "total deficit"/ "def" rows in CBO data).
*/
********************************************************************************

clear all 

cd "$project_code"

**** Load complete dataset ****

// do "code/clean_data.do" 


use "temp/complete_dataset.dta", clear

/*
**** Drop total variables b/c summing across categories ****
BUT, we only want to drop if there are sum components. If the total var is all 
that is in the CBO table, we want to keep 
*/
* Generate indicator if total_categories are only observations *
bysort report_year report_month change_type budget_line: gen occur  = _n 

bysort report_year report_month change_type budget_line: egen max_occur  = max(occur)

drop if strpos(descript_var, "total")>0 &  max_occur>1 
drop occur max_occur

**** Summing across revenues, outlays, net interest for each change ****

collapse (sum) t*, by (report_month report_year change_type budget_line)

drop if report_month==.
drop if budget_line==""

save "temp/stata_import_file_agg.dta", replace


**** Merge potential GDP data   ****

merge m:1 report_year using "dta/pgdp.dta"
drop _merge
// drop if report_month==.


**** Generate variable to indicate first or second half (i.e. using equivalent to Feb/August indicator)   ****

gen report_half = 1 if report_month<=6 
replace report_half = 2 if report_month>6 


********************************************************************************
******* Generate variables for regression (attempting to copy excel formulas here) *******

*** Generate weights -- copying for excel ***
gen disc =	1.0	
gen a0 = 0.516129032
gen factor_w =	0.5	
gen augshift = 	0.5


forvalues i = 0(1)5 {
	
	if `i' == 0 {
		gen w_t`i' = a0 if report_half==1
		replace w_t`i' = a0*(1-augshift) if report_half==2
	}
	else if `i' > 0 & `i' < 4 {
		local time_temp = `i' -1
		gen w_t`i' =   w_t`time_temp'*factor_w if report_half==1
		egen part1_factor_temp_last = max(w_t`time_temp') if report_half==1
		egen part1_factor_last = max(part1_factor_temp_last)
		egen part1_factor_temp_currrent = max(w_t`i') 
		replace w_t`i' =   part1_factor_last*factor_w + part1_factor_temp_currrent*(1-factor_w) if report_half==2
		drop part1_factor_temp_currrent part1_factor_temp_last part1_factor_last
    }
	
	else if `i' == 4 {
		gen w_t`i' = 1 - (w_t0 + w_t1 + w_t2 + w_t3)	
	}
	
	else if `i' == 5 {
		gen w_t`i' = 0	
	}
	
}

*************** Deal with more than two reports per year ***************

**** Generate count for how many times in a year we have a report ****
by report_year report_month (report_month), sort: gen report_count_temp = _n == 1
by report_year: egen report_count = sum(report_count_temp)

drop report_count_temp


/*
**** For reports with more than two reports, classify half_year = 2 as sum of the reports ****
Note here that I am attempting to follow what Alan does in excel
Note that for December 1995 report -- using that first half 1996 report as Alan does in spreadsheet
*/

gen report_half_final = . 

replace report_half_final = 1 if report_month==12 | report_month== 1 | report_month== 2
replace report_half_final = 2 if report_month>=3 & report_month<12

* April 2018 needs to be first half *
replace report_half_final = 1 if report_month==4 & report_year==2018 

* May 2022 needs to be first half *
replace report_half_final = 1 if report_month==5 & report_year==2022 


* Pushing forward year for Dec 1995 report *
replace report_year= report_year+1 if report_month==12 & report_year==1995
* Multiply by -0.5 as done in excel spreadsheet 
forvalues i = 0(1)12 {
	replace t`i' = t`i'*-0.5 if report_month==12 & report_year==1996 & budget_line=="rev"
	replace t`i' = t`i'*0.5 if report_month==1 & report_year==1998 & budget_line=="rev"
	replace t`i' = t`i'*-0.5 if report_month==1 & report_year==1998 & budget_line=="out"
	replace t`i' = t`i'*-1 if report_month==8 & report_year==1998 & budget_line=="out"
	replace t`i' = t`i'*-0.5 if report_month==8 & report_year==1995 & change_type=="leg" & budget_line=="rev"
	replace t`i' = t`i'*0.5 if report_month==8 & report_year==1995 & change_type=="leg" & budget_line=="out"

}

*** IDIOSYNCRATIC CHANGES TO MAKE MATCH EXCEL *** 

* If report_year has Jan and Feb observation, then Feb is considered second half ** 

** Identify where this is the case
sort report_year

by report_year: gen jan_obs = 1 if report_month==1
by report_year: gen feb_obs = 1 if report_month==2

by report_year: egen temp_feb = max(feb_obs)
by report_year: egen temp_jan = max(jan_obs)


replace report_half_final=2 if report_month== 2 & temp_feb==1 & temp_jan==1

drop jan_obs feb_obs temp_feb temp_jan

save "temp/complete_dataset_temp.dta", replace


/*
* Sum rev, out, int categories by tech, leg, econ 
*/
drop if budget_line=="def"

collapse (sum) t* (mean) pgdp-w_t5, by (report_year report_half_final change_type budget_line)

** Making correction the excel makes to multiply revs by (-1) or 0.5 in some cases ** 

forvalues i = 0(1)12 {
	replace t`i' = t`i'*-1 if report_year==1997 & budget_line=="rev"
}

save "temp/exclude_def_halfyr.dta", replace


use "temp/complete_dataset_temp.dta", clear

/*
* Every category is summed except the deficit categories -- I think I need to do that 
separately 
*/
keep if budget_line=="def"

collapse (last) t* pgdp-w_t5, by (report_year report_half_final change_type budget_line)

** Starting in 2012, making correction the excel makes to multiply deficits by (-1) ** 

forvalues i = 0(1)12 {
	replace t`i' = t`i'*-1 if report_year>=2012
}
save "temp/deficits_halfyr.dta", replace


*** Append data sets *** 
append using "temp/exclude_def_halfyr.dta"
order report_year report_half_final
drop report_half

rename report_half_final report_half

sort report_year report_half

replace change_type = trim(itrim(change_type)) //making sure there are no spacing differences

save "temp/regression_data.dta", replace

/*
** Want to add observations of 0's in cases where there there aren't any legislative changes **

keep if change_type=="leg" 
keep if budget_line=="rev" | budget_line=="out"

keep if report_year>=1985
* Count how many times we have rev and out in each report year (should have two in each)
by report_year report_half, sort: gen report_count_temp = _n 
by report_year report_half: egen report_count = max(report_count_temp)

* Checking which reports don't have values for rev and out * 
drop if report_count==2
*/


/*
**** Generate revenue value used in regression sum_{0 to 5}((w_t*rev_t/pgdp_year)) ****
I think the easiest way to do this is to make all the variables and save in separate dataset to merge?
*/
keep if change_type=="leg" | change_type=="baseline"

keep if budget_line=="rev" | budget_line=="out" | budget_line=="def"

merge m:1 report_year using "dta/budget_vars.dta"
drop _merge

merge m:1 report_year report_half using "dta/outgap_q.dta"
drop _merge
keep if report_year>=1983

preserve 
keep if change_type == "baseline" 
keep if budget_line == "def"
save "temp/temp_complete_dataset.dta", replace 
restore 


// save "temp/temp_test.dta", replace
//
//
// use "temp/temp_test.dta", clear
//
// keep if report_year >= 2020
//still fine 




//TC: this code chunk is unfortunate
local budget_types "rev out def"

foreach b in `budget_types' {

gen `b' = 0
gen `b'_ewtd = 0

//HARDCODED YEAR! report_year
forvalues y= 1984(1)2024{
	* For each year, create pgdp_t variable 
	forvalues gi = 0(1)5 {
			gen yr = `y' + `gi'
			gen pgdp_`gi'_temp = pgdp if report_year==yr
			egen pgdp_`gi' = max(pgdp_`gi'_temp)
			drop yr pgdp_`gi'_temp
	}
	
	forvalues i = 0(1)4 {	
		forvalues r = 1(1)2{
			gen `b'_leg_temp = t`i' if report_year==`y' & report_half==`r'  & budget_line=="`b'"
			
			if "`b'" =="def" {
				replace `b' = `b' - (w_t`i'* `b'_leg_temp)/pgdp_`i' if report_year==`y' & report_half==`r'  & budget_line=="`b'"
				replace `b'_ewtd = `b'_ewtd - (.2 * `b'_leg_temp)/pgdp_`i' if report_year==`y' & report_half==`r'  & budget_line=="`b'"

			}
			else {
				replace `b' = `b' + (w_t`i'* `b'_leg_temp)/pgdp_`i' if report_year==`y' & report_half==`r'  & budget_line=="`b'"	
				replace `b'_ewtd = `b'_ewtd + (.2 * `b'_leg_temp)/pgdp_`i' if report_year==`y' & report_half==`r'  & budget_line=="`b'"	

			}
			
			drop `b'_leg_temp
		}
		
	}
	drop pgdp_*
}
}



replace rev = . if budget_line=="out" | budget_line=="def"
replace out = . if budget_line=="rev" | budget_line=="def"
replace def = . if budget_line=="rev" | budget_line=="out"

replace rev_ewtd = . if budget_line=="out" | budget_line=="def"
replace out_ewtd = . if budget_line=="rev" | budget_line=="def"
replace def_ewtd = . if budget_line=="rev" | budget_line=="out"

sort report_year report_half

by report_year report_half: egen revenue = max(rev)
by report_year report_half: egen outlays = max(out)
by report_year report_half: egen surplus_exp_cur = max(def)

by report_year report_half: egen revenue_ewtd = max(rev_ewtd)
by report_year report_half: egen outlays_ewtd = max(out_ewtd)
by report_year report_half: egen surplus_exp_cur_ewtd = max(def_ewtd)

drop rev out def rev_ewtd out_ewtd def_ewtd

/*
** Create laggest surplus_exp: consistent w/ excel sspreadsheet ** 
Note: woudl probability be easier to convert this to wide format... keeping 
in this format to stay consistent with excel. If doing a lot of calcualtions with 
this, then converting is something to consider 
*/
sort report_year report_half

gen surplus_exp = .
gen surplus_exp_ewtd = .

forvalues y= 1984(1)2024{
		forvalues r = 1(1)2{
			if `r' == 1 {
				gen y_lag = `y' - 1	
				gen lag_surpexp_temp = surplus_exp_cur if report_year==y_lag & report_half==2
				gen lag_surpexp_temp_ewtd = surplus_exp_cur_ewtd if report_year==y_lag & report_half==2

				egen lag_surpexp =  max(lag_surpexp_temp)
				egen lag_surpexp_ewtd =  max(lag_surpexp_temp_ewtd)

				replace surplus_exp =  lag_surpexp if report_year==`y' & report_half==`r'
				replace surplus_exp_ewtd =  lag_surpexp_ewtd if report_year==`y' & report_half==`r'

				drop y_lag 
			}
			else {
				gen lag_surpexp_temp = surplus_exp_cur if report_year==`y' & report_half==1
				gen lag_surpexp_temp_ewtd = surplus_exp_cur_ewtd if report_year==`y' & report_half==1

				egen lag_surpexp =  max(lag_surpexp_temp)
				egen lag_surpexp_ewtd =  max(lag_surpexp_temp_ewtd)

				replace surplus_exp =  lag_surpexp if report_year==`y' & report_half==`r'
				replace surplus_exp_ewtd =  lag_surpexp_ewtd if report_year==`y' & report_half==`r'

			}
			
			drop lag_surpexp_temp lag_surpexp lag_surpexp_temp_ewtd lag_surpexp_ewtd
		}
}


**** Can now collpase 

*** Create other variables needed for regression ***
gen surp_pgdp = surplus_act/pgdp 
gen debt_pgdp = debt_act/pgdp 
gen outgap_pgdp = outgap*-0.01  // TC: should not be negative 
// gen outgap_pgdp = outgap*0.01 // TC: doesn't actually divide by PGDP. Solely for naming/iterating purposes!

gen surplus = revenue - outlays
gen surplus_ewtd = revenue_ewtd - outlays_ewtd


gen lag_surp_pgdp=.
gen lag_debt_pgdp=.
gen lag_outgap_pgdp=.


local lag_vars "surp debt outgap"
 
foreach v in `lag_vars' {

	forvalues y= 1984(1)2024{
		forvalues r = 1(1)2{
			gen y_lag = `y' - 1
			gen lag_temp = `v'_pgdp if report_year==y_lag & report_half==`r'
			egen lag = max(lag_temp)
			replace lag_`v'_pgdp = lag if report_year==`y' & report_half==`r'
			drop y_lag lag_temp lag
		}
	}
}
*by report_year: egen temp_feb = max(feb_obs)

collapse (mean) outgap-lag_outgap_pgdp, by (report_year report_half )

save "dta/leg_regression_final_full.dta", replace


drop if (report_year>2024 | report_year<1984) ///
       | (report_year==1984 & report_half==1) | ///starting regression w/ second half 1984
		(report_year==2022 & report_half==2)


save "dta/leg_regression_final.dta", replace
cd "$project_code/do/"

