/* This file builds on tables_soi_repl and esimates the number and share
of families between 0-50%, 50-100%, 100-150%, and 150%+ OPM and SPM
poverty receiving the 
EITC each year by state. 

Data used: SOI files (NBER).

*/

global data ./EITChistbins/stata/dta
global results ./EITChistbins/stata/out

***
cap log close
clear all
set more off, permanently

log using "./log/EITChistbins.log", replace
set seed 990 //to replicate
	 
	 
							******************************************************************
*************************** 	PART 1: CREATE ALL NECESSARY SOI DATA 	***************************
							******************************************************************
*Bring in SOI data from NBER server
global soidta /homes/data/soi/dta/

log using ./log/SOI_EITC.log, replace

*Added 2/12/2018: Bring in SNAP participation rates
import excel using "./raw/snap_participation_rates.xlsx", clear ///
	firstrow cellrange(A3)
	rename _all, lower
*Create years before 1994
expand 5 if year == 1994
	bys year: g n = _n
	replace year = 1994 - n + 1 if n > 1 & year == 1994
	tab year, mi
replace overall = overall/100 //put in pctage terms
tempfile snap
	save `snap', replace

*Bring in data 1985-2008
forvalues y =  1990/2011 {
use ${soidta}/x`y'.dta, clear 
*Make sure vars consistent across years
rename _all, lower
di "This is year `y'"
 //drop high-income and "other" states
* drop if state == 0 | state == 52 //don't have state after 2008
//comment this out so sample consistent across years

tempfile year`y'
	save `year`y'', replace
}


*Append everything together
use `year1985', clear
	forvalues y =  1986/2011 {
		append using `year`y''
}

*Make sure everyting's here
*tab state, mi //state not available after 2008
tab year, mi
*Merge to takeup
merge m:1 year using `snap', assert(match) nogen //make sure everything is here		

*** ADD LABELS 
label var mars "Filing Status Status"
label var state "State"
label var year "Year"
label var flpdyr "Tax Year Filed"
label var xocah "N. of Children"
label var xtot "Total Exemptions"
label var agi "AGI"
label var dweght "Decimal Weight"
label var eicref "Refundable EIC"
label var eicoff "EIC Offsetting Taxes 1"
label var eicrd "EIC Offsetting Taxes 2"
label var xocawh "N. of Children Away from Home"

label define filing 1 "Single" 2 "Married" 3 "Married Filing Separately" 4 ///
	"Head of Household" 5 "Widower"
label values mars filing

*** GENERATE MAIN VARIABLES
// dweght, a sampling weight, implies the decimal place. Hence divide by 100, as suggested in tax data file
cap drop weight
gen weight=dweght/100
table year, c(mean weight)

// Generate filing units
gen filers_c=1

//Generate individuals receiving EITC. Total EITC is composed of the refundable part (eicref) and the
//part that offsets tax liability (eicrd and eicoff)
gen eic_d=eicrd+eicoff+eicref
gen eic_c=eic_d>0

//KR added this: CTC
g ctc = chtcr + addcrd //addcrd == E11070 is refundable
*chtcr = E07220 = non-refundable bit

// Number of EIC Children
replace eic=. if year>1990 & year<1994			// missing for years that we can have zero or two kids
replace eic=. if year<1991 | eic_d==0			// consider missing those that did not file for EIC
replace eic=1 if eic_c==1 & year<1991			// EITC given equally to those with 1 child or more
tab year eic, missing
label var eic "N. of EIC Children"
rename eic nchild	// From now on nchild will refer to EIC children

// The variable flpdyr (year for which taxes are files), is inconsistenly defined. Fix it.
tab flpdyr, missing

forvalues i=68/97 {
	replace flpdyr=19`i' if flpdyr==`i'
}  

tab flpdyr, missing
gen late_fil=year!=flpdyr			// Late filers

// Variable to use for merging thresholds
gen kids= xocah 
di "These are tabs before taking max(depend, eitc)"
tab xocah nchild, mi
bys year: tab xocah nchild

*Take max of xocah, eitc kids
	replace kids = nchild if nchild > xocah & nchild != .
*See how many people this affects
di "These are tabs after taking max(depend, eitc)"
tab kids xocah
bys year: tab kids xocah

// Fix number of dependent children variable so that the maximum is 3
*KR changed this from tables_soi_repl in order to account for ARRA
replace xocah=3 if xocah>3 & year >= 2009
replace xocah=2 if xocah>2 & year < 2009
replace xocah=1 if xocah>0 & year<1991

*Before dropping ppl without kids, estimate share of EITC going to ppl with kids
*Use published tables from years 1996 on, but keep this in bc still need 1994-5 
*(No childless credit before 1994)
egen totaleitckid = total(eic_d*(nchild > 0 & nchild != .)), by(year)
egen totaleitc = total(eic_d), by(year)
	g pcteickid = totaleitckid/totaleitc
	*Drop intermediate variables to reduce confusion
	drop totaleitc*

//Just limit to hh w/ kids
keep if xocah > 0

// Add the labels
label var eic_c "EIC Counts"
label var eic_d "EIC Dollars"
label var filers_c "Number of Filers"
label var weight "Sampling Weight"
lab var ctc "addcrd and CTC"

tempfile cleansoi
save `cleansoi', replace			// save this data for later

***************************************
*** STATE, YEAR, MARRIED AND EIC CHILDREN CELLS
***************************************
// Here, the cells are married status, state, year, number
//of children claimed in EIC form.

// Drop individuals in APO/FPO, Puerto Rico, Virgin Islands, Guam or U.S. citizens living abroad
gen foreign=state>=52
sum foreign if year>=1996 & year<=2008 [aw=weight]
*drop if state>=52  //don't have state info after 2008

// Drop those filing late
sum late_fil if year>=1996 [aw=weight]
drop if late_fil==1

// Drop those married filing separately (12,484 observations). As we saw previously, 
//individuals married filing separately are not eligible to file for EITC.
drop if mars==3

// Marital Status
gen married=mars==2

//KR added this: generate family size
g famsize = 1 + kids + (married == 1) //count in spouses and number of kids

//get number of people below 62.5
g under652person = -1 if famsize >= 3
	replace under652person = 0 if famsize < 3 
	replace under652person = 1 if famsize < 3 & (agex >= 1 & agex != .)
	
*Merge to poverty line
g calyear = year //to match variable name in using

*Merge to SNAP rules
*Indicators for AK, HI
g ak = (state == 1)
g hi = (state == 12)
tab year ak, mi 
tab year hi, mi
merge m:1 year famsize ak hi using "./dta/snap/snaprules.dta", keep(master match) 
	*Make sure everything matched
	assert _merge >= 3 if year >= 1990
	keep if _merge >= 3
	drop _merge

*Merge to SPM
merge m:1 calyear using "./raw/spmthresh14.dta", ///
	keep(master match) //discard years not in SOI
	assert _merge >=3 //make sure all SOI obs are matched for small families
	drop _merge 
*Create thresholds (from Fox, 2017):
/*One and two adults: scale = (adults)0.5
Single parents: scale = (adults + 0.8 * first child + 0.5 * other children)
All other families: scale = (adults + 0.5 * children)
*/
*Thresholds we have are for 2-parent, 2kid family, fix so at individual level
replace spmrent = spmrent/(2 + (0.5*2))
assert kids >= 1 //make sure this is reasonable
g spmthreshold = (1 + 0.8 + 0.5*(kids-1))*spmrent if married == 0 //single parent families
	replace spmthreshold = ((famsize - kids) + 0.5*kids)*spmrent if married == 1 // other families

*Create the resource measure 
*(consistent with other forms of income assistance we have this is earned income -taxes paid)

merge m:1 calyear famsize kids under652person using "./raw/pov_thresholds_1980_2014.dta", ///
	keep(master match) //discard years not in SOI
	drop calyear //so don't have a bunch of dups
	
assert _merge >=3 if famsize < 10 //make sure all SOI obs are matched for small families
tab famsize _merge
keep if _merge >=3 //v. small (0.04%) large fams missing pov
	drop _merge 
	
	*Generate poverty thresholds based on hh size
	rename threshold fpl
	tab famsize, su(fpl)
	*Make sure no within-year variation for sample year
	bys year: tab famsize, su(fpl)

*From BHK17:
*** Closest definition of earned income for EIC purposes is wagescalc+schedc+farm-setax/2
**Change 11/15/2017: Don't take out 1/2 self-employment taxes in order to mirror the CPS approach
gen earn_inc = wages + schedc + farm 
gen negzero_earn_inc = earn_inc <= 0


*Estimate SNAP benefits (assuming 100% takeup)
*Merge to HHS poverty guidelines
cap g calyear = year
merge m:1 calyear ak hi using "./raw/pov_guidelines_1977_2015.dta", keep(master match) 
*Make sure everything matched
assert _merge >= 3
	drop _merge
*Generate fam-specific guideline
g fplguideline = fpl1 + (famsize - 1)*(fpladdl)

/*SNAP formula is:
SNAP = 	0 if gross income > 130%FPL or 0 if net income > 100% FPL
	max(MaxBenefit-(NetIncome*30%),0) if gross income <= 130% FPL & 
		net income <= 100% FPL
	
Gross Income=	(earned income) + AFDC
Net Income=	(Gross income)-(standard deduction)-(20%*earned income)-
	(excess shelter deduction)-(med and childcare deduction)
*/
*Assuming AFDC = 0
*SNAP calculated on monthly basis
g snapgross = earn_inc/12
*How to deal w/ childcare deduction? No limit after 2009
g snapnet = snapgross - disregard - (0.2*(earn_inc/12)) - shelter
	replace snapnet = 0 if snapnet < 0
g snapben = max(snapmax - (snapnet*0.3),0)
	*Deal w/ income limits
	*Poverty guidelines in monthly terms
	replace snapben = 0 if snapgross > 1.3*(fplguideline/12)
	replace snapben = 0 if snapnet > (fplguideline/12)
*Added 2/12/2018: Now account for imperfect take-up
g random = runiform() if snapben > 0 & snapben != .
replace snapben = 0 if random > overall //these are ppl who don't take up

*Put in annual terms
replace snapben = snapben*12

*Income as a share of poverty
g inc_opm = earn_inc/fpl
	lab var inc_opm "Earnings:OPM ratio"

*Income as a share of poverty, after-tax concept
*Change 1/9/2018: Add in (estimated) SNAP benefit
replace addcrd = 0 if addcrd == . | addcrd < 0 //actc created in 1998
g at_earn_inc = earn_inc + snapben - (taxaft + (fica/2)) + (eicref + addcrd)
	*Make sure we have this for all years
	tab year, su(at_earn_inc)
g inc_atti_spm = at_earn_inc/spmthresh //using SPM threshold
	lab var inc_atti_spm "After tax pov using SPM thresh"
g inc_atti_opm = at_earn_inc/fpl //using OPM threshold
	lab var inc_atti_opm "After tax pov using OPM thresh"
	
*Create bins, looping over each income/poverty definition
foreach pov in atti_spm {
g `pov'bin = 1 if  inc_`pov' < .5 //how to deal w/ neg inc? Keep in bottom bin for now
	replace `pov'bin = 2 if inc_`pov' >= 0.5 & inc_`pov' < 1
	replace `pov'bin = 3 if inc_`pov' >= 1 & inc_`pov' < 1.5
	replace `pov'bin= 4 if inc_`pov' >= 1.5 & inc_`pov' <2
	replace `pov'bin = 5 if inc_`pov' > 2 & inc_`pov' != .
	
	lab def `pov'bin 1 "<50% `pov'" 2 "50-100% `pov'" 3 "100-150% `pov'" ///
		4 "150-200% `pov'" 5 "200%+ `pov'"
	lab values `pov'bin `pov'bin
		
sum `pov'bin, d //make sure everything looks ok
tab `pov'bin, mi
tab `pov'bin, su(eic_d)

*Export to excel
preserve //for easy re-grouping
collapse (sum) eic_d ctc chtcr addcrd [pw = weight], by(year `pov'bin) //get total amounts by bin
	foreach cred in eic_d ctc chtcr addcrd {
	egen tot_`cred' = total(`cred'), by(year) //annual total across
		//all FPL groups
	g pct`cred'`pov' = `cred'/tot_`cred' //get percent of total going to each group
}
keep year `pov'bin pct*
*Reshape wide so easier to deal w/
*Take out missing values
drop if `pov'bin == .
 
reshape wide pct*, i(year) j(`pov'bin)
*Change file name slightly so can have open with main spreadsheet
*to port these results over
order year pcte* pctctc* pctchtcr* pctaddcrd* //logical order for easy sorting
export excel using "${results}/timeseries_nber.xlsx", sheet(EITC`pov') ///
	sheetreplace firstrow(variable)
restore
} //end poverty definition loop

*smaller file for troubleshooting
keep if  (eic_d > 0 & eic_d != .) | (ctc  > 0 & ctc   != .) | ///
	(chtcr  > 0 & chtcr  != .) | (addcrd  > 0 & addcrd   != .)

*Output this to xlsx
preserve
collapse (mean) pcteickid, by(year)
*Make sure no childless credit pre-OBRA
replace pcteickid = 1 if year < 1994
expand 5 if year == 2011 //go up to 2015
bys year: g n = _n
forvalues y = 1/5 {
	replace year = 2010 + `y' if n == `y' & year == 2011
}
tab year, mi
export excel using "./out/timeseries_nber.xlsx", sheet(pctkid) sheetreplace ///
	firstrow(variables)
restore

cap log close
***
