***********************
* Early Impact of ACA *
* Amanda Kowalski     *
*                     *
* 13 October 2014     *
***********************

****************************************
* PURPOSE: Clean data                  *
*		i.   Annual data       *
*		ii.  Schedule T data   *
*		iii. Quarterly data    *
*		iv.  Population data   *
*		v.   Other data        *
****************************************

clear
set matsize 2000
set type double
set more off, permanently

local tag "clean_data"
capture log close `tag'
log using "logs/`tag'.log", replace name(`tag')

******************
* i. Annual data *
******************
forval y = 2008/2013 {
	*Import and clean data
	insheet using "data/raw/`y'_nd.csv", comma nonames clear
	drop if _n == 1

	*Rename variables
	foreach v of varlist _all {
		local n1 = `v'[1]
		local n2 = `v'[2]
		local n3 = `v'[3]
		local n4 = `v'[4]
		
		if regexm("`n1'", "Enrollment") {
			local s1 = "enr"
		}
		else if regexm("`n1'", "Months") {
			local s1 = "mmonths"
		}
		else if regexm("`n1'", "Premiums") {
			local s1 = "premium"
		}
		else if regexm("`n1'", "Paid") {
			local s1 = "cost"
		}
		
		local s2 = substr("`n4'", -2, .)
		
		local varname = "`s1'_st`s2'"
		
		if "`n1'" == "Entity Name" {
			local varname = "name"
		}
		if "`n1'" == "SNL Statutory Entity Key" {
			local varname = "id"
		}
		if "`n1'" == "State Domiciled" {
			local varname = "domicile"
		}
		if "`n1'" == "SNL Group Name" {
			local varname = "group"
		}
		
		rename `v' `varname'
	}
	drop if _n <= 4

	*Reshape data
	reshape long enr_st mmonths_st premium_st cost_st, ///
		i(name id domicile group) j(state) string

	*Destring
	destring enr mmonths premium cost, ignore("NA", ",") replace

	*Year
	gen yr = `y'

	*Order and sort
	order id state name group domicile
	sort id state
	bysort id : gen state_n = _n
	order id state state_n
	
	*Save
	save "data/intermediate/`y'_data.dta", replace
}

***********************
* ii. Schedule T data *
***********************
insheet using "data/raw/scheduleT.csv", comma nonames clear
drop if _n == 1

*Rename variables
foreach v of varlist _all {
	local n1 = `v'[1]
	local n2 = `v'[2]
	local n3 = `v'[3]
	local n4 = `v'[4]
	
	if regexm("`n1'", "Direct Business") {
		local s1 = "prem_t"
	}
	
	local s2 = "`n3'"
	
	local s3 = substr("`n4'", -2, .)
	
	local varname = "`s1'`s2'`s3'"
	
	if "`n1'" == "Entity Name" {
		local varname = "name"
	}
	if "`n1'" == "SNL Statutory Entity Key" {
		local varname = "id"
	}
	if "`n1'" == "State Domiciled" {
		local varname = "domicile"
	}
	if "`n1'" == "SNL Group Name" {
		local varname = "group"
	}
	
	rename `v' `varname'
}
drop if _n <= 4

*Reshape data
reshape long prem_t, i(name id domicile group) j(yrqtr_st) string

*Year, Quarter, State
gen yr  = substr(yrqtr_st, 1, 4) if !regexm(yrqtr_st, "MRQ")
gen qtr = substr(yrqtr_st, 6, 1) if !regexm(yrqtr_st, "MRQ")
gen state = substr(yrqtr_st, -2, .)
gen yrqtr = substr(yrqtr_st, 1, 6) if !regexm(yrqtr_st, "MRQ")
replace yrqtr = substr(yrqtr_st, 1, 3) if regexm(yrqtr_st, "MRQ")
drop yrqtr_st

*Destring
destring prem_t yr qtr, ignore("NA", ",") replace

*Order and sort
order id yrqtr state yr qtr name group domicile prem_t
gsort id -yrqtr state
bysort id : gen yrqtr_st = _n
order id yrqtr_st

*Save
save "data/intermediate/schedulet_data.dta", replace

***********************
* iii. Quarterly data *
***********************
insheet using "data/raw/quarterly_data_nd.csv", comma nonames clear
drop if _n == 1

*Rename variables
foreach v of varlist _all {
	local n1 = `v'[1]
	local n2 = `v'[2]
	local n3 = `v'[3]
	local n4 = `v'[4]
	
	if regexm("`n1'", "Enrollment") {
		local s1 = "enr"
	}
	else if regexm("`n1'", "Months") {
		local s1 = "mmonths"
	}
	else if regexm("`n1'", "Premiums") {
		local s1 = "premium"
	}
	else if regexm("`n1'", "Paid") {
		local s1 = "cost"
	}
	
	local s2 = "`n3'"
	
	local varname = "`s1'`s2'"
	
	if "`n1'" == "Entity Name" {
		local varname = "name"
	}
	if "`n1'" == "SNL Statutory Entity Key" {
		local varname = "id"
	}
	if "`n1'" == "State Domiciled" {
		local varname = "domicile"
	}
	if "`n1'" == "SNL Group Name" {
		local varname = "group"
	}
	
	rename `v' `varname'
}
drop if _n <= 4

*Reshape data
reshape long enr mmonths premium cost, ///
	i(name id domicile group) j(yrqtr) string

*Year, Quarter
gen yr  = substr(yrqtr, 1, 4) if !regexm(yrqtr, "MRQ")
gen qtr = substr(yrqtr, 6, 1) if !regexm(yrqtr, "MRQ")

*Destring
destring enr mmonths premium cost yr qtr, ignore("NA", ",") replace

*Order and sort
order id yrqtr yr qtr name group domicile enr mmonths premium cost
gsort id -yrqtr

*Save
save "data/intermediate/quarterly_data_nd.dta", replace

***********************
* iv. Population data *
***********************

*2000-2010 state population data (only use 2000-2009)
insheet using "data/raw/pop2000-2010.csv", clear

*Clean
drop if v3 == ""
drop if _n == 1
drop if inlist(v1, "United States",	///
		   "Northeast",		///
		   "Midwest",		///
		   "South",		///
		   "West",		///
		   "Puerto Rico")
drop v2 v13 v14
compress

*Rename vars
rename v1 state_name
foreach var of varlist v3-v12 {
	local newname = `var'[1]
	rename `var' pop`newname'
}
drop if _n == 1

replace state_name = regexr(state_name, ".", "")

tempfile pop
save `pop'

*2010-2013 state population data
insheet using "data/raw/pop2010-2013.csv", clear

*Clean
drop if v3 == ""
drop if inlist(v1, "United States",	///
		   "Northeast",		///
		   "Midwest",		///
		   "South",		///
		   "West",		///
		   "Puerto Rico")
drop v2 v3
compress

*Rename vars
rename v1 state_name
foreach var of varlist v4-v7 {
	local newname = `var'[1]
	rename `var' pop`newname'
}
drop if _n == 1

replace state_name = regexr(state_name, ".", "")

*Merge 2000-2009
merge 1:1 state_name using `pop'
assert _merge == 3
drop _merge

*Reshape
reshape long pop, i(state_name) j(yr) string

*Destring
destring yr pop, ignore(",") replace

*Use 2013 data for 2014
expand 2 if yr == 2013, gen(is2014)
replace yr = 2014 if is2014
drop is2014

*Save
isid state_name yr // Verifies (state_name, yr) identifies unique obs
compress
save "data/intermediate/popdata.dta", replace

*****************
* v. Other data *
*****************

** Data on State Policies **
	* (state exchange, Medicaid expansion, direct enforcement, glitches)
	insheet using "data/raw/grouping.csv", comma names clear
	
	compress
	save "data/intermediate/grouping.dta", replace

** Data on Firms Listed on Exchanges and HMOs **
	insheet using "data/raw/firms_exch.csv", comma names clear
	rename hmo HMO

	compress
	save "data/intermediate/firms_exch.dta", replace

** ASPE Data on Exchange Enrollment **
	insheet using "data/raw/aspe_appendix_e.csv", comma names clear

	*Destring variables
	destring elig subselig medicaidelig exenroll, ignore("N/A", ",") replace

	*Prepare state variable for merge
	gen parenth = strpos(state, "(")
	replace state = substr(state, 1, parenth - 2) if regexm(state, "[0-9]")
	replace state = upper(state)

	preserve
	*Prepare using dataset for merge
	insheet using "data/raw/usps_state_ab.csv", comma names clear

	*Rename variables
	rename statepossession state
	rename abbreviation state_ab

	tempfile state_ab
	save `state_ab'

	restore

	*Merge state abbrevs
	merge 1:1 state using `state_ab'
	assert _merge != 1
	keep if _merge == 3
	drop _merge

	*Keep relevant variables
	keep state_ab elig subselig medicaidelig exenroll
	order state_ab elig subselig medicaidelig exenroll

	*Save final dataset
	save "data/intermediate/aspe_appendix_e.dta", replace

** Data on Uninsured Population and State Policies **
	use "data/raw/unins_cr_gi.dta", clear
	
	save "data/intermediate/unins_cr_gi.dta", replace
	
log close `tag'
