***********************
* Early Impact of ACA *
* Amanda Kowalski     *
*                     *
* 13 October 2014     *
***********************

*********************************************************************
* PURPOSE: Impute quarterly data                                    *
*		i.   Clean quarterly data                           *
*		ii.  Identify non-defunct firm-quarter observations *
*		iii. Imputation routine                             *
*		iv.  Adjustments to imputation                      *
*               v.   Summarize imputation                           *
*********************************************************************

clear
set matsize 2000
set type double
set more off, permanently

local tag "impute_data"
capture log close `tag'
log using "logs/`tag'.log", replace name(`tag')

***************************
* i. Clean quarterly data *
***************************
use "data/intermediate/quarterly_data_nd.dta", clear

*Adjust date
drop if yrqtr == "MRQ" // "most recent quarter" observations redundant
gen yq = yq(yr, qtr)
format yq %tq

*Order variables
order id yrqtr name group domicile yq yr qtr enr mmonths premium cost

*Summarize enr, mmonths, premium, cost (compare to summarize at end)
count
su enr mmonths premium cost

******************************************************
* ii. Identify non-defunct firm-quarter observations *
******************************************************

/* NOTE:
Say a firm is defunct if it is missing any of enrollment, member months,
premiums, or costs for two consecutive quarters

Some observations are very, very small in relative terms
(e.g. enrollment of 2 people when usually in the '000s)
such that they are essentially 0. These observations distort
trend regressions signficantly enough to need to be excluded from model.

Flag observations with abs val "< 0.1 * median of positive values"
*/

foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var' = 0 if `var' == .
}

foreach var in "enr" "mmonths" "premium" "cost" {
	qui : su `var' if id == "`id'" & `var' > 0 // non-zero observations
	bysort id : egen medpos_`var' = median(`var') if `var' > 0
	bysort id : egen med_`var' = max(medpos_`var')
	gen flag_`var' = abs(`var') < med_`var' / 10
	replace `var' = 0 if flag_`var' == 1
}

foreach var in "enr" "mmonths" "premium" "cost" {
	bysort id (yq) : gen defunct_`var'	= 1 if `var' == 0 & (`var'[_n-1] == 0 | `var'[_n+1] == 0)
	bysort id (yq) : replace defunct_`var'	= 1 if `var' == 0 & _n == 1
	bysort id (yq) : replace defunct_`var'	= 1 if `var' == 0 & _n == _N
	replace defunct_`var' = 0 if defunct_`var' == .
}

gen defunct = (defunct_enr + defunct_mmonths + defunct_premium + defunct_cost) > 0

	************************
	* Specific adjustments *
	************************

	gen manual_change =	inlist(id, "C299" , "C3169", "C3173", "C3174") | ///
				inlist(id, "C3320", "C3344", "C3368", "C3412") | ///
				inlist(id, "C3448", "C3538", "C3562", "C3663") | ///
				inlist(id, "C3787", "C3797", "C3870", "C4385", "C4711")
	su enr mmonths premium cost if manual_change
	su enr mmonths premium cost

	replace defunct = 0 if id == "C299"
	replace defunct = 0 if id == "C473" & yq >= yq(2010,1) & yq <= yq(2011,4)
	replace defunct = 0 if id == "C2195" & yq == yq(2008,1)
	replace defunct = 0 if inlist(id, "C3174", "C3173", "C3169")
	replace defunct = 1 if id == "C3169" & inlist(yq, yq(2008,1), yq(2008,2))
	replace defunct = 0 if id == "C3320"
	replace defunct = 1 if id == "C3320" & yq >= yq(2008,1) & yq <= yq(2010,2)
	replace defunct = 0 if id == "C3344"
	drop if id == "C3368"
	drop if id == "C3412"
	replace defunct = 0 if id == "C3448"
	drop if id == "C3538"
	replace defunct = 0 if id == "C3562" & yq >= yq(2008,1) & yq <= yq(2008,4)
	drop if id == "C3663"
	drop if id == "C3787"
	replace defunct = 1 if id == "C3797" & yq == yq(2009,2)
	replace defunct = 0 if id == "C3870" & yq == yq(2008,2)
	replace defunct = 0 if id == "C4385" & yq >= yq(2013,2) & yq <= yq(2013,4)
	drop if id == "C4711"

	*****************************
	* Drop defunct observations *
	*****************************

	*Summarize flags and non-defunct variables
	su flag*
	su defunct*
	su enr mmonths premium cost if defunct == 0

	*Drop defunct observations
	drop if defunct == 1
	drop med* flag* defunct* manual_change

	*Use tsfill to ensure we impute for values "within" time series
	encode id, gen(id_n)
	tsset id_n yq
	tsfill

	foreach var in "enr" "mmonths" "premium" "cost" {
		replace `var' = 0 if `var' == .
	}
	foreach var in "id" "name" "group" "domicile" {
		bysort id_n (yq) : replace `var' = `var'[_n-1] if `var' == ""
	}
	replace yr  = 1960 + floor(yq/4)
	replace qtr = 1 + mod(yq,4)
	tostring yr, gen(yr_str)
	tostring qtr, gen(qtr_str)
	replace yrqtr = yr_str + "Q" + qtr_str if yrqtr == ""
	drop yr_str qtr_str

*Summarize variables
su

*Count observations and distinct firms (i.e. time series)
count
codebook id
compress

***************************
* iii. Imputation routine *
***************************

foreach var in "enr" "mmonths" "premium" "cost" {
	gen `var'_imp = `var'
}

gen is_imp_enr = 0
gen is_imp_mmonths = 0
gen is_imp_premium = 0
gen is_imp_cost = 0

levelsof id, local(ids)
foreach id of local ids {
	foreach var in "enr" "mmonths" "premium" "cost" {
		count if id == "`id'" & `var' != 0
		if `r(N)' > 5 {
			*Run seasonally-adjusted, linear trend regression
			qui : reg `var' yq i.qtr if yq < yq(2013, 4) & id == "`id'" & `var' != 0
			*Flag outliers
			qui : predict pseudo_t if e(sample), rstudent
			qui : gen is_outlier = 1 if abs(pseudo_t) > 2 & pseudo_t != . | `var' <= 0
			qui : su is_outlier
			if `r(N)' > 0 {
				*Re-run seasonally-adjusted, linear trend regression
				qui : reg `var' yq i.qtr if yq < yq(2013, 4) & id == "`id'" & is_outlier != 1
				qui : predict `var'_hat if id == "`id'"
				qui : replace `var'_imp = `var'_hat if is_outlier == 1 & `var'_hat != .
				qui : replace is_imp_`var' = 1 if is_outlier == 1
				qui : drop `var'_hat
			}
			qui : drop pseudo_t is_outlier
		}
	}
}

*********************************
* iv. Adjustments to imputation *
*********************************

/* NOTE:
Some observations are unable to have a yhat because of lack of data
(cannot predict all quarterly dummies). We replace these observations
with the corresponding non-imputed observation (except when negative)
and with zero, otherwise.
*/

su *imp
tab id if enr_imp < 0 | mmonths_imp < 0 | premium_imp < 0 | cost_imp < 0
/* NOTE:
We can see that some of our imputations are negative.
 Graphical analysis of these ids suggests that they are false positives
 (i.e. observations flagged for imputation for which it is unnecessary)
 Revert to former series except when former series is negative.
 Then replace with 0.
In addition, other observations have been imputed when no imputation necessary.
 Restore these to raw values.
*/

foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var'	if `var'_imp < 0
	replace `var'_imp = 0		if `var'_imp < 0
}

******************************
* State-specific adjustments *
******************************

************
* ARKANSAS *
************
* id == "C2161" -- error in mmonths because outlier is "masked"
gen flag1 = id == "C2161" & inlist(yq, yq(2011,2), yq(2011,3))
gen flag2 = id == "C2161" & inlist(yq, yq(2013,2), yq(2013,3))
replace is_imp_mmonths = 1 if flag1 | flag2

reg mmonths yq i.qtr ///
	if id == "C2161" ///
	& yq < yq(2013, 4) ///
	& !(flag1 | flag2)
	
predict mmonths_hat				if flag1 | flag2
replace mmonths_imp = mmonths_hat		if flag1 | flag2
drop mmonths_hat

*Scale to mmonths reported in 2011Q3
qui : su mmonths_imp				if flag1
replace mmonths_imp = mmonths_imp / `r(sum)'	if flag1
qui : su mmonths				if flag1
replace mmonths_imp = mmonths_imp * `r(sum)'	if flag1

*Scale to mmonths reported in 2013Q3
qui : su mmonths_imp				if flag2
replace mmonths_imp = mmonths_imp / `r(sum)'	if flag2
qui : su mmonths				if flag2
replace mmonths_imp = mmonths_imp * `r(sum)'	if flag2

drop flag*

********
* D.C. *
********
* id == "C3663" -- data anomaloy; id only appears in 2008Q4, 2009Q4
* Enrollment, member months, and premiums could potentially be allocated
*  to 2008Q1-Q4 and 2009Q1-Q4; however, costs are very unusual.
* Drop observations.

drop if id == "C3663"

***********
* FLORIDA *
***********
drop if id == "C2242"

**********
* HAWAII *
**********
gen flag = id == "C6170" & inlist(yq, yq(2011,2), yq(2011,3), yq(2011,4), yq(2012,2), yq(2012,3), yq(2012,4))
*Adjust imputation for id "C6170"
replace mmonths_imp = mmonths if flag

*Member Months
reg mmonths yq i.qtr if id == "C6170" & yq < yq(2013,4) & !flag
predict mm_hat if flag
replace mmonths_imp = mm_hat if flag
drop mm_hat flag

**********
* KANSAS *
**********
*Only operates in 2009Q2-Q3; very odd data
drop if id == "C3518"

************
* KENTUCKY *
************
*Has odd data for 2011Q4-2012Q2
drop if id == "C2467" & yq >= yq(2011,4) & yq <= yq(2012,2)

*************
* MINNESOTA *
*************
*Don't impute C4726
foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var' if id == "C4726"
}

***************
* MISSISSIPPI *
***************
*Don't impute C3002
foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var' if id == "C3002"
}

************
* MISSOURI *
************
*C3518 (Good Health HMO Inc.) has weird data;
* It accounts for very little enr, mm, prem, cost;
* Data anomaly that is not flagged by imputation routine
* is apparent outlier in MO state graph

drop if id == "C3518"

************
* NEBRASKA *
************
*C2195 BCBS NE reports mmonths in bizarre fashion 2008Q1-2010Q2; impute
replace mmonths_imp = mmonths if id == "C2195"
gen flag = id == "C2195" & yq >= yq(2008,1) & yq <= yq(2010,2)
reg mmonths yq i.qtr if id == "C2195" & yq < yq(2013,4) & yq > yq(2010,2)
predict mm_hat if flag
replace mmonths_imp = mm_hat if flag
drop mm_hat flag

**************
* NEW JERSEY *
**************
*C3510 (Horizon) diasappears after 2009Q4; was on track to be largest provider
*C4711 (Triad), another large provider, is dropped at defunct stage because
* it is lacking premium & cost data

**************
* NEW MEXICO *
**************
*C4493 and C4494 both refer to Presbyterian;
* data series seems to "switch" at 2013Q1; do not impute!
foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var' if inlist(id, "C4493", "C4494") & yq == yq(2013,1)
}

************
* NEW YORK *
************
*Type II error for C3643 (Empire in 2013Q2-Q3; do not impute!
replace enr_imp = enr if id == "C3643" & inlist(yq, yq(2013,2), yq(2013,3))

********
* OHIO *
********
*C299 2008Q1-2010Q4 get dropped at defunct stage; definitely impute
*C473 missing data in 2010; also, all 2011 appears in 2011Q4; drop these obs
*drop if id == "C473" & yq >= yq(2010,1) & yq <= yq(2011,4)
gen flag1 = id == "C299" & yq >= yq(2008,1) & yq <= yq(2010,4)
gen flag2 = id == "C473" & yq >= yq(2009,3) & yq <= yq(2011,4)
gen flag3 = id == "C473" & yq >= yq(2008,1) & yq <= yq(2011,4)
foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var' if id == "C299"
	reg `var' yq i.qtr if id == "C299" & !flag1
	predict `var'_hat if flag1
	replace `var'_imp = `var'_hat if flag1
	drop `var'_hat
}
foreach var in "enr" "mmonths" {
	replace `var'_imp = `var' if id == "C473"
	reg `var' yq i.qtr if id == "C473" & !flag2
	predict `var'_hat if flag2
	replace `var'_imp = `var'_hat if flag2
	drop `var'_hat
}
foreach var in "premium" "cost" {
	replace `var'_imp = `var' if id == "C473"
	reg `var' yq i.qtr if id == "C473" & !flag3
	predict `var'_hat if flag3
	replace `var'_imp = `var'_hat if flag3
	drop `var'_hat
}

drop flag1 flag2 flag3

************
* OKLAHOMA *
************
drop if id == "C3870"

****************
* PENNSYLVANIA *
****************
* Never impute for id C3448 (First Priority Life Ins Co.)
foreach var in "enr" "mmonths" "premium" "cost" {
	replace `var'_imp = `var' if id == "C3448"
}

******************
* SOUTH CAROLINA *
******************
su enr if id == "C300" & yq == yq(2012,2)
local enr_12q2 = `r(sum)'
su enr if id == "C300" & yq == yq(2013,2)
local enr_13q2 = `r(sum)'
local ratio = `enr_12q2' / `enr_13q2'

su mmonths if id == "C300" & yq == yq(2013,2)
replace mmonths_imp = `ratio' * `r(sum)' if id == "C300" & yq == yq(2012,2)

********
* UTAH *
********
*C3337 (Select Health) poor reporting results in masked outlier
* Need to impute member months, premium, cost in 2011Q4
gen flag = id == "C3337" & yq == yq(2011,4)
su enr if id == "C3337" & yq == yq(2011,4)
local enr_11q4 = `r(sum)'
su enr if id == "C3337" & yq == yq(2012,4)
local enr_12q4 = `r(sum)'
local ratio = `enr_11q4' / `enr_12q4'

foreach var in "mmonths" "premium" "cost" {
	su `var' if id == "C3337" & yq == yq(2012,4)
	replace `var'_imp = `ratio' * `r(sum)' if flag
}
drop flag

*****************
* WEST VIRGINIA *
*****************
*C3313 (Coventry) only appears in three quarters and appears anomalous; drop
drop if id == "C3313"

***************************
* v. Summarize imputation *
***************************

*Summarize
count
su enr* mmonths* premium* cost*

*Review correlations between imputed and raw
su *imp
correl enr	enr_imp		if yq < yq(2013, 4)
correl mmonths	mmonths_imp	if yq < yq(2013, 4)
correl premium	premium_imp	if yq < yq(2013, 4)
correl cost	cost_imp	if yq < yq(2013, 4)

save "data/intermediate/quarterly_data_nd_imputed.dta", replace

log close `tag'
