/*==============================================================================                                                                   
Project: 	Brookings Criminal Justice Symposium	    
			Will Dobbie and Crystal Yang         
			
			This file cleans and imports all relevant files for use in the files
			called by US_PTDetention_RacialGaps_MASTER.do. 
			
			First coded: 1/21/2021
			Last updated: 4/26/2021
			
================================================================================*/

/** Paths set in 0_US_PTDetention_RacialGaps_MASTER.do 

clear all
set more off
cap log close
set maxvar 100000

* Local & Network directories
global Brookings  = "SCPS data folder (Encrypted Drive)"
global local_temp = "$Brookings/Temp"
global local_data = "$Brookings/ SCPS data sub-Folder with main dataset"

global dropbox    = "Network directory with public access datasets"
global do_files   = "$dropbox/Do_files"
global results	  = "$dropbox/Results"

global censuskey  = "739fc5d493d01589c1bdb7a2fd30a2b308b3e893"

cd "$dropbox"
*/

*Install necessary packages
// ssc install egenmore
// net install getcensus, from("https://raw.githubusercontent.com/CenterOnBudget/getcensus/master/src") replace 
// ssc install jsonio

********
**SCPS**
******** 

**Import data from SCPS**
*DS0001
import excel using "$local_data/DS0001/02038-0001-Data-REST.xlsx", firstrow clear

*Create detained / release indicator
gen Release = RELDET1 == 1
replace Release = . if RELDET1 >= 8
gen Detained = RELDET1 == 2
replace Detained = . if RELDET1 >= 8

*Of released, type of bond assignment
gen FinancialConditions = 0 if !mi(Release)
replace FinancialConditions = 1 if RELDET2 == 1 | RELDET2 == 4 //financial release or held on bail
replace FinancialConditions = . if RELDET2 >= 6

*Generate bail denied flag
gen BailDenied = 0 if !mi(Release)
replace BailDenied = 1 if RELDET2 == 5
replace BailDenied = . if RELDET2 >= 7

*Clean relevant variables (recode missings)
replace GENDER = . if GENDER == 9
replace GENDER = 0 if GENDER == 2
rename  GENDER MALE 

replace AGE = . if AGE == 999

replace PRIARR = .   if PRIARR == 99
replace PRIFARR = .  if PRIFARR == 99
replace PRIORFTA = . if PRIORFTA == 8 | PRIORFTA == 9 // coding NA as missing	
replace PRIORFTA = 0 if PRIORFTA == 2  
replace PRICONV = .  if PRICONV == 99
replace PRIFCONV = . if PRIFCONV == 99
replace OFFENSE1 = . if OFFENSE1 == 99
replace OFFTYPE1 = . if OFFTYPE1 == 9
label variable YEAR "Year"

*Clean FIPS code
tostring COUNTY, gen(FIPS)
replace FIPS = "0" + FIPS if length(FIPS)== 4

*Generate race indicator and race category
replace RACE = .     if RACE == 9
replace ORIGIN = .	 if ORIGIN == 9

gen Black = (RACE == 2)
gen NonHisWhite = (RACE == 1 & ORIGIN == 2)
gen HisWhite = (RACE == 1 & ORIGIN == 1)
gen Race_Other = (RACE == 3 | RACE == 4)
gen Race_Missing = (mi(RACE))

gen RaceCategory = .
replace RaceCategory = 1 if NonHisWhite == 1
replace RaceCategory = 2 if HisWhite == 1
replace RaceCategory = 3 if Black == 1

*Generate case count and All indicator for loops
gen All = 1
gen n_case = 1
gen ID = _n

*Create charge categories 
gen violent  = (OFFTYPE1 == 1)
gen property = (OFFTYPE1 == 2)
gen drug 	 = (OFFTYPE1 == 3)
gen pubOrder = (OFFTYPE1 == 4)

foreach var in violent property drug pubOrder{
	replace `var' = . if OFFTYPE1==.
}

*Create age categories 
gen age_24U    = (AGE <= 24)
gen age_25_44  = (AGE > 24) & (AGE <= 44)
gen age_45_64  = (AGE > 44) & (AGE <= 64)
gen age_65over = (AGE >= 65)

foreach var in age_24U age_25_44 age_45_64 age_65over{
	replace `var' = . if AGE==.
}

*Merge in FIPS names for presentation of data
merge m:1 FIPS using "$dropbox/Data/FIPSNames.dta"
label variable CountyName "County"
label variable STATE "State"

drop if _merge == 2
drop _merge

/*Fix non-merges; Miami-Dade updated in FIPS name file; Baltimore FIPS assumed
from MD state and Baltimore data from 24005 FIPS missing for years in which 
24410 is populated */ 
replace CountyName = "Miami-Dade" if FIPS == "12025"
replace FIPS = "24005" if FIPS == "24410"
encode CountyName, gen(CountyName_num)

*Save cleaned and prepped data
save "$local_temp/D1_Clean_Prepped.dta", replace

************************
**Opportunity Insights**
************************
**Import data from Opportunity Insights**
*OI codebook: https://opportunityinsights.org/wp-content/uploads/2019/07/Codebook-for-Table-5.pdf
use "$dropbox/Data/Opportunity Insights/OI_county_outcomes_dta.dta", clear

*Create FIPS code
tostring state, replace
tostring county, replace
replace state = "0" + state if length(state) == 1
replace county = "0" + county if length(county) == 1
replace county = "0" + county if length(county) == 2
gen FIPS = state + county

*Fix issue with old Miami-Dade FIPS
replace FIPS = "12025" if FIPS == "12086"

*Identify relevant variables
foreach x in 10 25 50 75{
	*Add in general variables 
	label variable kir_pooled_pooled_p`x' 		""
	label variable kir_pooled_male_p`x' 		"- Male"
	label variable kir_pooled_female_p`x' 		"- Female"
	label variable kir_black_pooled_p`x'		"- Black"
	label variable kir_hisp_pooled_p`x'			"- Hispanic"
	label variable kir_white_pooled_p`x'		"- White"
	
	rename kir_pooled_pooled_p`x'	interMob_All_p`x'
	rename kir_pooled_male_p`x'		interMob_Male_p`x'
	rename kir_pooled_female_p`x'	interMob_Female_p`x'
	rename kir_black_pooled_p`x'	interMob_Black_p`x'
	rename kir_hisp_pooled_p`x' 	interMob_Hispanic_p`x'
	rename kir_white_pooled_p`x'	interMob_NonHisWhite_p`x'	
	}

*Put mobility on same scale as detention and racial gap rates
foreach x of varlist inter* {
	replace `x' = `x' * 100
}

save "$dropbox/Data/Opportunity Insights/OI_county_outcomes_dta_CLEAN.dta", replace

**Clean and import neighborhood characteristics data**
use "$dropbox/Data/Opportunity Insights/OI_county_NeighborhoodCharacteristics.dta", clear

*Re-scale household income to $10,000s
replace hhinc_mean2000 = hhinc_mean2000 / 10000

*Create string FIPS
tostring state, replace
tostring county, replace
replace state = "0" + state if length(state) == 1
replace county = "0" + county if length(county) == 1
replace county = "0" + county if length(county) == 2
gen FIPS = state + county

*Fix issue with old Miami-Dade FIPS, SCPS uses 1990 Miami-Dade FIPS.
replace FIPS = "12025" if FIPS == "12086"

save "$dropbox/Data/Opportunity Insights/OI_county_NeighborhoodCharacteristics_CLEAN.dta", replace

**Clean and import Causal Place Effects data**
use "$dropbox/Data/Opportunity Insights/OI_county_CausalPlaceEffects.dta", clear

*Create missing indicators
gen mi_dropout_r		= (dropout_r == .)
gen mi_crime_total		= (crime_total == . )
gen mi_crime_violent 	= (crime_violent == .)
gen mi_scap_ski90pcm	= (scap_ski90pcm == .)

*Replace missing values 
replace dropout_r 		= 0 if mi_dropout_r == 1 
replace crime_total 	= 0 if mi_crime_total == 1
replace crime_violent	= 0 if mi_crime_violent == 1
replace scap_ski90pcm	= 0 if mi_scap_ski90pcm == 1

*Create string FIPS
tostring cty1990, replace //Only difference between 1990 FIPS and 2000 FIPS is Miami-Dade (1990 county code is 12025 and 2000 county code is 12086) and Skagway-Yakutat-Angoon (1990 FIPS is 2231 and 2000 county code is 2232)
replace cty1990 = "0" + cty1990 if length(cty1990) == 4
gen FIPS = cty1990

save "$dropbox/Data/Opportunity Insights/OI_county_CausalPlaceEffects_CLEAN.dta", replace

**********
**Census**
**********
**Create annual poverty totals** 
*Import data for all races
*Historical Poverty Tables: People and Families - 1959 to 2019 Table 3
*https://www2.census.gov/programs-surveys/cps/tables/time-series/historical-poverty-people/hstpov3.xlsx
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A9:M40)
gen Race = "All Races"

save "$dropbox/Data/Census/hstpov_All.dta", replace

*Import data for White Non-Hispanic 
*2002+
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A146:M165)
gen Race = "White Non-Hisp."

save "$dropbox/Data/Census/hstpov_NonHisWhite2002.dta", replace

*1990-2001
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A171:M182)
gen Race = "White Non-Hisp."

*Append multiple years
append using "$dropbox/Data/Census/hstpov_NonHisWhite2002.dta"
save "$dropbox/Data/Census/hstpov_NonHisWhite.dta", replace

*Import data for Black
*2002+
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A229:M248)
gen Race = "Black"

save "$dropbox/Data/Census/hstpov_Black2002.dta", replace

*1990-2001
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A254:M265)
gen Race = "Black"

append using "$dropbox/Data/Census/hstpov_Black2002.dta"
save "$dropbox/Data/Census/hstpov_Black.dta", replace

*Import data for Hispanic
import excel "$dropbox/Data/Census/Historical Poverty/hstpov3.xlsx", clear cellrange(A366:M397)
gen Race = "Hispanic"

save "$dropbox/Data/Census/hstpov_Hispanic.dta", replace

*Create poverty rates
foreach x in All NonHisWhite Black Hispanic{
    
	use "$dropbox/Data/Census/hstpov_`x'.dta", clear

	*Rename variables 
	rename A Year
	rename B AgeU18_All_Total_`x'
	rename C AgeU18_All_Pov_Num_`x'
	rename D AgeU18_All_Pov_Pct_`x'
	rename E AgeU18_Fam_Total_`x'
	rename F AgeU18_Fam_Pov_Num_`x'
	rename G AgeU18_Fam_Pov_Pct_`x'
	rename H Age18_64_Total_`x'
	rename I Age18_64_Pov_Num_`x'
	rename J Age18_64_Pov_Pct_`x'
	rename K Age65o_Total_`x'
	rename L Age65o_Pov_Num_`x'
	rename M Age65o_Pov_Pct_`x'

	*Create total poverty rate
	gen All_Ages_Total_`x' 			= AgeU18_All_Total_`x' + ///
									  Age18_64_Total_`x' + ///
									  Age65o_Total_`x'
	gen All_Ages_Pov_Num_`x'		= AgeU18_All_Pov_Num + ///
									  Age18_64_Pov_Num + ///
									  Age65o_Pov_Num
	gen All_Ages_PctPov_`x'			= (All_Ages_Pov_Num / All_Ages_Total)*100
	
	save "$dropbox/Data/Census/hstpov_`x'_CLEAN.dta", replace
}

*Merge data
use "$dropbox/Data/Census/hstpov_All_CLEAN.dta", clear

foreach x in  NonHisWhite Black Hispanic {
    merge 1:1 Year using "$dropbox/Data/Census/hstpov_`x'_CLEAN.dta"
	drop if _merge == 2
	drop _merge
}
*Racial gap in poverty rates
gen RGapPctPov_NonHisWhite_Black	= All_Ages_PctPov_Black - ///
									  All_Ages_PctPov_NonHisWhite
gen RGapPctPov_NonHisWhite_Hispanic	= All_Ages_PctPov_Hispanic - ///
									  All_Ages_PctPov_NonHisWhite
gen min_RGapPctPov 					= min(RGapPctPov_NonHisWhite_Black, RGapPctPov_NonHisWhite_Hispanic)

*Clean year variable 
replace Year = substr(Year, 1, 4)
destring Year, replace

*Label variables
label variable Year "Year"
label variable All_Ages_PctPov_All 			"All"
label variable All_Ages_PctPov_Black 		"Black"
label variable All_Ages_PctPov_Hispanic 	"Hispanic"
label variable All_Ages_PctPov_NonHisWhite  "White"

save "$dropbox/Data/Census/CPS_PctPov_1990_2019_CLEAN.dta", replace

**Create total population counts** 
*P012_ data used for decennial population totals

*2000
*Codebook: https://api.census.gov/data/2000/dec/sf1/groups/P012.html
*Aggregate (excluding race)
*Import data
import excel "$dropbox/Data/Census/2000 Decennial/P12.xlsx", firstrow clear  

*Add in variable labels 
foreach var of varlist P* {
	label variable `var' "`=`var'[1]''"
}

*Remove label row
drop if GEO_ID == "id"

*Reformat variables
foreach var of varlist P* {
	destring `var', replace 
}

*Clean FIPS code 
gen FIPS = substr(GEO_ID, 10, 14)

*Fix issue with old Miami-Dade FIPS
replace FIPS = "12025" if FIPS == "12086"

*Total 
gen pop_All_2000 	= P012002 + P012026
gen Log_pop_All_2000 = log(pop_All_2000) //defaults to natural log

*Gender
gen pop_Female_2000	= P012026
gen share_Female_2000	= pop_Female_2000 / pop_All_2000 

*25 to 44
gen pop_All_25_44_2000	= P012011 + P012012 + P012013 + P012014 + P012035 + P012036 + ///
						  P012037 + P012038
					  
*15 to 64; age distinction at 16 is unavailable in P012 
gen pop_All_16_64_2000 	= P012006 + P012007 + P012008 + P012009 + P012010 + P012011 + ///
						  P012012 + P012013 + P012014 + P012015 + P012016 + P012017 + ///
						  P012018 + P012019 + P012030 + P012031 + P012032 + P012033 + ///
						  P012034 + P012035 + P012036 + P012037 + P012038 + P012039 + ///
						  P012040 + P012041 + P012042 + P012043
					  					  
drop GEO_ID NAME
save "$dropbox/Data/Census/P12.dta", replace

*Cell-level and aggregate race
*Note: I and H have switched race indicators from other Census tables
*All races
foreach x in B H I {
    
		if "`x'" == "I" {
		local race = 	 "NonHisWhite"
		local race_txt = "White"
	}
	else if "`x'" == "B" {
		local race =	 "Black"
		local race_txt = "Black"
	}
	else if "`x'" == "H" {
		local race = 	 "Hispanic"
		local race_txt = "Hispanic"
	}
	
	*Import data
	import excel "$dropbox/Data/Census/2000 Decennial/P12`x'.xlsx", firstrow clear  

	*Add in variable labels 
	foreach var of varlist P* {
		label variable `var' "`=`var'[1]''"
	}

	*Remove label row
	drop if GEO_ID == "id"

	*Reformat variables
	foreach var of varlist P* {
		destring `var', replace 
	}

	*Clean FIPS code 
	gen FIPS = substr(GEO_ID, 10, 14)

	*Fix issue with old Miami-Dade FIPS
	replace FIPS = "12025" if FIPS == "12086"
	
	*Create aggregate race totals
	gen pop_`race'_2000 			= P012`x'002 + P012`x'026
										  
	*Create race by age totals 
	gen pop_`race'_25_44_2000 		= P012`x'011 + P012`x'012 + P012`x'013 + P012`x'014 + ///
									  P012`x'035 + P012`x'036 + P012`x'037 + P012`x'038
	gen pop_`race'_16_64_2000 		= P012`x'006 + P012`x'007 + P012`x'008 + P012`x'009 + ///
									  P012`x'010 + P012`x'011 + P012`x'012 + P012`x'013 + ///
									  P012`x'014 + P012`x'015 + P012`x'016 + P012`x'017 + ///
									  P012`x'018 + P012`x'019 + P012`x'030 + P012`x'031 + ///
									  P012`x'032 + P012`x'033 + P012`x'034 + P012`x'035 + ///
									  P012`x'036 + P012`x'037 + P012`x'038 + P012`x'039 + ///
									  P012`x'040 + P012`x'041 + P012`x'042 + P012`x'043
									  					  
	drop GEO_ID NAME
	save "$dropbox/Data/Census/P12`x'.dta", replace
}

**Poverty, Unemployment, Employment**
*Download Census data - B17001 poverty; S2301 / employment
*2010 is earlierst year for which 5 year estimates are available

/*Import and clean B17001H (White alone, non hispanic),  B17001B 
(Black or African American alone), B17001I (Hispanic or Latino)
*/
*codebook: https://api.census.gov/data/2010/acs/acs5/groups/B17001H.html

foreach x in h b i {
	if "`x'" == "h" {
		local race = 	 "NonHisWhite"
		local race_txt = "White"
	}
	else if "`x'" == "b" {
		local race =	 "Black"
		local race_txt = "Black"
	}
	else if "`x'" == "i" {
		local race = 	 "Hispanic"
		local race_txt = "Hispanic"
	}
	
	di "`x'"
	di "`race'"
	
	*Import data
	getcensus B17001`x', geography(county) data(5) years(2010) key($censuskey) noerror clear
	gen FIPS = state + county 
	
	*Fix issue with old Miami-Dade FIPS
	replace FIPS = "12025" if FIPS == "12086"
	
	*Create race by age poverty rates
	*25 to 44
	gen total_`race'_25_44_2010 = b17001`x'_011e + b17001`x'_012e + b17001`x'_040e + ///
								  b17001`x'_041e + b17001`x'_025e + b17001`x'_026e + ///
								  b17001`x'_054e + b17001`x'_055e
	gen pov_`race'_25_44_2010   = b17001`x'_011e + b17001`x'_012e + b17001`x'_025e + ///
								  b17001`x'_026e
	gen PctPov_`race'_25_44_2010 = pov_`race'_25_44_2010 / total_`race'_25_44_2010
	label variable PctPov_`race'_25_44_2010 "`race_txt' - Age 25 to 44 (%)"
	
	*Put pct pov on same scale as detention and racial gap rates
	foreach x of varlist PctPov* {
		replace `x' = `x' * 100
	}
	
	cap drop year state county name geo_id
	save "$dropbox/Data/Census/B17001_`race'_2010_5yr.dta", replace
}

*Import B17001 to create aggregate variables as seen in S1701
*codebook: https://api.census.gov/data/2010/acs/acs5/groups/B17001.html
getcensus B17001, geography(county) data(5) years(2010) key($censuskey) noerror clear
gen FIPS = state + county 
	
*Fix issue with old Miami-Dade FIPS
replace FIPS = "12025" if FIPS == "12086"

*25 to 44
gen PctPov_All_25_44_2010 = ((b17001_011e + b17001_012e) + ///
							 (b17001_025e + b17001_026e)) / ///
							((b17001_040e + b17001_041e) + ///
							 (b17001_054e + b17001_055e))
label variable PctPov_All_25_44_2010 "Age 25 to 44 (%)"

*Put poverty percentage on same scale as detention and racial gap rates
foreach x of varlist PctPov* {
	replace `x' = `x' * 100
}
	
cap drop year state county name geo_id
save "$dropbox/Data/Census/B17001_Aggregate_2010_5yr.dta", replace

/*Import and clean C23002H (White alone, non hispanic), 
C23002B (Black or African American alone), C23002I (Hispanic or Latino)
*/
*codebook: https://api.census.gov/data/2010/acs/acs5/groups/C23002H.html
foreach x in h b i {
	if "`x'" == "h" {
		local race = 	 "NonHisWhite"
		local race_txt = "White"
	}
	else if "`x'" == "b" {
		local race =	 "Black"
		local race_txt = "Black"
	}
	else if "`x'" == "i" {
		local race = 	 "Hispanic"
		local race_txt = "Hispanic"
	}
	
	di "`x'"
	di "`race'"
	
	*Import data
	getcensus C23002`x', geography(county) data(5) years(2010) key($censuskey) noerror clear
	gen FIPS = state + county 
	
	*Fix issue with old Miami-Dade FIPS
	replace FIPS = "12025" if FIPS == "12086"
		
	*Create race by age categories 
	gen Emp_`race'_16_64_2010   = (c23002`x'_007e + c23002`x'_020e) / ((c23002`x'_003e + ///
								   c23002`x'_016e) - (c23002`x'_005e + c23002`x'_018e))
	label variable Emp_`race'_16_64_2010 "`race_txt' - Age 16 to 64 (%)"
	

	*Put unemployment and employment rate on same scale as detention and racial gap rates
	foreach x of varlist Emp* {
		replace `x' = `x' * 100
	}
	
	*Remove 100% employment rates (only for older workers)
	foreach x of varlist Emp* {
		replace `x' = . if `x' == 100
	}
	
	*Save cleaned and prepped data
	cap drop year state county name geo_id
	save "$dropbox/Data/Census/C23002_`race'_2010_5yr.dta",replace
}

*Import and clean S2301 for 2010
*Codebook: https://api.census.gov/data/2010/acs/acs5/subject/groups/S2301.html
getcensus S2301, geography(county) data(5) years(2010) key($censuskey) clear
gen FIPS = state + county

*Fix issue with old Miami-Dade FIPS
replace FIPS = "12025" if FIPS == "12086"

*Rename and label variables 
*Unemployment variables
rename s2301_c04_001e Unemp_All_16Over_2010
label variable Unemp_All_16Over_2010 "Age 16 and Over (%)"

*Employment variables 
rename s2301_c03_001e Emp_All_16Over_2010
rename s2301_c03_004e Emp_All_25_44_2010
rename s2301_c03_011e Emp_Black_2010

label variable Emp_All_16Over_2010 	"Age 16 and Over (%)"
label variable Emp_All_25_44_2010	"Age 25 to 44 (%)"
label variable Emp_Black_2010 		"Black (%)"

*Remove missing values 
foreach x of varlist Emp* Unemp* {
	replace `x' = . if `x'<0
}

save "$dropbox/Data/Census/S2301_2010_5yr.dta", replace

**2000 Decennial Census data** 
*Prepare and import decennial data
foreach data in PCT142 PCT079 {
	*PCT142 - Poverty; PCT079 - Employment
	*Codebook for all PCT... variables: https://api.census.gov/data/2000/dec/sf4/variables.html
	*All races
	import excel "$dropbox/Data/Census/2000 Decennial/`data'.xlsx", firstrow clear  

	*Add in variable labels 
	foreach var of varlist P* {
		label variable `var' "`=`var'[1]''"
	}

	*Remove label row
	drop if GEO_ID == "id"

	*Reformat variables
	foreach var of varlist P* {
		destring `var', replace 
	}

	*Save cleaned and prepped data
	save "$dropbox/Data/Census/`data'.dta", replace

	*By race 
	*Import and clean
	foreach x in b h i {
		import excel "$dropbox/Data/Census/2000 Decennial/`data'`x'.xlsx", firstrow clear
		save "$dropbox/Data/Census/`data'`x'.dta", replace
	}

	*Append all race data together
	use "$dropbox/Data/Census/`data'.dta", clear 

	*Append by-race breakdowns
	append using "$dropbox/Data/Census/`data'b.dta"
	append using "$dropbox/Data/Census/`data'h.dta"
	append using "$dropbox/Data/Census/`data'i.dta"

	*Clean FIPS code 
	gen FIPS = substr(GEO_ID, 10, 14)
	
	*Fix issue with old Miami-Dade FIPS
	replace FIPS = "12025" if FIPS == "12086"

	*Clean race category
	replace POPGROUP_LABEL = "NonHisWhite" if POPGROUP == 2
	replace POPGROUP_LABEL = "Black" if POPGROUP == 4
	replace POPGROUP_LABEL = "Hispanic" if POPGROUP == 400
	
	save "$dropbox/Data/Census/`data'_Combined.dta", replace
}

**Clean individual poverty and employment decennial datasets**
*PCT142 - Poverty 
use "$dropbox/Data/Census/PCT142_Combined.dta", clear

*Age 
*25 to 44
gen PctPov_All_25_44_2000 	= (PCT142011 + PCT142012 + PCT142025 + PCT142026) / ///
							  (PCT142011 + PCT142012 + PCT142025 + PCT142026 + ///
							   PCT142040 + PCT142041 + PCT142054 + PCT142055) if ///
							   POPGROUP == 1

*Label variables for graph presentation
label variable PctPov_All_25_44_2000	"Age 25 to 44"

*Generate cell level race, gender, age group vars
foreach popgroup in Black NonHisWhite Hispanic {
	if "`popgroup'" == "Black" {
		local race_txt = "Black"
	else if "`popgroup'" == "NonHisWhite" {
		local race_txt = "Non-Hispanic White"
	}
	else if "`popgroup'" == "Hispanic"
		local race_txt = "Hispanic"
	}
									   
	*Create race by age poverty rate
	gen PctPov_`popgroup'_25_44_2000 	= (PCT142011 + PCT142012 + PCT142025 + ///
										   PCT142026) / ///
										  (PCT142011 + PCT142012 + PCT142040 + ///
										   PCT142041 + PCT142025 + PCT142026 + ///
										   PCT142054 + PCT142055) if ///
										   POPGROUP_LABEL == "`popgroup'"
	label variable PctPov_`popgroup'_25_44_2000     "`race_txt' - Age 25 to 44 (%)"
}

*Collapse to one observation 
collapse (max) PctPov*, by (FIPS)

*Put poverty rate on same scale as detention and racial gap rates
foreach x of varlist PctPov* {
	replace `x' = `x' * 100
}

*Save data
save "$dropbox/Data/Census/PCT142_Clean.dta", replace

*PCT079 - Employment 
use "$dropbox/Data/Census/PCT079_Combined.dta", clear

*Generate aggregate race, gender, age group vars

*Unemployment		   
gen Unemp_All_16Over_2000		= (PCT079008 + PCT079015 + PCT079022 + PCT079029 + ///
								   PCT079036 + PCT079043 + PCT079050 + PCT079057 + ///
								   PCT079064 + PCT079071 + PCT079078 + PCT079085 + ///
								   PCT079092 + PCT079100 + PCT079107 + PCT079114 + ///
								   PCT079121 + PCT079128 + PCT079135 + PCT079142 + ///
								   PCT079149 + PCT079156 + PCT079163 + PCT079170 + ///
								   PCT079177 + PCT079184) / ///
								  (PCT079006 + PCT079013 + PCT079020 + PCT079027 + ///
								   PCT079034 + PCT079041 + PCT079048 + PCT079055 + ///
								   PCT079062 + PCT079069 + PCT079076 + PCT079083 + ///
								   PCT079090 + PCT079098 + PCT079105 + PCT079112 + ///
								   PCT079119 + PCT079126 + PCT079133 + PCT079140 + ///
								   PCT079147 + PCT079154 + PCT079161 + PCT079168 + ///
								   PCT079175 + PCT079182) if POPGROUP == 1	
label variable Unemp_All_16Over_2000	"Age 16 and Over (%)"

*Employment Rate 
*Age 
gen Emp_All_16Over_2000 	= (PCT079007 + PCT079014 + PCT079021 + PCT079028 + ///
							   PCT079035 + PCT079042 + PCT079049 + PCT079056 + ///
							   PCT079063 + PCT079070 + PCT079077 + PCT079084 + ///
							   PCT079091 + PCT079099 + PCT079106 + PCT079113 + ///
							   PCT079120 + PCT079127 + PCT079134 + PCT079141 + ///
							   PCT079148 + PCT079155 + PCT079162 + PCT079169 + ///
							   PCT079176 + PCT079183) / ///
							 ((PCT079002 + PCT079094) - (PCT079005 + ///
							   PCT079012 + PCT079019 + PCT079026 + PCT079033 + ///
							   PCT079040 + PCT079047 + PCT079054 + PCT079061 + ///
							   PCT079068 + PCT079075 + PCT079082 + PCT079089 + ///
							   PCT079097 + PCT079104 + PCT079111 + PCT079118 + ///
							   PCT079125 + PCT079132 + PCT079139 + PCT079146 + ///
							   PCT079153 + PCT079160 + PCT079167 + PCT079174 + ///
							   PCT079181)) if POPGROUP == 1
								  
gen Emp_All_25_44_2000 		= (PCT079028 + PCT079035 + PCT079042 + PCT079120 + ///
							   PCT079127 + PCT079134) / ((PCT079024 + ///
							   PCT079031 + PCT079038 + PCT079116 + PCT079123 + ///
							   PCT079130) - (PCT079026 + PCT079033 + ///
							   PCT079040 + PCT079118 + PCT079125 + PCT079132)) ///
							   if POPGROUP == 1
label variable Emp_All_16Over_2000	"Age 16 and Over (%)"
label variable Emp_All_25_44_2000	"Age 25 to 44 (%)"

*LFP Rate 							   
gen LFP_Female_2000 		= (PCT079098 + PCT079105 + PCT079112 + PCT079119 + ///
							   PCT079126 + PCT079133 + PCT079140 + PCT079147 + ///
							   PCT079154 + PCT079161 + PCT079168 + PCT079175 + ///
							   PCT079182) / ((PCT079094)-(PCT079097 + ///
							   PCT079104 + PCT079111 + PCT079118 + PCT079125 + ///
							   PCT079132 + PCT079139 + PCT079146 + PCT079153 + ///
							   PCT079160 + PCT079167 + PCT079174 + PCT079181)) ///
							   if POPGROUP == 1

*Generate cell level rates
foreach popgroup in Black NonHisWhite Hispanic {
	if "`popgroup'" == "Black" {
		local race_txt = "Black"
	else if "`popgroup'" == "NonHisWhite" {
		local race_txt = "Non-Hispanic White"
	}
	else if "`popgroup'" == "Hispanic"
		local race_txt = "Hispanic"
	}
		
	*Create race by age categories
	gen Emp_`popgroup'_16_64_2000 = (PCT079007 + PCT079014 + PCT079021 + ///
								     PCT079028 + PCT079035 + PCT079042 + ///
									 PCT079049 + PCT079056 + PCT079063 + ///
									 PCT079070 + PCT079099 + PCT079106 + ///
									 PCT079113 + PCT079120 + PCT079127 + ///
									 PCT079134 + PCT079141 + PCT079148 + ///
									 PCT079155 + PCT079162) / ///
									((PCT079003 + PCT079010 + PCT079017 + ///
									 PCT079024 + PCT079031 + PCT079038 + ///
									 PCT079045 + PCT079052 + PCT079059 + ///
								     PCT079066 + PCT079095 + PCT079102 + ///
								     PCT079109 + PCT079116 + PCT079123 + ///
									 PCT079130 + PCT079137 + PCT079144 + ///
								     PCT079151 + PCT079158) - (PCT079005 + ///
									 PCT079012 + PCT079019 + PCT079026 + ///
									 PCT079033 + PCT079040 + PCT079047 + ///
									 PCT079054 + PCT079061 + PCT079068 + ///
									 PCT079097 + PCT079104 + PCT079111 + ///
									 PCT079118 + PCT079125 + PCT079132 + ///
									 PCT079139 + PCT079146 + PCT079153 + ///
									 PCT079160)) if ///
									 POPGROUP_LABEL == "`popgroup'"
	label variable Emp_`popgroup'_16_64_2000 	"`race_txt' - Age 16 to 64"
}

*Collapse to one observation 
collapse (max) Unemp* LFP* Emp* , by (FIPS)

*Put unemployment and employment rate on same scale as detention and racial gap rates
foreach x of varlist Unemp* Emp* LFP* {
	replace `x' = `x' * 100
}
	
save "$dropbox/Data/Census/PCT079_Clean.dta", replace

**Note regarding population weight and emp/pov denominator consistency**
/*2010 population count totals are consistently higher than the denominator used for poverty rate calculations, as provided in Census Tables B17001 for poverty and B01001 for population. The figures are similar, however, and follow a similar trend across counties. This may be a feature of the population for which the data is collected, as this discrepancy is also evident in the US totals for each table (see the top line in both https://data.census.gov/cedsci/table?q=b01001&tid=ACSDT5Y2010.B01001&hidePreview=false and https://data.census.gov/cedsci/table?q=b17001&tid=ACSDT5Y2010.B17001&hidePreview=false).*/
/*Population totals are able to be matched to the denominator for employment rate for the 65+ group, as provided in Census C23002 for employment and B01001 for population. Population totals for the group 16-64 are unable to be matched to employment denominators as the population tables do not disaggregate by age at age 16, (closest age group is 15-17). The population total for individuals age 15-64 is slightly above the denominator for employment rate of individuals age 16-64 and tracks trends across counties.*/

*********************************
**Federal Reserve Economic Data**
*********************************
*Import data for employment / population ratio
import excel using "$dropbox/Data/Economic Indicators/FRED_EmpPop_1990_2009.xls", clear firstrow cellrange(A15:E255)

*Clean variables
gen Year = year(Date)
format Date %td

label variable EmpPop_Hispanic  "Hispanic"
label variable EmpPop_Black		"Black"
label variable EmpPop_White		"White"
label variable EmpPop_All		"All"

save "$dropbox/Data/Economic Indicators/FRED_EmpPop_1990_2009_CLEAN.dta", replace

*****************************************
**SCPS Detention Rates and Racial Gaps**
*****************************************
*No weights
*Import SCPS D1, includes all years
use "$local_temp/D1_Clean_Prepped.dta", clear

*Collapse to county level obs 
collapse (sum) Detained n_case, ///
by(CountyName FIPS STATE Black NonHisWhite HisWhite Race_Other YEAR)

*Create detention rate for each race group
gen Det_Rate_NonHisWhite = .
replace Det_Rate_NonHisWhite = (Detained)/n_case if NonHisWhite == 1

gen Det_Rate_HisWhite = .
replace Det_Rate_HisWhite = (Detained)/n_case if HisWhite == 1

gen Det_Rate_Black = .
replace Det_Rate_Black = (Detained)/n_case if Black == 1

/*Collapse to one observation per county per year; Condenses observations lines into one.*/
collapse (sum) Detained n_case Det_Rate*, by(CountyName FIPS STATE YEAR)

*Change detention rate scale 
foreach x in NonHisWhite HisWhite Black {
	replace Det_Rate_`x' = Det_Rate_`x' *100
}

*Generate overall detention rate
gen Det_Rate = .
replace Det_Rate = (Detained)/n_case
replace Det_Rate = Det_Rate *100

*Label variables
label variable Det_Rate "Detention Rate (%)"
replace Det_Rate = round(Det_Rate, .1)

*Add state to county name
replace CountyName = CountyName + ", " + STATE

*Export raw detention rate and racial gap data
save "$local_temp/DetRate_county.dta", replace 

			
**Create totals for all counties**
*Import SCPS D1, includes all years
use "$local_temp/D1_Clean_Prepped.dta", clear

*Use correct denominator for detention rate
replace n_case = . if mi(Detained)

collapse (sum) Detained n_case [pweight = TOTALWT], ///
by(Black NonHisWhite HisWhite Race_Other YEAR)
*These weights applied during this collapse yield identical results to: svy: mean Detained using pweights

*Create annual detention rate for each race group
gen Det_Rate_NonHisWhite = .
replace Det_Rate_NonHisWhite = (Detained)/n_case if NonHisWhite == 1

gen Det_Rate_HisWhite = .
replace Det_Rate_HisWhite = (Detained)/n_case if HisWhite == 1

gen Det_Rate_Black = .
replace Det_Rate_Black = (Detained)/n_case if Black == 1

/*Collapse to one observation per year; This doesn't 
actually sum by race-rates, it just condenses multiple lines into one.*/
collapse (sum) Detained n_case Det_Rate*, by(YEAR)

*Create overall detention rate
gen Det_Rate = Detained/n_case
replace Det_Rate = Det_Rate *100

*Change detention rate scale 
foreach x in NonHisWhite HisWhite Black {
	replace Det_Rate_`x' = Det_Rate_`x' *100
}

*Label variables for table presentation
label variable Det_Rate 			"All"
label variable Det_Rate_NonHisWhite "White"
label variable Det_Rate_HisWhite 	"Hispanic"
label variable Det_Rate_Black	 	"Black"

*Save data
save "$local_temp/DetRate_TOTAL_pweight.dta", replace

***************************************************************
**Merge Data for Regressions, Correlations, and Scatter Plots**
***************************************************************
*Import unadjusted detention rate and racial gap 
use "$local_temp/DetRate_county.dta",clear 

*Reshape data to wide
keep YEAR FIPS CountyName Det_Rate 
reshape wide Det_Rate, i(CountyName) j(YEAR) 

*Generate detention rate difference
gen Det_Rate_Dif_2000_2009 = Det_Rate2009 - Det_Rate2000

*Fix issue with Baltimore
di _N //71
replace CountyName = "Baltimore, MD" if CountyName == ", MD"
collapse (firstnm) CountyName Det_Rate* , by(FIPS) //This only collapses the baltimore observation with two duplicate FIPS
di _N //70

*Reformat and relabel variables
label variable Det_Rate_Dif_2000_2009 "{&Delta} Detention Rate 2000-2009 (%)"
replace Det_Rate_Dif_2000_2009 = round(Det_Rate_Dif_2000_2009, .1)
format Det_Rate_Dif_2000_2009 %9.0f
label variable Det_Rate2000 "Detention Rate 2000 (%)"
label variable Det_Rate1990 "Detention Rate 1990 (%)"

*Label two lowest and two highest counties in terms of detention rate diff
gsort -Det_Rate_Dif_2000_2009
egen Rank_Det_Rate_Dif = rank(Det_Rate_Dif_2000_2009)
sum Rank_Det_Rate_Dif
local max_rank = `r(max)'
local min_rank = `r(min)'
gen Det_Rate_Dif_Labels = CountyName if Rank_Det_Rate_Dif == `max_rank' | Rank_Det_Rate_Dif == `max_rank' - 1
replace Det_Rate_Dif_Labels = CountyName if Rank_Det_Rate_Dif == `min_rank' | Rank_Det_Rate_Dif == `min_rank' + 1 

**Merge in 2010 data**
*merge in poverty Census data 
*Aggregate
merge 1:1 FIPS using "$dropbox/Data/Census/B17001_Aggregate_2010_5yr.dta"
drop if _merge == 2
drop _merge

*Cell level
foreach race in NonHisWhite Black Hispanic {
	merge 1:1 FIPS using "$dropbox/Data/Census/B17001_`race'_2010_5yr.dta"
	drop if _merge == 2
	drop _merge
}

*merge in employment Census data
*Aggreate
merge 1:1 FIPS using "$dropbox/Data/Census/S2301_2010_5yr.dta"
drop if _merge == 2
drop _merge

*Cell level
foreach race in NonHisWhite Black Hispanic {
	merge 1:1 FIPS using "$dropbox/Data/Census/C23002_`race'_2010_5yr.dta"
	drop if _merge ==2
	drop _merge
}

**Merge in 2000 data**
*merge in poverty Census data
merge 1:1 FIPS using "$dropbox/Data/Census/PCT142_Clean.dta"
drop if _merge == 2
drop _merge

*merge in employment Census data 
merge 1:1 FIPS using "$dropbox/Data/Census/PCT079_Clean.dta"
drop if _merge == 2
drop _merge

**Merge in OI data**
*merge in intergenerational mobility
merge 1:1 FIPS using "$dropbox/Data/Opportunity Insights/OI_county_outcomes_dta_CLEAN.dta"
drop if _merge == 2
drop _merge 

merge 1:1 FIPS using "$dropbox/Data/Opportunity Insights/OI_county_NeighborhoodCharacteristics_CLEAN.dta"
drop if _merge == 2
drop _merge

merge 1:1 FIPS using "$dropbox/Data/Opportunity Insights/OI_county_CausalPlaceEffects_CLEAN.dta"
drop if _merge == 2
drop _merge 

**Merge in Census population totals**
*Decennial - 2000
*Aggregate totals
merge 1:1 FIPS using "$dropbox/Data/Census/P12.dta"
drop if _merge == 2
drop _merge

*Cell-level totals
foreach x in B H I {
	merge 1:1 FIPS using "$dropbox/Data/Census/P12`x'.dta"
	drop if _merge == 2
	drop _merge 
}

**Generate differences** 
*Poverty
foreach x in All Black NonHisWhite Hispanic {
	gen PctPov_`x'_25_44_DIF = PctPov_`x'_25_44_2010 - PctPov_`x'_25_44_2000
	local newlabel: var label PctPov_`x'_25_44_2010
	label variable PctPov_`x'_25_44_DIF "`newlabel'"
}

*Employment rate 16 to 64
foreach x in Black NonHisWhite Hispanic {
	gen Emp_`x'_16_64_DIF = Emp_`x'_16_64_2010 - Emp_`x'_16_64_2000
	local newlabel: var label Emp_`x'_16_64_2010
	label variable Emp_`x'_16_64_DIF "`newlabel'"
}

*Employment rate 25 to 44
gen Emp_All_25_44_DIF = Emp_All_25_44_2010 - Emp_All_25_44_2000
local newlabel: var label Emp_All_25_44_2010
label variable Emp_All_25_44_DIF "`newlabel'"

*Save and export data for use in Summary Statistics Table and Pov, Emp, IM regressions
save "$local_temp/All_Vars_Merged_Wide.dta", replace

