
//  susb_read.do -- read in and recode ASCII/XLS downloads from 
//    Statistics of US Businesses (SUSB), US Census Bureau
//  

//=================================================================
//  STATIC DATA: EMPLOYMENT/FIRMS/ESTABS/PAYROLL ...
//=================================================================

//  ASCII downloads from:
//  http://www.census.gov/econ/susb/data/download_susb2002.html
//  to
//  http://www.census.gov/econ/susb/data/download_susb2007.html
//

//enterprise size variable, entrsize, changes from year to year
//http://www.census.gov/econ/susb/data/download_references.html
//
//recode entrsize as size_r
//  01  total
//  02  1-4
//  03  4-9
//  04  10-14
//  05  15-19
//  06  < 20
//  07  20-24
//  08  25-29
//  09  30-34
//  10  35-39
//  11  40-44
//  12  45-49
//  13  50-74
//  14  75-79
//  15  100-149
//  16  150-199
//  17  200-299
//  18  300-399
//  19  400-499
//  20  < 500
//  21  500-749
//  22  750-999
//  23  1000-1499
//  24  1500-2499
//  25  2500+
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2002.txt
gen year = 2002
gen size_r = entrsize
recode size_r ///
  (02 03 = 02) ///              
  (04 = 03) ///
  (05 = .) ///
  (06 = 06) ///
  (07 = .) ///
  (08 = .) ///
  (09 = 20) ///
  (10 = .) ///
  (11 = 04) ///
  (12 = 05) ///
  (13 = 07) ///
  (14 = 08) ///
  (15 = 09) ///
  (16 = 10) ///
  (17 = 11) ///
  (18 = 12) ///
  (19 = 13) ///
  (20 = 14) ///
  (21 = 15) ///
  (22 = 16) ///
  (23 = 17) ///
  (24 = 18) ///
  (25 = 19) ///
  (26 = 21) ///
  (27 = 22) ///
  (28 = 23) ///
  (29 = 24) /// 1500 - 2499
  (30 = 25) // 2500 + 
save susb_ind_size_02, replace

clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2003.txt
gen year = 2003
gen size_r = entrsize
recode size_r ///
  (02 03 = 02) ///
  (04 = 03) ///
  (05 = .) ///
  (06 = 06) ///
  (07 = .) ///
  (08 = .) ///
  (09 = 20) ///
  (10 = .) ///
  (11 = 04) ///
  (12 = 05) ///
  (13 = 07) ///
  (14 = 08) ///
  (15 = 09) ///
  (16 = 10) ///
  (17 = 11) ///
  (18 = 12) ///
  (19 = 13) ///
  (20 = 14) ///
  (21 = 15) ///
  (22 = 16) ///
  (23 = 17) ///
  (24 = 18) ///
  (25 = 19) ///
  (26 = 21) ///
  (27 = 22) ///
  (28 = 23) ///
  (29 = 24) /// 1500 - 2499
  (30 = 25) // 2500 + 
save susb_ind_size_03, replace

clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2004.txt
gen year = 2004
gen size_r = entrsize
recode size_r ///
  (02 03 = 02) ///
  (04 = 03) ///
  (05 = .) ///
  (06 = 06) ///
  (07 = .) ///
  (08 = .) ///
  (09 = 20) ///
  (10 = .) ///
  (11 = 04) ///
  (12 = 05) ///
  (13 = 07) ///
  (14 = 08) ///
  (15 = 09) ///
  (16 = 10) ///
  (17 = 11) ///
  (18 = 12) ///
  (19 = 13) ///
  (20 = 14) ///
  (21 = 15) ///
  (22 = 16) ///
  (23 = 17) ///
  (24 = 18) ///
  (25 = 19) ///
  (26 = 21) ///
  (27 = 22) ///
  (28 = 23) ///
  (29 = 24) /// 1500 - 2499
  (30 = 25) // 2500 + 
save susb_ind_size_04, replace

clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2005.txt
gen year = 2005
gen size_r = entrsize
recode size_r ///
  (02 = 02) ///
  (03 = 03) ///
  (04 = .) ///
  (05 = 06) ///
  (06 = .) ///
  (07 = .) ///
  (08 = 20) ///
  (09 = .) ///
  (10 = 04) ///
  (11 = 05) ///
  (12 = 07) ///
  (13 = 08) ///
  (14 = 09) ///
  (15 = 10) ///
  (16 = 11) ///
  (17 = 12) ///
  (18 = 13) ///
  (19 = 14) ///
  (20 = 15) ///
  (21 = 16) ///
  (22 = 17) ///
  (23 = 18) ///
  (24 = 19) ///
  (25 = 21) ///
  (26 = 22) ///
  (27 = 23) ///
  (28 = 24) /// 1500 - 2499
  (29 = 25) // 2500 + 
save susb_ind_size_05, replace

clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2006.txt
gen year = 2006
gen size_r = entrsize
recode size_r ///
  (02 = 02) ///
  (03 = 03) ///
  (04 = .) ///
  (05 = 06) ///
  (06 = .) ///
  (07 = .) ///
  (08 = 20) ///
  (09 = .) ///
  (10 = 04) ///
  (11 = 05) ///
  (12 = 07) ///
  (13 = 08) ///
  (14 = 09) ///
  (15 = 10) ///
  (16 = 11) ///
  (17 = 12) ///
  (18 = 13) ///
  (19 = 14) ///
  (20 = 15) ///
  (21 = 16) ///
  (22 = 17) ///
  (23 = 18) ///
  (24 = 19) ///
  (25 = 21) ///
  (26 = 22) ///
  (27 = 23) ///
  (28 = 24) /// 1500 - 2499
  (39 = 25) // 2500 + 
save susb_ind_size_06, replace

clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2007.txt
gen year = 2007
gen size_r = entrsize
recode size_r ///
  (24 25 = 24) /// 1500 - 2499
  (26 27 = 25) // 2500 + 
save susb_ind_size_07, replace

//combine all years
append using susb_ind_size_06
append using susb_ind_size_05
append using susb_ind_size_04
append using susb_ind_size_03
append using susb_ind_size_02


//impute employment for supressed industry_size cells
//at the industry level many of the employment cells for
//larger firms are supressed. as a rough approximation 
//impute total category employment, by averaging across the 
//reported range

//2007+
replace empl = round((0+19)/2) if emplfl_r == "A"
replace empl = round((20+99)/2) if emplfl_r == "B"
replace empl = round((100+249)/2) if emplfl_r == "C"
replace empl = round((250+499)/2) if emplfl_r == "E"
replace empl = round((500+999)/2) if emplfl_r == "F"
replace empl = round((1000+2499)/2) if emplfl_r == "G"
replace empl = round((2500+4999)/2) if emplfl_r == "H"
replace empl = round((5000+9999)/2) if emplfl_r == "I"
replace empl = round((10000+24999)/2) if emplfl_r == "J"
replace empl = round((25000+49999)/2) if emplfl_r == "K"
replace empl = round((50000+99999)/2) if emplfl_r == "L"
replace empl = 100000 if emplfl_r == "M"

//2002-2006
replace empl = round((0+19)/2) if emplfl == "[a]"
replace empl = round((20+99)/2) if emplfl == "[b]"
replace empl = round((100+249)/2) if emplfl == "[c]"
replace empl = round((250+499)/2) if emplfl == "[e]"
replace empl = round((500+999)/2) if emplfl == "[f]"
replace empl = round((1000+2499)/2) if emplfl == "[g]"
replace empl = round((2500+4999)/2) if emplfl == "[h]"
replace empl = round((5000+9999)/2) if emplfl == "[i]"
replace empl = round((10000+24999)/2) if emplfl == "[j]"
replace empl = round((25000+49999)/2) if emplfl == "[k]"
replace empl = round((50000+99999)/2) if emplfl == "[l]"
replace empl = 100000 if emplfl == "[m]"

gen empl_imputed = length(emplfl) > 0 | length(emplfl_r) > 0 


save susb_ind_size_0207, replace

!del susb_ind_size_02.dta
!del susb_ind_size_03.dta
!del susb_ind_size_04.dta
!del susb_ind_size_05.dta
!del susb_ind_size_06.dta
!del susb_ind_size_07.dta


//=================================================================
//  ADD DYNAMIC DATA
//=================================================================

//read these in from the excel spreadsheets available from
//http://www.census.gov/econ/susb/historical_data.html

//entered in by hand by copying from each spreadsheet

//2002 to 2003 (NAICS 1997)
//2003 to 2004
//2004 to 2005
//2005 to 2006
//2006 to 2007

//all spreadsheet years have identical formats
//change this for each year 2002 2003 2004 2005 2006
local yy 06

clear
edit
//directly pasted in the data from the appropriate excel spreadsheet
//copy all data below the column headings!

gen year = 20`yy'

ren var1 naics
ren var2 naicsdscr
ren var3 datatype
ren var4 datatypedscr
ren var5 e1 //total
ren var6 e2 //1-4
ren var7 e3 //5-9
ren var8 e4 //10-19

gen is_level = ! (datatype == 3 | datatype == 8 | datatype == 9  ///
  | datatype == 12 | datatype >= 17)

forvalues i = 1/4 {
  gen data`i' = e`i' if is_level
  gen data`i'_imputed = regexm(e`i',"[a-m]")

  replace data`i' = string(round((0+19)/2)) if e`i' == "[a]"
  replace data`i' = string(round((20+99)/2)) if e`i' == "[b]"
  replace data`i' = string(round((100+249)/2)) if e`i' == "[c]"
  replace data`i' = string(round((250+499)/2)) if e`i' == "[e]"
  replace data`i' = string(round((500+999)/2)) if e`i' == "[f]"
  replace data`i' = string(round((1000+2499)/2)) if e`i' == "[g]"
  replace data`i' = string(round((2500+4999)/2)) if e`i' == "[h]"
  replace data`i' = string(round((5000+9999)/2)) if e`i' == "[i]"
  replace data`i' = string(round((10000+24999)/2)) if e`i' == "[j]"
  replace data`i' = string(round((25000+49999)/2)) if e`i' == "[k]"
  replace data`i' = string(round((50000+99999)/2)) if e`i' == "[l]"
  replace data`i' = string(100000) if e`i' == "[m]"

  destring data`i', ignore(,(D)) replace
}

//create the entrsize == 6 (<20 emps) category
gen data6 = data2 + data3 + data4 if is_level
gen data6_imputed = data2_imputed + data3_imputed + data4_imputed

drop data2 data3 data4
drop data2_imputed data3_imputed data4_imputed

ren data1_imputed fdata1
ren data6_imputed fdata6

reshape long fdata data, i(naics datatype) j(size_r) //the recoded entrsize

//only keep the level variables
drop if !is_level

drop e1 e2 e3 e4
drop var9 var10 var11 var12

drop datatypedscr
//01	Initial year establishments
//02	Change in establishments
//03	Percent change in establishments X
//04	Establishment births
//05	Establishment deaths
//06	Establishment expansions
//07	Establishment contractions
//08	Percent change in establishments due to births X
//09	Percent change in establishments due to deaths X

//10	Initial year employment
//11	Change in employment
//12	Percent change in employment X
//13	Change in employment due to births X
//14	Change in employment due to deaths X
//15	Change in employment due to expansions X
//16	Change in employment due to contractions X
//17	Percent change in employment due to births
//18	Percent change in employment due to deaths
//19	Percent change in employment due to expansions & births
//20	Percent change in employment due to contractions & deaths

//put data types across
reshape wide fdata data, i(naics size_r) j(datatype)

ren data1 estb_init
ren data2 estb_delta
ren data4 estb_births
ren data5 estb_deaths
ren data6 estb_expand
ren data7 estb_shrink

ren data10 empl_init
ren data11 empl_delta
ren data13 empl_births
ren data14 empl_deaths
ren data15 empl_expand
ren data16 empl_shrink

sort year naics size_r
save temp_dyn_`yy', replace

foreach yy in 03 04 05 06 {
use susb_ind_size_`yy'
sort year naics size_r
//save susb_ind_size_`yy', replace

merge year naics size_r using temp_dyn_`yy'
gen dyn_data = (_merge == 3)
drop _merge
save susb_ind_size_dyn_`yy', replace 

//get rid of the temp data
!del temp_dyn*.dta

//-----------------------------------------------------------------
//  COMBINE YEARS 2003 TO 2006 (NAICS 2002)
//-----------------------------------------------------------------

//put all the years together
//only use 2003 to 2006, since they all use naics 2002
use susb_ind_size_dyn_03
//gen dyn_data = (_merge == 3)
//drop _merge
//save temp_dyn_03, replace
//use susb_ind_size_03
//drop if size_r == 6 | size_r == 1
//append using temp_dyn_03
//sort year naics size_r
//save temp_dyn, replace
foreach yy in 04 05 06 {
  append using susb_ind_size_dyn_`yy'
//replace dyn_data = (_merge == 3) if year == `yy'
//drop _merge
//save temp_dyn_`yy', replace
//use susb_ind_size_`yy'
//drop if size_r == 6 | size_r == 1
//append using temp_dyn_`yy'
//sort year naics size_r
  }
//drop 5 and 6 digit industries
drop if length(naics) > 4 & !regexm(naics,"-")
save susb_ind_size_dyn_0306, replace

!del susb_ind_size_dyn_03.dta
!del susb_ind_size_dyn_04.dta
!del susb_ind_size_dyn_05.dta
!del susb_ind_size_dyn_06.dta
