/*=======================================================================
 * susb1_cleanstatic.do : read in and recode ASCII/xls 
 *  downloads from Statistics of US Businesses (SUSB), US Census Bureau
 * 
 * Inputs:
 *  ./susb_raw/ *.txt  : ASCII downloads (see below)
 *
 * Outputs:
 *  sbh-susb-static.dta   : pooled cross section 2003-2007
 *  sbh-susb-static07.dta : original 2007 file w/ additional EC variables
 *    and more detailed size categories
 *
 * Notes:
 *
 * 050211 [benjamin] : created from old file
 *
 *=======================================================================
 */


cap log close
log using sbh-data-susb1-cleanstatic.txt, replace text

version 10
clear all
macro drop _all
graph drop _all

set more off
set mem 500m
set matsize 400
set maxvar 10000
set varabbrev off
set linesize 80

//=================================================================
//  STATIC DATA: EMPLOYMENT/FIRMS/ESTABS/PAYROLL ...
//=================================================================

//  ASCII downloads from:
//  http://www.census.gov/econ/susb/data/download_susb2003.html
//  to
//  http://www.census.gov/econ/susb/data/download_susb2007.html
//

//enterprise size variable, entrsize, changes from year to year
//http://www.census.gov/econ/susb/data/download_references.html
//
// consistent size codes:
cap program drop deflzsize
program define deflzsize
  label def lzsize    ///
    01 "1_Total"      ///
    02 "2_1-4"        ///
    03 "3_4-9"        ///
    04 "4_10-14"      ///
    05 "5_15-19"      ///
    06 "6_<20"        ///
    07 "7_20-24"      ///
    08 "8_25-29"      ///
    09 "9_30-34"      /// 
    10 "10_35-39"     /// 
    11 "11_40-44"     /// 
    12 "12_45-49"     /// 
    13 "13_50-74"     /// 
    14 "14_75-79"     /// 
    15 "15_100-149"   /// 
    16 "16_150-199"   /// 
    17 "17_200-299"   /// 
    18 "18_300-399"   /// 
    19 "19_400-499"   /// 
    20 "20_< 500"     ///  
    21 "21_500-749"   /// 
    22 "22_750-999"   ///  
    23 "23_1000-1499" /// 
    24 "24_1500-2499" /// 
    25 "25_2500+" 
end program 
//-----------------------------------------------------------------------
//  READ/CODE 2003 DATA
//-----------------------------------------------------------------------
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2003.txt
gen year = 2003

//recode entrsize as zsize
gen zsize = entrsize
label var zsize "Longitudinal recode of entrsize"
deflzsize
label val zsize lzsize
recode zsize    ///
  (02 03 = 02)  ///
  (04 = 03) ///
  (05 = .)  ///
  (06 = 06) ///
  (07 = .)  ///
  (08 = .)  ///
  (09 = 20) ///
  (10 = .)  ///
  (11 = 04) ///
  (12 = 05) ///
  (13 = 07) ///
  (14 = 08) ///
  (15 = 09) ///
  (16 = 10) ///
  (17 = 11) ///
  (18 = 12) ///
  (19 = 13) ///
  (20 = 14) ///
  (21 = 15) ///
  (22 = 16) ///
  (23 = 17) ///
  (24 = 18) ///
  (25 = 19) ///
  (26 = 21) ///
  (27 = 22) ///
  (28 = 23) ///
  (29 = 24) /// 1500 - 2499
  (30 = 25) // 2500 + 
save x-susb_ind_size_03, replace

//-----------------------------------------------------------------------
//  READ/CODE 2004 DATA
//-----------------------------------------------------------------------
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2004.txt
gen year = 2004

//recode entrsize as zsize
gen zsize = entrsize
label var zsize "Longitudinal recode of entrsize"
deflzsize
label val zsize lzsize
recode zsize    ///
  (02 03 = 02)  ///
  (04 = 03) ///
  (05 = .)  ///
  (06 = 06) ///
  (07 = .)  ///
  (08 = .)  ///
  (09 = 20) ///
  (10 = .)  ///
  (11 = 04) ///
  (12 = 05) ///
  (13 = 07) ///
  (14 = 08) ///
  (15 = 09) ///
  (16 = 10) ///
  (17 = 11) ///
  (18 = 12) ///
  (19 = 13) ///
  (20 = 14) ///
  (21 = 15) ///
  (22 = 16) ///
  (23 = 17) ///
  (24 = 18) ///
  (25 = 19) ///
  (26 = 21) ///
  (27 = 22) ///
  (28 = 23) ///
  (29 = 24) /// 1500 - 2499
  (30 = 25) // 2500 + 
save x-susb_ind_size_04, replace

//-----------------------------------------------------------------------
//  READ/CODE 2005 DATA
//-----------------------------------------------------------------------
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2005.txt
gen year = 2005
//recode entrsize as zsize
gen zsize = entrsize
label var zsize "Longitudinal recode of entrsize"
deflzsize
label val zsize lzsize
recode zsize    ///
  (02 = 02) ///
  (03 = 03) ///
  (04 = .)  ///
  (05 = 06) ///
  (06 = .)  ///
  (07 = .)  ///
  (08 = 20) ///
  (09 = .)  ///
  (10 = 04) ///
  (11 = 05) ///
  (12 = 07) ///
  (13 = 08) ///
  (14 = 09) ///
  (15 = 10) ///
  (16 = 11) ///
  (17 = 12) ///
  (18 = 13) ///
  (19 = 14) ///
  (20 = 15) ///
  (21 = 16) ///
  (22 = 17) ///
  (23 = 18) ///
  (24 = 19) ///
  (25 = 21) ///
  (26 = 22) ///
  (27 = 23) ///
  (28 = 24) /// 1500 - 2499
  (29 = 25) // 2500 + 
save x-susb_ind_size_05, replace

//-----------------------------------------------------------------------
//  READ/CODE 2006 DATA
//-----------------------------------------------------------------------
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2006.txt
gen year = 2006

//recode entrsize as zsize
gen zsize = entrsize
label var zsize "Longitudinal recode of entrsize"
deflzsize
label val zsize lzsize
recode zsize  ///
  (02 = 02) ///
  (03 = 03) ///
  (04 = .)  ///
  (05 = 06) ///
  (06 = .)  ///
  (07 = .)  ///
  (08 = 20) ///
  (09 = .)  ///
  (10 = 04) ///
  (11 = 05) ///
  (12 = 07) ///
  (13 = 08) ///
  (14 = 09) ///
  (15 = 10) ///
  (16 = 11) ///
  (17 = 12) ///
  (18 = 13) ///
  (19 = 14) ///
  (20 = 15) ///
  (21 = 16) ///
  (22 = 17) ///
  (23 = 18) ///
  (24 = 19) ///
  (25 = 21) ///
  (26 = 22) ///
  (27 = 23) ///
  (28 = 24) /// 1500 - 2499
  (39 = 25) // 2500 + 
save x-susb_ind_size_06, replace

//-----------------------------------------------------------------------
//  READ/CODE 2007 DATA
//-----------------------------------------------------------------------
clear
insheet using susb_raw/us_6digitnaics_detailedsizes_2007.txt
gen year = 2007

//recode entrsize as zsize
gen zsize = entrsize
label var zsize "Longitudinal recode of entrsize"
deflzsize
label val zsize lzsize
recode zsize ///
  (24 25 = 24) /// 1500 - 2499
  (26 27 = 25) // 2500 + 
save x-susb_ind_size_07, replace

//-----------------------------------------------------------------------
//  COMBINE ALL YEARS / ADDITIONAL RECODES
//-----------------------------------------------------------------------

append using x-susb_ind_size_06
append using x-susb_ind_size_05
append using x-susb_ind_size_04
append using x-susb_ind_size_03

//impute employment for supressed industry_size cells
//at the industry level many of the employment cells for
//larger firms are supressed. as a rough approximation 
//impute total category employment, by averaging across the 
//reported range

//http://www.census.gov/econ/susb/data/download_references.html

gen zempl = empl
label var zempl "Recode of empl with imputed missing data"

//2007+
replace zempl = round((0+19)/2) if emplfl_r == "A"
replace zempl = round((20+99)/2) if emplfl_r == "B"
replace zempl = round((100+249)/2) if emplfl_r == "C"
replace zempl = round((250+499)/2) if emplfl_r == "E"
replace zempl = round((500+999)/2) if emplfl_r == "F"
replace zempl = round((1000+2499)/2) if emplfl_r == "G"
replace zempl = round((2500+4999)/2) if emplfl_r == "H"
replace zempl = round((5000+9999)/2) if emplfl_r == "I"
replace zempl = round((10000+24999)/2) if emplfl_r == "J"
replace zempl = round((25000+49999)/2) if emplfl_r == "K"
replace zempl = round((50000+99999)/2) if emplfl_r == "L"
replace zempl = 100000 if emplfl_r == "M"

//2002-2006
replace zempl = round((0+19)/2) if emplfl == "[a]"
replace zempl = round((20+99)/2) if emplfl == "[b]"
replace zempl = round((100+249)/2) if emplfl == "[c]"
replace zempl = round((250+499)/2) if emplfl == "[e]"
replace zempl = round((500+999)/2) if emplfl == "[f]"
replace zempl = round((1000+2499)/2) if emplfl == "[g]"
replace zempl = round((2500+4999)/2) if emplfl == "[h]"
replace zempl = round((5000+9999)/2) if emplfl == "[i]"
replace zempl = round((10000+24999)/2) if emplfl == "[j]"
replace zempl = round((25000+49999)/2) if emplfl == "[k]"
replace zempl = round((50000+99999)/2) if emplfl == "[l]"
replace zempl = 100000 if emplfl == "[m]"

gen fzempl = length(emplfl) > 0 | length(emplfl_r) > 0
label var fzempl "1 if zempl imputed" 

//save 2007 version w/ original entrsize and extra rcpt variable
preserve
keep if year == 2007
drop zsize 
save sbh-susb-static07, replace
restore

// ---------------------------------------------------------------------
// re-aggregate to zsize for pooled version
// ---------------------------------------------------------------------
// entrsize coding changes across years. to recode to consistent zsize
// some entrsize categories are combined within the year. need to 
// reaggregate to zsize groups so there aren't duplicate obs

drop entrsize entrsizedscr
drop emplfl_r emplfl_n payrfl_n rcptfl_n emplfl payrfl
drop statedscr // all united states
collapse (sum) firm estb empl payr rcpt zempl (max) fzempl, ///
  by(year naics naicsdscr zsize)
order year naics naicsdscr zsize firm estb empl payr rcpt zempl fzempl


label data "created by susb1_cleanstatic.do"
save sbh-susb-static, replace

//erase the temporary files
!del x-susb_ind_size_03.dta
!del x-susb_ind_size_04.dta
!del x-susb_ind_size_05.dta
!del x-susb_ind_size_06.dta
!del x-susb_ind_size_07.dta

cap log close
//exit


