/* ======================================================================
 * susb4_cleandyn.do : clean converted xls SUSB dynamic data 
 *  and merge it with the pooled cross sections. 
 * 
 * Inputs:
 *  ./susb_raw/sbh-susb-raw-dyn*.dta : converted xls files from
 *    susb3_readdyn.do
 *  ./sbh-susb-static.dta : pooled 2003-2007 cross sections
 *
 * Outputs:
 *  ./sbh-susb-dyn.dta : pooled two year panel for years
 *    2003-2006
 *
 * Notes: 
 *
 * 050211 [benjamin] : created
 *
 * ======================================================================
 */

cap log close
log using sbh-data-susb4-cleandyn.txt, replace text

version 10
clear all
macro drop _all
graph drop _all

set more off
set mem 500m
set matsize 400
set maxvar 10000
set varabbrev off
set linesize 80

//=======================================================================
// DYNAMIC DATA: EMPLOYMENT/FIRMS/ESTABS/PAYROLL
//=======================================================================

// Reshape the converted two year panel data

//years 2003-2004 through 2006-2007 all use NAICS 2002 codes
foreach yy in 03 04 05 06 {
  clear
  use susb_raw/sbh-susb-raw-dyn`yy'
  gen level = !(datatype == 3 | datatype == 8 | datatype == 9 ///
    | datatype == 12 | datatype >= 17) & !missing(datatype)
  label var level "1 if observation is level data"

  forvalues i = 1/4 {
    gen data`i' = e`i' if level
    label var data`i' "Recoded version of e`i' zsize `i'"

    gen fdata`i' = regexm(e`i',"[a-m]")
    label var fdata`i' "1 if fdata`i' is imputed"
  
    replace data`i' = string(round((0+19)/2)) if e`i' == "[a]"
    replace data`i' = string(round((20+99)/2)) if e`i' == "[b]"
    replace data`i' = string(round((100+249)/2)) if e`i' == "[c]"
    replace data`i' = string(round((250+499)/2)) if e`i' == "[e]"
    replace data`i' = string(round((500+999)/2)) if e`i' == "[f]"
    replace data`i' = string(round((1000+2499)/2)) if e`i' == "[g]"
    replace data`i' = string(round((2500+4999)/2)) if e`i' == "[h]"
    replace data`i' = string(round((5000+9999)/2)) if e`i' == "[i]"
    replace data`i' = string(round((10000+24999)/2)) if e`i' == "[j]"
    replace data`i' = string(round((25000+49999)/2)) if e`i' == "[k]"
    replace data`i' = string(round((50000+99999)/2)) if e`i' == "[l]"
    replace data`i' = string(100000) if e`i' == "[m]"

    destring data`i', ignore(,(D)) replace
  }

  //create the entrsize == 6 (<20 emps) category
  gen data6 = data2 + data3 + data4 if level
  label var data6 "Data for firms with < 20 employees"
  gen fdata6 = fdata2 + fdata3 + fdata4
  label var fdata6 "1 if data6 is imputed"
  
  drop data2 data3 data4
  drop fdata2 fdata3 fdata4
  disp 1
  reshape long fdata data, i(naics datatype) j(zsize) //the recoded entrsize
  
  //only keep the level variables
  drop if !level //may be missing values so avoid keep if
  
  drop e1 e2 e3 e4 e5 e6 e7 e8
  
  drop datatypedscr
  //copied from spreadsheet data types
  //01	Initial year establishments
  //02	Change in establishments
  //03	Percent change in establishments X
  //04	Establishment births
  //05	Establishment deaths
  //06	Establishment expansions
  //07	Establishment contractions
  //08	Percent change in establishments due to births X
  //09	Percent change in establishments due to deaths X
  //10	Initial year employment
  //11	Change in employment
  //12	Percent change in employment X
  //13	Change in employment due to births X
  //14	Change in employment due to deaths X
  //15	Change in employment due to expansions X
  //16	Change in employment due to contractions X
  //17	Percent change in employment due to births
  //18	Percent change in employment due to deaths
  //19	Percent change in employment due to expansions & births
  //20	Percent change in employment due to contractions & deaths
  disp 1
  //put data types across
  reshape wide fdata data, i(naics zsize) j(datatype)
  
  ren data1 estb_init
  ren data2 estb_delta
  ren data4 estb_births
  ren data5 estb_deaths
  ren data6 estb_expand
  ren data7 estb_shrink
  
  ren data10 empl_init
  ren data11 empl_delta
  ren data13 empl_births
  ren data14 empl_deaths
  ren data15 empl_expand
  ren data16 empl_shrink
  
  sort year naics zsize
  save x-temp_dyn_`yy', replace
}

//now merge the dynamic data with the static files
foreach yy in 03 04 05 06 {
  use sbh-susb-static if year == 20`yy'
  sort year naics zsize
  merge year naics zsize using x-temp_dyn_`yy'
  gen fdyn = (_merge == 3)
  label var fdyn "1 if matched to dynamic data"
  drop _merge
  save x-susb_ind_size_dyn_`yy', replace 
}

//get rid of the temp data
!del x-temp_dyn*.dta

//-----------------------------------------------------------------
//  COMBINE YEARS 2003 TO 2006 (NAICS 2002)
//-----------------------------------------------------------------
//put all the years together
//only use 2003 to 2006, since they all use naics 2002
use x-susb_ind_size_dyn_03
foreach yy in 04 05 06 {
  append using x-susb_ind_size_dyn_`yy'
}

//-----------------------------------------------------------------
//  CHECK COMBINED DYNAMIC DATA
//-----------------------------------------------------------------
//compare the levels from the dynaimc data to the levels
//from the static data
sort year naics zsize

egen all_empl_total_static = sum(empl*(zsize==1)), by(year)
egen all_empl_total_dyn = sum(empl_init*(zsize==1)), by(year)

gen all_empl_share_ind_size_static = empl / all_empl_total_static
gen all_empl_share_ind_size_dyn = empl_init / all_empl_total_dyn

//usually within 1e-4 of each other
by year: sum all_empl_share_ind_size_static all_empl_share_ind_size_dyn ///
  if zsize == 6 & naics == "--"

cap drop all_empl_total_static all_empl_total_dyn 
cap drop all_empl_share_ind_size_static all_empl_share_ind_size_dyn


//=========================================================
//  MEASURES JOB CREATION/DESTRUCTION/REALLOCATION
//=========================================================

//define rates using the Davis & Haltiwanger conventions
//denom is empl_t + empl_t-1 / 2, which produces a symmetric
//growth rate
gen denom = (empl_init + (empl_delta + empl_init))/2

gen net_rate = empl_delta / denom
label var net_rate "Net job creation rate"

//we treat expansions and births separately
gen jc_rate = empl_expand / denom 
label var jc_rate "Gross job creation (continuing estb) rate"

gen jb_rate = empl_births / denom
label var jb_rate "Gross job creation (new estb) rate"

gen jd_rate = (empl_shrink + empl_deaths) / denom
label var jd_rate "Gross job destruction rate"

gen excess_realloc_rate = jc_rate + jb_rate - jd_rate - abs(net_rate)

//cleanup
drop denom

//add industry totals to the small firm obs
cap drop ind_delta_rate ind_jc_rate ind_jb_rate ind_jd_rate ind_excess_realloc_rate
egen ind_net_rate = sum((zsize == 1)*net_rate), by (year naics)
egen ind_jc_rate = sum((zsize == 1)*jc_rate), by (year naics)
egen ind_jd_rate = sum((zsize == 1)*jd_rate), by (year naics)
egen ind_jb_rate = sum((zsize == 1)*jb_rate), by (year naics)
egen ind_excess_realloc_rate = sum((zsize == 1)*excess_realloc_rate), by (year naics)

label data "last modified by susb4_cleandyn.do"
save sbh-susb-dyn, replace

//cleanup
!del x-susb_ind_size_dyn_03.dta
!del x-susb_ind_size_dyn_04.dta
!del x-susb_ind_size_dyn_05.dta
!del x-susb_ind_size_dyn_06.dta

cap log close
exit


