/* ======================================================================
 * susb2_cdfs.do : generate the cdf for the susb datasets
 * 
 * Inputs:
 *  sbh-susb-static.dta : from previous step
 *  sbh-susb-static07.dta : from previous step
 *
 * Outputs:
 *  sbh-susb-static.dta : pooled cross sections 2003 to 2007 with
 *    consistent coding
 *  sbh-susb-static07.dta : cross section from 2007 with more detailed
 *    size categories and economic census variables
 *
 * Notes:
 *
 * 050211 [benjamin] : created
 *
 * ======================================================================
 */

cap log close
log using sbh-data-susb2-cdfs.txt, replace text

version 10
clear all
macro drop _all
graph drop _all

set more off
set mem 500m
set matsize 400
set maxvar 10000
set varabbrev off
set linesize 80

//=======================================================================
// part 1: 2007 specific data
//=======================================================================
clear
use sbh-susb-static07 //use 2007 version with original entrsize var

//just use 2007 when EC variables are available
//there are also more entrsize categories
keep if year == 2007 

//identify the summary rows
gen summary = entrsize == 1 | entrsize == 6 | entrsize == 20
label var summary "1 if observation is a summary row"

// ---------------------------------------------------------------------
// ##1a: size distribution within an industry 
// ---------------------------------------------------------------------
// *note* can get marginal distr from naics == "--" (all industries)

// alternative densities for firm sizes
//  - firms
//  - establishments
//  - employment
//  - payroll
//  - receipts
foreach s in firm estb empl payr rcpt {
  //get industry totals
  //egen ind`s'tot = sum(`s') if !summary, by (naics)
  //some small size categories will be missing data, use the industry
  //totals from the all sizes cell instead. at very fine levels of 
  //aggregation the shares may add up to < 1, but the summary row 
  //shares should be more accurate. 
  egen ind`s'tot = max(`s'), by (naics) 
  label var ind`s'tot "Total `s' within industry"
  // compute share for each size class within industry
  // (including agg size categories e.g. < 20)
  gen sz`s'shrind = `s' / ind`s'tot
}
  
//add up shares to get a cdf for each density measure
sort naics entrsize
foreach s in firm estb empl payr rcpt {
  //add up the shares of non summary rows
  by naics: gen sz`s'cdfind = sum(sz`s'shrind) if !summary
  label var sz`s'cdfind "size cls cumulative `s' share within industry"
  by naics : gen sz`s'rankind = _n if !summary
  label var sz`s'rankind "size cls `s' rank within industry"
}

//moments of the firm size distribution (firm freq is density)
//use average employment within size class as center of mass for size class 
egen indavgsz = sum(empl/firm * firm/indfirmtot) if !summary, by(naics)
egen indvarsz = sum((empl/firm - indavgsz)^2 * firm/indfirmtot) if !summary, by(naics)
//copy to the summary rows
by naics: replace indavgsz = indavgsz[_n+1] if summary
by naics: replace indvarsz = indvarsz[_n+1] if summary

// ---------------------------------------------------------------------
// ##1b: industry distribution within a size class 
// ---------------------------------------------------------------------
// *note* order the industries from highest to lowest share

// there are several levels of industry aggregation
// need a different distribution for each level
gen naicsdig = length(naics)
replace naicsdig = 0 if naics == "--" //all industries
replace naicsdig = 2  if regexm(naics,"[0-9]-[0-9]") //some 2-digits combined

foreach s in firm empl { //not interested in the other "densities"
  // get size class totals (for each ind aggr level)
  //egen sz`s'tot = sum(`s'), by (naicsdig entrsize)
  // the size class total will be "--" industry level
  // as above, the shares may not sum to exactly one, but the
  // shares of the combined size groups, like < 20, will be better
  egen sz`s'tot = max(`s'), by(entrsize)
  label var sz`s'tot "Total `s' within size class"
  // assign industry shares within size class
  gen ind`s'shrsz = `s' / sz`s'tot
  label var ind`s'shrsz "Industry share of `s' within size class"
}
  
foreach s in firm empl {
  //order the industries by concentration within a size category 
  gsort naicsdig entrsize -ind`s'shrsz
  //add up the shares within size class
  by naicsdig entrsize : gen ind`s'cdfsz = sum(ind`s'shrsz)
  label var ind`s'cdfsz "industry cumulative `s' share within size class"
  //numeric rank within size class
  by naicsdig entrsize : gen ind`s'ranksz = _n
  label var ind`s'ranksz "industry rank within size class"
}

//-----------------------------------------------------------------------
// save 2007 version of susb data
//-----------------------------------------------------------------------
label data "last modified by sbh-data-susb2-cdfs.do"
save sbh-susb-static07, replace
    
//=======================================================================
// part 2: pooled consistent 2003-2007 cross sections
//=======================================================================
clear
use sbh-susb-static

//leave the summary rows
gen summary = zsize == 1 | zsize == 6 | zsize == 20
label var summary "1 if observation is a summary row"

// ---------------------------------------------------------------------
// ##2a: size distribution within an industry 
// ---------------------------------------------------------------------
// *note* can get marginal distr from naics == "--" (all industries)

// alternative densities for firm sizes
//  - firms
//  - establishments
//  - employment
//  - payroll
//  - receipts
foreach s in firm estb empl payr rcpt {
  // get industry totals
  //egen ind`s'tot = sum(`s') if !summary, by (year naics)
  // some small size categories will be missing data, use the industry
  // totals from the all sizes cell instead. at very fine levels of 
  // aggregation the shares may add up to < 1, but the summary row 
  // shares should be more accurate. 
  egen ind`s'tot = max(`s'), by (year naics)
  label var ind`s'tot "Total `s' within industry"
  // compute share for each size class within industry
  // (including agg size categories e.g. < 20)
  gen sz`s'shrind = `s' / ind`s'tot
}
  
//add up shares to get a cdf for each density measure
sort year naics zsize
foreach s in firm estb empl payr rcpt {
  //add up the shares of non summary rows
  by year naics: gen sz`s'cdfind = sum(sz`s'shrind) if !summary
  label var sz`s'cdfind "size cls cumulative `s' share within industry"
  by year naics: gen sz`s'rankind = _n if !summary
  label var sz`s'rankind "size cls `s' rank within industry"
}

//moments of the firm size distribution (firm freq is density)
//use average employment within size class as center of mass for size class
egen indavgsz = sum(empl/firm * firm/indfirmtot) if !summary, by(year naics)
egen indvarsz = sum((empl/firm - indavgsz)^2 * firm/indfirmtot) if !summary, by(year naics)
//copy to the summary rows
by year naics: replace indavgsz = indavgsz[_n+1] if summary
by year naics: replace indvarsz = indvarsz[_n+1] if summary

//check how bad the approximation is as far as hitting average size
disp "compare average size within industry to estimate from sample distribution"
count if abs(empl/firm/indavgsz-1) > 0.20 & zsize == 1 & length(naics)==6

// ---------------------------------------------------------------------
// ##2b: industry distribution within a size class 
// ---------------------------------------------------------------------
// *note* order the industries from highest to lowest share

// there are several levels of industry aggregation
// need a different distribution for each level
gen naicsdig = length(naics)
replace naicsdig = 0 if naics == "--" //all industries
replace naicsdig = 2  if regexm(naics,"[0-9]-[0-9]") //some 2-digits combined

foreach s in firm empl { //not interested in the other "densities"
  // get size class totals (for each ind aggr level)
  //egen sz`s'tot = sum(`s'), by (year naicsdig zsize)
  // the size class total will be "--" industry level
  // as above, the shares may not sum to exactly one, but the
  // shares of the combined size groups, like < 20, will be better
  egen sz`s'tot = max(`s'), by(year zsize)
  label var sz`s'tot "Total `s' within size class"
  // assign industry shares within size class
  gen ind`s'shrsz = `s' / sz`s'tot
  label var ind`s'shrsz "Industry share of `s' within size class"
}
  
foreach s in firm empl {
  //order the industries by concentration within a size category 
  gsort year naicsdig zsize -ind`s'shrsz
  //add up the shares within size class
  by year naicsdig zsize : gen ind`s'cdfsz = sum(ind`s'shrsz)
  label var ind`s'cdfsz "industry cumulative `s' share within size class"
  //numeric rank within size class
  by year naicsdig zsize : gen ind`s'ranksz = _n
  label var ind`s'ranksz "industry rank within size class"
}

//-----------------------------------------------------------------------
// save pooled 2003 to 2007 consistent version of susb data
//-----------------------------------------------------------------------
label data "last modified by susb2_cdfs.do"
save sbh-susb-static, replace

    
cap log close
exit
 
