***********************************************************
* Labor Income Dynamics project
* 
***********************************************************
* This program reads in PSID_labinc_Sample.dta, which contains records 
* from the 1987-2009 PSID surveys.  
*
* It then runs the first stage regression of labor income
* on demographics;
*
* It finally computes covariances 
*************************************************************

#delimit;

clear;
program drop _all;
clear matrix;
set matsize 1000;
set memory 4000m;
set more off;
cap log close;


local datapath "/Users/jasondebacker/Econ/Research/IncomeInequality/PSID_data_file" ;
local logpath "/Users/jasondebacker/Econ/Research/IncomeInequality/PSID_data_file" ;
local graphpath "/Users/jasondebacker/Econ/Research/IncomeInequality/PSID_data_file" ;
local outputpath "/Users/jasondebacker/Econ/Research/IncomeInequality/PSID_data_file" ;


log using "`logpath'/PSID_labincHD_CAL.log", replace;
cd `datapath' ;

*** Open data file ***;
use "`datapath'/PSID_LabIncHD_Sample.dta", clear; * this is file from Ivan ;
compress;
ci;



* summarize;
sum age year;
ci LabIncHD Wages2HD;

xtdes;


***** Perform separate regression for each year ******;
* Use log labor income;
gen labinc = ln(LabIncHD);

* Create age dummies;
forvalues h = 25(1)60 {;
   gen age_`h' = (age==`h');
};


* First year;
preserve;

keep if (year==1987);


reg labinc age_26-age_60 ;
more;

predict res, resid;
predict labincHAT, xb;
*hist res, bin(50) yscale(range(0 0.02)) saving(res, replace);

summ  res, detail;


save "`datapath'/FinalSampleResids_labinc_CAL.dta", replace;

restore;



* Rest of years;
drop if year == 1987 ;
levelsof year, local(levels) ; 
foreach l of local levels { ;

preserve;

keep if (year==`l');

reg labinc age_26-age_60 ;
more;

predict res, resid;
predict labincHAT, xb;
*hist res, bin(50) yscale(range(0 0.02)) saving(res, replace);

summ  res, detail;



append using "`datapath'/FinalSampleResids_labinc_CAL.dta" ;
save "`datapath'/FinalSampleResids_labinc_CAL.dta", replace ;

restore;

};


use "`datapath'/FinalSampleResids_labinc_CAL.dta", clear;

tsset issnm year;

gen dres = d.res;

gen lagres  = l.res;
gen lag2res = l2.res;
reg res lagres;
reg res lagres lag2res;

preserve;
keep issnm year res;
reshape wide res, i(issnm) j(year);
pwcorr res*, obs;
restore;

preserve;
keep issnm year dres;
reshape wide dres, i(issnm) j(year);
pwcorr dres*, obs;
restore;

save "`datapath'/FinalSampleResids_labinc_CAL.dta", replace;
#delimit cr;


* This part of the do-file does two things: 1. First it computes autocovariances  
* of the regression residuals using the data in dataset InData.  It keeps 
* the computed autocovariances (and the number of observations used to compute
*  each) in scalars with names of the form cov_XXXXX & wgt_XXXXX.
* 2. Then it uses an 'auxiliary' dataset (which provides all values 
* that each relevant variable could posssibly take) to turn these 
* autocovariances (and weights) into a separate Stata dataset that will 
* be used to estimate parameters using nonlinear least squares.
*
* NOTE: This version of the program does NOT keep track of calendar year;
* it assumes that only age (or potential experience) matters. Also, it treats
* all individuals as belonging to the same cohort.

clear
scalar drop _all
set more off



**************************************************************************
* Define parameters. 
**************************************************************************

local year1 = 1987   /* 1st calendar year data observed */
local year2 = 1988   /* 2nd calendar year data observed */
local yearT = 2008   /* last calendar year data observed */

local maxPE = 36
local maxJ  =100     /* max lag allowed; doesn't matter if large */


local InData   "`datapath'/FinalSampleResids_labinc_CAL.dta"   /* PSID data*/
local AuxData  "`datapath'/AuxDataCAL_lead_labinc_2009.csv"


**************************************************************************
* Read input data (residuals in LEV & FD).
**************************************************************************

use  issnm year PE res dres  using "`datapath'/FinalSampleResids_labinc_CAL.dta", clear

		
**************************************************************************
* 1. Put data into right form. 
**************************************************************************

keep if (year>=`year1' & year<=`yearT')
keep if (PE<=`maxPE')

tempfile temp1
tempfile temp2

sort issnm year
save `temp1', replace

*********************************************************
* Create variable PE`year1': PE in year `year1'.
*********************************************************

* First, locate earliest (nonmissing) observation for each individual
collapse (min) minyear=year minPE=PE, by(issnm)

* Take difference between first year observed and `year1' (eg 1975)
gen DiffTo`year1' = minyear - `year1'

* Subtract difference from PE in first year observed to obtain PE in `year1'
gen PE`year1' = minPE - DiffTo`year1'

sort issnm
save `temp2', replace
clear

*********************************************************
* Merge PE`year1' to dataset temp1
*********************************************************
use `temp1', clear
joinby issnm using `temp2', unmatched(master) _merge(MERGE)
keep if MERGE==3
*drop MERGE
save `temp1', replace

*********************************************************
* Form dataset of residuals (res_ & dres_) in the form:
* res_`yr'_`h' & dres_`yr'_`h'
*********************************************************

keep issnm year res dres PE`year1'

rename  res  res_
rename dres dres_

reshape wide res_ dres_, i(issnm) j(year)


* Create variables PE1987, PE1988, ..., PE2009

forvalues yr = `year2'(1)`yearT' {
  local yrlag = `yr'-1
  gen PE`yr' = PE`yrlag' + 1  if (PE`yrlag'!=.)
}


* Create the needed variables

forvalues yr = `year1'(1)1996 {
  forvalues  h = 1(1)`maxPE' {
    gen res_`yr'_`h' = .
    replace res_`yr'_`h' = res_`yr' if (`h'==PE`yr')
  }
}

forvalues yr = `year2'(1)1996 {
  forvalues  h = 2(1)`maxPE' {
    gen dres_`yr'_`h' = .
    replace dres_`yr'_`h' = dres_`yr' if (`h'==PE`yr')
  }
}

forvalues yr = 1998(2)`yearT' {
  forvalues  h = 1(1)`maxPE' {
    gen res_`yr'_`h' = .
    replace res_`yr'_`h' = res_`yr' if (`h'==PE`yr')
  }
}

forvalues yr = 1998(2)`yearT' {
  forvalues  h = 2(1)`maxPE' {
    gen dres_`yr'_`h' = .
    replace dres_`yr'_`h' = dres_`yr' if (`h'==PE`yr')
  }  
  
}

**************************************************************************
* Compute covariances & weights and save as scalars. 
**************************************************************************

tempfile resdata

* First, in LEVELS
forvalues yr = `year1'(1)1996 {
forvalues h  = 1(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')   /* j loops bet. 1 & min(maxJ,H-h,T-t) */
	forvalues j = 0(1)`JJ' {                         /* start j at zero to compute variance too */

		local hpj = `h' +`j'
		local ypj = `yr'+`j'

		* Compute all covariances and save as scalars. NOTE: Using
		* 'capture' here is probably not necessary.  I use it to be over-
                * cautious.  CAPTURE does the following: When there are errors
                * that lead the program to abort, the built-in scalar _rc 
		* is assigned the value 2000 by Stata. In such cases, I assign '.' 
		* to the covariance and '0' to the number of observations used to 
		* compute the covariance (the weights).

		capture correlate  res_`yr'_`h'  res_`ypj'_`hpj', cov

		if _rc==2000 | _rc==111 {
			scalar cov_`yr'_`h'_`j' = .
			scalar wgt_`yr'_`h'_`j' = 0
		}
		else {
			scalar cov_`yr'_`h'_`j' = r(cov_12)
			scalar wgt_`yr'_`h'_`j' = r(N) 
		}
	}
}
}



forvalues yr = 1998(2)`yearT' {
forvalues h  = 1(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')   /* j loops bet. 1 & min(maxJ,H-h,T-t) */
	forvalues j = 0(1)`JJ' {                         /* start j at zero to compute variance too */

		local hpj = `h' +`j'
		local ypj = `yr'+`j'

		* Compute all covariances and save as scalars. NOTE: Using
		* 'capture' here is probably not necessary.  I use it to be over-
                * cautious.  CAPTURE does the following: When there are errors
                * that lead the program to abort, the built-in scalar _rc 
		* is assigned the value 2000 by Stata. In such cases, I assign '.' 
		* to the covariance and '0' to the number of observations used to 
		* compute the covariance (the weights).

		capture correlate  res_`yr'_`h'  res_`ypj'_`hpj', cov

		if _rc==2000 | _rc==111 {
			scalar cov_`yr'_`h'_`j' = .
			scalar wgt_`yr'_`h'_`j' = 0
		}
		else {
			scalar cov_`yr'_`h'_`j' = r(cov_12)
			scalar wgt_`yr'_`h'_`j' = r(N) 
		}
	}
}
}


* Next, in FIRST DIFFERENCES
/* forvalues yr = `year2'(1)`yearT' {
forvalues h  = 2(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')   *//* j loops bet. 1 & min(maxJ,H-h,T-t) */
/*		forvalues j = 0(1)`JJ' {            *//* start j at zero to compute variance too */

/*		local hpj = `h' +`j'
		local ypj = `yr'+`j'

		capture correlate  dres_`yr'_`h'  dres_`ypj'_`hpj', cov

		if _rc==2000 {
			scalar covfd_`yr'_`h'_`j' = .
			scalar wgtfd_`yr'_`h'_`j' = 0
		}
		else {
			scalar covfd_`yr'_`h'_`j' = r(cov_12)
			scalar wgtfd_`yr'_`h'_`j' = r(N) 
		}

	}
}
}
*/
save `resdata'
drop _all


**************************************************************************
* Turn covariances and weights (scalars) into a separate Stata dataset.
**************************************************************************

* Read (auxiliary) dataset (the one used here was created in Matlab).

*infile yrvar hvar jvar hpjvar  using  `AuxData'
insheet using "`AuxData'", comma

* First, in LEVELS
gen covvar = .
gen wgtvar = .

forvalues yr = `year1'(1)1996 {
forvalues h = 1(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')
	forvalues j = 0(1)`JJ' {	     /* start j at zero to keep variances too */
			
		replace covvar = cov_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')

		replace wgtvar = wgt_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')
	}
}
}

forvalues yr = 1998(2)`yearT' {
forvalues h = 1(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')
	forvalues j = 0(1)`JJ' {	     /* start j at zero to keep variances too */
			
		replace covvar = cov_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')

		replace wgtvar = wgt_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')
	}
}
}

* Next, in FIRST DIFFERENCES
/*gen covvarfd = .
gen wgtvarfd = .

forvalues yr = `year2'(1)`yearT' {
forvalues h  = 2(1)`maxPE' {

	local JJ = min(`maxJ',`maxPE'-`h',`yearT'-`yr')
	forvalues j = 0(1)`JJ' {	     *//* start j at zero to keep variances too */
			
/*		replace covvarfd = covfd_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')

		replace wgtvarfd = wgtfd_`yr'_`h'_`j'  if (yrvar==`yr' & hvar==`h' & jvar==`j')
	}
}
}
*/

*drop if (covvar==. & covvarfd==.)
drop if (covvar==.)

		save "`datapath'/CovsData_PSID_labinc_CAL.dta", replace
		

capture log close 

