/* ======================================================================
 * kfs_read.do : import and recode the kfs data
 * 
 * Inputs:
 *  ./kfs_raw/kfs5_publicuse_21apr10.dta : downloaded from kauffman:
 *    http://www.kauffman.org/kfs/
 *
 * Outputs:
 *  ./sbh-kauffman.dta
 *
 * Notes:
 *
 * 050511 [benjamin] : adapted from erik's file
 *
 * ======================================================================
 */

#delimit ;

clear ;
set mem 500m ;
use "./kfs_raw/kfs5_publicuse_21apr10.dta", clear; 

destring naics_code_0 naics_code_2 naics_code_3 , replace ;

#delimit ;

gen industry_ag_mining 			= naics_code_0 == 11 | naics_code_0 == 21 | naics_code_0 == 22 ; 
gen industry_construction 		= naics_code_0 == 23 ; 
gen industry_manufacturing 		= naics_code_0 == 31 | naics_code_0 == 32 | naics_code_0 ==33 ; 
gen industry_wholesale 			= naics_code_0 == 42 ; 
gen industry_retail 			= naics_code_0 == 44 | naics_code_0 == 45 ; 
gen industry_transport 			= naics_code_0 == 48 | naics_code_0 == 49 ; 
gen industry_information 		= naics_code_0 == 51 ; 
gen industry_fire 			= naics_code_0 == 52 | naics_code_0 == 53 ; 
gen industry_service_rest_hotel	= naics_code_0== 72 ;
gen industry_service_arts		= naics_code_0 == 71 ;
gen industry_service_prof_nonhealth	= naics_code_0 == 54  | naics_code_0 == 55 | naics_code_0 == 61 ;
gen industry_service_health		= naics_code_0 == 62 ;
gen industry_service_other		= naics_code_0 == 81 | naics_code_0 == 92 | naics_code_0 == 56 ; 

#delimit ;

replace industry_ag_mining 			= . if naics_code_0 == .; 
replace industry_construction 		= . if naics_code_0 == . ; 
replace industry_manufacturing 		= . if naics_code_0 == .; 
replace industry_wholesale 			= . if naics_code_0 == . ; 
replace industry_retail 			= . if naics_code_0 == .; 
replace industry_transport 			= . if naics_code_0 == .; 
replace industry_information 			= . if naics_code_0 == . ; 
replace industry_fire 				= . if naics_code_0 == .; 
replace industry_service_rest_hotel		= . if naics_code_0== . ;
replace industry_service_arts			= . if naics_code_0 == . ;
replace industry_service_prof_nonhealth	= . if naics_code_0 == .;
replace industry_service_health		= . if naics_code_0 == . ;
replace industry_service_other		= . if naics_code_0 == .; 



#delimit ;

gen industry_ag_mining_2  		= naics_code_2 == 11 | naics_code_2 == 21 | naics_code_2 == 22 ; 
gen industry_construction_2 		= naics_code_2 == 23 ; 
gen industry_manufacturing_2 		= naics_code_2 == 31 | naics_code_2 == 32 | naics_code_2 ==33 ; 
gen industry_wholesale_2 		= naics_code_2 == 42 ; 
gen industry_retail_2 			= naics_code_2 == 44 | naics_code_2 == 45 ; 
gen industry_transport_2 		= naics_code_2 == 48 | naics_code_2 == 49 ; 
gen industry_information_2 		= naics_code_2 == 51 ; 
gen industry_fire_2 			= naics_code_2 == 52 | naics_code_2 == 53 ; 
gen industry_service_rest_hotel_2	= naics_code_2== 72 ;
gen industry_service_arts_2		= naics_code_2 == 71 ;
gen industry_service_prof_nonhlth_2	= naics_code_2 == 54  | naics_code_3 == 55 | naics_code_3 == 61 ;
gen industry_service_health_2		= naics_code_2 == 62 ;
gen industry_service_other_2		= naics_code_2 == 81 | naics_code_3 == 92 | naics_code_3 == 56 ; 

#delimit ;

replace industry_ag_mining_2 			= . if naics_code_2 == .; 
replace industry_construction_2 		= . if naics_code_2 == . ; 
replace industry_manufacturing_2 		= . if naics_code_2 == .; 
replace industry_wholesale_2 			= . if naics_code_2 == . ; 
replace industry_retail_2 			= . if naics_code_2 == .; 
replace industry_transport_2 			= . if naics_code_2 == .; 
replace industry_information_2 		= . if naics_code_2 == . ; 
replace industry_fire_2 			= . if naics_code_2 == .; 
replace industry_service_rest_hotel_2	= . if naics_code_2== . ;
replace industry_service_arts_2		= . if naics_code_2 == . ;
replace industry_service_prof_nonhlth_2	= . if naics_code_2 == .;
replace industry_service_health_2		= . if naics_code_2 == . ;
replace industry_service_other_2		= . if naics_code_2 == .; 


#delimit ;

gen industry_ag_mining_3 		= naics_code_3 == 11 | naics_code_3 == 21 | naics_code_3 == 22 ; 
gen industry_construction_3		= naics_code_3 == 23 ; 
gen industry_manufacturing_3 		= naics_code_3 == 31 | naics_code_3 == 32 | naics_code_3 ==33 ; 
gen industry_wholesale_3 		= naics_code_3 == 42 ; 
gen industry_retail_3 			= naics_code_3 == 44 | naics_code_3 == 45 ; 
gen industry_transport_3 		= naics_code_3 == 48 | naics_code_3 == 49 ; 
gen industry_information_3 		= naics_code_3 == 51 ; 
gen industry_fire_3 			= naics_code_3 == 52 | naics_code_3 == 53 ; 
gen industry_service_rest_hotel_3	= naics_code_3== 72 ;
gen industry_service_arts_3		= naics_code_3 == 71 ;
gen industry_service_prof_nonhlth_3	= naics_code_3 == 54  | naics_code_3 == 55 | naics_code_3 == 61 ;
gen industry_service_health_3		= naics_code_3 == 62 ;
gen industry_service_other_3		= naics_code_3 == 81 | naics_code_3 == 92 | naics_code_3 == 56 ; 

#delimit ;

replace industry_ag_mining_3 			= . if naics_code_3 == .; 
replace industry_construction_3 		= . if naics_code_3 == . ; 
replace industry_manufacturing_3 		= . if naics_code_3 == .; 
replace industry_wholesale_3 			= . if naics_code_3 == . ; 
replace industry_retail_3 			= . if naics_code_3 == .; 
replace industry_transport_3 			= . if naics_code_3 == .; 
replace industry_information_3 		= . if naics_code_3 == . ; 
replace industry_fire_3 			= . if naics_code_3 == .; 
replace industry_service_rest_hotel_3	= . if naics_code_3 == . ;
replace industry_service_arts_3		= . if naics_code_3 == . ;
replace industry_service_prof_nonhlth_3	= . if naics_code_3 == .;
replace industry_service_health_3		= . if naics_code_3 == . ;
replace industry_service_other_3		= . if naics_code_3 == .; 

#delimit ;

replace c5_num_employees_0 = "25" if c5_num_employees_0 == "25+" ; 
replace c5_num_employees_2 = "25" if c5_num_employees_2 == "25+" ;
replace c5_num_employees_3 = "30" if c5_num_employees_3 == "30+" ; 

#delimit ;

destring c5_num_employees_0 c5_num_employees_2 c5_num_employees_3, replace ; 

#delimit ;

gen num_emp_0_4_0 = c5_num_employees_0 >= 0 & c5_num_employees_0 <= 4;
gen num_emp_0_9_0 = c5_num_employees_0 >= 0 & c5_num_employees_0 <= 9;
gen num_emp_20_above_0 = c5_num_employees_0 >= 20 & c5_num_employees_0 <= 900;

#delimit ;

gen num_emp_0_4_2 = c5_num_employees_2 >= 0 & c5_num_employees_2 <= 4;
gen num_emp_0_9_2 = c5_num_employees_2 >= 0 & c5_num_employees_2 <= 9;
gen num_emp_20_above_2 = c5_num_employees_2 >= 20 & c5_num_employees_2 <= 900;

#delimit ;

gen num_emp_0_4_3 = c5_num_employees_3 >= 0 & c5_num_employees_3 <= 4;
gen num_emp_0_9_3 = c5_num_employees_3 >= 0 & c5_num_employees_3 <= 9;
gen num_emp_20_above_3 = c5_num_employees_3 >= 20 & c5_num_employees_3 <= 900;


rename wgt_final_0 weight_0 ; 
rename wgt_final_f2_2 weight_2 ; 
rename wgt_final_f3_3 weight_3 ; 
rename wgt_final_f123_long_3 weight_3_long ; 
rename wgt_final_f12_long_2 weight_2_long ; 

#delimit ;

gen delta_emp_0_3 = c5_num_employees_3 - c5_num_employees_0 ; 
gen delta_emp_0_2 = c5_num_employees_2 - c5_num_employees_0 ; 

#delimit ;

gen delta_emp_0_3_more1 = delta_emp_0_3 >= 1 & delta_emp_0_3 < . ;
replace delta_emp_0_3_more1 = . if delta_emp_0_3 == . ; 

gen delta_emp_0_3_more5 = delta_emp_0_3 >= 5 & delta_emp_0_3 < . ;
replace delta_emp_0_3_more5 = . if delta_emp_0_3 == . ; 

gen delta_emp_0_3_more10 = delta_emp_0_3 >= 10 & delta_emp_0_3 < . ;
replace delta_emp_0_3_more10 = . if delta_emp_0_3 == . ; 

gen delta_emp_0_2_more5 = delta_emp_0_2 >= 5 & delta_emp_0_2 < . ;
replace delta_emp_0_2_more5 = . if delta_emp_0_2 == . ; 

gen delta_emp_0_2_more10 = delta_emp_0_2 >= 10 & delta_emp_0_2 < . ;
replace delta_emp_0_2_more10 = . if delta_emp_0_2 == . ; 


#delimit ;

replace total_patents_3 = "30" if total_patents_3== "30+" ; 
replace total_copyrights_3 = "200" if total_copyrights_3 == "200+" ; 
replace total_trademarks_3 = "25" if total_trademarks_3 == "25+" ; 


destring  total_patents_0 total_copyrights_0 total_trademarks_0 
          total_patents_2 total_copyrights_2 total_trademarks_2
          total_patents_3 total_copyrights_3 total_trademarks_3, replace ;


gen have_patent_0 			= total_patents_0 > 0 & total_patents_0  < . ;
gen have_copyright_0 			= total_copyrights_0 > 0 & total_copyrights_0  < . ;
gen have_patent_copyright_0 		= have_patent_0 == 1 | have_copyright_0 == 1 ; 

replace have_patent_0 			= . if total_patents_0 == . ;
replace have_copyright_0 		= . if total_copyrights_0 == . ;
replace have_patent_copyright_0 	= . if total_patents_0 == . | total_copyrights_0 == . ;


gen have_patent_3 			= total_patents_3 > 0 & total_patents_3  < . ;
gen have_copyright_3 			= total_copyrights_3 > 0 & total_copyrights_3  < . ;
gen have_patent_copyright_3 		= have_patent_3 == 1 | have_copyright_3 == 1 ; 

replace have_patent_3 			= . if total_patents_3 == . ;
replace have_copyright_3 		= . if total_copyrights_3 == . ;
replace have_patent_copyright_3 	= . if total_patents_3 == . | total_copyrights_3 == . ;

#delimit ;

gen have_trademark_3			= total_trademarks_3 > 0 & total_trademarks_3  < . ;
replace have_trademark_3		= . if total_trademarks_3 == . ; 

gen have_pat_trade_copy_3 		= have_patent_3 == 1 | have_copyright_3 == 1 | have_trademark_3 ;
replace have_pat_trade_copy_3 	= . if total_patents_3 == . | total_copyrights_3 == . | total_trademarks_3 == . ; 


save sbh-kauffman, replace;


