clear;

load alldata98-09;

%% drop year 2008 and 2009
index=find(data(:,2)<2008);
data=data(index,:);

% human capital
load data_hc
data(:,31:36)=data_hc(index,3:8);

cond_index=1; % 0 all data; 1 drop non-positive va, rk and emp

% note that all variable have not been deflated
% 1newid 2year 3cic2 4output 5va_tax 6va_input 7rk 8emp 9sales 10cost 11sales2 
% 12ckjhz (export) 13tprofit 14gjzbj 15frzbj 16allzbj 17gykgqk (state share)
% 18gzw 19age 20djzclx 21lsgx 22va_defl 23bnzj (depreciation this year) 24ljzj (accumulated depreication)
% 25gdzcyjhj (fixed asset original value) 26gdzchj (fixed asset) 27va_data 28number 29labor_compensation

% nominal va
data(:,40)=data(:,4)+data(:,5)-data(:,6);

%% definition
data=sortrows(data, [1 2]);

%% drop the rows that va, rk or emp has missing value 
% missing value
data(isnan(data(:,40)),:)=[]; % va
data(isnan(data(:,7)),:)=[]; % rk
data(isnan(data(:,8)),:)=[]; % emp


%% drop va, rk and emp <= 0
if cond_index == 1
    % non-positive va, rk and emp
    index=find(data(:,40)>0 & data(:,7)>0 & data(:,8)>0);
    data=data(index,:);
end

%% redefine variables
n=length(data);
newid=data(:,1);
year=data(:,2);
cic2=data(:,3);
rk=data(:,7); 
emp=data(:,8);
va=data(:,40);
output=data(:,4);
ls=data(:,21);
wl=data(:,29);
tprofit=data(:,13);
sales=data(:,9);
sprofit=data(:,9)-data(:,10);
age=data(:,19);
bk=data(:,25)-data(:,24);

% adjusted va for computing labor income share
va_zxd=data(:,29)+data(:,5)+data(:,23)+data(:,13);

%% define SOE: gykgqk==1 or state capital share>=0.5
gykgqk=data(:,17);  gjzbj=data(:,14);  allzbj=data(:,16);
soe= (gykgqk==1 | ((gjzbj./allzbj)>=0.5 & allzbj>0));
% define SOE_div: soe=1 but djczlx is NOT soe
dj=data(:,20);
soe_div= (soe==1 & dj ~= 110 & dj ~= 141 & dj ~= 151);
% define SOE by djczlx
soe_dj= (dj == 110 | dj == 141 | dj == 151);
% registration capital share
frzbj=data(:,15);
fr=frzbj./allzbj;
gj=gjzbj./allzbj;

%% lag index
lag_index=zeros(n,1);
lag_index(2:n,1)=newid(2:n,1)==newid(1:n-1,1);

%% firm age in the sample
age_sample=zeros(n,1);
for i=1:n
    if lag_index(i)==0
        year_ini=year(i);
        for j=i:n-1
            age_sample(j)=year(j)-year_ini;
            if lag_index(j+1)==0
                break;
            end
        end
    end
end

% check
% xxx(:,1)=year;
% xxx(:,2)=lag_index;
% xxx(:,3)=age_sample;
% xxx(1:50,:)

%% find firm's location of first sample and last sample 
first=zeros(n,1); last=zeros(n,1);
for i=1:n
    if lag_index(i)==0
        temp_f=i;
    end
    for j=i:n-1
        if lag_index(j+1)==0
            temp_l=j;
            break;
        end
    end
    first(i)=temp_f;
    last(i)=temp_l;
end
last(first(n):n)=n;

% dummy varialbe for firm's first sample and last sameple
% d_first(i)=1 if row i is some firm's first sample, 0 otherwise;
% d_last(i)=1 if row i is some firm's last sample, 0 otherwise;
% d_exit(i)=1 for exiting firms
d_first=zeros(n,1); d_last=zeros(n,1); d_exit=zeros(n,1);
for i=1:n
    d_first(i)= (i==first(i));
    d_last(i)= (i==last(i));
    d_exit(i)= (year(last(i))<2007);
end

% firm's first variables and last variables
first_year=zeros(n,1);  last_year=zeros(n,1);  
first_soe=zeros(n,1); last_soe=zeros(n,1);
first_ls=zeros(n,1); last_ls=zeros(n,1);
for i=1:n
    first_year(i)=year(first(i));  last_year(i)=year(last(i));
    first_soe(i)=soe(first(i));  last_soe(i)=soe(last(i));
    first_ls(i)=ls(first(i));  last_ls(i)=ls(last(i));
end

% adjust age * no effect -- the same thing has been done in the original data *
for i=1:n
    if d_first(i) == 1
        age(i)=age(i);
    else
        age(i)=age(first(i))+year(i)-year(first(i));
    end
end
age=min(age,30);
        
last_age=zeros(n,1);
for i=1:n
    last_age(i)=age(last(i));
end 

% dummy for the incumbent in 2004 and 2007
d_04_07=zeros(n,1);
for i=1:n
    if year(i)==2004 & last_year(i)==2007
        d_04_07(first(i):last(i))=1;
    end
end

% definition for entry firms
d_inc=zeros(n,1); % incumbent firms
d_entry=zeros(n,1); % first > 1998
d_entry_o=zeros(n,1); % first > 1998 and age >= t-1998
d_entry_n=zeros(n,1); % first > 1998 and age < t-1998
d_entry_surv=zeros(n,1); % first > 1998 and active in 2007
d_entry_exit=zeros(n,1); % first > 1998 and exit before 2007
d_entry_old=zeros(n,1); % first > 1998 and active in 2007 and age >= 9 in 2007
d_entry_new=zeros(n,1); % first > 1998 and active in 2007 and age < 9 in 2007
% d_entry = d_entry_surv + d_entry_exit
% d_entry_surv = d_entry_old + d_entry_new
for i=1:n
    if first_year(i) == 1998 & last_year(i) == 2007;
        d_inc(i)=1;
    end
    if first_year(i) > 1998
        d_entry(i)=1;
        if age(i) < year(i) - 1998
            d_entry_n(i)=1;
        elseif age(i) >= year(i) - 1998
            d_entry_o(i)=1;
        end
        if last_year(i) < 2007
            d_entry_exit(i)=1;
        elseif last_year(i) == 2007
            d_entry_surv(i)=1;
            if last_age(i) < 9
                d_entry_new(i)=1;
            elseif last_age(i) >= 9
                d_entry_old(i)=1;
            end
        end
    end
end

% dummy for balanced panel
d_bp=zeros(n,1);
d_bp(find(last-first==9))=1;

%% ownership conversion
% conv_own:  1-unconveted_soe   2-soe_convert_nosoe  3-unconverted_nosoe  4-nosoe_convert_soe
conv_own=zeros(n,1);
for i=1:n
    if first_soe(i) && last_soe(i)
        conv_own(i)=1;
    end
    if first_soe(i) && ~last_soe(i)
        conv_own(i)=2;
    end
    if ~first_soe(i) && ~last_soe(i)
        conv_own(i)=3;
    end
    if ~first_soe(i) && last_soe(i)
        conv_own(i)=4;
    end
end

% find the beginning year that soe changed
conv_year=zeros(n,1);
for i=1:n
    if last(i)~=first(i) && (conv_own(i)==2 || conv_own(i)==4)
        for j=i+1:last(i)
            if soe(j)~=soe(j-1)
                conv_year(first(i):last(i),1)=year(j);
                break;
            end
        end
    end
    i=last(i)+1;
end

%% lsgx conversion
% conv_ls:  1-unconveted_center   2-unconverted_provincial
conv_ls=zeros(n,1);
for i=1:n
    if first_ls(i) == 10 && last_ls(i) == 10
        conv_ls(i)=1;
    end
    if first_ls(i) == 20 && last_ls(i) == 20
        conv_ls(i)=2;
    end
end

% first-period cic2
cic2_first=zeros(n,1);
for i=1:n
    cic2_first(i)=cic2(first(i));
end

% index for firms in the same industry at the first and last periods
cic2_con=zeros(n,1);
for i=1:n
    if cic2(first(i)) == cic2(last(i))
        cic2_con(i)=1;
    end
end

%% human capital
hc=zeros(n,1);
hc_pos=data(:,32); % postgraduate
hc_uni=data(:,33); % university
hc_col=data(:,34); % college
hc_hig=data(:,35); % high school
hc_mid=data(:,36); % middle school and below
hc_dta=(1+0.1*(19-6))*hc_pos+(1+0.1*(16-6))*hc_uni+(1+0.1*(14-6))*hc_col+(1+0.1*(12-6))*hc_hig+hc_mid;
emp_hc=hc_pos+hc_uni+hc_col+hc_hig+hc_mid; % total employment by education composition reported
school=(19*hc_pos+16*hc_uni+14*hc_col+12*hc_hig+6*hc_mid)./emp_hc;

% check
% index=find(hc>0); xxx(:,1)=year(index); xxx(:,2)=emp(index); xxx(:,3)=emp_hc(index);
% index=find(hc>0 & soe==1); sum(hc(index))/sum(emp_hc(index)) % soe hc index
% index=find(hc>0 & soe~=1); sum(hc(index))/sum(emp_hc(index)) % nso hc index

% human capital aggregates by industry and ownership
cic2_index=[6	7	8	9	10	11	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	39	40	41	42  43  44	45	46]';
for i=1:39
    index=find(isnan(hc_dta)==0 & year==2004 & cic2==cic2_index(i));
    index_cso=find(isnan(hc_dta)==0 & year==2004 & cic2==cic2_index(i) & conv_own==1);
    index_pso=find(isnan(hc_dta)==0 & year==2004 & cic2==cic2_index(i) & conv_own==2);
    index_nso=find(isnan(hc_dta)==0 & year==2004 & cic2==cic2_index(i) & conv_own>=3);
    if isempty(index_cso) == 0
        hc_cso_ratio(i)=sum(hc_dta(index_cso))/sum(emp_hc(index_cso));
    else
        hc_cso_ratio(i)=sum(hc_dta(index))/sum(emp_hc(index));
    end
    if isempty(index_pso) == 0
        hc_pso_ratio(i)=sum(hc_dta(index_pso))/sum(emp_hc(index_pso));
    else
        hc_pso_ratio(i)=sum(hc_dta(index))/sum(emp_hc(index));
    end
    if isempty(index_nso) == 0
        hc_nso_ratio(i)=sum(hc_dta(index_nso))/sum(emp_hc(index_nso));
    else
        hc_nso_ratio(i)=sum(hc_dta(index))/sum(emp_hc(index));
    end
    
    % generate human capital
    index_cso=find(cic2==cic2_index(i) & conv_own==1);
    index_pso=find(cic2==cic2_index(i) & conv_own==2);
    index_nso=find(cic2==cic2_index(i) & conv_own>=3);
    hc(index_cso)=hc_cso_ratio(i)*emp(index_cso);
    hc(index_pso)=hc_pso_ratio(i)*emp(index_pso);
    hc(index_nso)=hc_nso_ratio(i)*emp(index_nso);
end

% for those with hc data in 2004
for i=1:n
    if hc_dta(i) > 0
        hc_ratio=hc_dta(i)./emp_hc(i);
        hc(first(i):last(i))=emp(first(i):last(i))*hc_ratio;
    end
end

%% find the last sample of missing soe
miss_soe=find(d_last & first_year==1998 & first_soe & last_year<2007);
data_miss_soe=data(miss_soe,:);

%% save data
if cond_index == 0
    save data_all newid year cic2 va rk emp soe soe_dj soe_div conv_year conv_own d_first d_last d_exit first last gj fr allzbj dj output ls wl tprofit sprofit sales age va_zxd cic2_first cic2_con age_sample d_entry d_entry_new d_entry_old d_entry_exit d_bp d_entry_surv first_year last_year d_inc d_entry_n d_entry_o d_04_07 hc school gykgqk bk
elseif cond_index == 1
    save data_positive newid year cic2 va rk emp soe soe_dj soe_div conv_year conv_own d_first d_last d_exit first last gj fr allzbj dj output ls wl tprofit sprofit sales age va_zxd cic2_first cic2_con age_sample d_entry d_entry_new d_entry_old d_entry_exit d_bp d_entry_surv first_year last_year d_inc d_entry_n d_entry_o d_04_07 hc school gykgqk bk
end

% sectoral va share
cic2_index=[6	7	8	9	10	11	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	39	40	41	42  43  44	45	46]';
for t=2004:2007
    for i=1:39
        index=find(year == t & cic2 == cic2_index(i));
        va_share(i,t-2003)=sum(va(index));
    end
    va_share(:,t-2003)=va_share(:,t-2003)/sum(va_share(:,t-2003));
end

%% generate the balanced panel
bp_index=find(last-first==9);
conv_own=conv_own(bp_index,1);
conv_year=conv_year(bp_index,1);
year=year(bp_index,1); cic2=cic2(bp_index,1); newid=newid(bp_index,1);
va=va(bp_index,1); rk=rk(bp_index,1); emp=emp(bp_index,1); 
soe=soe(bp_index,1); soe_dj=soe_dj(bp_index,1); dj=dj(bp_index,1); gj=gj(bp_index,1); fr=fr(bp_index,1); soe_div=soe_div(bp_index,1);
output=output(bp_index,1); ls=ls(bp_index,1); wl=wl(bp_index,1); tprofit=tprofit(bp_index,1); sprofit=sprofit(bp_index,1); sales=sales(bp_index,1);
conv_ls=conv_ls(bp_index,1);
va_zxd=va_zxd(bp_index,1);
cic2_con=cic2_con(bp_index,1);
age=age(bp_index,1);
hc=hc(bp_index,1);
bk=bk(bp_index,1);

%% lag index
n=length(va);
lag_index=zeros(n,1);
lag_index(2:n,1)=newid(2:n,1)==newid(1:n-1,1);

%% find firm's location of first sample and last sample
first=zeros(n,1); last=zeros(n,1);
for i=1:n
    if lag_index(i)==0
        temp_f=i;
    end
    for j=i:n-1
        if lag_index(j+1)==0
            temp_l=j;
            break;
        end
    end
    first(i)=temp_f;
    last(i)=temp_l;
end
last(first(n):n)=n;

%% save
if cond_index == 0
    save data_all_bp newid year cic2 va rk emp soe soe_dj soe_div conv_year conv_own first last gj fr dj output ls conv_ls wl tprofit sprofit sales cic2_con va_zxd age hc bk
elseif cond_index == 1
    save data_positive_bp newid year cic2 va rk emp soe soe_dj soe_div conv_year conv_own first last gj fr dj output ls conv_ls wl tprofit sprofit sales cic2_con va_zxd age hc bk
end

%% labor income share
cic2_index=[6	7	8	9	10	11	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	39	40	41	42  43  44	45	46]';
eta=1/7;
alpha(:,1)=cic2_index;
for t=2004:2007
    for i=1:39
        index=find(year == t & conv_own == 3 & cic2 == cic2_index(i));
        labor_share(i,t-2003)=sum(wl(index))/sum(va(index));
        labor_share_zxd(i,t-2003)=sum(wl(index))/sum(va_zxd(index));
    end
end
alp=1-labor_share_zxd/(1-eta);
alp_mean=(mean(alp'))';

% weighted average alpha
alp_agg=mean(sum(alp.*va_share));

% some results
mean(sum(labor_share.*va_share))
mean(sum(labor_share_zxd.*va_share))

% adjustment
alp_agg_target=1-0.5/(1-eta); % 0.5 = aggregate labor share = (1-alp)*(1-eta)
alp_ad=alp_mean*alp_agg_target/alp_agg;
alpha(:,2)=alp_ad;

% test
labor_share_ad=(1-alp*alp_agg_target/alp_agg)*(1-eta);
labor_share_agg=mean(sum(labor_share_ad.*va_share));

%% heterogeneous markups 
markup_in=0.015; % markups increase
the=1-1/(1+markup_in);
e_t=(1-eta)*(1-the);

save data_parameter cic2_index eta alpha the e_t
