% ReadStateData.m
%
%  Reads death data from the Johns Hopkins spreadsheet
%
%   https://coronavirus.jhu.edu/map.html
%   https://github.com/CSSEGISandData/COVID-19
%   https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv
%
%   Codes:  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv

clear; close all;
diarychad('ReadStateData');

% SOURCE of the data (all Johns Hopkins)
%   https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv
%   Codes:  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv
fname='time_series_covid19_deaths_US.csv';  % Death data
cname='UID_ISO_FIPS_LookUp_Table.xls';      % State codes
FirstDataDate=datetime('2020-01-22');

CorrectNYStatePop=19440469;  % Aggregating counties gives a much lower number -- error in Github files

StateUID=readmatrix(cname,'Sheet','States','Range','F4:F54');
StateNames=readmatrix(cname,'Sheet','States','Range','G4:G54','OutputType','char');
StateCodes=readmatrix(cname,'Sheet','States','Range','H4:H54','OutputType','char');
StatePop=readmatrix(cname,'Sheet','States','Range','M4:M54');
StatePop=StatePop';
N=length(StateUID);

CountyNames=readmatrix(fname,'Range','F1:F3262','OutputType','char');
data=readmatrix(fname); % All strings become NaN, but data is stored in appropriate cols/rows!
uidraw=data(:,1);
fips=data(:,5);
deaths=data(:,13:end);
uid=floor(uidraw/1000);
[NumRecs,T]=size(deaths);
CovidDates=FirstDataDate+caldays(0:(T-1))';
datadate=upper(['Data through ' datestr(CovidDates(end))])

% State Codes: AL=1, CA=5, NY=33, etc.
for i=1:N;
    eval([StateCodes{i} '=i;']);
end;

% Adjusting New York county data for April 15 - April 19 change
disp ' '; 
disp 'Adjusting NY for the April 15-19 change in definition of deaths';
apr15=find(CovidDates=='15-Apr-2020');
ny=find(fips==36061);
nyapr15=deaths(ny,apr15);
nyapr16=deaths(ny,apr15+1);
nyapr17=deaths(ny,apr15+2);
oldratio17to15=1.09 % from data *not including* the extended deaths
predictedApr17=oldratio17to15*nyapr15  % Update based on these data Apr 15
adjustmentratio = nyapr17/predictedApr17 % Ratio to raise all numbers for NY by
deaths(ny,1:apr15)=adjustmentratio*deaths(ny,1:apr15);
deaths(ny,apr15+1)=.5*(deaths(ny,apr15)+deaths(ny,apr15+2)); % Apr 16 is halfway
fprintf('  Old Apr 15 = %8.0f     New Apr 15 = %8.0f\n',[nyapr15 deaths(ny,apr15)]);
fprintf('  Old Apr 16 = %8.0f     New Apr 17 = %8.0f\n',[nyapr16 deaths(ny,apr15+1)]);
fprintf('  Old Apr 17 = %8.0f     New Apr 17 = %8.0f\n',[nyapr17 deaths(ny,apr15+2)]);


% Add up counties to get state death totals
StateDeaths=zeros(T,N);
for i=1:NumRecs;
    s=find(StateUID==uid(i));    % s=state index
    if ~isempty(s);
        StateDeaths(:,s)=StateDeaths(:,s)+deaths(i,:)';
    end;
end;

% Fix NY State Pop
StatePop(NY)=CorrectNYStatePop;

StateDeathsPerMillion=div(StateDeaths,StatePop)*1e6;


% ==================================================
% Add special places (NYC, NYX, LA, Detroit, etc);
% ==================================================
regions={ % Row, Name, Pop(thousands)
        'SF'  6001	'Alameda'	1671329
        'SF'  6013	'Contra Costa'	1153526
        'SF'  6041	'Marin'		258826
        'SF'  6081	'San Mateo'	766573
        'SF'  6085	'Santa Clara'	1927852
        'NYC' 36061	'New York City' 1628706 %8336817 %5803210
	'NYC' 36005     'Bronx'         1418207
        'NYC' 36081     'Queens'        2253858
        'NYC' 36047	'Kings'         2559903
        'NYC' 36085	'Richmond'      476143
        'NYR' 36061	'New York City' 1628706 %8336817 %5803210
	'NYR' 36005     'Bronx'         1418207
        'NYR' 36081     'Queens'        2253858
        'NYR' 36047	'Kings'         2559903
        'NYR' 36085	'Richmond'      476143
        'NYR' 36119	'Westchester'	967506 
        'NYR' 36059	'Nassau'        1356924
        'NYR' 36087	'Rockland'	325789 
        'NYR' 36103	'Suffolk'	1476601
        'LAX'  6037	'Los Angeles'	10039107
        'LAX'  6059	'Orange'	3175692
        'CHI'  17031	'Cook'		5150233
        'DET' 26163	'Wayne, MI'     1749343
        'PHI' 42101     'Philadelphia'  1584064
        'MIA' 12086     'Miami'         2716940
        'BOS' 25017     'Middlesex'     1611699
        'BOS' 25025     'Suffolk'        803907
        'ATL' 13121     'Fulton'        1063937
        'ATL' 13089     'DeKalb'         759297
	'HOU' 48201     'Harris County' 4713325
    }      

% Note: NYC population is wrong in the time_series file. The correct number is reported in a different file on the same GitHub site 
% https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv

A=cell2table(regions);
A.Properties.VariableNames={'city','fips','names','pop'}
Nctys=length(A.pop); % # of special regions to be read 
for i=1:Nctys;
    s=find(fips==A.fips(i));
    CountyDeaths(:,i)=deaths(s,:)';
    RawCountyPlace(i)=CountyNames(s);
end;
RawCountyPlace


% Now add up into SF, NYC, LAX etc
CityCodes={'SF','NYC','NYR','LAX','CHI','DET','PHI','MIA','BOS','ATL','HOU'}'; %'NYR',
CityNames={'SF Bay Area','New York City (only)','New York City (plus)','Los Angeles','Chicago','Detroit','Philadelphia','Miami','Boston+Middlesex','Atlanta','Houston (Harris Co.)'}'
for i=1:length(CityCodes);
    indx=find(ismember(A.city,CityCodes(i)));
    if length(indx)>1;
        CityDeaths(:,i)=sum(CountyDeaths(:,indx)')';
    else;
        CityDeaths(:,i)=CountyDeaths(:,indx);
    end;
    CityPop(i)=sum(A.pop(indx));
    if ismember(CityCodes(i),{'NYR'});
        NYC_fips=A.fips(indx);
    end;
end;


% NY Counties are all 36xxx. Let's construct NYX by adding up all other than these...
NYC_fips=[NYC_fips; 36005; 36081]; % Bronx and Queens are in the file for some odd reason; no deaths reported there
NYfips=36;
indx=find(floor(fips/1000)==NYfips); 
for i=1:length(indx);
    %irow=num2str(indx(i)+1); % Have to add +1 for title row in spreadsheet
    if ~any(NYC_fips==fips(indx(i))); % None are Westchester, Suffolk, etc NYR
        NYXDeaths(i,:)=deaths(indx(i),:);
        NYXnames(i)=CountyNames(indx(i));
    end;
end;
NYXDeaths_total=sum(NYXDeaths)';

cshow(datestr(CovidDates),[CityDeaths(:,2) NYXDeaths_total],'%8.0f','NYC NYX');

% For some weird reason this is off, maybe bc NYC is off?
%NYXPop=19440469-CityPop(3);
NYXPop=CorrectNYStatePop-CityPop(3);
disp('We are using 19440469 for NY State instead')
fprintf('which gives a pop for NYX of %12.0f\n',NYXPop);

CityDeaths=[CityDeaths NYXDeaths_total];
CityPop=[CityPop NYXPop];
CityCodes=[CityCodes; {'NYX'}]
CityNames=[CityNames; {'New York excluding NYC'}]


% Read London, Madrid, etc
lname='LondonMadridEtc.csv';
EuropeNames=readmatrix(lname,'Range','B2:B11','OutputType','char');
EuropeCodes=readmatrix(lname,'Range','A2:A11','OutputType','char');
EuropePop=readmatrix(lname,'Range','C2:C11');
EuropePop=EuropePop';
data=readmatrix(lname);
deaths=data(2:end,4:end)';

% Fix Paris, Ille de France April 14=4140 ajdustment (April 15 = 5094)
%  -- Use Paris 75 to adjust
t2=find(ismember(CovidDates,{'2020-04-15'}));
t1=find(ismember(CovidDates,{'2020-04-14'}));
par2=deaths(t2,5);
par1=deaths(t1,5);
PIF=6;
disp ' ';
fprintf('Paris Growth factor:   April 15 = %5.0f   April 14 = %5.0f    Factor=%6.3f\n',[par2 par1 par2/par1]);
PIFfactor=par1/par2*deaths(t2,PIF)/deaths(t1,PIF)
fprintf('Paris Ille de France:   April 15 = %5.0f   April 14 = %5.0f    PIFFactor=%6.3f\n',[deaths([t2 t1],PIF)' PIFfactor]);
disp '   Adjusting all earlier Paris Ille de France data by this factor...';
PIFdeaths=[deaths(1:t1,PIF)*PIFfactor; deaths(t2:end,PIF)];
deaths(:,PIF)=PIFdeaths;


% Fix Rhode Island
RIstate=find(ismember(StateCodes,'RI'));
RIeuro =find(ismember(EuropeCodes,'RI'));
StateDeaths(:,RIstate)=deaths(:,RIeuro);
StateDeathsPerMillion(:,RIstate)=div(StateDeaths(:,RIstate),StatePop(RIstate))*1e6;
EuropeNames(RIeuro)=[];
EuropeCodes(RIeuro)=[];
EuropePop(RIeuro)=[];
deaths(:,RIeuro)=[];


% Put it all together
CityDeaths=[CityDeaths deaths];
CityPop=[CityPop EuropePop];
CityCodes=[CityCodes; EuropeCodes]
CityNames=[CityNames; EuropeNames]

CityDeathsPerMillion=div(CityDeaths,CityPop)*1e6;

save CovidStateData

diary off
