function d = dbload(varargin)
% dbload  Create database by loading CSV file.
%
% Syntax
% =======
%
%     D = dbload(FNAME, ...)
%     D = dbload(D,FNAME, ...)
%
% Input arguments
% ================
%
% * `FNAME` [ char | cellstr ] - Input CSV file name or a cell array of CSV
% file names that will be combined.
%
% * `D` [ struct ] - An existing database (struct) to which the CSV
% entries will be added.
%
% Output arguments
% =================
%
% * `D` [ struct ] - Database created from the input CSV file(s).
%
% Options
% ========
%
% * `'case='` [ `'lower'` | `'upper'` | *empty* ] - Change case of variable
% names.
%
% * `'commentRow='` [ char | cellstr | *`{'comment','comments'}`* ] - Label
% at the start of row that will be used to create tseries object comments.
%
% * `'dateFormat='` [ char | *`'YYYYFP'`* ] - Format of dates in first
% column.
%
% * `'delimiter='` [ char | *`','`* ] - Delimiter separating the individual
% values (cells) in the CSV file; if different from a comma, all occurences
% of the delimiter will replaced with commas -- note that this will also
% affect text in comments.
%
% * `'freq='` [ `0` | `1` | `2` | `4` | `6` | `12` | `365` | `'daily'` |
% *empty* ] - Advise frequency of dates; if empty, frequency will be
% automatically recognised.
%
% * `'freqLetters='` [ char | *`'YHQBM'`* ] - Letters representing frequency
% of dates in date column.
%
% * `'leadingRow='` [ char | numeric | *empty* ] - String at the beginning
% (i.e. in the first cell) of the row with variable names; or the line
% number at which the row with variable names begins.
%
% * `'nameFunc='` [ function_handle | *empty* ] - Function used to change or
% transform the variable names; if empty, the variable names from the CSV
% are used as they are.
%
% * `'nan='` [ char | *`NaN`* ] - String representing missing observations
% (case insensitive).
%
% * `'skipRows='` [ char | cellstr | numeric | *empty* ] - Skip rows whose
% first cell matches the string or strings (regular expressions);
% alternatively, you can specify a vector of row numbers to be skipped.
%
% * `'userdata='` [ char | *`Inf`* ] - Field name under which the database
% userdata loaded from the CSV file (if they exist) will be stored in the
% output database; if `'userData'=Inf` the field name will be read from the
% CSV file (and will be thus identical to the originally saved database).
%
% * `'userdataField='` [ char | *`'.'`* ] - A leading character denoting
% userdata fields for individual time series. If empty, no userdata fields
% will be read.
%
% Description
% ============
%
% Use the `'freq='` option whenever there is ambiguity in intepreting
% the date strings, and IRIS is not able to determine the frequency
% correctly (see Example 1).
%
% Structure of CSV database files
% --------------------------------
%
% The minimalist structure of a CSV database file has a leading row with
% variables names, a leading column with dates in the basic IRIS format,
% and individual columns with numeric data:
%
%     +---------+---------+---------+--
%     |         |       Y |       P |
%     +---------+---------+---------+--
%     |  2010Q1 |       1 |      10 |
%     +---------+---------+---------+--
%     |  2010Q2 |       2 |      20 |
%     +---------+---------+---------+--
%     |         |         |         |
%
% You can add a comment row (must be placed before the data part, and start
% with a label 'Comment' in the first cell) that will also be read in and
% assigned as comments to the individual tseries objects created in the
% output database.
%
%     +---------+---------+---------+--
%     |         |       Y |       P |
%     +---------+---------+---------+--
%     | Comment |  Output |  Prices |
%     +---------+---------+---------+--
%     |  2010Q1 |       1 |      10 |
%     +---------+---------+---------+--
%     |  2010Q2 |       2 |      20 |
%     +---------+---------+---------+--
%     |         |         |         |
%
% You can use a different label in the first cell to denote a comment row;
% in that case you need to set the option `'commentRow='` accordingly.
%
% All CSV rows whose names start with a character specified in the option
% `'userdataField='` (a dot by default) will be added to output tseries
% objects as fields of their userdata.
%
%     +---------+---------+---------+--
%     |         |       Y |       P |
%     +---------+---------+---------+--
%     | Comment |  Output |  Prices |
%     +---------+---------+---------+--
%     | .Source |   Stat  |  IMFIFS |
%     +---------+---------+---------+--
%     | .Update | 17Feb11 | 01Feb11 |
%     +---------+---------+---------+--
%     | .Units  | Bil USD |  2010=1 |
%     +---------+---------+---------+--
%     |  2010Q1 |       1 |      10 |
%     +---------+---------+---------+--
%     |  2010Q2 |       2 |      20 |
%     +---------+---------+---------+--
%     |         |         |         |
%
% Example 1
% ==========
%
% Typical example of using the `'freq='` option is a quarterly database with
% dates represented by the corresponding months, such as a sequence
% 2000-01-01, 2000-04-01, 2000-07-01, 2000-10-01, etc. In this case, you
% can use the following options:
%
%     d = dbload('filename.csv','dateFormat','YYYY-MM-01','freq',4);
%

% -IRIS Toolbox.
% -Copyright (c) 2007-2012 Jaromir Benes.

if isstruct(varargin{1})
    d = varargin{1};
    varargin(1) = [];
else
    d = struct();
end

fname = varargin{1};
varargin(1) = [];

P = inputParser();
P.addRequired('d',@isstruct);
P.addRequired('fname',@(x) ischar(x) || iscellstr(x));
P.parse(d,fname);

% Loop over all input databases and subcontract `dbload`.
if iscellstr(fname)
    nfname = length(fname);
    if nfname > 1
        for i = 1 : nfname
            d = dbload(d,fname{i},varargin{:});
            return
        end
    else
        fname = fname{1};
    end
end

opt = passvalopt('data.dbload',varargin{1:end});

if strncmp(opt.dateformat,'$',1)
    opt.dateformat = opt.dateformat(2:end);
    opt.freq = 'daily';
end

if isequal(opt.freq,365)
    opt.freq = 'daily';
end

if isempty(opt.dateformat)
    if strcmpi(opt.freq,'daily')
        opt.dateformat = 'dd/mm/yyyy';
    else
        opt.dateformat = 'YFP';
    end
end

if ~isempty(opt.changecase) && isempty(opt.case)
    opt.case = opt.changecase;
end

if ischar(opt.skiprows)
    opt.skiprows = {opt.skiprows};
end

if ~isempty(opt.skiprows) && ~isnumeric(opt.skiprows)
    for i = 1 : length(opt.skiprows)
        if isempty(opt.skiprows{i})
            continue
        end
        if opt.skiprows{i}(1) ~= '^'
            opt.skiprows{i} = ['^',opt.skiprows{i}];
        end
        if opt.skiprows{i}(end) ~= '$'
            opt.skiprows{i} = [opt.skiprows{i},'$'];
        end
    end
end

if ischar(opt.commentrow)
    opt.commentrow = {opt.commentrow};
end

%**************************************************************************

% Read file.
text = file2char(fname);
text = strfun.converteols(text);

% Replace non-comma delimiter with comma.
if ~strcmp(opt.delimiter,',')
    text = strrep(text,sprintf(opt.delimiter),',');
end

mystrfind = @(x,y) ~isempty(strfind(lower(x),y));

% Read headers.
name = {};
class = {};
comment = {};
isdate = false;
start = 1;
count = 0;
namedone = false;
dbuserdata = '';
dbuserdatafieldname = '';
isuserdata = false;
ident = '';
seriesuserdata = struct();
doreadheaders();

% Trim the headers.
if start > 1
    text = text(start:end);
end

class = strtrim(class);
comment = strtrim(comment);
if length(class) < length(name)
    class(length(class)+1:length(name)) = {''};
end
if length(comment) < length(name)
    comment(length(comment)+1:length(name)) = {''};
end

% Read numeric data from CSV string.
dates = [];
data = [];
nandate = [];
missing = [];
if ~isempty(text)
    doreadnumericdata();
end

if ~isempty(dates)
    maxdate = max(dates);
    mindate = min(dates);
    nper = 1 + round(maxdate - mindate);
    dateindex = 1 + round(dates - mindate);
else
    nper = 0;
    dateindex = [];
    mindate = NaN;
end

% Apply user function to variables names.
if ~isempty(opt.namefunc)
    for i = 1 : length(name)
        name{i} = opt.namefunc(name{i}); %#ok<AGROW>
    end
end

% Convert variable name case if requested by the user.
switch lower(opt.case)
    case 'lower'
        name = lower(name);
    case 'upper'
        name = upper(name);
end

% Make sure the database entry names are all valid Matlab names.
index = ~cellfun(@isempty,name);
name(index) = genvarname(name(index));

% Populated userdata field.
if ~isempty(opt.userdata) && isuserdata
    if ischar(opt.userdata) || isempty(dbuserdatafieldname)
        dbuserdatafieldname = opt.userdata;
    end
    try
        d.(dbuserdatafieldname) = eval(dbuserdata);
    catch E
        utils.error('data', ...
            ['DBLOAD failed when reconstructing user data.\n', ...
            'Matlab says ''%s'''], ...
            E.message);
    end
end

% Populate the output database with tseries and numeric data.
template = tseries();
count = 0;
nname = length(name);
dopopulatedatabase();

% Nested functions.

%**************************************************************************
    function doreadheaders()
        rowcount = 0;
        legacywarning = false;
        while ~isempty(text) && ~isdate
            rowcount = rowcount + 1;
            eol = regexp(text,'\n','start','once');
            if isempty(eol)
                line = text;
            else
                line = text(start:eol-1);
            end
            count = count + 1;
            if isnumericscalar(opt.leadingrow) && count < opt.leadingrow
                continue
            end
            tokens = regexp(line, ...
                '([^",]*),|([^",]*)$|"(.*?)",|"(.*?)"$','tokens');
            tokens = [tokens{:}];
            if isempty(tokens) || all(cellfun(@isempty,tokens))
                ident = '%';
            else
                ident = strrep(tokens{1},'->','');
                ident = strtrim(ident);
            end

            if isnumeric(opt.skiprows) && any(rowcount == opt.skiprows)
                domovetonexteol();
                continue
            end
            
            if dochknamerow()
                name = tokens(2:end);
                namedone = true;
                domovetonexteol();
                continue
            end
            
            action = '';
            
            if strncmp(ident,opt.userdatafield,1)
                fieldname = strtrim(ident(2:end));
                fieldname = genvarname(fieldname);
                try %#ok<TRYNC>
                    seriesuserdata.(fieldname) = tokens(2:end);
                end
                action = 'userdata';
                % Some of the userdata fields can be reused as comments etc.
            end
            
            if strncmp(ident,'%',1)
                action = 'do_nothing';
            elseif mystrfind(ident,'userdata')
                action = 'userdata';
                dbuserdatafieldname = xxgetuserdatafieldname(tokens{1});
                dbuserdata = tokens{2};
                isuserdata = true;
            elseif mystrfind(ident,'class[size]')
                class = tokens(2:end);
                action = 'class';
            elseif mystrfind(ident,'class')
                if ~legacywarning
                    utils.warning('data', ...
                        ['This seems to be a legacy CSV file ', ...
                        'created in an older version of IRIS. ', ...
                        'The database may not load correctly.']);
                    legacywarning = true;
                end
                action = 'class';
            elseif any(strcmpi(ident,opt.commentrow))
                comment = tokens(2:end);
                action = 'comment';
            elseif ~isempty(strfind(lower(ident),'units'))
                action = 'do_nothing';
            elseif ~isnumeric(opt.skiprows) ...
                    && any(~cellfun(@isempty,regexp(ident,opt.skiprows)))
                action = 'do_nothing';
            end
            
            if isempty(action) && ~isempty(ident)
                isdate = true;
            else
                domovetonexteol();
            end
            
        end
        
        function domovetonexteol()
            if ~isempty(eol)
                text(eol) = ' ';
                start = eol + 1;
            else
                text = '';
            end
        end
        
        function flag = dochknamerow()
            if namedone
                flag = false;
                return
            end
            if isnumeric(opt.leadingrow)
                flag = count == opt.leadingrow;
            else
                flag = any(strcmpi(ident,opt.leadingrow));
            end
        end
        
    end
% doreadheaders().

%**************************************************************************
    function doreadnumericdata()
        % Read date column (first column).
        datecol = regexp(text,'^[^,\n]*','match','lineanchors');
        datecol = strtrim(datecol);
        % Remove leading or trailing single or double quotes.
        % Some programs save any text cells with single or double quotes.
        datecol = regexprep(datecol,'^["'']','');
        datecol = regexprep(datecol,'["'']$','');
        % Replace user-supplied NaN strings with 'NaN'. The user-supplied NaN
        % strings must not contain commas.
        text = lower(text);
        text = strrep(text,' ','');
        opt.nan = strtrim(lower(opt.nan));
        % When replacing user-defined NaNs, there can be in theory conflict with
        % date strings. We do not resolve this conflict because it is not very
        % likely.
        if strcmp(opt.nan,'nan')
            % Handle quoted NaNs correctly.
            text = strrep(text,'"nan"','nan');
        else
            % We cannot have multiple NaN strings because of the way `strrep` handles
            % repeated patterns and because `strrep` is not able to detects word
            % boundaries. Handle quoted NaNs first.
            text = strrep(text,['"',opt.nan,'"'],'NaN');
            text = strrep(text,opt.nan,'NaN');
        end
        % Replace empty character cells with numeric NaNs.
        text = strrep(text,'""','NaN');
        % Replace date highlights with numeric NaNs.
        text = strrep(text,'"***"','NaN');
        % Read numeric data.
        whitespace = sprintf(' \b\r\t');
        % Empty cells with be treated either as NaN or NaN+NaNi depending on the
        % presence or absence of complex numbers in the rest of the table.
        missing = pi()*eps();
        data = textscan(text,'',-1, ...
            'delimiter',',','whiteSpace',whitespace, ...
            'headerLines',0,'headerColumns',1,'emptyValue',missing, ...
            'commentStyle','matlab','collectOutput',true);
        data = data{1};
        tmpnper = size(data,1);
        if length(datecol) > tmpnper
            datecol = datecol(1:tmpnper);
        end
        if ~isempty(datecol)
            % Rows with empty dates.
            emptydate = cellfun(@isempty,datecol);
        end
        % Convert date strings.
        dates = nan(1,length(datecol));
        if ~isempty(datecol) && ~all(emptydate)
            if strcmpi(opt.freq,'daily')
                dates(~emptydate) = datenum(datecol(~emptydate), ...
                    lower(opt.dateformat));
            else
                dates(~emptydate) = str2dat(datecol(~emptydate), ...
                    'dateformat',opt.dateformat, ...
                    'freq',opt.freq, ...
                    'freqletters',opt.freqletters);
            end
        end
        % Exclude NaN dates (that includes also empty dates), but keep all data
        % rows. This is because of non-tseries data.
        nandate = isnan(dates);
        dates(nandate) = [];
        % Check for mixed frequencies.
        if ~isempty(dates) && ~strcmpi(opt.freq,'daily')
            tmpfreq = datfreq(dates);
            if any(tmpfreq(1) ~= tmpfreq)
                utils.error('data', ...
                    'Dates in CSV database ''%s'' have mixed frequencies.', ...
                    fname);
            end
        end
    end
% doreadnumericdata().

%**************************************************************************
    function dopopulatedatabase()
        seriesuserdatalist = fieldnames(seriesuserdata);
        nseriesuserdata = length(seriesuserdatalist);
        while count < nname
            thisname = name{count+1};
            if nseriesuserdata > 0
                doseriesuserdata();
            end
            if isempty(thisname)
                % Skip columns with empty names.
                count = count + 1;
                continue
            end
            tokens = regexp(class{count+1},'^(\w+)(\[.*\])?','tokens','once');
            if isempty(tokens)
                thisclass = '';
                tmpsize = [];
            else
                thisclass = lower(tokens{1});
                tmpsize = xxgetsize(tokens{2});
            end
            if isempty(thisclass)
                thisclass = 'tseries';
            end
            if strcmp(thisclass,'tseries')
                % Tseries data.
                if isempty(tmpsize)
                    tmpsize = [Inf,1];
                end
                ncol = prod(tmpsize(2:end));
                if ~isempty(data)
                    if isreal(data(~nandate,count+(1:ncol)))
                        unit = 1;
                    else
                        unit = 1 + 1i;
                    end
                    thisdata = nan(nper,ncol)*unit;
                    thisdata(dateindex,:) = data(~nandate,count+(1:ncol));
                    thisdata(thisdata == missing) = NaN*unit;
                    thisdata = reshape(thisdata,[nper,tmpsize(2:end)]);
                    thiscomment = reshape(comment(count+(1:ncol)),[1,tmpsize(2:end)]);
                    % d.(thisName) = tseries(dates,thisData,thisComment);
                    d.(thisname) = template;
                    d.(thisname).start = mindate;
                    d.(thisname).data = thisdata;
                    d.(thisname).Comment = thiscomment;
                    d.(thisname) = mytrim(d.(thisname));
                else
                    % Create an empty tseries object with proper 2nd and higher
                    % dimensions.
                    d.(thisname) = template;
                    d.(thisname).start = NaN;
                    d.(thisname).data = zeros(0,tmpsize(2:end));
                    d.(thisname).Comment = cell(1,tmpsize(2:end));
                    d.(thisname).Comment(:) = {''};
                end
                if nseriesuserdata > 0
                    d.(thisname) = userdata(d.(thisname),thisuserdata);
                end
            elseif ~isempty(tmpsize)
                % Numeric data.
                ncol = prod(tmpsize(2:end));
                thisdata = reshape(data(1:tmpsize(1),count+(1:ncol)),tmpsize);
                thisdata(thisdata == missing) = NaN;
                % Convert to the right numeric class.
                f = str2func(thisclass);
                d.(thisname) = f(thisdata);
            end
            count = count + ncol;
        end
        
        function doseriesuserdata()
            thisuserdata = struct();
            for ii = 1 : nseriesuserdata
                try
                    thisuserdata.(seriesuserdatalist{ii}) = ...
                        seriesuserdata.(seriesuserdatalist{ii}){count+1};
                catch %#ok<CTCH>
                    thisuserdata.(seriesuserdatalist{ii}) = '';
                end
            end
        end
        
    end
% dopopulatedatabase().

end

% Subfunctions.

%**************************************************************************
function s = xxgetsize(c)
% xxgetsize  Read the size string 1-by-1-by-1 etc. as a vector.

% New style of saving size: [1-by-1-by-1].
% Old style of saving size: [1][1][1].

c = strrep(c(2:end-1),'][','-by-');
s = sscanf(c,'%g-by-');
s = s(:).';

end

%**************************************************************************
function name = xxgetuserdatafieldname(c)

name = regexp(c,'\[([^\]]+)\]','once','tokens');
if ~isempty(name)
    name = name{1};
else
    name = '';
end

end