#!/usr/bin/python3
'''
Import flu data from CDC

GPL copyleft jc@unternet.net

Note the strange week numbering in National_Custom_Data*.csv from
cdc.gov/flu/weekly compared to the more readable dates on the covid-19
data. The Rosetta stone was provided by a recent cdc.gov/flu/weekly
headline, "Key Updates for Week 16, ending April 18, 2020".

So it seems week 40 of a "flu year" starts in autumn of the year before,
counts up to 52, then counts from week 1 in January to week 39 in this
year's autumn.

However, since there are only 364 days in a year of 52 7-day weeks, there
will be a "week 53" every now and then. That occurred in the 2014-15 season,
in this data set. There are at least 3 logical ways to deal with it, that I
can see: ignore it; distribute it evenly over 7 years; or do as I. Ratel
did and show a 53-week year, with week 53 in the week 1 slot and bumping
all the rest over to the right.

See //accordingtohoyt.com/2020/03/27/covid-19-and-us-mortality-by-i-ratel/

Since I can't do, with a multi-year graph, what I did with the Covid-19 data
chart, showing the percentage as an additional line on the chart (because I
would need one for each year). So I see two rational choices: ignore it,
as Ratel apparently did, or I believe better, multiply the current count by the
inverse of the percentage, assuming the data yet to come in will be similar.
'''

import sys, os, csv, re, logging  # pylint: disable=multiple-imports
from collections import defaultdict

logging.basicConfig(level=logging.DEBUG if __debug__ else logging.INFO)

class Null():  # pylint: disable=too-few-public-methods
    '''
    represent Javascript null
    '''
    def __repr__(self):
        'Javascript null'
        return 'null'
    __str__ = __repr__

NULL = Null()

HEADERS = [
    'AREA',
    'SUB AREA',
    'AGE GROUP',
    'SEASON',
    'WEEK',
    'THRESHOLD',
    'BASELINE',
    'PERCENT P&I',
    'NUM INFLUENZA DEATHS',
    'NUM PNEUMONIA DEATHS',
    'TOTAL DEATHS',
    'PERCENT COMPLETE'
]
WANTED = [
    'SEASON',
    'WEEK',
    'TOTAL DEATHS',
    'PERCENT COMPLETE'
]
CHOSEN = ['National', '', 'All']  # all rows should match
WEEKS = list(range(40, 53)) + list(range(1, 40))  # 40-52, 1-39 as explained

def dataimport(rawdata):
    '''
    Process poorly formatted CSV data from CDC
    '''
    if not isinstance(rawdata, list) and os.path.exists(rawdata):
        with open(rawdata) as infile:
            rawdata = infile.readlines()
    csvin = csv.reader(rawdata)
    rows = [row for row in csvin]
    headers = rows.pop(0)
    if headers != HEADERS:
        raise ValueError('Mismatching headers in new CVC data {} vs. {}'.format(
            headers, HEADERS))
    selected = [dict(zip(headers, row)) for row in rows if row[:3] == CHOSEN]
    if not selected:
        raise ValueError('No data matching {}, found {}'.
                         format(CHOSEN, rows[0][:3]))
    cleaned = [[numberclean(row, item) for item in WANTED] for row in selected]
    return cleaned

def numberclean(data, name):
    '''
    Render the CDC numbers into something parseable by Javascript
    '''
    item = data[name]
    if re.match('^PERCENT', name):
        try:
            item = float(re.match('^[^0-9.]*([0-9.]+)%$', data[name]).group(1))
        except (AttributeError, IndexError):
            raise TypeError('Unexpected percent {}'.format(data[name]))
    elif re.match('.*DEATH', name):
        item = int(item.replace(',', ''))
    elif name == 'WEEK':
        item = int(item)
    return item

def convert(infile, outfile):
    '''
    Convert CSV data to JavaScript
    '''
    cleaned = dataimport(infile)
    weeks = defaultdict(list)
    seasons = defaultdict(list)
    for season, week, deaths, percent in cleaned:
        if percent < 100:
            # incomplete data should probably be corrected somehow
            deaths *= (100.0 / percent)
        if week != 53:  # this is a known problem, ignore it for now
            weeks[season].append(week)
        if weeks[season] != WEEKS[:len(weeks[season])]:
            raise ValueError('Week out of order: {}'.format(weeks[season]))
        seasons[season].append(deaths)
    legend = sorted(seasons)
    # pad out missing data
    for season in legend:
        seasons[season].extend([NULL] * (53 - len(seasons[season])))
    fludata = [[getweek(index)] + [seasons[season][index] for season in legend]
               for index in range(53)]
    if not hasattr(outfile, 'write'):
        outfile = open(outfile, 'w')
    print('const legend = {};'.format(legend), file=outfile)
    print('const fluData = {};'.format(fludata), file=outfile)

def getweek(index):
    '''
    Return week as a string that Google charts will sort into correct order
    '''
    formatted = None
    try:
        week = WEEKS[index]  # works on 1-52
        formatted = '% 2d' % week
    except IndexError:
        formatted = ' '
    return formatted

if __name__ == '__main__':
    sys.argv.append(sys.stdout)  # in case output file wasn't specified
    if len(sys.argv) >= 3:
        # pylint: disable=unbalanced-tuple-unpacking
        COMMAND, CSVNAME, JSNAME = sys.argv[:3]
        convert(CSVNAME, JSNAME)
    else:
        print('Usage: {} CSVNAME, JSNAME'.format(sys.argv[0]),
              file=sys.stderr)