MYI-Tax-Analyzer: preprep.py

File data/preprep.py from the latest check-in

"""
Pre-prepare CSV-formatted files delivered by MOF on 2020-12-03 by reading
them from the MYI-data/Dec03 directory, creating late variable, merging
them, and writing the merged file to the data directory.

USAGE: execute this script in the MYI-Tax-Analyzer/data directory as follows:
$ python preprep.py
"""

import os
import sys
import pandas as pd
import taf  # Tax Analyzer Framework


DATA_YEAR = 2018
RAW_DATA_PATH = os.path.join('..', '..', 'MYI-data', 'Dec03')
KINDS = ['B', 'BE']
DROP_VARS = {}
DROP_VARS['B'] = [
    'A1', 'A3', 'A5', 'A8a_S127_3B (S127_3B)', 'A8a_S127_3A (S127_3A)',
    'A8a_S127_NA (S127_NA)', 'J1', 'J2', 'J2a', 'J8'
]
DROP_VARS['BE'] = [
    'A1', 'A3', 'A5', 'A7a_S127_3B', 'A7a_S127_3A', 'A7a_S127_NA'
]
OUT_DATA_PATH = os.path.join('.')


def main(form=None):
    """
    High-level logic of the script.
    """
    print(f'data/preprep.py executing for form {form} ...')

    # read MOF raw data files into a dataframes
    idf = {}
    for kind in ['late', 'ontime']:
        filename = f'{form}18-{kind}.csv'
        filepath = os.path.join(RAW_DATA_PATH, filename)
        idf[kind] = pd.read_csv(filepath,
                                # some files contain Unicode
                                encoding='unicode_escape',
                                # some columns have mixed data types
                                low_memory=False)
        idf[kind].fillna(0, inplace=True)  # replace all blank fields with zero
        assert (idf[kind].Tahun == DATA_YEAR).all()
        print(f': {kind:6s} returns= {idf[kind].shape[0]:7d}')
        # drop unneed variables
        idf[kind].drop(columns=DROP_VARS[form], inplace=True)

    # add late variable to each dataframe in the idf dictionary
    idf['late']['late'] = 1
    idf['ontime']['late'] = 0

    # merge the late and ontime dataframes
    odf = pd.concat(idf, ignore_index=True, copy=False)
    print(f': concat returns= {odf.shape[0]:7d}')
    # write merged dataframe to CSV file
    filepath = os.path.join(OUT_DATA_PATH, f'{form}18.csv')
    taf.df2csv(odf, filepath)

    return 0


if __name__ == '__main__':
    if main(form='B') != 0:
        sys.exit(1)
    if main(form='BE') != 0:
        sys.exit(1)
    sys.exit(0)