"""
Pre-prepare CSV-formatted files delivered by MOF on 2020-12-03 by reading
them from the MYI-data/Dec03 directory, creating late variable, merging
them, and writing the merged file to the data directory.
USAGE: execute this script in the MYI-Tax-Analyzer/data directory as follows:
$ python preprep.py
"""
import os
import sys
import pandas as pd
import taf # Tax Analyzer Framework
DATA_YEAR = 2018
RAW_DATA_PATH = os.path.join('..', '..', 'MYI-data', 'Dec03')
KINDS = ['B', 'BE']
DROP_VARS = {}
DROP_VARS['B'] = [
'A1', 'A3', 'A5', 'A8a_S127_3B (S127_3B)', 'A8a_S127_3A (S127_3A)',
'A8a_S127_NA (S127_NA)', 'J1', 'J2', 'J2a', 'J8'
]
DROP_VARS['BE'] = [
'A1', 'A3', 'A5', 'A7a_S127_3B', 'A7a_S127_3A', 'A7a_S127_NA'
]
OUT_DATA_PATH = os.path.join('.')
def main(form=None):
"""
High-level logic of the script.
"""
print(f'data/preprep.py executing for form {form} ...')
# read MOF raw data files into a dataframes
idf = {}
for kind in ['late', 'ontime']:
filename = f'{form}18-{kind}.csv'
filepath = os.path.join(RAW_DATA_PATH, filename)
idf[kind] = pd.read_csv(filepath,
# some files contain Unicode
encoding='unicode_escape',
# some columns have mixed data types
low_memory=False)
idf[kind].fillna(0, inplace=True) # replace all blank fields with zero
assert (idf[kind].Tahun == DATA_YEAR).all()
print(f': {kind:6s} returns= {idf[kind].shape[0]:7d}')
# drop unneed variables
idf[kind].drop(columns=DROP_VARS[form], inplace=True)
# add late variable to each dataframe in the idf dictionary
idf['late']['late'] = 1
idf['ontime']['late'] = 0
# merge the late and ontime dataframes
odf = pd.concat(idf, ignore_index=True, copy=False)
print(f': concat returns= {odf.shape[0]:7d}')
# write merged dataframe to CSV file
filepath = os.path.join(OUT_DATA_PATH, f'{form}18.csv')
taf.df2csv(odf, filepath)
return 0
if __name__ == '__main__':
if main(form='B') != 0:
sys.exit(1)
if main(form='BE') != 0:
sys.exit(1)
sys.exit(0)