INI-Tax-Analyzer: prep.py

File data/prep.py from the latest check-in

"""
Prepare CSV-formatted files suitable for use by INI-Tax-Analyzer from
publicly available files in the GitHub TPRU-India/taxcalc repository.

USAGE: execute this script in the INI-Tax-Analyzer/data directory as follows:
(tafe) data% python prep.py
"""

import os
import sys
import numpy as np
import pandas as pd
import taf  # Tax-Analyzer-Framework


DATA_YEAR = 2017

RAW_DATA_FILEPATH = os.path.join('.', 'pitSmallData.csv')
RAW_WGHT_FILEPATH = os.path.join('.', 'pit_weightsSD.csv')
CODE_PATH = os.path.join('..', 'initaxanalyzer')
INPUT_DATA_FILEPATH = os.path.join(CODE_PATH, 'tax.csv')
INPUT_WGHT_FILEPATH = os.path.join(CODE_PATH, 'tax_weights.csv')

DATA_VARIABLES = {
    # raw_name: input_name
    # 'FILING_SEQ_NO': 'FILING_SEQ_NO',  # rely on recid
    'AGE': 'age',
    'AGEGRP': 'agegrp',
    # 'FORM_ID': 'FORM_ID',  # non-numeric variable
    # 'ASSESSMENT_YEAR': 'ASSESSMENT_YEAR',  # rely on year
    'SALARIES': 'salary',
    'INCOME_HP': 'income_hp',
    'PRFT_GAIN_BP_OTHR_SPECLTV_BUS': 'prft_gain_bp_othr_specltv_bus',
    'PRFT_GAIN_BP_SPECLTV_BUS': 'prft_gain_bp_specltv_bus',
    'PRFT_GAIN_BP_SPCFD_BUS': 'prft_gain_bp_spcfd_bus',
    'PRFT_GAIN_BP_INC_115BBF': 'prft_gain_bp_inc_115bbf',
    'TOTAL_PROFTS_GAINS_BP': 'total_profts_gains_bp',
    'ST_CG_AMT_1': 'stcg_amt_1',
    'ST_CG_AMT_2': 'stcg_amt_2',
    'ST_CG_AMT_APPRATE': 'stcg_amt_3',
    # 'TOTAL_SCTG': 'TOTAL_SCTG',  # not in code
    'LT_CG_AMT_1': 'ltcg_amt_1',
    'LT_CG_AMT_2': 'ltcg_amt_2',
    # 'TOTAL_LTCG': 'TOTAL_LTCG',  # not in code
    # 'TOTAL_CAP_GAIN'  # not in code
    # 'INCOME_OS_NOT_RACEHORSE'  # not in code
    # 'INC_CHARBLE_SPL_RATE'  # not in code
    # 'INCOME_OS_RACEHORSE'  # not in code
    'TOTAL_INCOME_OS': 'income_os',
    # 'GTI_BEFORE_LOSSES': 'GTI_BEFORE_LOSSES',  # not in code
    'CYL_SET_OFF': 'cyl_set_off',
    # 'BAL_AFTR_CYL_SET_OFF'  # not in code
    'BFL_SET_OFF_BALANCE': 'bfl_set_off_balance',
    # 'GROSS_TOTAL_INCOME'  # not in code
    # 'INC_CHG_TAX_SPL_RATES'  # not in code
    # 'DEDUCT_SEC_10A_OR_10AA'  # no non-zero observations in raw data
    # 'PARTB_CHAPTER_VIA'  # not in code
    # 'PARTC_CHAPTER_VIA'   # not in code
    'TOTAL_DEDUC_VIA': 'itemized_deductions',
    # 'TOTAL_INCOME'  # not in code
    # 'INC_SPL_111A_112'  # not in code
    'NET_AGRC_INCOME': 'net_agrc_income'
    # 'AGGREGATE_INCOME'  # not in code
    # 'CY_LOSS_CARRYFWD'  # not in code
    # 'TAX_PAYBLE_TI_SEC115JC'  # not in code
    # 'SURCHARGE_ON_DEEMED_TI'  # not in code
    # 'EDUCATION_CESS_SEC115JC'  # not in code
    # 'TOTAL_TAX_PAYBL_SEC115JC'  # not in code
    # 'TAX_NORMAL_RATE'  # not in code
    # 'TAX_SI_RATE'  # not in code
    # 'REBATE_AGRC_INCOME'  # not in code
    # 'TAX_PAYABL_TOT_INCOM'  # not in code
    # 'REBATE_87A'  # not in code
    # 'TAX_PAYABLE_ON_REBATE'  # not in code
    # 'SURCHARGE_25OFSI'  # not in code
    # 'SURCHARGE_ON_ABOVE_CRORE'  # not in code
    # 'TOTAL_SURCHARGE'  # not in code
    # 'EDUCATION_CESS'  # not in code
    # 'GROSS_TAX_LIABILITY'  # not in code
    # 'GROSS_TAX_PAYABLE'  # not in code
    # 'CRDEDIT_US115JD_EY'  # not in code
    # 'TAX_PAYBLE_AFTR_CRDT_SEC115JD'  # not in code
    # 'RELIEF_89'  # not in code
    # 'RELIEF_90'  # not in code
    # 'RELIEF_91'  # not in code
    # 'TOTAL_RELIEF'  # not in code
    # 'NET_TAX_LIABILTY'  # not in code
    # 'INT_PAYABLE_234A'  # not in code
    # 'INT_PAYABLE_234B'  # not in code
    # 'INT_PAYABLE_234C'  # not in code
    # 'TOTAL_INT_PAYABLE'  # not in code
    # 'AGGREGATE_LIABILTY'  # not in code
    # 'ADVANCE_TAX'  # not in code
    # 'TOTAL_TDS'  # not in code
    # 'TOTAL_TCS'  # not in code
    # 'SELF_ASSESMNT_TAX'  # not in code
    # 'TOTAL_TAX_PAID'  # not in code
    # 'TAX_PAYBL'  # not in code
    # 'REFUND_AMOUNT'  # not in code
    # 'position'  # not in code
    # 'decile_group'  # not in code
}


def main():
    """
    High-level logic of the script.
    """
    print('executing data/prep.py script ...')

    # read raw files
    raw_data = pd.DataFrame(pd.read_csv(RAW_DATA_FILEPATH))
    raw_wght = pd.DataFrame(pd.read_csv(RAW_WGHT_FILEPATH))

    # extract variables of interest from raw data
    num = raw_data.shape[0]
    recid = np.array(range(1, num+1), dtype='int')
    startyear = np.ones(num, dtype='int') * DATA_YEAR
    weight = np.ones(num, dtype='int')
    data = pd.DataFrame({
        'recid': recid,
        'year': startyear,
        'weight': weight
    })
    for rvar, ivar in DATA_VARIABLES.items():
        data[ivar] = raw_data[rvar]
    data['salary'] += 30000  # to make it into gross salary

    # write data to INPUT_DATA_FILEPATH as CSV-formatted file
    taf.df2csv(data, INPUT_DATA_FILEPATH, float_format='%.0f')

    # extract weights of interest from raw weights
    wght = raw_wght[['WT2017']].copy()

    # write wght to INPUT_WGHT_FILEPATH as CSV-formatted file
    taf.df2csv(wght, INPUT_WGHT_FILEPATH, float_format='%.13f')

    return 0


if __name__ == '__main__':
    sys.exit(main())