"""
Prepare CSV-formatted files suitable for use by INI-Tax-Analyzer from
publicly available files in the GitHub TPRU-India/taxcalc repository.
USAGE: execute this script in the INI-Tax-Analyzer/data directory as follows:
(tafe) data% python prep.py
"""
import os
import sys
import numpy as np
import pandas as pd
import taf # Tax-Analyzer-Framework
DATA_YEAR = 2017
RAW_DATA_FILEPATH = os.path.join('.', 'pitSmallData.csv')
RAW_WGHT_FILEPATH = os.path.join('.', 'pit_weightsSD.csv')
CODE_PATH = os.path.join('..', 'initaxanalyzer')
INPUT_DATA_FILEPATH = os.path.join(CODE_PATH, 'tax.csv')
INPUT_WGHT_FILEPATH = os.path.join(CODE_PATH, 'tax_weights.csv')
DATA_VARIABLES = {
# raw_name: input_name
# 'FILING_SEQ_NO': 'FILING_SEQ_NO', # rely on recid
'AGE': 'age',
'AGEGRP': 'agegrp',
# 'FORM_ID': 'FORM_ID', # non-numeric variable
# 'ASSESSMENT_YEAR': 'ASSESSMENT_YEAR', # rely on year
'SALARIES': 'salary',
'INCOME_HP': 'income_hp',
'PRFT_GAIN_BP_OTHR_SPECLTV_BUS': 'prft_gain_bp_othr_specltv_bus',
'PRFT_GAIN_BP_SPECLTV_BUS': 'prft_gain_bp_specltv_bus',
'PRFT_GAIN_BP_SPCFD_BUS': 'prft_gain_bp_spcfd_bus',
'PRFT_GAIN_BP_INC_115BBF': 'prft_gain_bp_inc_115bbf',
'TOTAL_PROFTS_GAINS_BP': 'total_profts_gains_bp',
'ST_CG_AMT_1': 'stcg_amt_1',
'ST_CG_AMT_2': 'stcg_amt_2',
'ST_CG_AMT_APPRATE': 'stcg_amt_3',
# 'TOTAL_SCTG': 'TOTAL_SCTG', # not in code
'LT_CG_AMT_1': 'ltcg_amt_1',
'LT_CG_AMT_2': 'ltcg_amt_2',
# 'TOTAL_LTCG': 'TOTAL_LTCG', # not in code
# 'TOTAL_CAP_GAIN' # not in code
# 'INCOME_OS_NOT_RACEHORSE' # not in code
# 'INC_CHARBLE_SPL_RATE' # not in code
# 'INCOME_OS_RACEHORSE' # not in code
'TOTAL_INCOME_OS': 'income_os',
# 'GTI_BEFORE_LOSSES': 'GTI_BEFORE_LOSSES', # not in code
'CYL_SET_OFF': 'cyl_set_off',
# 'BAL_AFTR_CYL_SET_OFF' # not in code
'BFL_SET_OFF_BALANCE': 'bfl_set_off_balance',
# 'GROSS_TOTAL_INCOME' # not in code
# 'INC_CHG_TAX_SPL_RATES' # not in code
# 'DEDUCT_SEC_10A_OR_10AA' # no non-zero observations in raw data
# 'PARTB_CHAPTER_VIA' # not in code
# 'PARTC_CHAPTER_VIA' # not in code
'TOTAL_DEDUC_VIA': 'itemized_deductions',
# 'TOTAL_INCOME' # not in code
# 'INC_SPL_111A_112' # not in code
'NET_AGRC_INCOME': 'net_agrc_income'
# 'AGGREGATE_INCOME' # not in code
# 'CY_LOSS_CARRYFWD' # not in code
# 'TAX_PAYBLE_TI_SEC115JC' # not in code
# 'SURCHARGE_ON_DEEMED_TI' # not in code
# 'EDUCATION_CESS_SEC115JC' # not in code
# 'TOTAL_TAX_PAYBL_SEC115JC' # not in code
# 'TAX_NORMAL_RATE' # not in code
# 'TAX_SI_RATE' # not in code
# 'REBATE_AGRC_INCOME' # not in code
# 'TAX_PAYABL_TOT_INCOM' # not in code
# 'REBATE_87A' # not in code
# 'TAX_PAYABLE_ON_REBATE' # not in code
# 'SURCHARGE_25OFSI' # not in code
# 'SURCHARGE_ON_ABOVE_CRORE' # not in code
# 'TOTAL_SURCHARGE' # not in code
# 'EDUCATION_CESS' # not in code
# 'GROSS_TAX_LIABILITY' # not in code
# 'GROSS_TAX_PAYABLE' # not in code
# 'CRDEDIT_US115JD_EY' # not in code
# 'TAX_PAYBLE_AFTR_CRDT_SEC115JD' # not in code
# 'RELIEF_89' # not in code
# 'RELIEF_90' # not in code
# 'RELIEF_91' # not in code
# 'TOTAL_RELIEF' # not in code
# 'NET_TAX_LIABILTY' # not in code
# 'INT_PAYABLE_234A' # not in code
# 'INT_PAYABLE_234B' # not in code
# 'INT_PAYABLE_234C' # not in code
# 'TOTAL_INT_PAYABLE' # not in code
# 'AGGREGATE_LIABILTY' # not in code
# 'ADVANCE_TAX' # not in code
# 'TOTAL_TDS' # not in code
# 'TOTAL_TCS' # not in code
# 'SELF_ASSESMNT_TAX' # not in code
# 'TOTAL_TAX_PAID' # not in code
# 'TAX_PAYBL' # not in code
# 'REFUND_AMOUNT' # not in code
# 'position' # not in code
# 'decile_group' # not in code
}
def main():
"""
High-level logic of the script.
"""
print('executing data/prep.py script ...')
# read raw files
raw_data = pd.DataFrame(pd.read_csv(RAW_DATA_FILEPATH))
raw_wght = pd.DataFrame(pd.read_csv(RAW_WGHT_FILEPATH))
# extract variables of interest from raw data
num = raw_data.shape[0]
recid = np.array(range(1, num+1), dtype='int')
startyear = np.ones(num, dtype='int') * DATA_YEAR
weight = np.ones(num, dtype='int')
data = pd.DataFrame({
'recid': recid,
'year': startyear,
'weight': weight
})
for rvar, ivar in DATA_VARIABLES.items():
data[ivar] = raw_data[rvar]
data['salary'] += 30000 # to make it into gross salary
# write data to INPUT_DATA_FILEPATH as CSV-formatted file
taf.df2csv(data, INPUT_DATA_FILEPATH, float_format='%.0f')
# extract weights of interest from raw weights
wght = raw_wght[['WT2017']].copy()
# write wght to INPUT_WGHT_FILEPATH as CSV-formatted file
taf.df2csv(wght, INPUT_WGHT_FILEPATH, float_format='%.13f')
return 0
if __name__ == '__main__':
sys.exit(main())