Tax-Analyzer-Framework

test4.py
Login

File taf/dptests/test4.py from the latest check-in


"""
Test IncomeUnderReporting class in a situation where tax data contain
income values while survey data contains income values that are capped
at a known level.  The objective is to use the tax incomes to impute
survey incomes above the cap level.

IMPORTANT NOTE: This is exactly the same data-preparation situation as
examined in the test1.py script.  The only difference between test1.py
and this test4.py script is the method of dealing with situation.  In
test1.py the MICE class is used; here the IncomeUnderReporting class
is used to solve the data-preparation problem.
"""

import sys
import time
import numpy as np
import pandas as pd
from taf import IncomeUnderReporting

# debugging parameters:
CSV_WRITE = False
TIME_IUR = False

# dPlN income distribution parameters:
SEED_TD = 123456789
SIZE_TD = 1_000_000  # number of observations in tax data
SEED_SD = 987654321
SIZE_SD = 100_000  # number of observations in survey data
MEAN = 9.0
EDUC = 0.4
SDEV = 1.0
ALPHA = 2.0
BETA = np.inf
CAP_LEVEL = 200e3

# descriptive statistics parameters:
FRACTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

# IncomeUnderReporting class parameters:
IUR_SEED = 678912345


def main():
    """
    High-level logic.
    """
    # pylint: disable=too-many-locals
    # ----------------------------------------------------------------
    # (*) construct high-income tax data
    # ----------------------------------------------------------------
    # construct tax data set containing only incomes above CAP_LEVEL
    rng = np.random.default_rng(seed=SEED_TD)
    norm = rng.normal(size=SIZE_TD)
    expa = rng.exponential(size=SIZE_TD)
    expb = rng.exponential(size=SIZE_TD)
    educ = rng.integers(11, 20+1, size=SIZE_TD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    ).astype(int)
    capped = np.where(income >= CAP_LEVEL, 1, 0)
    tdf = pd.DataFrame({
        'inc': income[capped == 1],
        'educ': educ[capped == 1]
    })
    if CSV_WRITE:
        tdf.to_csv('test4_tax.csv', index=False)
    print('tdf:\n', tdf.describe(percentiles=FRACTILES))
    # ----------------------------------------------------------------
    # (*) construct survey data
    # ----------------------------------------------------------------
    # construct survey data set with complete information
    rng = np.random.default_rng(seed=SEED_SD)
    norm = rng.normal(size=SIZE_SD)
    expa = rng.exponential(size=SIZE_SD)
    expb = rng.exponential(size=SIZE_SD)
    educ = rng.integers(11, 20+1, size=SIZE_SD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    ).astype(int)
    tdf = pd.DataFrame({
        'inc': income,
        'educ': educ
    })
    df_columns = list(tdf)
    if CSV_WRITE:
        tdf.to_csv('test4_tax.csv', index=False)
    print('TAX df:\n', tdf.describe(percentiles=FRACTILES))
    # ----------------------------------------------------------------
    # (*) construct survey data for input to IncUndRep.adjust method
    # ----------------------------------------------------------------
    # construct capped survey data set
    sdf = tdf.copy()
    capped = sdf.inc >= CAP_LEVEL
    sdf.loc[capped, 'inc'] = CAP_LEVEL
    print('OBS df:\n', sdf.describe(percentiles=FRACTILES))
    if CSV_WRITE:
        sdf.to_csv('test4_obs.csv', index=False)
    # replace survey values with tax values at or above the CAP_LEVEL
    time0 = time.time()
    iur = IncomeUnderReporting(CAP_LEVEL, IUR_SEED)
    adj = iur.adjust(df_columns.index('inc'),
                     sdf.to_numpy(),
                     tdf.inc[capped].to_numpy())
    if TIME_IUR:
        print(f'IUR_exec_time(secs)= {(time.time() - time0):.1f}')
    adf = pd.DataFrame(adj, columns=df_columns).astype('int')
    print('ADJ df:\n', adf.describe(percentiles=FRACTILES))
    if CSV_WRITE:
        adf.to_csv('test4_adj.csv', index=False)
    return 0
# end of main function code


if __name__ == '__main__':
    sys.exit(main())