Tax-Analyzer-Framework

test1.py
Login

File taf/dptests/test1.py from the latest check-in


"""
Test MICE class in a situation where tax data contains income values while
survey data contains income values that are capped at a known level.
The objective is to use the tax incomes to impute survey incomes above
the cap level.

IMPORTANT NOTE: This is exactly the same data-preparation situation as
examined in the test4.py script.  The only difference between test4.py
and this test1.py script is the method of dealing with situation.  In
test4.py the IncomeUnderReporting class is used; here the MICE class
is used to solve the data-preparation problem.
"""

import sys
import time
import numpy as np
import pandas as pd
from taf import MICE

# debugging parameters:
CSV_WRITE = False
TIME_MICE = False

# dPlN income distribution parameters:
SEED_TD = 123456789
SIZE_TD = 1_000_000  # number of observations in tax data
SEED_SD = 987654321
SIZE_SD = 100_000  # number of observations in survey data
MEAN = 9.0
EDUC = 0.4
SDEV = 1.0
ALPHA = 2.0
BETA = np.inf
CAP_LEVEL = 200e3

# descriptive statistics parameters:
PCTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

# MICE class parameters:
MICE_SEED = 678912345


def main():
    """
    High-level logic.
    """
    # pylint: disable=too-many-locals
    # ----------------------------------------------------------------
    # (*) construct high-income tax data
    # ----------------------------------------------------------------
    # construct tax data set containing only incomes above CAP_LEVEL
    # assuming the full income distribution follows a double Pareto
    # log Normal (dPlN) distribution, which is lognormal with a
    # fatter upper tail (ALPHA) and a fatter lower tail (BETA)
    rng = np.random.default_rng(seed=SEED_TD)
    norm = rng.normal(size=SIZE_TD)
    expa = rng.exponential(size=SIZE_TD)
    expb = rng.exponential(size=SIZE_TD)
    educ = rng.integers(11, 20+1, size=SIZE_TD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    ).astype(int)
    taxdata = np.ones(SIZE_TD, dtype=int)
    capped = np.where(income >= CAP_LEVEL, 1, 0)
    tdf = pd.DataFrame({
        'inc': income[capped == 1],
        'educ': educ[capped == 1],
        'taxd': taxdata[capped == 1]
    })
    if CSV_WRITE:
        tdf.to_csv('test1_tax.csv', index=False)
    print('tdf:\n', tdf.describe(percentiles=PCTILES))
    # ----------------------------------------------------------------
    # (*) construct survey data
    # ----------------------------------------------------------------
    # construct survey data set with complete information
    rng = np.random.default_rng(seed=SEED_SD)
    norm = rng.normal(size=SIZE_SD)
    expa = rng.exponential(size=SIZE_SD)
    expb = rng.exponential(size=SIZE_SD)
    educ = rng.integers(11, 20+1, size=SIZE_SD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    ).astype(int)
    taxdata = np.zeros(SIZE_SD, dtype=int)
    capped = np.where(income >= CAP_LEVEL, 1, 0)
    sdf = pd.DataFrame({
        'inc': income[capped == 1],
        'educ': educ[capped == 1],
        'taxd': taxdata[capped == 1]
    })
    if CSV_WRITE:
        sdf.to_csv('test1_sur.csv', index=False)
    print('sdf:\n', sdf.describe(percentiles=PCTILES))
    print(sdf.corr())
    # ----------------------------------------------------------------
    # (*) construct data for input to MICE.impute method
    # ----------------------------------------------------------------
    # construct imputation input data set
    sdf.loc[:, 'inc'] = np.nan
    print('sur_capped:\n', sdf.describe(percentiles=PCTILES))
    inp = pd.concat([tdf, sdf], ignore_index=True)
    if CSV_WRITE:
        inp.to_csv('test1_inp.csv', index=False, na_rep='nan')
    # ----------------------------------------------------------------
    # (*) use MICE.impute method to impute high incomes
    # ----------------------------------------------------------------
    # impute missing values in input data set
    time0 = time.time()
    mice = MICE(inp.shape[0], inp.shape[1], [0], [2], iters=1, seed=MICE_SEED)
    iarray = mice.impute(inp.to_numpy())
    ival_mean, ival_sdev, ival_min, ival_max = mice.get_ival_stats()
    if TIME_MICE:
        print(f'MICE_exec_time(secs)= {(time.time() - time0):.1f}')
    imputed = pd.DataFrame(iarray)
    imputed.columns = ['inc', 'educ', 'taxd']
    imp = imputed[imputed['taxd'] == 0]
    print('imp:\n', imp.describe(percentiles=PCTILES))
    print(imp.corr())
    if CSV_WRITE:
        imp.to_csv('test1_imp.csv', index=False)
    for itr in range(0, mice.iterations+1):
        print((
            f'iter,ival:mean,sdev,min,max(K)= {itr:2d} '
            f'{ival_mean[0, itr]*1e-3:5.0f} '
            f'{ival_sdev[0, itr]*1e-3:5.0f} '
            f'{ival_min[0, itr]*1e-3:5.0f} '
            f'{ival_max[0, itr]*1e-3:5.0f} '
        ))
    return 0
# end of main function code


if __name__ == '__main__':
    sys.exit(main())