
Artifact [c71af91098]

Artifact [c71af91098]

Artifact c71af910982df1c3e48f3cf47d0bfaa2b7796bb09b5ae68020cc448141c699fe:

Test IncomeUnderReporting class in a situation where tax data contain
income values while survey data contains income values that are capped
at a known level.  The objective is to use the tax incomes to impute
survey incomes above the cap level.

IMPORTANT NOTE: This is exactly the same data-preparation situation as
examined in the script.  The only difference between
and this script is the method of dealing with situation.  In the MICE class is used; here the IncomeUnderReporting class
is used to solve the data-preparation problem.

import sys
import time
import numpy as np
import pandas as pd
from taf import IncomeUnderReporting

# debugging parameters:
TIME_IUR = False

# dPlN income distribution parameters:
SEED_TD = 123456789
SIZE_TD = 1_000_000  # number of observations in tax data
SEED_SD = 987654321
SIZE_SD = 100_000  # number of observations in survey data
MEAN = 9.0
EDUC = 0.4
SDEV = 1.0
ALPHA = 2.0
BETA = np.inf
CAP_LEVEL = 200e3

# descriptive statistics parameters:
FRACTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

# IncomeUnderReporting class parameters:
IUR_SEED = 678912345

def main():
    High-level logic.
    # pylint: disable=too-many-locals
    # ----------------------------------------------------------------
    # (*) construct high-income tax data
    # ----------------------------------------------------------------
    # construct tax data set containing only incomes above CAP_LEVEL
    rng = np.random.default_rng(seed=SEED_TD)
    norm = rng.normal(size=SIZE_TD)
    expa = rng.exponential(size=SIZE_TD)
    expb = rng.exponential(size=SIZE_TD)
    educ = rng.integers(11, 20+1, size=SIZE_TD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    capped = np.where(income >= CAP_LEVEL, 1, 0)
    tdf = pd.DataFrame({
        'inc': income[capped == 1],
        'educ': educ[capped == 1]
    if CSV_WRITE:
        tdf.to_csv('test4_tax.csv', index=False)
    print('tdf:\n', tdf.describe(percentiles=FRACTILES))
    # ----------------------------------------------------------------
    # (*) construct survey data
    # ----------------------------------------------------------------
    # construct survey data set with complete information
    rng = np.random.default_rng(seed=SEED_SD)
    norm = rng.normal(size=SIZE_SD)
    expa = rng.exponential(size=SIZE_SD)
    expb = rng.exponential(size=SIZE_SD)
    educ = rng.integers(11, 20+1, size=SIZE_SD)
    del rng
    income = np.exp(
        MEAN + EDUC * (educ - 15) + norm*SDEV + expa/ALPHA - expb/BETA
    tdf = pd.DataFrame({
        'inc': income,
        'educ': educ
    df_columns = list(tdf)
    if CSV_WRITE:
        tdf.to_csv('test4_tax.csv', index=False)
    print('TAX df:\n', tdf.describe(percentiles=FRACTILES))
    # ----------------------------------------------------------------
    # (*) construct survey data for input to IncUndRep.adjust method
    # ----------------------------------------------------------------
    # construct capped survey data set
    sdf = tdf.copy()
    capped = >= CAP_LEVEL
    sdf.loc[capped, 'inc'] = CAP_LEVEL
    print('OBS df:\n', sdf.describe(percentiles=FRACTILES))
    if CSV_WRITE:
        sdf.to_csv('test4_obs.csv', index=False)
    # replace survey values with tax values at or above the CAP_LEVEL
    time0 = time.time()
    iur = IncomeUnderReporting(CAP_LEVEL, IUR_SEED)
    adj = iur.adjust(df_columns.index('inc'),
    if TIME_IUR:
        print(f'IUR_exec_time(secs)= {(time.time() - time0):.1f}')
    adf = pd.DataFrame(adj, columns=df_columns).astype('int')
    print('ADJ df:\n', adf.describe(percentiles=FRACTILES))
    if CSV_WRITE:
        adf.to_csv('test4_adj.csv', index=False)
    return 0
# end of main function code

if __name__ == '__main__':