Tax-Analyzer-Framework: test3.py

File taf/dptests/test3.py from the latest check-in

"""
Test MICE class in a situation where some income variables are missing
not at random (MNAR), but that the missing data pattern is monotone and
aggregate information on the missing income values is known.  That info,
for each income variable with missing values, is the number of missing
income values that are zero and the aggregate total of missing incomes.

NOTES:

(1) Doing the imputation with no adjustments (that is, assuming MAR),
results in a large over-estimate of aggregate intinc and aggregage
divinc among those with missing values.  This is to be expected given
that missing values are more likely among those with low intinc and
divinc.  Here are the results when making that inaccurate assumption:
  -----------------------------
  shift_ = None
  post_shift_min_ = None
  post_shift_max_ = None
  scale_ = None
  zero_ = None
  -----------------------------
Results:
  missing_percent= 80.0730
  -----------------------------
  all_agg_missing_intinc= 0.276
  imp_agg_missing_intinc= 0.958
  all_zer_missing_intinc= 19644
  imp_zer_missing_intinc= 3060
  -----------------------------
  all_agg_missing_divinc= 0.541
  imp_agg_missing_divinc= 4.071
  all_zer_missing_divinc= 34060
  imp_zer_missing_divinc= 221
  -----------------------------

(2) Doing the imputation with shift adjustments only, results in the
imputed totals equaling the aggregate totals as shown below, but there
are too many zeros.  Shift adjustment factors:
  -----------------------------
  shift_ = [-17.2e3, -50.2e3]
  post_shift_min_ = None
  post_shift_max_ = None
  scale_ = None
  zero_ = None
  -----------------------------
Results:
  -----------------------------
  all_agg_missing_intinc= 0.276
  imp_agg_missing_intinc= 0.276
  all_zer_missing_intinc= 19644
  imp_zer_missing_intinc= 62775
  -----------------------------
  all_agg_missing_divinc= 0.541
  imp_agg_missing_divinc= 0.541
  all_zer_missing_divinc= 34060
  imp_zer_missing_divinc= 59167
  -----------------------------

(3) Doing the imputation with scale adjustments only, results in the
imputed totals equaling the aggregate totals as shown below, but there
are too few zeros.  Scale adjustment factors:
  -----------------------------
  shift_ = None
  post_shift_min_ = None
  post_shift_max_ = None
  scale_ = [0.288, 0.134]
  zero_ = None
  -----------------------------
Results:
  -----------------------------
  all_agg_missing_intinc= 0.276
  imp_agg_missing_intinc= 0.276
  all_zer_missing_intinc= 19644
  imp_zer_missing_intinc= 3060
  -----------------------------
  all_agg_missing_divinc= 0.541
  imp_agg_missing_divinc= 0.542
  all_zer_missing_divinc= 34060
  imp_zer_missing_divinc= 2
  -----------------------------

(4) Doing the imputation with scale and zero adjustments, results in
the imputed totals equaling the aggregate totals as shown below, and
the number of zeros in about right.
Scale and zero adjustment factors:
  -----------------------------
  shift_ = None
  post_shift_min_ = None
  post_shift_max_ = None
  scale_ = [0.300, 0.1848]
  zero_ = [0.944e3, 7.162e3]
  -----------------------------
Results:
  -----------------------------
  all_agg_missing_intinc= 0.276
  imp_agg_missing_intinc= 0.277
  all_zer_missing_intinc= 19644
  imp_zer_missing_intinc= 19640
  -----------------------------
  all_agg_missing_divinc= 0.541
  imp_agg_missing_divinc= 0.542
  all_zer_missing_divinc= 34060
  imp_zer_missing_divinc= 34456
  -----------------------------
"""

import sys
import numpy as np
import pandas as pd
from taf import MICE

# debugging parameters:
CSV_WRITE = False

# data set parameters:
DATA_SEED = 234567891
DATA_SIZE = 100_000  # number of observations in full data set

# missing parameters:
MISS_SEED = 987654321
MISS_MNAR = True

# descriptive statistics parameters:
PCTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

# MICE class parameters:
MICE_SEED = 123456789

# missing adjustment assumption:
# (only one should be True)
ADJ_NONE = False
ADJ_SHIFT = False
ADJ_SCALE = False
ADJ_SCALE_ZERO = True


def main():
    """
    High-level logic.
    """
    # pylint: disable=too-many-locals,too-many-statements
    # ----------------------------------------------------------------
    # (*) construct multivariate data set with no missing data
    # ----------------------------------------------------------------
    # construct full data set (without any missing values)
    rng = np.random.default_rng(seed=DATA_SEED)
    educ = rng.integers(11, 20+1, size=DATA_SIZE)
    # ... construct empinc with dPlN distribution
    norm = rng.normal(size=DATA_SIZE)
    expa = rng.exponential(size=DATA_SIZE)
    expb = rng.exponential(size=DATA_SIZE)
    logemp = (
        9.0 + (educ - 15) * 0.4 + norm * 1.0 + expa/2.0 - expb/np.inf
    )
    logemp = np.clip(logemp, -100, 100)
    logemp_mean = logemp.mean()
    empinc = np.exp(logemp).astype(int)
    empinc = np.where(empinc < 1000, 0, empinc)
    # ... construct intinc with dPlN distribution
    norm = rng.normal(size=DATA_SIZE)
    logint = (
        8.0 + norm * 1.0 + (logemp - logemp_mean) * 0.5
    )
    logint = np.clip(logint, -100, 100)
    logint_mean = logint.mean()
    intinc = np.exp(logint).astype(int)
    intinc = np.where(intinc < 1000, 0, intinc)
    # ... construct divinc with dPlN distribution
    norm = rng.normal(size=DATA_SIZE)
    logdiv = (
        9.0 + norm * 1.0 + (logint - logint_mean) * 0.5
    )
    logdiv = np.clip(logdiv, -100, 100)
    divinc = np.exp(logdiv).astype(int)
    divinc = np.where(divinc < 5000, 0, divinc)
    # put data in dataframe and tabulate
    del rng
    adf = pd.DataFrame({
        'educ': educ,
        'empinc': empinc,
        'intinc': intinc,
        'divinc': divinc
    })
    if CSV_WRITE:
        adf.to_csv('test3_all.csv', index=False)
    print('adf marginals:\n', adf.describe(percentiles=PCTILES))
    print('adf correlations:\n', adf.corr())
    # ----------------------------------------------------------------
    # (*) specify some data values as missing with a monotone pattern
    # ----------------------------------------------------------------
    # construct missing data set
    if MISS_MNAR:
        logcap = np.log1p(intinc + divinc)
        miss_prob = np.where(logcap < 10.368, 1.0, 0.0)
    else:
        miss_prob = 0.80
    rng = np.random.default_rng(seed=MISS_SEED)
    urn = rng.random(size=DATA_SIZE)
    del rng
    missing = urn < miss_prob
    miss_intinc = adf.intinc[missing].to_numpy()
    allmiss_intinc_agg = miss_intinc.sum() * 1e-9
    allmiss_intinc_zer = miss_intinc.size - np.count_nonzero(miss_intinc)
    miss_divinc = adf.divinc[missing].to_numpy()
    allmiss_divinc_agg = miss_divinc.sum() * 1e-9
    allmiss_divinc_zer = miss_divinc.size - np.count_nonzero(miss_divinc)
    mdf = adf.copy().astype(float)
    del adf
    mdf.loc[missing, ['intinc', 'divinc']] = np.nan
    print(f'missing_percent= {(100 * missing.sum() / DATA_SIZE):.4f}')
    # ----------------------------------------------------------------
    # (*) impute missing values using MICE class
    # ----------------------------------------------------------------
    # impute missing values in missing data set
    colnames = list(mdf.columns)
    idx_order = [colnames.index('intinc'), colnames.index('divinc')]
    if ADJ_NONE:
        shift_ = None
        post_shift_min_ = None
        post_shift_max_ = None
        scale_ = None
        zero_ = None
    if ADJ_SHIFT:
        shift_ = [-17.2e3, -50.2e3]
        post_shift_min_ = [0, 0]
        post_shift_max_ = None
        scale_ = None
        zero_ = None
    if ADJ_SCALE:
        shift_ = None
        post_shift_min_ = None
        post_shift_max_ = None
        scale_ = [0.288, 0.134]
        zero_ = None
    if ADJ_SCALE_ZERO:
        shift_ = None
        post_shift_min_ = None
        post_shift_max_ = None
        scale_ = [0.300, 0.1848]
        zero_ = [0.944e3, 7.162e3]
    mice = MICE(mdf.shape[0], mdf.shape[1], idx_order, [],
                monotone=True,
                shift=shift_,
                post_shift_min=post_shift_min_,
                post_shift_max=post_shift_max_,
                scale=scale_,
                zero_below_abs=zero_,
                seed=MICE_SEED,
                iters=1)
    iarray = mice.impute(mdf.to_numpy())
    idf = pd.DataFrame(iarray, columns=mdf.columns)
    if CSV_WRITE:
        idf.to_csv('test3_imp.csv', index=False, float_format='%.0f')
    print('idf marginals:\n', idf.describe(percentiles=PCTILES))
    print('idf correlations:\n', idf.corr())
    # ----------------------------------------------------------------
    # (*) compare imputed missing values with actual values
    # ----------------------------------------------------------------
    miss_intinc = idf.intinc[missing].to_numpy()
    impmiss_intinc_agg = miss_intinc.sum() * 1e-9
    impmiss_intinc_zer = miss_intinc.size - np.count_nonzero(miss_intinc)
    miss_divinc = idf.divinc[missing].to_numpy()
    impmiss_divinc_agg = miss_divinc.sum() * 1e-9
    impmiss_divinc_zer = miss_divinc.size - np.count_nonzero(miss_divinc)
    print(f'all_agg_missing_intinc= {allmiss_intinc_agg:.3f}')
    print(f'imp_agg_missing_intinc= {impmiss_intinc_agg:.3f}')
    print(f'all_zer_missing_intinc= {allmiss_intinc_zer}')
    print(f'imp_zer_missing_intinc= {impmiss_intinc_zer}')
    print(f'all_agg_missing_divinc= {allmiss_divinc_agg:.3f}')
    print(f'imp_agg_missing_divinc= {impmiss_divinc_agg:.3f}')
    print(f'all_zer_missing_divinc= {allmiss_divinc_zer}')
    print(f'imp_zer_missing_divinc= {impmiss_divinc_zer}')
    return 0
# end of main function code


if __name__ == '__main__':
    sys.exit(main())