"""
Test MICE class in a situation where some income variables are missing
not at random (MNAR), but that the missing data pattern is monotone and
aggregate information on the missing income values is known. That info,
for each income variable with missing values, is the number of missing
income values that are zero and the aggregate total of missing incomes.
NOTES:
(1) Doing the imputation with no adjustments (that is, assuming MAR),
results in a large over-estimate of aggregate intinc and aggregage
divinc among those with missing values. This is to be expected given
that missing values are more likely among those with low intinc and
divinc. Here are the results when making that inaccurate assumption:
-----------------------------
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = None
zero_ = None
-----------------------------
Results:
missing_percent= 80.0730
-----------------------------
all_agg_missing_intinc= 0.276
imp_agg_missing_intinc= 0.958
all_zer_missing_intinc= 19644
imp_zer_missing_intinc= 3060
-----------------------------
all_agg_missing_divinc= 0.541
imp_agg_missing_divinc= 4.071
all_zer_missing_divinc= 34060
imp_zer_missing_divinc= 221
-----------------------------
(2) Doing the imputation with shift adjustments only, results in the
imputed totals equaling the aggregate totals as shown below, but there
are too many zeros. Shift adjustment factors:
-----------------------------
shift_ = [-17.2e3, -50.2e3]
post_shift_min_ = None
post_shift_max_ = None
scale_ = None
zero_ = None
-----------------------------
Results:
-----------------------------
all_agg_missing_intinc= 0.276
imp_agg_missing_intinc= 0.276
all_zer_missing_intinc= 19644
imp_zer_missing_intinc= 62775
-----------------------------
all_agg_missing_divinc= 0.541
imp_agg_missing_divinc= 0.541
all_zer_missing_divinc= 34060
imp_zer_missing_divinc= 59167
-----------------------------
(3) Doing the imputation with scale adjustments only, results in the
imputed totals equaling the aggregate totals as shown below, but there
are too few zeros. Scale adjustment factors:
-----------------------------
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = [0.288, 0.134]
zero_ = None
-----------------------------
Results:
-----------------------------
all_agg_missing_intinc= 0.276
imp_agg_missing_intinc= 0.276
all_zer_missing_intinc= 19644
imp_zer_missing_intinc= 3060
-----------------------------
all_agg_missing_divinc= 0.541
imp_agg_missing_divinc= 0.542
all_zer_missing_divinc= 34060
imp_zer_missing_divinc= 2
-----------------------------
(4) Doing the imputation with scale and zero adjustments, results in
the imputed totals equaling the aggregate totals as shown below, and
the number of zeros in about right.
Scale and zero adjustment factors:
-----------------------------
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = [0.300, 0.1848]
zero_ = [0.944e3, 7.162e3]
-----------------------------
Results:
-----------------------------
all_agg_missing_intinc= 0.276
imp_agg_missing_intinc= 0.277
all_zer_missing_intinc= 19644
imp_zer_missing_intinc= 19640
-----------------------------
all_agg_missing_divinc= 0.541
imp_agg_missing_divinc= 0.542
all_zer_missing_divinc= 34060
imp_zer_missing_divinc= 34456
-----------------------------
"""
import sys
import numpy as np
import pandas as pd
from taf import MICE
# debugging parameters:
CSV_WRITE = False
# data set parameters:
DATA_SEED = 234567891
DATA_SIZE = 100_000 # number of observations in full data set
# missing parameters:
MISS_SEED = 987654321
MISS_MNAR = True
# descriptive statistics parameters:
PCTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]
# MICE class parameters:
MICE_SEED = 123456789
# missing adjustment assumption:
# (only one should be True)
ADJ_NONE = False
ADJ_SHIFT = False
ADJ_SCALE = False
ADJ_SCALE_ZERO = True
def main():
"""
High-level logic.
"""
# pylint: disable=too-many-locals,too-many-statements
# ----------------------------------------------------------------
# (*) construct multivariate data set with no missing data
# ----------------------------------------------------------------
# construct full data set (without any missing values)
rng = np.random.default_rng(seed=DATA_SEED)
educ = rng.integers(11, 20+1, size=DATA_SIZE)
# ... construct empinc with dPlN distribution
norm = rng.normal(size=DATA_SIZE)
expa = rng.exponential(size=DATA_SIZE)
expb = rng.exponential(size=DATA_SIZE)
logemp = (
9.0 + (educ - 15) * 0.4 + norm * 1.0 + expa/2.0 - expb/np.inf
)
logemp = np.clip(logemp, -100, 100)
logemp_mean = logemp.mean()
empinc = np.exp(logemp).astype(int)
empinc = np.where(empinc < 1000, 0, empinc)
# ... construct intinc with dPlN distribution
norm = rng.normal(size=DATA_SIZE)
logint = (
8.0 + norm * 1.0 + (logemp - logemp_mean) * 0.5
)
logint = np.clip(logint, -100, 100)
logint_mean = logint.mean()
intinc = np.exp(logint).astype(int)
intinc = np.where(intinc < 1000, 0, intinc)
# ... construct divinc with dPlN distribution
norm = rng.normal(size=DATA_SIZE)
logdiv = (
9.0 + norm * 1.0 + (logint - logint_mean) * 0.5
)
logdiv = np.clip(logdiv, -100, 100)
divinc = np.exp(logdiv).astype(int)
divinc = np.where(divinc < 5000, 0, divinc)
# put data in dataframe and tabulate
del rng
adf = pd.DataFrame({
'educ': educ,
'empinc': empinc,
'intinc': intinc,
'divinc': divinc
})
if CSV_WRITE:
adf.to_csv('test3_all.csv', index=False)
print('adf marginals:\n', adf.describe(percentiles=PCTILES))
print('adf correlations:\n', adf.corr())
# ----------------------------------------------------------------
# (*) specify some data values as missing with a monotone pattern
# ----------------------------------------------------------------
# construct missing data set
if MISS_MNAR:
logcap = np.log1p(intinc + divinc)
miss_prob = np.where(logcap < 10.368, 1.0, 0.0)
else:
miss_prob = 0.80
rng = np.random.default_rng(seed=MISS_SEED)
urn = rng.random(size=DATA_SIZE)
del rng
missing = urn < miss_prob
miss_intinc = adf.intinc[missing].to_numpy()
allmiss_intinc_agg = miss_intinc.sum() * 1e-9
allmiss_intinc_zer = miss_intinc.size - np.count_nonzero(miss_intinc)
miss_divinc = adf.divinc[missing].to_numpy()
allmiss_divinc_agg = miss_divinc.sum() * 1e-9
allmiss_divinc_zer = miss_divinc.size - np.count_nonzero(miss_divinc)
mdf = adf.copy().astype(float)
del adf
mdf.loc[missing, ['intinc', 'divinc']] = np.nan
print(f'missing_percent= {(100 * missing.sum() / DATA_SIZE):.4f}')
# ----------------------------------------------------------------
# (*) impute missing values using MICE class
# ----------------------------------------------------------------
# impute missing values in missing data set
colnames = list(mdf.columns)
idx_order = [colnames.index('intinc'), colnames.index('divinc')]
if ADJ_NONE:
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = None
zero_ = None
if ADJ_SHIFT:
shift_ = [-17.2e3, -50.2e3]
post_shift_min_ = [0, 0]
post_shift_max_ = None
scale_ = None
zero_ = None
if ADJ_SCALE:
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = [0.288, 0.134]
zero_ = None
if ADJ_SCALE_ZERO:
shift_ = None
post_shift_min_ = None
post_shift_max_ = None
scale_ = [0.300, 0.1848]
zero_ = [0.944e3, 7.162e3]
mice = MICE(mdf.shape[0], mdf.shape[1], idx_order, [],
monotone=True,
shift=shift_,
post_shift_min=post_shift_min_,
post_shift_max=post_shift_max_,
scale=scale_,
zero_below_abs=zero_,
seed=MICE_SEED,
iters=1)
iarray = mice.impute(mdf.to_numpy())
idf = pd.DataFrame(iarray, columns=mdf.columns)
if CSV_WRITE:
idf.to_csv('test3_imp.csv', index=False, float_format='%.0f')
print('idf marginals:\n', idf.describe(percentiles=PCTILES))
print('idf correlations:\n', idf.corr())
# ----------------------------------------------------------------
# (*) compare imputed missing values with actual values
# ----------------------------------------------------------------
miss_intinc = idf.intinc[missing].to_numpy()
impmiss_intinc_agg = miss_intinc.sum() * 1e-9
impmiss_intinc_zer = miss_intinc.size - np.count_nonzero(miss_intinc)
miss_divinc = idf.divinc[missing].to_numpy()
impmiss_divinc_agg = miss_divinc.sum() * 1e-9
impmiss_divinc_zer = miss_divinc.size - np.count_nonzero(miss_divinc)
print(f'all_agg_missing_intinc= {allmiss_intinc_agg:.3f}')
print(f'imp_agg_missing_intinc= {impmiss_intinc_agg:.3f}')
print(f'all_zer_missing_intinc= {allmiss_intinc_zer}')
print(f'imp_zer_missing_intinc= {impmiss_intinc_zer}')
print(f'all_agg_missing_divinc= {allmiss_divinc_agg:.3f}')
print(f'imp_agg_missing_divinc= {impmiss_divinc_agg:.3f}')
print(f'all_zer_missing_divinc= {allmiss_divinc_zer}')
print(f'imp_zer_missing_divinc= {impmiss_divinc_zer}')
return 0
# end of main function code
if __name__ == '__main__':
sys.exit(main())