"""
Test IncomeUnderReporting class in a situation where there are no non-survey
data available to adjust the under reporting of high incomes in survey data.
If we are willing to make three assumptions, we can adjust for the
income under reporting in the survey data. Here are the assumptions:
(1) we are willing to assume an income threshold at or above which
we think income is under reported;
(2) we are willing to assume that incomes between a lower bound (which is
less than the threshold) and the threshold are correctly reported in
the survey data;
(3) we are willing to assume that incomes at or above the lower bound have
a Pareto distribution.
This approach has been used in a 2016 OECD working paper by Nicolas
Ruiz and Nicolas Woloszko.
But the ability of this approach to make plausible adjustments of income
under reporting is severely limited because there is no information to
guide the selection of the adjustment threshold or the lower bound. And
by it nature, this approach cannot deal with high income people being
missing from the survey data.
This script illustrates one of these problems: the sensitivity of the
adjustment results to a deviation from assumption (3). It does that
in the following way. This script employs two different sets of
assumptions about the distribution of true higher incomes: (a) pure
Pareto, and (b) mixed Pareto and logNormal. Under the (a) assumptions,
if we guess correctly about the bound and threshold, the approach produces
a fairly accurate estimate of the alpha Pareto parameter, and hence, of
the required under-reporting adjustment (as can be seen in the
test5xpure.exp results file, which is summarized below). However, under
the (b) assumptions, the inference method systematically produces estimates
of alpha that are too low, and hence, produces income under-reporting
adjustments that are much too large (as can be seen in the test5xmixed.exp
results file, which is summarized below).
Here is a summary of these two sets of results for 100,000 observations
(with mean incomes expressed in thousands):
-------------------------------------------------------------------------------
PURE MIXED
-------------------------------------------------------------------------------
TRUE MEAN 116.9 115.2
OBSERVED MEAN 99.7 91.4 (same under reporting that increases above 200)
INFERRED ALPHA 1.52 0.83 (assuming the same 120 bound and 200 threshold)
ADJUSTED MEAN 116.2 811.7
-------------------------------------------------------------------------------
"""
import sys
import time
import argparse
import numpy as np
import pandas as pd
from taf import IncomeUnderReporting
# debugging parameters:
DESCRIBE_ONLY = False
CSV_WRITE = False
TIME_IUR = False
# dPlN income distribution parameters:
SEED_SD = 987654321
SIZE_SD = 100_000 # number of observations in survey data
THRESHOLD_SD = 200e3
# descriptive statistics parameters:
FRACTILES = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99]
# IncomeUnderReporting class parameters:
SEED_IUR = 567891234
THRESHOLD_IUR = 200e3
BOUND_IUR = 120e3
def main(distribution):
"""
High-level logic.
"""
# pylint: disable=too-many-locals,too-many-statements
# ----------------------------------------------------------------
# (*) construct survey data set with no income under-reporting
# ----------------------------------------------------------------
# specify dPlN income distribution parameters
if distribution == 'pure':
# pure Pareto
mean = 11.0
sdev = 0.0
alpha = 1.5
beta = 2.0
else:
# mixed Pareto and logNormal
mean = 10.5
sdev = 1.0
alpha = 1.5
beta = 2.0
# construct survey data set with complete information
rng = np.random.default_rng(seed=SEED_SD)
norm = rng.normal(size=SIZE_SD)
expa = rng.exponential(size=SIZE_SD)
expb = rng.exponential(size=SIZE_SD)
del rng
income = np.exp(mean + norm*sdev + expa/alpha - expb/beta).astype(int)
tdf = pd.DataFrame({
'inc': income,
})
df_columns = list(tdf)
if CSV_WRITE:
tdf.to_csv('test5x_true.csv', index=False)
print('TRUE df:\n', tdf.describe(percentiles=FRACTILES))
# ----------------------------------------------------------------
# (*) specify income under-reporting in survey data
# ----------------------------------------------------------------
# construct survey data set with income under reporting
sdf = tdf.copy()
underreported = sdf.inc >= THRESHOLD_SD
num_underreported = np.count_nonzero(underreported)
ur_inc = np.clip((0.5 * sdf.inc + 0.5 * THRESHOLD_SD).astype(int),
THRESHOLD_SD, np.inf)
sdf.loc[underreported, 'inc'] = ur_inc
print('OBSV df:\n', sdf.describe(percentiles=FRACTILES))
if CSV_WRITE:
sdf.to_csv('test5x_obsv.csv', index=False)
# ----------------------------------------------------------------
# (*) adjust under-reported incomes using synthetic income data
# ----------------------------------------------------------------
# compute mean income in [BOUND_IUR,THRESHOLD_IUR) range
inc = sdf.inc.to_numpy()
in_range = np.logical_and(inc >= BOUND_IUR, inc < THRESHOLD_IUR)
above_range = inc >= THRESHOLD_IUR
print((f'IUR_log_income_range= [{np.log(BOUND_IUR):.3f},'
f'{np.log(THRESHOLD_IUR):.3f})'))
print(f'IUR_income_range= [{BOUND_IUR:.5e},{THRESHOLD_IUR:.5e})')
count_in_range = inc[in_range].size
print((f'percent_in_IUR_income_range= '
f'{(100*count_in_range/SIZE_SD):.2f}'))
count_above_range = inc[above_range].size
print(('percent_above_IUR_income_range= '
f'{(100*count_above_range/SIZE_SD):.2f}'))
if DESCRIBE_ONLY:
return 0
# adjust survey income values at or above THRESHOLD_IUR using infer method
time0 = time.time()
iur = IncomeUnderReporting(THRESHOLD_IUR, SEED_IUR)
adjval = iur.synthetic_adjustment_values_infer(num_underreported,
BOUND_IUR,
inc[in_range],
verbose=True)
assert np.all(adjval >= THRESHOLD_IUR)
adj = iur.adjust(df_columns.index('inc'), sdf.to_numpy(), adjval)
if TIME_IUR:
print(f'IUR_exec_time(secs)= {(time.time() - time0):.1f}')
# check adj values returned from IncomeUnderReporting.adjust method
adf = pd.DataFrame(adj, columns=df_columns).astype('int')
assert adf.shape[0] == SIZE_SD
adf_b_mean = adf.inc[adf.inc < THRESHOLD_SD].mean()
sdf_b_mean = sdf.inc[sdf.inc < THRESHOLD_SD].mean()
if not np.allclose(adf_b_mean, sdf_b_mean):
print(f'adf,sdf_below_mean: {adf_b_mean:.2f} != {sdf_b_mean:.2f}')
print('ADJT df:\n', adf.describe(percentiles=FRACTILES))
if CSV_WRITE:
adf.to_csv('test5x_adjt.csv', index=False)
return 0
# end of main function code
def process_command_line_arguments():
"""
Process command-line arguments returning a dictionary with values for
these variables: DIST
"""
usage_str = 'python test5x.py pure|mixed [--help]'
parser = argparse.ArgumentParser(
prog='',
usage=usage_str,
description=('Execute test5x.py using specified '
'distribution assumption.')
)
parser.add_argument('DIST', type=str,
help=('Assumed type of income distribution: '
'pure Pareto or mixed Pareto and logNormal.'))
args = parser.parse_args()
# check command-line arguments
args_ok = True
if args.DIST not in ('pure', 'mixed'):
sys.stderr.write(
"ERROR: DIST must be either 'pure' or 'mixed'\n"
)
args_ok = False
if args_ok:
return {'DIST': args.DIST}
sys.stderr.write(f'USAGE: {usage_str}\n')
return {}
# end of process_command_line_arguments function
if __name__ == '__main__':
cliarg = process_command_line_arguments()
if cliarg:
sys.exit(main(cliarg['DIST']))
else:
sys.exit(1)