This repository has been archived by the owner on Feb 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
input_check.py
305 lines (284 loc) · 15.4 KB
/
input_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 17 15:30:47 2019
@author: LaurencT
"""
import pandas as pd
import numpy as np
import re
def check_analysis_type(analysis_type):
"""Checks the values in the analysis type dictionary to make sure they are
bools or ints as appropriate
Inputs:
analysis_type: a dict where values are ints or bools depending on
what they determine
"""
# Take out the values that are supposed to be bools
analysis_values = [analysis_type['run_all'],
analysis_type['overwrite_estimates']]
# Test to see if they are bools
for value in analysis_values:
if not type(value) is bool:
raise ValueError('all of the run_... parameters in analysis_type should be booleans')
# Test to make sure num trials is an int
if not type(analysis_type['num_trials']) is int:
raise ValueError('the number of trialsin analysis_type has to be an integer')
def check_indexes(param_user_all):
"""Checks to ensure all of the id_codes for each separate analysis are unique
Inputs:
param_user_all - a df of user inputted parameters
"""
indexes = param_user_all.index.tolist()
if len(set(indexes)) < len(indexes):
raise ValueError('id_codes should be unique, please ensure there are no duplicate id codes in the LTLI_parameters.csv')
def check_columns(param_user_all):
"""Checks to ensure all of the required column names are contained in the
parameters tab
Inputs:
param_user_all - a df of user inputted parameters
"""
necessary_columns = ['id_name', 'intervention_type', 'disease_1', 'disease_2',
'disease_3', 'age', 'population_upper',
'population_mean', 'population_lower', 'population_SD',
'disease_1_prop_mean', 'disease_1_prop_lower',
'disease_1_prop_upper', 'disease_1_prop_SD',
'disease_2_prop_mean', 'disease_2_prop_lower',
'disease_2_prop_upper', 'disease_2_prop_SD',
'disease_3_prop_mean', 'disease_3_prop_lower',
'disease_3_prop_upper', 'disease_3_prop_SD',
'endem_thresh_mean', 'endem_thresh_lower',
'endem_thresh_upper', 'endem_thresh_SD',
'endem_thresh_metric', 'inflation_factor_upper',
'inflation_factor_mean', 'inflation_factor_lower',
'inflation_factor_SD', 'intervention_cut_upper',
'intervention_cut_mean', 'intervention_cut_lower',
'intervention_cut_SD', 'prob_cover_upper',
'prob_cover_mean', 'prob_cover_lower',
'prob_cover_SD', 'coverage_upper', 'coverage_mean',
'coverage_lower', 'coverage_SD', 'coverage_below_threshold',
'share_upper', 'share_mean', 'share_lower', 'share_SD',
'efficacy_upper', 'efficacy_mean', 'efficacy_lower',
'efficacy_SD', 'lives_touched', 'lives_touched_975',
'lives_touched_025', 'lives_improved', 'lives_improved_975',
'lives_improved_025', 'exception_count', 'exception_comment']
necessary_columns_missing = [column for column in necessary_columns
if column not in list(param_user_all) ]
if len(necessary_columns_missing)>0:
raise ValueError('The following columns are missing from the LTLI_parameters.csv '+str(necessary_columns_missing))
def check_not_negative(param_user_all):
"""Checks to ensure all of the numerical parameters are non-negative
Inputs:
param_user_all - a df of parameters, some columns are numerical
Raises:
a value error if another of the parameter values are negative
"""
column_roots = [re.sub('_mean', '', column)
for column in param_user_all if re.search('mean', column)]
numerical_columns = [[column+'_mean', column+'_upper', column+'_lower', column+'_SD']
for column in column_roots]
numerical_columns = [column for sublist in numerical_columns for column in sublist]
contains_negative = [any(param_user_all[column]<0) for column in numerical_columns]
if any(contains_negative):
negative_columns = np.array(numerical_columns)[np.array(contains_negative)]
raise ValueError('The following columns in LTLI_parameters.csv contain negative values and should not '+str(negative_columns))
def check_iterable_1_not_smaller(iterable_1, iterable_2):
"""Checks two iterables of the same length for whether each element in 1
is at least as big as the corresponding element of 2
Inputs:
iterable_1 - an iterable of arbitary length n
iterable_2 - an iterable of length n which is ordered to correspond
to iterable_1
Returns:
bool reflecting whether all elements are not smaller
"""
if len(iterable_1) == len(iterable_2):
bool_list = map(lambda x,y: x>=y, iterable_1, iterable_2)
else:
raise ValueError('the iterables must be the same length')
return all(list(bool_list))
def check_upper_lower(param_user_all):
"""Checks to ensure every value of the upper parameters is at least as great
as the mean and the mean is at least as great as the lower
inputs:
param_user_all - a df of user inputted parameters
"""
column_roots = [re.sub('_mean', '', column)
for column in param_user_all if re.search('mean', column)]
column_mapping = [(column+"_upper", column+"_mean", column+"_lower")
for column in column_roots]
for mapping in column_mapping:
# Check upper >= mean
if not check_iterable_1_not_smaller(iterable_1 = param_user_all[mapping[0]],
iterable_2 = param_user_all[mapping[1]]):
raise ValueError(mapping[1]+' in LTLI_parameters.csv is greater than '+mapping[0])
# Check mean >= lower
elif not check_iterable_1_not_smaller(iterable_1 = param_user_all[mapping[1]],
iterable_2 = param_user_all[mapping[2]]):
raise ValueError(mapping[2]+' in LTLI_parameters.csv is greater than '+mapping[1])
# Upper>=lower by transitivity therefore is valid
else:
print('Values of '+mapping[0]+', '+mapping[1]+', '+mapping[2]+' are consistent')
def column_checker(column, ther_diag_param, correct_value):
"""Checks if a column of a df is all one value, raises an error if it's not
inputs:
column - a string which is the name of a column in ther_diag_param
ther_diag_param - a df of parameters that is filtered so it only
contains entries on diagnostics and therapeutics
correct_value - a number that is the expected column value
"""
if not all(v == correct_value for v in ther_diag_param[column]):
raise ValueError('Values in '+column+' in LTLI_parameters.csv should all be '+str(correct_value)+' for diagnostics and therapeutics')
def check_diag_ther(param_user_all):
"""Checks diagnostics and therapeutics to ensure they don't have incorrect
values for population and endemicity - as those do not feed into these
analyses
Inputs:
param_user_all - a df of all user inputted parameters
"""
# Filter the df so it is relevant to therapeutics and diagnostics
ther_diag_param = param_user_all[param_user_all.intervention_type.isin(['Therapeutic', 'Therapeutic mental health', 'Diagnostic'])]
# Find the parameters that aren't relevant for the analysis and make sure they won't affect results
population_columns = [column for column in param_user_all if re.search('population', column)]
endem_columns = [column for column in param_user_all if re.search('endem.*[d-z]$', column)]
# Check each column in turn to ensure it is the right value
for column in population_columns:
# Population estimates don't feed in to these analysis so shouldn't be
# have a distribution
if re.search("SD", column):
column_checker(column, ther_diag_param, correct_value = 0)
# Population estimates don't feed in to these analysis so should all be 1
else:
column_checker(column, ther_diag_param, correct_value = 1)
# Endemicity threshold shouldn't be applied here, as countries with
# low burden may use it, on their limited number of patients
for column in endem_columns:
column_checker(column, ther_diag_param, correct_value = 0)
def check_disease_selection(param_user_all, burden_all):
"""Checks to confirm that all selected diseases are valid
Inputs:
param_user_all - a df of inputted parameters must contain disease_1
disease_2, and disease_3 columns
burden_all - a df of burden data, disease column must be called cause
"""
# All diseases in the dataset - GBD+aetiology+malaria breakdown#~ - Empty is
# only other valid option
valid_diseases = set(burden_all['cause'].tolist()+['Empty'])
# Makes a list of all diseases inputted by the user
disease_choices = (param_user_all['disease_1'].tolist()+
param_user_all['disease_2'].tolist()+
param_user_all['disease_3'].tolist())
for disease in disease_choices:
if not disease in valid_diseases:
#~ could write in a fuzzy lookup for potential valid names
raise ValueError(disease+' in LTLI_parameters.csv is not a valid disease name, please consult handover guidance for valid diseases')
def flatten(x):
"""Flattens nested list"""
if isinstance(x, list):
return [a for i in x for a in flatten(i)]
else:
return [x]
def check_burden(burden_all):
"""Checks to make sure all the column names in the burden data are consist
ent with what are called in the later code
Inputs:
burden_all - names of disease burden columns contain a _ because
they are 'measure_metric', there are columns for 'country',
'age', 'cause'
"""
# Specify necessary categorising columns
expected_columns = ['age', 'super_region', 'region', 'country', 'cause']
# Specify the different burden data columns expected
expected_burden_columns = ['dalys', 'deaths', 'incidence', 'prevalence']
expected_burden_columns = [[column+'_number', column+'_rate']
for column in expected_burden_columns]
expected_burden_columns = flatten(expected_burden_columns)
expected_burden_columns = [[column+'_upper', column+'_mean', column+'_lower']
for column in expected_burden_columns]
expected_burden_columns = flatten(expected_burden_columns)
# Combine all the expected columns
all_expected_columns = expected_columns + expected_burden_columns
# Generate TRUEs where they are missing
missing_column_bool = [column not in list(burden_all) for column in all_expected_columns]
if any(missing_column_bool):
missing_columns = np.array(all_expected_columns)[np.array(missing_column_bool)]
# Raise an error about missing necessary columns
raise ValueError(str(missing_columns)+' columns are missing from gbd_data_wide_2017.csv')
def check_population(population):
"""Checks to make sure all the column names and indexes in the population
data are consistent with what are called in the later code
Inputs:
population - a df of population data
"""
# Check the column names are there
expected_names = ['pop_0-0', 'pop_1-4', 'pop_5-14', 'pop_15-69','pop_70-100']
expected_names_present = [name in list(population) for name in expected_names]
if not all(expected_names_present):
raise ValueError('The population df does not have the expected columns')
# Check the index for country names
elif 'France' not in population.index:
raise ValueError('The population df does not have countries as indexes, ensure GBD_population_2016_reshaped has the column location_name')
else:
# do nothing because not problems
pass
def check_coverage(coverage):
"""Checks to make sure all the column names and indexes in the coverage data
are consistent with what are called in the later code
Inputs:
population - a df of coverage data
"""
expected_names = ['vaccine_coverage', 'vaccine_prob_cover',
'device_coverage', 'device_prob_cover',
'rapid_diagnostic_test_coverage', 'rapid_diagnostic_test_prob_cover',
'therapeutic_coverage', 'therapeutic_prob_cover',
'therapeutic_mental_health_coverage',
'therapeutic_mental_health_prob_cover']
expected_names_missing = [name for name in expected_names
if name not in list(coverage)]
if len(expected_names_missing)>0:
raise ValueError('The following columns are missing from intervention_coverage_assumptions.csv '+str(expected_names_missing))
# Check the index for country names
elif 'France' not in coverage.index:
raise ValueError('The coverage_df does not have countries as indexes, ensure intervention_coverage_assumptions.csv sheet Penetration Assumptions has a column called country')
else:
# do nothing because not problems
pass
def check_inputs(analysis_type, param_user_all, population, coverage, burden_all):
"""Checks all user inputs using a variety of functions - raises error if
not valid inputs
Inputs:
analysis_type - a dict
param_user_all - a df of input parameters
population - a df of population data
coverage - a df of coverage data
burden_all - a df of burden data
"""
# Make sure user inputted parameters are valid
check_analysis_type(analysis_type)
check_indexes(param_user_all)
check_columns(param_user_all)
check_not_negative(param_user_all)
check_upper_lower(param_user_all)
check_diag_ther(param_user_all)
check_disease_selection(param_user_all, burden_all)
# Make sure burden / population / coverage datasets imported have the
# right country names and column names
check_burden(burden_all)
check_population(population)
check_coverage(coverage)
def check_run_all(analysis_type, param_user_dict):
"""Gets a user input of the right id_code if not running all previous
Inputs:
analysis_type - the dictionary summarising what type of analysis is
being undertaken, must contain the key 'run_all'
param_user_all - df of parameters, must have 'id_code' as indexes
Returns:
the user code input by the user
"""
id_codes = list(param_user_dict.keys())
if analysis_type['run_all'] == False:
print('\nHere is a list of possible id_codes: \n', id_codes, '\n')
id_user = input('Please input a relevant id_code from the csv: ')
while id_user not in id_codes:
id_user = input("That is not a valid id_code, please try again: ")
param_user_dict = {id_user: param_user_dict[id_user]}
return param_user_dict