This repository has been archived by the owner on Feb 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrestructure_cov_pop_burden.py
339 lines (325 loc) · 16.6 KB
/
restructure_cov_pop_burden.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 19 14:52:22 2019
@author: LaurencT
"""
import pandas as pd
import numpy as np
import re
from probability_distribution_moments import gamma_moments_burden
from scipy.stats import gamma
from scipy.stats import norm
def get_relevant_burden(param_dict, burden_all):
"""Return a dict of burden data frames with the relevant conditions isolated
Inputs:
param_dict - keys: id_codes, values dfs of parameters for
each of the trials / scenarios
burden_all - a df of burden data, must have the columns cause and age
Returns:
a dict - keys: id_codes, values dfs of burden data
"""
# Create dictionary of lists to filter disease column by
disease_lists = {k: [param_dict[k]['disease_1'][0],
param_dict[k]['disease_2'][0],
param_dict[k]['disease_3'][0]]
for k in param_dict.keys()}
# Create dictionary of burden dfs
burden_dict = {k: burden_all[burden_all['cause'].isin(disease_lists[k])]
for k in disease_lists.keys()}
# Filter based on age
burden_dict = {k: burden_dict[k][burden_dict[k]['age'] == param_dict[k]['age'][0]]
for k in burden_dict.keys()}
return burden_dict
def select_columns_burden(burden_df, index):
"""This probabilistically varies columns by GBD ranges for probabilistic
trials it selects the correct deterministic columns for deterministic
trials and subsets and renames the columns.
Inputs:
burden_df - a df of burden data with upper, lower and mean estimates
for at least one burden measure
index - a string to indicate which deterministic scenario or which
trial it is
Returns:
a df of burden data with just one column for each measure
"""
new_burden_df = burden_df.copy()
# Create column roots e.g. DALY_rate
column_roots = [re.sub('_mean', '', column)
for column in list(new_burden_df)
if re.search('mean', column)]
# Vary relevant column deterministically or probabilistically based on its root
for root in column_roots:
try:
int(index)
#~ changed from normal to gamma
mean = new_burden_df[root + '_mean']
sd = (
new_burden_df[root + '_mean'] -
new_burden_df[root + '_lower']
)/2
prop_lower_mean = (new_burden_df[root + '_mean'] /
new_burden_df[root + '_mean']).mean()
gamma_vals = pd.DataFrame([gamma_moments_burden(mean_val, sd_val)
for mean_val, sd_val in zip(mean, sd)])
new_burden_df[root + '_mean'] = gamma.rvs(a = gamma_vals['shape'],
scale = gamma_vals['scale'],
size = len(gamma_vals['shape']))
new_burden_df[root + '_mean'] = new_burden_df[root + '_mean'] * \
norm.rvs(1, (1-prop_lower_mean)/4)
new_burden_df[root + '_mean'] = np.where(new_burden_df[root + '_mean'] <0,
0, new_burden_df[root + '_mean'])
except ValueError:
if index == 'burden_lower':
new_burden_df[root + '_mean'] = new_burden_df[root + '_lower']
elif index == 'burden_upper':
new_burden_df[root + '_mean'] = new_burden_df[root + '_upper']
# Remove upper and lower columns as the relevant column is now the mean
relevant_columns = [column for column in list(new_burden_df)
if not re.search('upper|lower', column)]
new_burden_df = new_burden_df[relevant_columns]
# Create new column name mapping
new_column_names_dict = {column: re.sub('_mean', '', column)
for column in relevant_columns}
# Rename columns
new_burden_df = new_burden_df.rename(columns = new_column_names_dict)
return new_burden_df
def strain_adjust_burden_df(burden_df, param_df, index):
"""Adjusts a data frame of disease burden downwards in case not all of the
patients would benefit from the interventions (due to sub disease strains
or other factors)
Inputs:
burden_df - a df with columns of disease burden where the names of
all of those columns contain a _ because they are 'measure_metric',
there is also a cause column
param_dict - a df of parameters with columns for 3 diseases in the form
disease_[1-3] and with corresponding strain proportions for each of
those diseases in the form disease_[1-3]_prop
index - a string to indicate which deterministic scenario or which
trial it is
Returns:
a burden df that has been adjusted for those substrains
"""
# Copy the burden_df to avoid side effects
new_burden_df = burden_df.copy()
# Create a list of burden columns because they are the form 'measure_metric'
column_list = [column for column in list(new_burden_df)
if re.search('number|rate', column)]
# Create a mapping of the disease and proportion
disease_dict = {k: k+"_prop" for k in ['disease_1', 'disease_2', 'disease_3']}
# Adjust the df for the substrain prop for each disease
for column in column_list:
for disease in disease_dict.keys():
new_burden_df[column] = np.where(
new_burden_df['cause'] == param_df.loc[index, disease],
new_burden_df[column]*param_df.loc[index, disease_dict[disease]],
new_burden_df[column])
return new_burden_df
def aggregate_burden_df(burden_df):
"""Sums the burden from various conditions targetted by the intervention
Inputs:
burden_df - a df with columns for country, age, cause and disease
burden
Returns:
a df where the cause is now renamed to be a list of causes and the
burden from original causes is now aggregated
"""
# Create a copy of burden_df to avoid side effects
new_burden_df = burden_df.copy()
# Create list then string of causes
causes = set(new_burden_df['cause'])
new_cause_name = ', '.join(causes)
# Aggregate by age and couttry
summed_burden_df = new_burden_df.groupby(['country', 'age']).sum()
# Turn the indexes back into columns in a df
index_list = summed_burden_df.index.tolist()
summed_df = pd.DataFrame(index_list,
columns = ['country', 'age'],
index = index_list)
# Add the new cause name to the df
summed_df['cause'] = new_cause_name
# Merge the burden with the country, age group, and cause columns
summed_df = pd.concat([summed_df, summed_burden_df], axis = 1)
# Reindex so the country is the only index
summed_df.index = [i[0] for i in summed_df.index.tolist()]
# Merge in region / super region columns
other_columns = new_burden_df[['country', 'super_region', 'region']].drop_duplicates()
summed_df = summed_df.merge(other_columns, on = 'country')
summed_df.index = summed_df['country']
return summed_df
def adjust_burden_dict(burden_dict, param_dict):
"""Adjusts the burden numbers down for substrains
Inputs:
burden_dict - a dict - keys are id_codes for the projects
values are dataframes of all relevant burden for those projects
param_dict - a dict - keys are id_codes for the projects and values
are dfs of parameters for the different scenarios and trials
Returns:
a dictionary keys are the id_codes for projects, values are dicts
each trial key responding to a homogenously proportioned burden df
with the relevant figure for that project / trial
"""
burden_dict_new = {}
# Loop through each of the id_codes
for code in param_dict.keys():
# For each code find the relevant burden data and parameters
burden_scenarios_dict = {}
burden_df = burden_dict[code].copy()
param_df = param_dict[code].copy()
# For each trial adjust the burden data to make sure it is relevant
# for the trial
for index in param_df.index.tolist():
burden_df_index = select_columns_burden(burden_df, index)
burden_df_index = strain_adjust_burden_df(burden_df_index,
param_df,
index)
burden_df_index = aggregate_burden_df(burden_df_index)
burden_scenarios_dict[index] = burden_df_index
# Add the new dictionary for the trials to the outer dictionary
burden_dict_new[code] = burden_scenarios_dict
return burden_dict_new
def create_coverage_population_dict(coverage, population, param_dict):
"""Creates a dictionary where keys are id_codes and values are dfs
of relevant coverage / population data for the intervention
Inputs:
coverage - a df of coverage data containing the columns 'country' and
coverage and prob_cover columns for each modality
population - a df if population data with the columns 'country' and
population columns
param_dict - a df with the column 'intervention_type'
Returns:
dict where keys are id_codes and values are dfs of relevant coverage
/ population data for the intervention
"""
cov_pop_dict = {}
# Loop through the id_codes to be keys in the dictionary
for code in param_dict.keys():
new_coverage = coverage.copy()
population = population.copy()
intervention_type = param_dict[code]['intervention_type'][0]
# Select therapeutic coverage columns if it is a Therapeutic (therapeutic)
if intervention_type == 'Therapeutic':
new_coverage = new_coverage[['country',
'therapeutic_coverage',
'therapeutic_prob_cover']]
# Select therapeutic mental health coverage columns if it is a therapeutic
# for a mental health condition
elif intervention_type == 'Therapeutic mental health':
new_coverage = new_coverage[['country',
'therapeutic_mental_health_coverage',
'therapeutic_mental_health_prob_cover']]
# Select vaccine coverage columns if it is a vaccine
elif intervention_type == 'Vaccine':
new_coverage = new_coverage[['country',
'vaccine_coverage',
'vaccine_prob_cover']]
# Select RDT coverage columns if it is a RDT
elif intervention_type == 'Rapid diagnostic test':
new_coverage = new_coverage[['country',
'rapid_diagnostic_test_coverage',
'rapid_diagnostic_test_prob_cover']]
# Select device coverage columns if it is a device
elif intervention_type == 'Device':
new_coverage = new_coverage[['country',
'device_coverage',
'device_prob_cover']]
else:
raise ValueError('The value of intervention_type for '+code+' is not valid')
# Create new column names and rename
new_column_names = {column: re.sub(('vaccine_|rapid_diagnostic_test_|device_|'
'therapeutic_mental_health_|therapeutic_'),
'', column)
for column in list(new_coverage)}
new_coverage = new_coverage.rename(columns = new_column_names)
# Merge the coverage and population data
cov_pop_df = pd.concat([new_coverage, population], axis = 1, sort = True)
cov_pop_dict[code] = cov_pop_df
return cov_pop_dict
def adjust_cov_pop_df(cov_pop_df, index, param_df):
"""Adjust the coverage and proportion based on the parameters for each scenario
Inputs:
cov_pop_df - a df with population columns and coverage columns in the
form prob_cover
index - a string to indicate which deterministic scenario or which
trial it is
param_df - a df with parameters for each of the scenarios must contain
columns: 'coverage' and 'population'
Returns:
a df with the population and coverage columns adjusted
"""
new_cov_pop_df = cov_pop_df.copy()
# Adjust population columns by the population assumption for this scenario
pop_columns = [column for column in list(new_cov_pop_df)
if re.search('pop', column)]
for column in pop_columns:
new_cov_pop_df[column] = (new_cov_pop_df[column] *
param_df.loc[index, 'population'])
# Adjust coverage columns by the coverage assumption for this scenario
cov_columns = [column for column in list(new_cov_pop_df)
if re.search('coverage', column)]
for column in cov_columns:
new_cov_pop_df[column] = (new_cov_pop_df[column] *
param_df.loc[index, 'coverage'])
new_cov_pop_df[column] = np.where(new_cov_pop_df[column] > 0.95,
0.95, new_cov_pop_df[column])
# Adjust prob_cover columns by the coverage assumption for this scenario
cov_columns = [column for column in list(new_cov_pop_df)
if re.search('prob_cover', column)]
for column in cov_columns:
new_cov_pop_df[column] = (new_cov_pop_df[column] *
param_df.loc[index, 'prob_cover'])
new_cov_pop_df[column] = np.where(new_cov_pop_df[column] > 0.95,
0.95, new_cov_pop_df[column])
return new_cov_pop_df
def adjust_cov_pop_for_trials(cov_pop_dict, param_dict):
"""Adjusts the cov_pop_dict so its values are now a dictionary of scenario
and dfs to use in each of those scenarios
Inputs:
cov_pop_dict - a dictionary where the keys are id_codes and the
values are dfs of coverage and population data
param_dict - a dictionary where the keys are id_codes and the values
are dfs of paramters for each of hte scenarios
Returns:
a dict where the keys are id_codes and the values are dicts of where
keys are scenarios and values are dfs of appropriate coverage / population
data
"""
new_cov_pop_dict = {}
# Loop through the dictionary by keys
for code in cov_pop_dict.keys():
# Set up the dict and dfs
cov_pop_scenarios_dict = {}
cov_pop_df = cov_pop_dict[code]
param_df = param_dict[code]
# Loop through each of the scenarios
for index in param_df.index.tolist():
# Adjust the cov_pop_df based on the scenario parameters
new_cov_pop_df = adjust_cov_pop_df(cov_pop_df, index, param_df)
cov_pop_scenarios_dict[index] = new_cov_pop_df.copy()
new_cov_pop_dict[code] = cov_pop_scenarios_dict
return new_cov_pop_dict
def merge_cov_pop_and_burden(burden_dict, cov_pop_dict):
"""Merges the dataframes within a nested dictionary structure
Inputs:
burden_dict - keys - id_code, values are dictionary of scenarios
and burden data dfs
cov_pop_dict - keys - id_code, values are dictionary of scenarios
and coverage and population data dfs
(both sets of keys, and indexes of the dfs have to be equivalent)
Returns:
a merged data_dict of dictionaries
"""
data_dict = {}
for code in burden_dict.keys():
scenario_dict = {}
scenario_dict_burden = burden_dict[code]
scenario_dict_cov_pop = cov_pop_dict[code]
for scen in scenario_dict_burden.keys():
# Merge population and burden data
merged_df = pd.concat([scenario_dict_burden[scen],
scenario_dict_cov_pop[scen]],
axis = 1)
# Deduplicate columns from the df
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
scenario_dict[scen] = merged_df
data_dict[code] = scenario_dict
return data_dict