diff --git a/CHANGES b/CHANGES index 456c53ef..c41977f2 100644 --- a/CHANGES +++ b/CHANGES @@ -5,7 +5,6 @@ version 0.16 Minor bug fix relative to release 0.15. Fixed a transpose operation that would cause normative.py to fail in certain cases version 0.17: -Changes: - New syntax for functions related to parallelization (python path is automatically generated) - Support for slurm clusters has been added - Updates to hierarchical Bayesian regression (different priors for intercepts and noise) for improved transfer @@ -15,12 +14,10 @@ Changes: - Updated sphinx documentation version 0.18 -Changes: - addition of cross-validation functionality for HBR - minor bug fixes for HBR version 0.19 -Changes: - separate standardization of responses and covariates - standardization is no longer on by default - new standardization methods @@ -28,7 +25,6 @@ Changes: - minor bugs resolved (CV with HBR) version 0.20 -Changes: - Major code refactoring - Requirements updated for python 3.8.3 - Updates to documentation and Integrations with Read the Docs @@ -54,7 +50,6 @@ Some minor updates and bug fixes: - other minor bug fixes related to cross-validation (computation of error metrics), import problems and calibration statistics version 0.23 -Changes: - SHASH functionality for HBR added - Bug fix for normative model predict() function @@ -72,3 +67,9 @@ version 0.26 - Added support for web portal - Provided a wrapper for blr to use transfer() functionality - Also streamlined tutorials (PCNtoolkit-demo), so that all tutorials run with this version + +version 0.27 +- Configured more sensible default options for HBR (random slope and intercept for mu and random intercept for sigma) +- Fixed a translation problem between the previous naming convention for HBR models (only Gaussian models) and the current naming (also SHASH models) +- Minor updates to fix synchronisation problems in PCNportal (related to the HBR updates above) +- Added configuration files for containerisation with Docker diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..6e22e953 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,39 @@ +# Installs software and cleans up unnecessary memory: the smaller, the less downtime +FROM continuumio/miniconda3:latest +RUN apt-get update +RUN apt-get install -y libhdf5-dev \ + pkg-config \ + gcc \ + g++ \ + zip + +# Make sure we have the right version of numpy +RUN conda install numpy=1.21 + +# Install directly from GitHub (master branch) +#RUN pip install scikit-learn +#RUN pip install pcntoolkit==0.26 + +# This is an alternative method that pulls from the dev branch +RUN wget https://github.com/amarquand/PCNtoolkit/archive/dev.zip +RUN unzip dev.zip +RUN pip install scikit-learn +RUN cd PCNtoolkit-dev; pip install . ; cd .. + +# Add command line links +RUN ln -s /opt/conda/lib/python3.10/site-packages/pcntoolkit /opt/ptk +RUN chmod 755 /opt/ptk/normative.py +RUN chmod 755 /opt/ptk/normative_parallel.py +RUN chmod 755 /opt/ptk/trendsurf.py +RUN echo "export PATH=${PATH}:/opt/ptk" >> ~/.bashrc + +# clean up +RUN rm -rf PCNtoolkit-dev dev.zip +RUN conda clean -a +RUN pip cache purge +RUN apt-get clean + +# execute entrypoint +COPY entrypoint.sh ./entrypoint.sh +RUN chmod +x ./entrypoint.sh +ENTRYPOINT [ "./entrypoint.sh" ] \ No newline at end of file diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 00000000..517f2b1e --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# take the pcntoolkit function from the first argument specified +func="$1" +shift + +# run using all remaining arguments +/opt/ptk/${func} "$@" \ No newline at end of file diff --git a/pcntoolkit/model/NPR.py b/pcntoolkit/model/NPR.py old mode 100644 new mode 100755 index 07bee34c..6e8c1f53 --- a/pcntoolkit/model/NPR.py +++ b/pcntoolkit/model/NPR.py @@ -77,4 +77,3 @@ def np_loss(y_hat, y_hat_84, y, z_all, z_context): 0.16 * F.binary_cross_entropy(torch.squeeze(y_hat_84[idx2,:]), torch.mean(y[idx2,:],dim=1), reduction="sum") KLD = kl_div_gaussians(z_all[0], z_all[1], z_context[0], z_context[1]) return BCE + KLD + BCE84 - diff --git a/pcntoolkit/model/hbr.py b/pcntoolkit/model/hbr.py index afb86899..98c71037 100644 --- a/pcntoolkit/model/hbr.py +++ b/pcntoolkit/model/hbr.py @@ -173,8 +173,12 @@ def hbr(X, y, batch_effects, batch_effects_size, configs, trace=None): pb = ParamBuilder(model, X, y, batch_effects, trace, configs) if configs['likelihood'] == 'Normal': - mu = pb.make_param("mu").get_samples(pb) - sigma = pb.make_param("sigma").get_samples(pb) + mu = pb.make_param("mu", mu_slope_mu_params = (0.,10.), + sigma_slope_mu_params = (5.,), + mu_intercept_mu_params=(0.,10.), + sigma_intercept_mu_params = (5.,)).get_samples(pb) + sigma = pb.make_param("sigma", mu_sigma_params = (10., 5.), + sigma_sigma_params = (5.,)).get_samples(pb) sigma_plus = pm.math.log(1+pm.math.exp(sigma)) y_like = pm.Normal('y_like',mu=mu, sigma=sigma_plus, observed=y) @@ -530,7 +534,8 @@ def make_param(self, name, dim = (1,), **kwargs): slope_parameterization = self.make_param(f'slope_{name}', dim=[self.feature_num], **kwargs) intercept_parameterization = self.make_param(f'intercept_{name}', **kwargs) return LinearParameterization(name=name, dim=dim, - slope_parameterization=slope_parameterization, intercept_parameterization=intercept_parameterization, + slope_parameterization=slope_parameterization, + intercept_parameterization=intercept_parameterization, pb=self, **kwargs) diff --git a/pcntoolkit/normative.py b/pcntoolkit/normative.py index bfd50553..d4a8cbed 100755 --- a/pcntoolkit/normative.py +++ b/pcntoolkit/normative.py @@ -1,4 +1,4 @@ -#!/Users/andre/sfw/anaconda3/bin/python +#!/opt/conda/bin/python # ------------------------------------------------------------------------------ # Usage: @@ -1032,11 +1032,7 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None, Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() - # Creates a file for every job succesfully completed (for tracking failed jobs). - if count_jobsdone==True: - done_path = os.path.join(log_path, str(job_id)+".jobsdone") - Path(done_path).touch() - + if testresp is None: save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix) @@ -1069,7 +1065,17 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None, save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results, outputsuffix=outputsuffix) + # Creates a file for every job succesfully completed (for tracking failed jobs). + if count_jobsdone==True: + done_path = os.path.join(log_path, str(job_id)+".jobsdone") + Path(done_path).touch() + return (Yhat, S2, Z) + + # Creates a file for every job succesfully completed (for tracking failed jobs). + if count_jobsdone==True: + done_path = os.path.join(log_path, str(job_id)+".jobsdone") + Path(done_path).touch() def extend(covfile, respfile, maskfile=None, **kwargs): diff --git a/pcntoolkit/normative_model/norm_hbr.py b/pcntoolkit/normative_model/norm_hbr.py index 8ad0b11a..d769163e 100644 --- a/pcntoolkit/normative_model/norm_hbr.py +++ b/pcntoolkit/normative_model/norm_hbr.py @@ -34,28 +34,115 @@ class NormHBR(NormBase): - """ Classical GPR-based normative modelling approach + + """ HBR multi-batch normative modelling class. By default, this function + estimates a linear model with random intercept, random slope, and random + homoscedastic noise. + + :param X: [N×P] array of clinical covariates + :param y: [N×1] array of neuroimaging measures + :param trbefile: the address to the batch effects file for the training set. + the batch effect array should be a [N×M] array where M is the number of + the type of batch effects. For example when the site and gender is modeled + as batch effects M=2. Each column in the batch effect array contains the + batch ID (starting from 0) for each sample. If not specified (default=None) + then all samples assumed to be from the same batch (i.e., the batch effect + is not modelled). + :param tsbefile: Similar to trbefile for the test set. + :param model_type: Specifies the type of the model from 'linear', 'plynomial', + and 'bspline' (defauls is 'linear'). + :param likelihood: specifies the type of likelihood among 'Normal' 'SHASHb','SHASHo', + and 'SHASHo2' (defauls is normal). + :param linear_mu: Boolean (default='True') to decide whether the mean (mu) is + parametrized on a linear function (thus changes with covariates) or it is fixed. + :param linear_sigma: Boolean (default='False') to decide whether the variance (sigma) is + parametrized on a linear function (heteroscedastic noise) or it is fixed for + each batch (homoscedastic noise). + :param linear_epsilon: Boolean (default='False') to decide the parametrization + of epsilon for the SHASH likelihood that controls its skewness. + If True, epsilon is parametrized on a linear function + (thus changes with covariates) otherwise it is fixed for each batch. + :param linear_delta: Boolean (default='False') to decide the parametrization + of delta for the SHASH likelihood that controls its kurtosis. + If True, delta is parametrized on a linear function + (thus changes with covariates) otherwise it is fixed for each batch. + :param random_intercept_{parameter}: if parameters mu (default='True'), + sigma (default='False'), epsilon (default='False'), and delta (default='False') + are parametrized on a linear function, then this boolean decides + whether the intercept can vary across batches. + :param random_slope_{parameter}: if parameters mu (default='True'), + sigma (default='False'), epsilon (default='False'), and delta (default='False') + are parametrized on a linear function, then this boolean decides + whether the slope can vary across batches. + :param centered_intercept_{parameter}: if parameters mu (default='False'), + sigma (default='False'), epsilon (default='False'), and delta (default='False') + are parametrized on a linear function, then this boolean decides + whether the parameters of intercept are estimated in a centered or + non-centered manner (default). While centered estimation runs faster + it may cause some problems for the sampler (the funnel of hell). + :param centered_slope_{parameter}: if parameters mu (default='False'), + sigma (default='False'), epsilon (default='False'), and delta (default='False') + are parametrized on a linear function, then this boolean decides + whether the parameters of slope are estimated in a centered or + non-centered manner (default). While centered estimation runs faster + it may cause some problems for the sampler (the funnel of hell). + :param sampler: specifies the type of PyMC sampler (Defauls is 'NUTS'). + :param n_samples: The number of samples to draw (Default is '1000'). Please + note that this parameter must be specified in a string fromat ('1000' and + not 1000). + :param n_tuning: String that specifies the number of iterations to adjust + the samplers's step sizes, scalings or similar (defauls is '500'). + :param n_chains: String that specifies the number of chains to sample. Defauls + is '1' for faster estimation, but note that sampling independent chains + is important for some convergence checks. + :param cores: String that specifies the number of chains to run in parallel. + (defauls is '1'). + :param Initialization method to use for auto-assigned NUTS samplers. The + defauls is 'jitter+adapt_diag' that starts with a identity mass matrix + and then adapt a diagonal based on the variance of the tuning samples + while adding a uniform jitter in [-1, 1] to the starting point in each chain. + :param target_accept: String that of a float in [0, 1] that regulates the + step size such that we approximate this acceptance rate. The defauls is '0.8' + but higher values like 0.9 or 0.95 often work better for problematic posteriors. + :param order: String that defines the order of bspline or polynomial model. + The defauls is '3'. + :param nknots: String that defines the numbers of knots for the bspline model. + The defauls is '5'. Higher values increase the model complexity with negative + effect on the spped of estimations. + :param nn_hidden_layers_num: String the specifies the number of hidden layers + in neural network model. It can be either '1' or '2'. The default is set to '2'. + :param nn_hidden_neuron_num: String that specifies the number of neurons in + the hidden layers. The defauls is set to '2'. + + Written by S.de Boer and S.M. Kia + """ def __init__(self, **kwargs): self.configs = dict() - self.configs['transferred'] = False + # inputs self.configs['trbefile'] = kwargs.pop('trbefile', None) self.configs['tsbefile'] = kwargs.pop('tsbefile', None) + # Model settings self.configs['type'] = kwargs.pop('model_type', 'linear') - self.configs['skewed_likelihood'] = kwargs.pop('skewed_likelihood', 'False') == 'True' - self.configs['pred_type'] = kwargs.pop('pred_type', 'single') self.configs['random_noise'] = kwargs.pop('random_noise', 'True') == 'True' + self.configs['likelihood'] = kwargs.pop('likelihood', 'Normal') + # sampler settings self.configs['n_samples'] = int(kwargs.pop('n_samples', '1000')) self.configs['n_tuning'] = int(kwargs.pop('n_tuning', '500')) self.configs['n_chains'] = int(kwargs.pop('n_chains', '1')) - self.configs['likelihood'] = kwargs.pop('likelihood', 'Normal') self.configs['sampler'] = kwargs.pop('sampler', 'NUTS') self.configs['target_accept'] = float(kwargs.pop('target_accept', '0.8')) self.configs['init'] = kwargs.pop('init', 'jitter+adapt_diag') self.configs['cores'] = int(kwargs.pop('cores', '1')) + # model transfer setting self.configs['freedom'] = int(kwargs.pop('freedom', '1')) + self.configs['transferred'] = False + # deprecated settings + self.configs['skewed_likelihood'] = kwargs.pop('skewed_likelihood', 'False') == 'True' + # misc + self.configs['pred_type'] = kwargs.pop('pred_type', 'single') if self.configs['type'] == 'bspline': self.configs['order'] = int(kwargs.pop('order', '3')) @@ -95,16 +182,25 @@ def __init__(self, **kwargs): if self.configs['linear_sigma']: if 'random_noise' in kwargs.keys(): print("The keyword \'random_noise\' is deprecated. It is now automatically replaced with \'random_intercept_sigma\', because sigma is linear") - self.configs['random_intercept_sigma'] = kwargs.pop('random_noise','False') == 'True' + self.configs['random_intercept_sigma'] = kwargs.pop('random_noise','True') == 'True' elif 'random_noise' in kwargs.keys(): print("The keyword \'random_noise\' is deprecated. It is now automatically replaced with \'random_sigma\', because sigma is fixed") - self.configs['random_sigma'] = kwargs.pop('random_noise','False') == 'True' + self.configs['random_sigma'] = kwargs.pop('random_noise','True') == 'True' if 'random_slope' in kwargs.keys(): print("The keyword \'random_slope\' is deprecated. It is now automatically replaced with \'random_intercept_mu\'") - self.configs['random_intercept_mu'] = kwargs.pop('random_slope','False') == 'True' + self.configs['random_slope_mu'] = kwargs.pop('random_slope','True') == 'True' ##### End Deprecations + ## Default parameters + self.configs['linear_mu'] = kwargs.pop('linear_mu','True') == 'True' + self.configs['random_mu'] = kwargs.pop('random_mu','True') == 'True' + self.configs['random_intercept_mu'] = kwargs.pop('random_intercept_mu','True') == 'True' + self.configs['random_slope_mu'] = kwargs.pop('random_slope_mu','True') == 'True' + self.configs['random_sigma'] = kwargs.pop('random_sigma','True') == 'True' + self.configs['centered_sigma'] = kwargs.pop('centered_sigma','True') == 'True' + ## End default parameters + self.hbr = HBR(self.configs) @property diff --git a/pcntoolkit/normative_model/norm_np.py b/pcntoolkit/normative_model/norm_np.py old mode 100644 new mode 100755 index 7b632522..14bdd291 --- a/pcntoolkit/normative_model/norm_np.py +++ b/pcntoolkit/normative_model/norm_np.py @@ -226,4 +226,4 @@ def predict(self, Xs, X=None, Y=None, theta=None): y_sigma_84 = y_sigma_84.cpu().numpy() * (self.scaler.data_max_ - self.scaler.data_min_) sigma_al = y_hat - y_hat_84 return y_hat.squeeze(), (y_sigma**2 + sigma_al**2).squeeze() #, z_context[0].cpu().numpy(), z_context[1].cpu().numpy() - \ No newline at end of file + \ No newline at end of file diff --git a/pcntoolkit/normative_parallel.py b/pcntoolkit/normative_parallel.py index 8c549b67..ad9d7c35 100755 --- a/pcntoolkit/normative_parallel.py +++ b/pcntoolkit/normative_parallel.py @@ -1,4 +1,4 @@ -#!/.../anaconda/bin/python/ +#!/opt/conda/bin/python # ----------------------------------------------------------------------------- # Run parallel normative modelling. @@ -88,8 +88,17 @@ def execute_nm(processing_dir, :param testrespfile_path: Full path to a .txt file that contains all test features :param log_path: Path for saving log files :param binary: If True uses binary format for response file otherwise it is text - - written by (primarily) T Wolfers, (adapted) SM Kia, (adapted) S Rutherford. + :param interactive: If False (default) the user should manually + rerun the failed jobs or collect the results. + If 'auto' the job status are checked until all + jobs are completed then the failed jobs are rerun + and the results are automaticallu collectted. + Using 'query' is similar to 'auto' unless it + asks for user verification thius is immune to + endless loop in the case of bugs in the code. + + written by (primarily) T Wolfers, (adapted) SM Kia + The documentation is adapated by S Rutherford. ''' if normative_path is None: @@ -899,7 +908,7 @@ def bashwrap_nm(processing_dir, # add in optional arguments. for k in kwargs: - job_call = [job_call[0] + ' ' + k + '=' + kwargs[k]] + job_call = [job_call[0] + ' ' + k + '=' + str(kwargs[k])] # writes bash file into processing dir with open(processing_dir+job_name, 'w') as bash_file: diff --git a/pcntoolkit/trendsurf.py b/pcntoolkit/trendsurf.py index e0f0d6e5..d7e03732 100644 --- a/pcntoolkit/trendsurf.py +++ b/pcntoolkit/trendsurf.py @@ -1,4 +1,4 @@ -#!/Users/andre/sfw/anaconda3/bin/python +#!/opt/conda/bin/python # ------------------------------------------------------------------------------ # Usage: diff --git a/pcntoolkit/util/utils.py b/pcntoolkit/util/utils.py index 24bb8dcc..e81c8158 100644 --- a/pcntoolkit/util/utils.py +++ b/pcntoolkit/util/utils.py @@ -1094,10 +1094,29 @@ def load_freesurfer_measure(measure, data_path, subjects_list): class scaler: - def __init__(self, scaler_type='standardize', tail=0.01): + def __init__(self, scaler_type='standardize', tail=0.05, + adjust_outliers=True): + + """ + A class for rescaling data using either standardization or minmax + normalization. + + :param scaler_type: String that decides the type of scaler including + 1) 'standardize' for standardizing data, 2) 'minmax' for minmax normalization + in range of [0,1], and 3) 'robminmax' for robust (to outliers) minmax + normalization.The default is 'standardize'. + :param tail: Is a decimal in range [0,1] that decides the tails of + distribution for finding robust min and max in 'robminmax' + normalization. The defualt is 0.05. + :param adjust_outliers: Boolean that decides whether to adjust the + outliers in 'robminmax' normalization or not. If True the outliers + values are truncated to 0 or 1. The defauls is True. + + """ self.scaler_type = scaler_type self.tail = tail + self.adjust_outliers = adjust_outliers if self.scaler_type not in ['standardize', 'minmax', 'robminmax']: raise ValueError("Undifined scaler type!") @@ -1122,7 +1141,7 @@ def fit(self, X): self.max[i] = np.median(np.sort(X[:,i])[-int(np.round(X.shape[0] * self.tail)):]) - def transform(self, X, adjust_outliers=False): + def transform(self, X): if self.scaler_type == 'standardize': @@ -1132,7 +1151,7 @@ def transform(self, X, adjust_outliers=False): X = (X - self.min) / (self.max - self.min) - if adjust_outliers: + if self.adjust_outliers: X[X < 0] = 0 X[X > 1] = 1 @@ -1154,7 +1173,7 @@ def inverse_transform(self, X, index=None): X = X * (self.max[index] - self.min[index]) + self.min[index] return X - def fit_transform(self, X, adjust_outliers=False): + def fit_transform(self, X): if self.scaler_type == 'standardize': @@ -1179,7 +1198,7 @@ def fit_transform(self, X, adjust_outliers=False): X = (X - self.min) / (self.max - self.min) - if adjust_outliers: + if self.adjust_outliers: X[X < 0] = 0 X[X > 1] = 1 diff --git a/setup.py b/setup.py index 1b05d04b..74ce24e3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='pcntoolkit', - version='0.26', + version='0.27', description='Predictive Clinical Neuroscience toolkit', url='http://github.com/amarquand/PCNtoolkit', author='Andre Marquand', @@ -12,7 +12,7 @@ 'argparse', 'nibabel>=2.5.1', 'six', - 'sklearn', + 'scikit-learn', 'bspline', 'matplotlib', 'numpy>=1.19.5,<1.23', @@ -24,4 +24,5 @@ 'theano==1.0.5', 'arviz==0.11.0' ], + #python_requires='<3.10', zip_safe=False) diff --git a/tests/testHBR.py b/tests/testHBR.py index c5a33d57..6b98ed9f 100644 --- a/tests/testHBR.py +++ b/tests/testHBR.py @@ -4,6 +4,9 @@ Created on Mon Jul 29 13:26:35 2019 @author: seykia + +This script tests HBR models with default configs on toy data. + """ import os @@ -22,20 +25,20 @@ working_dir = '/home/preclineu/seykia/temp/tests/' # Specift a working directory # to save data and results. -simulation_method = 'non-linear' # 'linear' +simulation_method = 'linear' # 'non-linear' n_features = 1 # The number of input features of X n_grps = 2 # Number of batches in data n_samples = 500 # Number of samples in each group (use a list for different # sample numbers across different batches) -model_types = ['linear', 'polynomial', 'bspline', 'nn'] # models to try +model_types = ['linear', 'polynomial', 'bspline'] # models to try ############################## Data Simulation ################################ X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef = \ simulate_data(simulation_method, n_samples, n_features, n_grps, - working_dir=working_dir, plot=True, noise='hetero_gaussian') + working_dir=working_dir, plot=True) ################################# Methods Tests ############################### @@ -43,9 +46,7 @@ for model_type in model_types: - nm = norm_init(X_train, Y_train, alg='hbr', model_type=model_type, - random_intercept='True', random_slope='True', random_noise='True', - hetero_noise='True', skewed_likelihood='False', order='3') + nm = norm_init(X_train, Y_train, alg='hbr', model_type=model_type) nm.estimate(X_train, Y_train, trbefile=working_dir+'trbefile.pkl') yhat, ys2 = nm.predict(X_test, tsbefile=working_dir+'tsbefile.pkl') @@ -88,9 +89,7 @@ estimate(covfile, respfile, testcov=testcov, testresp=testresp, trbefile=trbefile, tsbefile=tsbefile, alg='hbr', outputsuffix='_' + model_type, inscaler='None', outscaler='None', model_type=model_type, - random_intercept='True', random_slope='True', random_noise='True', - hetero_noise= 'True', skewed_likelihood='False', savemodel='True', - saveoutput='True') + savemodel='True', saveoutput='True') ###############################################################################