Merge pull request #111 from amarquand/dev

Dev
amarquand · Mar 6, 2023 · 84e47ec · 84e47ec
2 parents 36b7ce9 + 4c44b02
commit 84e47ec
Show file tree

Hide file tree

Showing 13 changed files with 227 additions and 45 deletions.
diff --git a/CHANGES b/CHANGES
@@ -5,7 +5,6 @@ version 0.16
 Minor bug fix relative to release 0.15. Fixed a transpose operation that would cause normative.py to fail in certain cases
 
 version 0.17:
-Changes:
 - New syntax for functions related to parallelization (python path is automatically generated)
 - Support for slurm clusters has been added
 - Updates to hierarchical Bayesian regression (different priors for intercepts and noise) for improved transfer
@@ -15,20 +14,17 @@ Changes:
 - Updated sphinx documentation
 
 version 0.18
-Changes:
 - addition of cross-validation functionality for HBR
 - minor bug fixes for HBR
 
 version 0.19
-Changes:
 - separate standardization of responses and covariates
 - standardization is no longer on by default
 - new standardization methods
 - pymc3 version fixed
 - minor bugs resolved (CV with HBR)
 
 version 0.20
-Changes:
 - Major code refactoring
 - Requirements updated for python 3.8.3
 - Updates to documentation and Integrations with Read the Docs
@@ -54,7 +50,6 @@ Some minor updates and bug fixes:
 - other minor bug fixes related to cross-validation (computation of error metrics), import problems and calibration statistics
 
 version 0.23
-Changes:
 - SHASH functionality for HBR added
 - Bug fix for normative model predict() function
 
@@ -72,3 +67,9 @@ version 0.26
 - Added support for web portal
 - Provided a wrapper for blr to use transfer() functionality 
 - Also streamlined tutorials (PCNtoolkit-demo), so that all tutorials run with this version
+
+version 0.27
+- Configured more sensible default options for HBR (random slope and intercept for mu and random intercept for sigma)
+- Fixed a translation problem between the previous naming convention for HBR models (only Gaussian models) and the current naming (also SHASH models)
+- Minor updates to fix synchronisation problems in PCNportal (related to the HBR updates above)
+- Added configuration files for containerisation with Docker
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,39 @@
+# Installs software and cleans up unnecessary memory: the smaller, the less downtime
+FROM continuumio/miniconda3:latest
+RUN apt-get update
+RUN apt-get install -y libhdf5-dev \
+                        pkg-config \
+                        gcc \
+                        g++ \
+                        zip
+
+# Make sure we have the right version of numpy                         
+RUN conda install numpy=1.21
+
+# Install directly from GitHub (master branch)
+#RUN pip install scikit-learn 
+#RUN pip install pcntoolkit==0.26
+
+# This is an alternative method that pulls from the dev branch
+RUN wget https://github.com/amarquand/PCNtoolkit/archive/dev.zip 
+RUN unzip dev.zip
+RUN pip install scikit-learn
+RUN cd PCNtoolkit-dev; pip install . ; cd ..
+
+# Add command line links
+RUN ln -s /opt/conda/lib/python3.10/site-packages/pcntoolkit /opt/ptk
+RUN chmod 755 /opt/ptk/normative.py
+RUN chmod 755 /opt/ptk/normative_parallel.py
+RUN chmod 755 /opt/ptk/trendsurf.py
+RUN echo "export PATH=${PATH}:/opt/ptk" >> ~/.bashrc
+
+# clean up
+RUN rm -rf PCNtoolkit-dev dev.zip
+RUN conda clean -a 
+RUN pip cache purge
+RUN apt-get clean
+
+# execute entrypoint
+COPY entrypoint.sh ./entrypoint.sh
+RUN chmod +x ./entrypoint.sh
+ENTRYPOINT [ "./entrypoint.sh" ]
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# take the pcntoolkit function from the first argument specified
+func="$1"
+shift
+
+# run using all remaining arguments
+/opt/ptk/${func} "$@"
diff --git a/pcntoolkit/model/NPR.py b/pcntoolkit/model/NPR.py
@@ -77,4 +77,3 @@ def np_loss(y_hat, y_hat_84, y, z_all, z_context):
             0.16 * F.binary_cross_entropy(torch.squeeze(y_hat_84[idx2,:]), torch.mean(y[idx2,:],dim=1), reduction="sum")
     KLD = kl_div_gaussians(z_all[0], z_all[1], z_context[0], z_context[1])
     return BCE + KLD + BCE84
-
diff --git a/pcntoolkit/model/hbr.py b/pcntoolkit/model/hbr.py
@@ -173,8 +173,12 @@ def hbr(X, y, batch_effects, batch_effects_size, configs, trace=None):
         pb = ParamBuilder(model, X, y, batch_effects, trace, configs)
 
         if configs['likelihood'] == 'Normal':
-            mu = pb.make_param("mu").get_samples(pb)
-            sigma = pb.make_param("sigma").get_samples(pb)
+            mu = pb.make_param("mu", mu_slope_mu_params = (0.,10.), 
+                               sigma_slope_mu_params = (5.,), 
+                               mu_intercept_mu_params=(0.,10.), 
+                               sigma_intercept_mu_params = (5.,)).get_samples(pb)
+            sigma = pb.make_param("sigma", mu_sigma_params = (10., 5.),
+                                  sigma_sigma_params = (5.,)).get_samples(pb)
             sigma_plus = pm.math.log(1+pm.math.exp(sigma))
             y_like = pm.Normal('y_like',mu=mu, sigma=sigma_plus, observed=y)
 
@@ -530,7 +534,8 @@ def make_param(self, name, dim = (1,), **kwargs):
             slope_parameterization = self.make_param(f'slope_{name}', dim=[self.feature_num], **kwargs)
             intercept_parameterization = self.make_param(f'intercept_{name}', **kwargs)
             return LinearParameterization(name=name, dim=dim, 
-                                    slope_parameterization=slope_parameterization, intercept_parameterization=intercept_parameterization,
+                                    slope_parameterization=slope_parameterization, 
+                                    intercept_parameterization=intercept_parameterization,
                                     pb=self, 
                                     **kwargs)
 

diff --git a/pcntoolkit/normative.py b/pcntoolkit/normative.py
@@ -1,4 +1,4 @@
-#!/Users/andre/sfw/anaconda3/bin/python
+#!/opt/conda/bin/python
 
 # ------------------------------------------------------------------------------
 #  Usage:
@@ -1032,11 +1032,7 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
                 Yhat[:, i] = yhat.squeeze()
                 S2[:, i] = s2.squeeze()
 
-        # Creates a file for every job succesfully completed (for tracking failed jobs).
-        if count_jobsdone==True:
-            done_path = os.path.join(log_path, str(job_id)+".jobsdone")
-            Path(done_path).touch()
-
+
 
     if testresp is None:
         save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix)
@@ -1069,7 +1065,17 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
         save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results,
                      outputsuffix=outputsuffix)
 
+        # Creates a file for every job succesfully completed (for tracking failed jobs).
+        if count_jobsdone==True:
+            done_path = os.path.join(log_path, str(job_id)+".jobsdone")
+            Path(done_path).touch()
+
         return (Yhat, S2, Z)
+
+    # Creates a file for every job succesfully completed (for tracking failed jobs).
+    if count_jobsdone==True:
+        done_path = os.path.join(log_path, str(job_id)+".jobsdone")
+        Path(done_path).touch()
 
 
 def extend(covfile, respfile, maskfile=None, **kwargs):

diff --git a/pcntoolkit/normative_model/norm_hbr.py b/pcntoolkit/normative_model/norm_hbr.py
@@ -34,28 +34,115 @@
 
 
 class NormHBR(NormBase):
-    """ Classical GPR-based normative modelling approach
+
+    """ HBR multi-batch normative modelling class. By default, this function 
+    estimates a linear model with random intercept, random slope, and random 
+    homoscedastic noise.
+    
+    :param X: [N×P] array of clinical covariates
+    :param y: [N×1] array of neuroimaging measures
+    :param trbefile: the address to the batch effects file for the training set.
+        the batch effect array should be a [N×M] array where M is the number of
+        the type of batch effects. For example when the site and gender is modeled
+        as batch effects M=2. Each column in the batch effect array contains the 
+        batch ID (starting from 0) for each sample. If not specified (default=None)
+        then all samples assumed to be from the same batch (i.e., the batch effect 
+                                                            is not modelled).
+    :param tsbefile: Similar to trbefile for the test set.
+    :param model_type: Specifies the type of the model from 'linear', 'plynomial',
+        and 'bspline' (defauls is 'linear').
+    :param likelihood: specifies the type of likelihood among 'Normal' 'SHASHb','SHASHo',
+        and 'SHASHo2' (defauls is normal).
+    :param linear_mu: Boolean (default='True') to decide whether the mean (mu) is
+        parametrized on a linear function (thus changes with covariates) or it is fixed.
+    :param linear_sigma: Boolean (default='False') to decide whether the variance (sigma) is
+        parametrized on a linear function (heteroscedastic noise) or it is fixed for
+        each batch (homoscedastic noise).
+    :param linear_epsilon: Boolean (default='False') to decide the parametrization
+        of epsilon for the SHASH likelihood that controls its skewness. 
+        If True, epsilon is  parametrized on a linear function 
+        (thus changes with covariates) otherwise it is fixed for each batch.
+    :param linear_delta: Boolean (default='False') to decide the parametrization
+        of delta for the SHASH likelihood that controls its kurtosis. 
+        If True, delta is  parametrized on a linear function 
+        (thus changes with covariates) otherwise it is fixed for each batch.
+    :param random_intercept_{parameter}: if parameters mu (default='True'), 
+        sigma (default='False'), epsilon (default='False'), and delta (default='False')
+        are parametrized on a linear function, then this boolean decides 
+        whether the intercept can vary across batches.
+    :param random_slope_{parameter}: if parameters mu (default='True'), 
+        sigma (default='False'), epsilon (default='False'), and delta (default='False')
+        are parametrized on a linear function, then this boolean decides 
+        whether the slope can vary across batches.
+    :param centered_intercept_{parameter}: if parameters mu (default='False'), 
+        sigma (default='False'), epsilon (default='False'), and delta (default='False')
+        are parametrized on a linear function, then this boolean decides 
+        whether the parameters of intercept are estimated in a centered or
+        non-centered manner (default). While centered estimation runs faster
+        it may cause some problems for the sampler (the funnel of hell).
+    :param centered_slope_{parameter}: if parameters mu (default='False'), 
+        sigma (default='False'), epsilon (default='False'), and delta (default='False')
+        are parametrized on a linear function, then this boolean decides 
+        whether the parameters of slope are estimated in a centered or
+        non-centered manner (default). While centered estimation runs faster
+        it may cause some problems for the sampler (the funnel of hell).
+    :param sampler: specifies the type of PyMC sampler (Defauls is 'NUTS').
+    :param n_samples: The number of samples to draw (Default is '1000'). Please 
+        note that this parameter must be specified in a string fromat ('1000' and 
+                                                                       not 1000).  
+    :param n_tuning: String that specifies the number of iterations to adjust 
+        the samplers's step sizes, scalings or similar (defauls is '500').
+    :param n_chains: String that specifies the number of chains to sample. Defauls 
+        is '1' for faster estimation, but note that sampling independent chains 
+        is important for some convergence checks.
+    :param cores: String that specifies the number of chains to run in parallel.
+        (defauls is '1').
+    :param Initialization method to use for auto-assigned NUTS samplers. The
+        defauls is 'jitter+adapt_diag' that starts with a identity mass matrix 
+        and then adapt a diagonal based on the variance of the tuning samples
+        while adding a uniform jitter in [-1, 1] to the starting point in each chain.
+    :param target_accept: String that of a float in [0, 1] that regulates the 
+        step size such that we approximate this acceptance rate. The defauls is '0.8'
+        but higher values like 0.9 or 0.95 often work better for problematic posteriors.
+    :param order: String that defines the order of bspline or polynomial model.
+        The defauls is '3'.
+    :param nknots: String that defines the numbers of knots for the bspline model.
+        The defauls is '5'. Higher values increase the model complexity with negative 
+        effect on the spped of estimations.
+    :param nn_hidden_layers_num: String the specifies the number of hidden layers
+        in neural network model. It can be either '1' or '2'. The default is set to '2'.
+    :param nn_hidden_neuron_num: String that specifies the number of neurons in 
+        the hidden layers. The defauls is set to  '2'.
+        
+    Written by S.de Boer and S.M. Kia
+    
     """
 
     def __init__(self, **kwargs):
 
         self.configs = dict()
-        self.configs['transferred'] = False
+        # inputs
         self.configs['trbefile'] = kwargs.pop('trbefile', None)
         self.configs['tsbefile'] = kwargs.pop('tsbefile', None)
+        # Model settings
         self.configs['type'] = kwargs.pop('model_type', 'linear')
-        self.configs['skewed_likelihood'] = kwargs.pop('skewed_likelihood', 'False') == 'True'
-        self.configs['pred_type'] = kwargs.pop('pred_type', 'single')
         self.configs['random_noise'] = kwargs.pop('random_noise', 'True') == 'True'
+        self.configs['likelihood'] = kwargs.pop('likelihood', 'Normal')
+        # sampler settings
         self.configs['n_samples'] = int(kwargs.pop('n_samples', '1000'))
         self.configs['n_tuning'] = int(kwargs.pop('n_tuning', '500'))
         self.configs['n_chains'] = int(kwargs.pop('n_chains', '1'))
-        self.configs['likelihood'] = kwargs.pop('likelihood', 'Normal')
         self.configs['sampler'] = kwargs.pop('sampler', 'NUTS')
         self.configs['target_accept'] = float(kwargs.pop('target_accept', '0.8'))
         self.configs['init'] = kwargs.pop('init', 'jitter+adapt_diag')
         self.configs['cores'] = int(kwargs.pop('cores', '1'))
+        # model transfer setting
         self.configs['freedom'] = int(kwargs.pop('freedom', '1'))
+        self.configs['transferred'] = False
+        # deprecated settings
+        self.configs['skewed_likelihood'] = kwargs.pop('skewed_likelihood', 'False') == 'True'
+        # misc
+        self.configs['pred_type'] = kwargs.pop('pred_type', 'single')
 
         if self.configs['type'] == 'bspline':
             self.configs['order'] = int(kwargs.pop('order', '3'))
@@ -95,16 +182,25 @@ def __init__(self, **kwargs):
             if self.configs['linear_sigma']:
                 if 'random_noise' in kwargs.keys():
                     print("The keyword \'random_noise\' is deprecated. It is now automatically replaced with \'random_intercept_sigma\', because sigma is linear")
-                    self.configs['random_intercept_sigma'] = kwargs.pop('random_noise','False') == 'True'
+                    self.configs['random_intercept_sigma'] = kwargs.pop('random_noise','True') == 'True'
             elif 'random_noise' in kwargs.keys():
                 print("The keyword \'random_noise\' is deprecated. It is now automatically replaced with \'random_sigma\', because sigma is fixed")
-                self.configs['random_sigma'] = kwargs.pop('random_noise','False') == 'True'
+                self.configs['random_sigma'] = kwargs.pop('random_noise','True') == 'True'
             if 'random_slope' in kwargs.keys():
                 print("The keyword \'random_slope\' is deprecated. It is now automatically replaced with \'random_intercept_mu\'")
-                self.configs['random_intercept_mu'] = kwargs.pop('random_slope','False') == 'True'
+                self.configs['random_slope_mu'] = kwargs.pop('random_slope','True') == 'True'
             ##### End Deprecations 
 
 
+        ## Default parameters
+        self.configs['linear_mu'] = kwargs.pop('linear_mu','True') == 'True'
+        self.configs['random_mu'] = kwargs.pop('random_mu','True') == 'True'
+        self.configs['random_intercept_mu'] = kwargs.pop('random_intercept_mu','True') == 'True'
+        self.configs['random_slope_mu'] = kwargs.pop('random_slope_mu','True') == 'True'
+        self.configs['random_sigma'] = kwargs.pop('random_sigma','True') == 'True'
+        self.configs['centered_sigma'] = kwargs.pop('centered_sigma','True') == 'True'
+        ## End default parameters
+
         self.hbr = HBR(self.configs)
 
     @property

diff --git a/pcntoolkit/normative_model/norm_np.py b/pcntoolkit/normative_model/norm_np.py
@@ -226,4 +226,4 @@ def predict(self, Xs, X=None, Y=None, theta=None):
         y_sigma_84 = y_sigma_84.cpu().numpy() * (self.scaler.data_max_ - self.scaler.data_min_)
         sigma_al = y_hat - y_hat_84
         return y_hat.squeeze(), (y_sigma**2 + sigma_al**2).squeeze() #, z_context[0].cpu().numpy(), z_context[1].cpu().numpy()
-
+        
diff --git a/pcntoolkit/normative_parallel.py b/pcntoolkit/normative_parallel.py
@@ -1,4 +1,4 @@
-#!/.../anaconda/bin/python/
+#!/opt/conda/bin/python
 
 # -----------------------------------------------------------------------------
 # Run parallel normative modelling.
@@ -88,8 +88,17 @@ def execute_nm(processing_dir,
     :param testrespfile_path: Full path to a .txt file that contains all test features
     :param log_path: Path for saving log files
     :param binary: If True uses binary format for response file otherwise it is text
-
-    written by (primarily) T Wolfers, (adapted) SM Kia, (adapted) S Rutherford.
+    :param interactive: If False (default) the user should manually 
+                        rerun the failed jobs or collect the results.
+                        If 'auto' the job status are checked until all 
+                        jobs are completed then the failed jobs are rerun
+                        and the results are automaticallu collectted.
+                        Using 'query' is similar to 'auto' unless it
+                        asks for user verification thius is immune to 
+                        endless loop in the case of bugs in the code.
+
+    written by (primarily) T Wolfers, (adapted) SM Kia
+    The documentation is adapated by S Rutherford.
     '''
 
     if normative_path is None:
@@ -899,7 +908,7 @@ def bashwrap_nm(processing_dir,
 
     # add in optional arguments. 
     for k in kwargs:
-        job_call = [job_call[0] + ' ' + k + '=' + kwargs[k]]
+        job_call = [job_call[0] + ' ' + k + '=' + str(kwargs[k])]
 
     # writes bash file into processing dir
     with open(processing_dir+job_name, 'w') as bash_file:

diff --git a/pcntoolkit/trendsurf.py b/pcntoolkit/trendsurf.py
@@ -1,4 +1,4 @@
-#!/Users/andre/sfw/anaconda3/bin/python
+#!/opt/conda/bin/python
 
 # ------------------------------------------------------------------------------
 #  Usage:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -77,4 +77,3 @@ def np_loss(y_hat, y_hat_84, y, z_all, z_context):
		0.16 * F.binary_cross_entropy(torch.squeeze(y_hat_84[idx2,:]), torch.mean(y[idx2,:],dim=1), reduction="sum")
		KLD = kl_div_gaussians(z_all[0], z_all[1], z_context[0], z_context[1])
		return BCE + KLD + BCE84
Original file line number	Diff line number	Diff line change
Expand Up		@@ -226,4 +226,4 @@ def predict(self, Xs, X=None, Y=None, theta=None):
		y_sigma_84 = y_sigma_84.cpu().numpy() * (self.scaler.data_max_ - self.scaler.data_min_)
		sigma_al = y_hat - y_hat_84
		return y_hat.squeeze(), (y_sigma2 + sigma_al2).squeeze() #, z_context[0].cpu().numpy(), z_context[1].cpu().numpy()