Skip to content

Commit

Permalink
Merge pull request #222 from amarquand/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
amarquand authored Nov 29, 2024
2 parents 89f4441 + 40d6e83 commit aacd7bf
Show file tree
Hide file tree
Showing 11 changed files with 963 additions and 64 deletions.
7 changes: 7 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,10 @@ version 0.31.0
- Backwards compatibilty improved by using pd.read_pickle instead of pickle.load
- SHASH classes have been refactored and improved
- HBR priors improved


version 0.32.0
- Update Dockerfile
- Add scaler.transfer, using Welford's algorithm to compute running mean and std for standardizers.
- Correctly save metadata of transfered models.

48 changes: 14 additions & 34 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,39 +1,19 @@
# Installs software and cleans up unnecessary memory: the smaller, the less downtime
FROM continuumio/miniconda3:latest
RUN apt-get update
RUN apt-get install -y libhdf5-dev \
pkg-config \
gcc \
g++ \
zip
FROM python:3.10-slim

# Make sure we have the right version of numpy
RUN conda install numpy=1.21
# Combine all installation and cleanup steps into a single layer
RUN apt-get update && \
apt-get install -y --no-install-recommends wget unzip tk libhdf5-dev pkg-config gcc g++&& \
wget https://github.com/amarquand/PCNtoolkit/archive/dev.zip && \
unzip dev.zip && \
cd PCNtoolkit-dev && python -m pip install . && cd .. && \
# Cleanup
rm -rf PCNtoolkit-dev dev.zip && \
pip cache purge && \
apt-get remove -y wget unzip && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install directly from GitHub (master branch)
#RUN pip install scikit-learn
#RUN pip install pcntoolkit==0.26

# This is an alternative method that pulls from the dev branch
RUN wget https://github.com/amarquand/PCNtoolkit/archive/dev.zip
RUN unzip dev.zip
RUN pip install scikit-learn
RUN cd PCNtoolkit-dev; pip install . ; cd ..

# Add command line links
RUN ln -s /opt/conda/lib/python3.10/site-packages/pcntoolkit /opt/ptk
RUN chmod 755 /opt/ptk/normative.py
RUN chmod 755 /opt/ptk/normative_parallel.py
RUN chmod 755 /opt/ptk/trendsurf.py
RUN echo "export PATH=${PATH}:/opt/ptk" >> ~/.bashrc

# clean up
RUN rm -rf PCNtoolkit-dev dev.zip
RUN conda clean -a
RUN pip cache purge
RUN apt-get clean

# execute entrypoint
COPY entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh
ENTRYPOINT [ "./entrypoint.sh" ]
2 changes: 1 addition & 1 deletion docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ func="$1"
shift

# run using all remaining arguments
/opt/ptk/${func} "$@"
${func} "$@"
26 changes: 18 additions & 8 deletions pcntoolkit/normative.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,11 +1014,11 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
else:
if os.path.exists(os.path.join(model_path, 'meta_data.md')):
with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
meta_data = pickle.load(file)
inscaler = meta_data['inscaler']
outscaler = meta_data['outscaler']
scaler_cov = meta_data['scaler_cov']
scaler_resp = meta_data['scaler_resp']
my_meta_data = pickle.load(file)
inscaler = my_meta_data['inscaler']
outscaler = my_meta_data['outscaler']
scaler_cov = my_meta_data['scaler_cov']
scaler_resp = my_meta_data['scaler_resp']
meta_data = True
else:
print("No meta-data file is found!")
Expand All @@ -1036,13 +1036,15 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
X = X[:, np.newaxis]

if inscaler in ['standardize', 'minmax', 'robminmax']:
scaler_cov[0].extend(X)
X = scaler_cov[0].transform(X)

feature_num = Y.shape[1]
mY = np.mean(Y, axis=0)
sY = np.std(Y, axis=0)

if outscaler in ['standardize', 'minmax', 'robminmax']:
scaler_resp[0].extend(Y)
Y = scaler_resp[0].transform(Y)

batch_effects_train = fileio.load(trbefile)
Expand Down Expand Up @@ -1092,6 +1094,10 @@ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
inputsuffix + '.pkl'))

nm = nm.estimate_on_new_sites(X, Y[:, i], batch_effects_train)
if meta_data:
my_meta_data['scaler_cov'] = scaler_cov[0]
my_meta_data['scaler_resp'] = scaler_resp[0]
pickle.dump(my_meta_data, open(os.path.join(output_path, 'meta_data.md'), 'wb'))
if batch_size is not None:
nm.save(os.path.join(output_path, 'NM_0_' +
str(job_id*batch_size+i) + outputsuffix + '.pkl'))
Expand Down Expand Up @@ -1247,11 +1253,15 @@ def extend(covfile, respfile, maskfile=None, **kwargs):
else:
if os.path.exists(os.path.join(model_path, 'meta_data.md')):
with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
meta_data = pickle.load(file)
if (meta_data['inscaler'] != 'None' or
meta_data['outscaler'] != 'None'):
my_meta_data = pickle.load(file)
if (my_meta_data['inscaler'] != 'None' or
my_meta_data['outscaler'] != 'None'):
print('Models extention on scaled data is not possible!')
return
meta_data = True
else:
print("No meta-data file is found!")
meta_data = False

if not os.path.isdir(output_path):
os.mkdir(output_path)
Expand Down
30 changes: 20 additions & 10 deletions pcntoolkit/normative_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,20 @@
# S Rutherford, AF Marquand
# -----------------------------------------------------------------------------

from __future__ import print_function
from __future__ import division
from __future__ import division, print_function

import os
import sys
import fileinput
import glob
import shutil
import os
import pickle
import fileinput
import shutil
import sys
import time
import numpy as np
import pandas as pd
from datetime import datetime
from subprocess import run, check_output
from subprocess import check_output, run

import numpy as np
import pandas as pd

try:
import pcntoolkit as ptk
Expand All @@ -45,8 +44,8 @@
ptkpath = os.path.abspath(os.path.dirname(__file__))
if ptkpath not in sys.path:
sys.path.append(ptkpath)
import dataio.fileio as fileio
import configs
import dataio.fileio as fileio
from util.utils import yes_or_no


Expand Down Expand Up @@ -1405,3 +1404,14 @@ def check_jobs(jobs, cluster_spec, start_time=None, delay=60):
print('All jobs are completed!')
break
time.sleep(delay)


def entrypoint(*args):
main(*args)

def main(*args):
execute_nm(*args)

if __name__ == "__main__":
main(sys.argv[1:])

10 changes: 6 additions & 4 deletions pcntoolkit/trendsurf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
# Written by A. Marquand
# ------------------------------------------------------------------------------

from __future__ import print_function
from __future__ import division
from __future__ import division, print_function

import argparse
import os
import sys
import numpy as np

import nibabel as nib
import argparse
import numpy as np

try: # Run as a package if installed
from pcntoolkit.dataio import fileio
Expand Down Expand Up @@ -300,6 +300,8 @@ def estimate(filename, maskfile, basis, ard=False, outputall=False,
out.append(bs2)
return out

def entrypoint(*args):
main(*args)

def main(*args):
np.seterr(invalid='ignore')
Expand Down
112 changes: 106 additions & 6 deletions pcntoolkit/util/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,9 +1184,10 @@ def __init__(self, scaler_type='standardize', tail=0.05,
def fit(self, X):

if self.scaler_type == 'standardize':

self.m = np.mean(X, axis=0)
self.s = np.std(X, axis=0)
self.w = Welford()
self.w.consume(X)
self.m = self.w.mean
self.s = self.w.std

elif self.scaler_type == 'minmax':
self.min = np.min(X, axis=0)
Expand All @@ -1200,6 +1201,24 @@ def fit(self, X):
np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
self.max[i] = np.median(
np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])


def extend(self, X):
if self.scaler_type == 'standardize':
self.w.consume(X)
self.m = self.w.mean
self.s = self.w.std

elif self.scaler_type in ['minmax']:
self.min = np.min(np.stack([self.min, np.min(X, axis=0)], axis=0), axis=0)
self.max = np.max(np.stack([self.max, np.max(X, axis=0)], axis=0), axis=0)

elif self.scaler_type in ['robminmax']:
for i in range(X.shape[1]):
med1 = np.median(np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
med2 = np.median(np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
self.min[i] = np.min(np.stack([self.min[i], med1], axis=0), axis=0)
self.max[i] = np.max(np.stack([self.max[i], med2], axis=0), axis=0)

def transform(self, X, index=None):

Expand Down Expand Up @@ -1240,9 +1259,10 @@ def inverse_transform(self, X, index=None):
def fit_transform(self, X):

if self.scaler_type == 'standardize':

self.m = np.mean(X, axis=0)
self.s = np.std(X, axis=0)
self.w = Welford()
self.w.consume(X)
self.m = self.w.mean
self.s = self.w.std
X = (X - self.m) / self.s

elif self.scaler_type == 'minmax':
Expand Down Expand Up @@ -1596,3 +1616,83 @@ def expand(a):
else:
return a
return [expand(x) for x in args]




class Welford(object):
"""Implements Welford's algorithm for computing a running mean
and standard deviation as described at:
http://www.johndcook.com/standard_deviation.html
Taken from: https://gist.github.com/alexalemi/2151722#file-welford-py
Adapted to work with numpy arrays.
can take single values or iterables
Properties:
mean - returns the mean
std - returns the std
meanfull- returns the mean and std of the mean
Usage:
>>> foo = Welford()
>>> foo(range(100))
>>> foo
<Welford: 49.5 +- 29.0114919759>
>>> foo([1]*1000)
>>> foo
<Welford: 5.40909090909 +- 16.4437417146>
>>> foo.mean
5.409090909090906
>>> foo.std
16.44374171455467
>>> foo.meanfull
(5.409090909090906, 0.4957974674244838)
"""

def __init__(self, lst=None):
self.k = np.array([0])
self.M = np.array([0])
self.S = np.array([0])

self.__call__(lst)

def update(self, x):
if self.k == 0:
if isinstance(x, np.ndarray):
self.M = np.zeros_like(x)
self.S = np.zeros_like(x)
if x is None:
return
self.k += 1
newM = self.M + (x - self.M) * 1.0 / self.k
newS = self.S + (x - self.M) * (x - newM)
self.M, self.S = newM, newS

def consume(self, lst):
lst = iter(lst)
for x in lst:
self.update(x)

def __call__(self, x):
if hasattr(x, "__iter__"):
self.consume(x)
else:
self.update(x)

@property
def mean(self) -> np.ndarray:
return self.M

@property
def meanfull(self) -> tuple[np.ndarray, np.ndarray]:
return self.mean, self.std / np.sqrt(self.k)

@property
def std(self) -> np.ndarray:
if self.k == 1:
return np.zeros_like(self.M)
return np.sqrt(self.S / (self.k - 1))

def __repr__(self):
return "<Welford: {} +- {}>".format(self.mean, self.std)
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pcntoolkit"
version = "0.31.0"
version = "0.32.0"
description = "Predictive Clinical Neuroscience Toolkit"
authors = ["Andre Marquand"]
license = "GNU GPLv3"
Expand Down Expand Up @@ -39,3 +39,5 @@ python_version = ">=3.10,<3.13"

[tool.poetry.scripts]
normative = "pcntoolkit.normative:entrypoint"
trendsurf = "pcntoolkit.trendsurf:entrypoint"
normative_parallel = "pcntoolkit.normative_parallel:entrypoint"
24 changes: 24 additions & 0 deletions tests/docker_test/docker_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
set -x

script_dir=$(realpath $(dirname $(realpath $0)))
test_dir=$(realpath $script_dir/..)
root_dir=$(realpath $test_dir/..)

export DOCKER_DIR=$root_dir/docker


export LOCAL_DATA_PATH="$script_dir/temp/pcntoolkit/data"
export DOCKER_DATA_PATH="/mnt/data"
mkdir -p $LOCAL_DATA_PATH
curl -o $LOCAL_DATA_PATH/fcon1000 https://raw.githubusercontent.com/predictive-clinical-neuroscience/PCNtoolkit-demo/refs/heads/main/data/fcon1000.csv

cd $DOCKER_DIR
cd ..

echo "Splitting the data into train and test covariates, responses and batch effects..."
python tests/cli_test/split_data.py --input_file $LOCAL_DATA_PATH/fcon1000 --output_dir $LOCAL_DATA_PATH

cd $DOCKER_DIR

docker run -v $LOCAL_DATA_PATH:$DOCKER_DATA_PATH pcntoolkit:v0.31.0_dev normative $DOCKER_DATA_PATH/Y_tr_fcon1000.pkl -c $DOCKER_DATA_PATH/X_tr_fcon1000.pkl -f fit -a hbr warp=WarpSinArcsinh optimizer=l-bfgs-b warp_reparam=True
Loading

0 comments on commit aacd7bf

Please sign in to comment.