Test BLR with Kfold in Parallel

amarquand · Dec 12, 2024 · 0d8c8e0 · 0d8c8e0
1 parent 78d3722
commit 0d8c8e0
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -96,4 +96,13 @@ tests/cli_test/*
 
 docs/autoapi/*
 docs/_build/*
-docs
+docs
+tests/cli_test_parallel_kfold/temp/be_te_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/be_tr_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/fcon1000
+tests/cli_test_parallel_kfold/temp/X_te_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/X_tr_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/X_var_te_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/X_var_tr_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/Y_te_fcon1000.pkl
+tests/cli_test_parallel_kfold/temp/Y_tr_fcon1000.pkl
diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch1.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch1.png
diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch2.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch2.png
diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch1.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch1.png
diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch2.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch2.png
diff --git a/tests/cli_test_parallel_kfold/inspect_results.py b/tests/cli_test_parallel_kfold/inspect_results.py
@@ -0,0 +1,56 @@
+import pickle
+import numpy as np
+import matplotlib.pyplot as plt
+
+import glob
+import os
+
+
+
+for batch in [1,2]:
+    results_dir = "/project/3022000.05/projects/stijdboe/temp/parallel_processing/batch_1"
+
+
+    for func in ['fit', 'predict', 'estimate']:
+        print(f"Plotting {func} results...")
+        results = glob.glob(os.path.join(results_dir, f"*{func}.pkl"))
+        for result in results:
+            if "Z" in result:
+                z = pickle.load(open(result, "rb"))
+                n = np.random.randn(z.shape[0], 1)
+                sorted_z = np.sort(z, axis=0)
+                sorted_n = np.sort(n, axis=0)
+                plt.plot(sorted_z, sorted_n, label=f"Z_{func}")
+                plt.savefig(f"Z_{func}.png")
+                plt.close()
+            elif "yhat" in result:
+                x_path = "/project/3022000.05/projects/stijdboe/Projects/PCNtoolkit/tests/cli_test_parallel_kfold/temp/X_te_fcon1000.pkl"
+                x = pickle.load(open(x_path, "rb")).to_numpy()
+                sortindex = np.argsort(x[:,1])
+                print(x[sortindex, 1])
+                yhat = pickle.load(open(result, "rb")).to_numpy()
+                result = result.replace("yhat", "ys2")
+                s2 = pickle.load(open(result, "rb")).to_numpy()
+                print(x.shape)
+                print(yhat.shape)
+                print(s2.shape)
+
+                for i in range(yhat.shape[1]):
+                    plt.plot(x[sortindex, 1], yhat[sortindex, i], label=f"Yhat_{func}_{i}")
+                    plt.plot(x[sortindex, 1], yhat[sortindex, i] - s2[sortindex, i], label=f"Yhat_{func}_{i} - s2")
+                    plt.plot(x[sortindex, 1], yhat[sortindex, i] + s2[sortindex, i], label=f"Yhat_{func}_{i} + s2")
+                    plt.savefig(f"Yhat_{func}_ft{i}_batch{batch}.png")
+                    plt.close()
+            elif "S2" in result:
+                s2 = pickle.load(open(result, "rb"))
+                print(f"{s2=}")
+            elif "EXPV" in result:
+                expv = pickle.load(open(result, "rb"))
+                print(f"{expv=}")
+            elif "MSLL" in result:
+                msll = pickle.load(open(result, "rb"))
+                print(f"{msll=}")
+            elif "SMSE" in result:
+                smse = pickle.load(open(result, "rb"))
+                print(f"{smse=}")
+
diff --git a/tests/cli_test_parallel_kfold/split_data.py b/tests/cli_test_parallel_kfold/split_data.py
@@ -34,6 +34,10 @@ def main():
     # Standardize the covariates and responses
     cov = StandardScaler().fit_transform(cov.to_numpy()[:,np.newaxis])
     resp = StandardScaler().fit_transform(resp.to_numpy())
+
+    xmin = cov.min()
+    xmax = cov.max()
+
 
     # Map the batch effects to integers
     be_ids = np.unique(be, return_inverse=True)[1]
@@ -44,16 +48,24 @@ def main():
     # Create the design matrices
     mean_basis = 'linear'
     var_basis = 'linear'
-    Phi_tr = create_design_matrix(cov[train_idx], basis=mean_basis, intercept=False, site_ids=be_ids[train_idx])
-    Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis)
-    Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=False, site_ids=be_ids[test_idx])
-    Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis)
+    # Phi_tr = create_design_matrix(cov[train_idx], basis=mean_basis, intercept=False, site_ids=be_ids[train_idx])
+    # Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis)
+    # Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=False, site_ids=be_ids[test_idx])
+    # Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis)
+
+    Phi_tr = create_design_matrix(cov[train_idx],  basis=mean_basis, intercept=True, xmin=xmin, xmax=xmax)
+    # Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis, xmin=xmin, xmax=xmax)
+    Phi_var_tr = cov[train_idx]
+    Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=True, xmin=xmin, xmax=xmax)
+    # Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis, xmin=xmin, xmax=xmax)
+    Phi_var_te = cov[test_idx]
+    print(f"{Phi_var_te.shape=}")
 
     # Save everything
     pd.to_pickle(pd.DataFrame(Phi_tr), os.path.join(args.output_dir, f'X_tr_{infile}.pkl'))
-    pd.to_pickle(pd.DataFrame(Phi_var_tr), os.path.join(args.output_dir, f'X_var_tr_{infile}.pkl'))
+    pd.to_pickle(Phi_var_tr, os.path.join(args.output_dir, f'X_var_tr_{infile}.pkl'))
     pd.to_pickle(pd.DataFrame(Phi_te), os.path.join(args.output_dir, f'X_te_{infile}.pkl'))
-    pd.to_pickle(pd.DataFrame(Phi_var_te), os.path.join(args.output_dir, f'X_var_te_{infile}.pkl'))
+    pd.to_pickle(Phi_var_te, os.path.join(args.output_dir, f'X_var_te_{infile}.pkl'))
     pd.to_pickle(pd.DataFrame(resp[train_idx]), os.path.join(args.output_dir, f'Y_tr_{infile}.pkl'))
     pd.to_pickle(pd.DataFrame(resp[test_idx]), os.path.join(args.output_dir, f'Y_te_{infile}.pkl'))
     pd.to_pickle(be[train_idx], os.path.join(args.output_dir, f'be_tr_{infile}.pkl'))

diff --git a/tests/cli_test_parallel_kfold/submit_jobs.py b/tests/cli_test_parallel_kfold/submit_jobs.py
@@ -5,35 +5,45 @@
 
 
 def execute_nm_wrapper(*args):
-    args_dict = {k:v for k,v in [arg.split('=') for arg in args]}
+    args_dict = {k:v for k,v in [arg.split('=') for arg in args[0]]}
 
     func = args_dict.get('func')
     covfile_path = args_dict.get('covfile_path',None)
     respfile_path = args_dict.get('respfile_path',None)
+    varcovfile_path = args_dict.get('varcovfile_path',None)
     testcovfile_path = args_dict.get('testcovfile_path',None)
-    testrespfile_path = args_dict.get('testrespfile_path',None) 
+    testrespfile_path = args_dict.get('testrespfile_path',None)
+    testvarcovfile_path = args_dict.get('testvarcovfile_path',None)
+    if func == "estimate":
+        testrespfile_path = None
 
     execute_nm(
         python_path='/home/preclineu/stijdboe/.conda/envs/pcntk_dev/bin/python',
         normative_path="/home/preclineu/stijdboe/.conda/envs/pcntk_dev/lib/python3.12/site-packages/pcntoolkit/normative.py",
         job_name='test_normative_parallel',
-        processing_dir='/project/3022000.05/projects/stijdboe/temp/parallel_processing',
+        processing_dir='/project/3022000.05/projects/stijdboe/temp/parallel_processing/',
+        log_path='/project/3022000.05/projects/stijdboe/temp/parallel_processing/log/',
+        varcovfile=varcovfile_path,
+        testvarcovfile=testvarcovfile_path,
         func=func,
         covfile_path=covfile_path,
         respfile_path=respfile_path,
         testcovfile_path=testcovfile_path,
         testrespfile_path=testrespfile_path,
         batch_size=2,
-        memory='16G',
-        duration='00:10:00',
+        memory='4G',
+        duration='00:02:00',
         job_id=1,
         cv_folds = 5,
-        alg='hbr',
+        alg='blr',
         warp='WarpSinArcsinh',
         optimizer='l-bfgs-b',
-        warp_reparam=True,
-        inscaler='standardize',
-        binary=False
+        warp_reparam='True',
+        binary='True',
+        cluster_spec='slurm',
+        saveoutput='True',
+        savemodel='True',
+        outputsuffix=f"_{func}"
         )
 
 

diff --git a/tests/cli_test_parallel_kfold/test_cli.sh b/tests/cli_test_parallel_kfold/test_cli.sh
@@ -12,14 +12,14 @@ curl -o $tempdir/$data_name https://raw.githubusercontent.com/predictive-clinica
 echo "Splitting the data into train and test covariates, responses and batch effects..."
 python split_data.py --input_file $tempdir/$data_name --output_dir $tempdir
 
-echo "Fitting the model..."
-python submit_jobs.py func=fit covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl
+# echo "Fitting the model..."
+# python submit_jobs.py func=fit covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl
 
-echo "Predicting the test set..."
-python submit_jobs.py func=predict covfile_path=$tempdir/X_te_$data_name.pkl respfile_path=$tempdir/Y_te_$data_name.pkl
+# echo "Predicting the test set..."
+# python submit_jobs.py func=predict covfile_path=$tempdir/X_te_$data_name.pkl respfile_path=$tempdir/Y_te_$data_name.pkl
 
 echo "Also doing estimate..."
-python submit_jobs.py func=estimate covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl testcovfile_path=$tempdir/X_te_$data_name.pkl testrespfile_path=$tempdir/Y_te_$data_name.pkl
+python submit_jobs.py func=estimate covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl testcovfile_path=$tempdir/X_te_$data_name.pkl testrespfile_path=$tempdir/Y_te_$data_name.pkl testvarcovfile_path=$tempdir/X_var_te_$data_name.pkl varcovfile_path=$tempdir/X_var_tr_$data_name.pkl
 
 echo "Done!"
-rm -R $tempdir
+# rm -R $tempdirls