diff --git a/.gitignore b/.gitignore index ddd329e..7f1258d 100644 --- a/.gitignore +++ b/.gitignore @@ -96,4 +96,13 @@ tests/cli_test/* docs/autoapi/* docs/_build/* -docs \ No newline at end of file +docs +tests/cli_test_parallel_kfold/temp/be_te_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/be_tr_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/fcon1000 +tests/cli_test_parallel_kfold/temp/X_te_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/X_tr_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/X_var_te_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/X_var_tr_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/Y_te_fcon1000.pkl +tests/cli_test_parallel_kfold/temp/Y_tr_fcon1000.pkl diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch1.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch1.png new file mode 100644 index 0000000..4441116 Binary files /dev/null and b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch1.png differ diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch2.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch2.png new file mode 100644 index 0000000..4441116 Binary files /dev/null and b/tests/cli_test_parallel_kfold/Yhat_estimate_ft0_batch2.png differ diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch1.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch1.png new file mode 100644 index 0000000..7defa3e Binary files /dev/null and b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch1.png differ diff --git a/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch2.png b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch2.png new file mode 100644 index 0000000..7defa3e Binary files /dev/null and b/tests/cli_test_parallel_kfold/Yhat_estimate_ft1_batch2.png differ diff --git a/tests/cli_test_parallel_kfold/inspect_results.py b/tests/cli_test_parallel_kfold/inspect_results.py new file mode 100644 index 0000000..f8760ff --- /dev/null +++ b/tests/cli_test_parallel_kfold/inspect_results.py @@ -0,0 +1,56 @@ +import pickle +import numpy as np +import matplotlib.pyplot as plt + +import glob +import os + + + +for batch in [1,2]: + results_dir = "/project/3022000.05/projects/stijdboe/temp/parallel_processing/batch_1" + + + for func in ['fit', 'predict', 'estimate']: + print(f"Plotting {func} results...") + results = glob.glob(os.path.join(results_dir, f"*{func}.pkl")) + for result in results: + if "Z" in result: + z = pickle.load(open(result, "rb")) + n = np.random.randn(z.shape[0], 1) + sorted_z = np.sort(z, axis=0) + sorted_n = np.sort(n, axis=0) + plt.plot(sorted_z, sorted_n, label=f"Z_{func}") + plt.savefig(f"Z_{func}.png") + plt.close() + elif "yhat" in result: + x_path = "/project/3022000.05/projects/stijdboe/Projects/PCNtoolkit/tests/cli_test_parallel_kfold/temp/X_te_fcon1000.pkl" + x = pickle.load(open(x_path, "rb")).to_numpy() + sortindex = np.argsort(x[:,1]) + print(x[sortindex, 1]) + yhat = pickle.load(open(result, "rb")).to_numpy() + result = result.replace("yhat", "ys2") + s2 = pickle.load(open(result, "rb")).to_numpy() + print(x.shape) + print(yhat.shape) + print(s2.shape) + + for i in range(yhat.shape[1]): + plt.plot(x[sortindex, 1], yhat[sortindex, i], label=f"Yhat_{func}_{i}") + plt.plot(x[sortindex, 1], yhat[sortindex, i] - s2[sortindex, i], label=f"Yhat_{func}_{i} - s2") + plt.plot(x[sortindex, 1], yhat[sortindex, i] + s2[sortindex, i], label=f"Yhat_{func}_{i} + s2") + plt.savefig(f"Yhat_{func}_ft{i}_batch{batch}.png") + plt.close() + elif "S2" in result: + s2 = pickle.load(open(result, "rb")) + print(f"{s2=}") + elif "EXPV" in result: + expv = pickle.load(open(result, "rb")) + print(f"{expv=}") + elif "MSLL" in result: + msll = pickle.load(open(result, "rb")) + print(f"{msll=}") + elif "SMSE" in result: + smse = pickle.load(open(result, "rb")) + print(f"{smse=}") + diff --git a/tests/cli_test_parallel_kfold/split_data.py b/tests/cli_test_parallel_kfold/split_data.py index bb2caa9..5573843 100644 --- a/tests/cli_test_parallel_kfold/split_data.py +++ b/tests/cli_test_parallel_kfold/split_data.py @@ -34,6 +34,10 @@ def main(): # Standardize the covariates and responses cov = StandardScaler().fit_transform(cov.to_numpy()[:,np.newaxis]) resp = StandardScaler().fit_transform(resp.to_numpy()) + + xmin = cov.min() + xmax = cov.max() + # Map the batch effects to integers be_ids = np.unique(be, return_inverse=True)[1] @@ -44,16 +48,24 @@ def main(): # Create the design matrices mean_basis = 'linear' var_basis = 'linear' - Phi_tr = create_design_matrix(cov[train_idx], basis=mean_basis, intercept=False, site_ids=be_ids[train_idx]) - Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis) - Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=False, site_ids=be_ids[test_idx]) - Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis) + # Phi_tr = create_design_matrix(cov[train_idx], basis=mean_basis, intercept=False, site_ids=be_ids[train_idx]) + # Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis) + # Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=False, site_ids=be_ids[test_idx]) + # Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis) + + Phi_tr = create_design_matrix(cov[train_idx], basis=mean_basis, intercept=True, xmin=xmin, xmax=xmax) + # Phi_var_tr = create_design_matrix(cov[train_idx], basis=var_basis, xmin=xmin, xmax=xmax) + Phi_var_tr = cov[train_idx] + Phi_te = create_design_matrix(cov[test_idx], basis=mean_basis, intercept=True, xmin=xmin, xmax=xmax) + # Phi_var_te = create_design_matrix(cov[test_idx], basis=var_basis, xmin=xmin, xmax=xmax) + Phi_var_te = cov[test_idx] + print(f"{Phi_var_te.shape=}") # Save everything pd.to_pickle(pd.DataFrame(Phi_tr), os.path.join(args.output_dir, f'X_tr_{infile}.pkl')) - pd.to_pickle(pd.DataFrame(Phi_var_tr), os.path.join(args.output_dir, f'X_var_tr_{infile}.pkl')) + pd.to_pickle(Phi_var_tr, os.path.join(args.output_dir, f'X_var_tr_{infile}.pkl')) pd.to_pickle(pd.DataFrame(Phi_te), os.path.join(args.output_dir, f'X_te_{infile}.pkl')) - pd.to_pickle(pd.DataFrame(Phi_var_te), os.path.join(args.output_dir, f'X_var_te_{infile}.pkl')) + pd.to_pickle(Phi_var_te, os.path.join(args.output_dir, f'X_var_te_{infile}.pkl')) pd.to_pickle(pd.DataFrame(resp[train_idx]), os.path.join(args.output_dir, f'Y_tr_{infile}.pkl')) pd.to_pickle(pd.DataFrame(resp[test_idx]), os.path.join(args.output_dir, f'Y_te_{infile}.pkl')) pd.to_pickle(be[train_idx], os.path.join(args.output_dir, f'be_tr_{infile}.pkl')) diff --git a/tests/cli_test_parallel_kfold/submit_jobs.py b/tests/cli_test_parallel_kfold/submit_jobs.py index cebae3f..9a6b1f8 100644 --- a/tests/cli_test_parallel_kfold/submit_jobs.py +++ b/tests/cli_test_parallel_kfold/submit_jobs.py @@ -5,35 +5,45 @@ def execute_nm_wrapper(*args): - args_dict = {k:v for k,v in [arg.split('=') for arg in args]} + args_dict = {k:v for k,v in [arg.split('=') for arg in args[0]]} func = args_dict.get('func') covfile_path = args_dict.get('covfile_path',None) respfile_path = args_dict.get('respfile_path',None) + varcovfile_path = args_dict.get('varcovfile_path',None) testcovfile_path = args_dict.get('testcovfile_path',None) - testrespfile_path = args_dict.get('testrespfile_path',None) + testrespfile_path = args_dict.get('testrespfile_path',None) + testvarcovfile_path = args_dict.get('testvarcovfile_path',None) + if func == "estimate": + testrespfile_path = None execute_nm( python_path='/home/preclineu/stijdboe/.conda/envs/pcntk_dev/bin/python', normative_path="/home/preclineu/stijdboe/.conda/envs/pcntk_dev/lib/python3.12/site-packages/pcntoolkit/normative.py", job_name='test_normative_parallel', - processing_dir='/project/3022000.05/projects/stijdboe/temp/parallel_processing', + processing_dir='/project/3022000.05/projects/stijdboe/temp/parallel_processing/', + log_path='/project/3022000.05/projects/stijdboe/temp/parallel_processing/log/', + varcovfile=varcovfile_path, + testvarcovfile=testvarcovfile_path, func=func, covfile_path=covfile_path, respfile_path=respfile_path, testcovfile_path=testcovfile_path, testrespfile_path=testrespfile_path, batch_size=2, - memory='16G', - duration='00:10:00', + memory='4G', + duration='00:02:00', job_id=1, cv_folds = 5, - alg='hbr', + alg='blr', warp='WarpSinArcsinh', optimizer='l-bfgs-b', - warp_reparam=True, - inscaler='standardize', - binary=False + warp_reparam='True', + binary='True', + cluster_spec='slurm', + saveoutput='True', + savemodel='True', + outputsuffix=f"_{func}" ) diff --git a/tests/cli_test_parallel_kfold/test_cli.sh b/tests/cli_test_parallel_kfold/test_cli.sh index b7df2d4..bb85ce9 100755 --- a/tests/cli_test_parallel_kfold/test_cli.sh +++ b/tests/cli_test_parallel_kfold/test_cli.sh @@ -12,14 +12,14 @@ curl -o $tempdir/$data_name https://raw.githubusercontent.com/predictive-clinica echo "Splitting the data into train and test covariates, responses and batch effects..." python split_data.py --input_file $tempdir/$data_name --output_dir $tempdir -echo "Fitting the model..." -python submit_jobs.py func=fit covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl +# echo "Fitting the model..." +# python submit_jobs.py func=fit covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl -echo "Predicting the test set..." -python submit_jobs.py func=predict covfile_path=$tempdir/X_te_$data_name.pkl respfile_path=$tempdir/Y_te_$data_name.pkl +# echo "Predicting the test set..." +# python submit_jobs.py func=predict covfile_path=$tempdir/X_te_$data_name.pkl respfile_path=$tempdir/Y_te_$data_name.pkl echo "Also doing estimate..." -python submit_jobs.py func=estimate covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl testcovfile_path=$tempdir/X_te_$data_name.pkl testrespfile_path=$tempdir/Y_te_$data_name.pkl +python submit_jobs.py func=estimate covfile_path=$tempdir/X_tr_$data_name.pkl respfile_path=$tempdir/Y_tr_$data_name.pkl testcovfile_path=$tempdir/X_te_$data_name.pkl testrespfile_path=$tempdir/Y_te_$data_name.pkl testvarcovfile_path=$tempdir/X_var_te_$data_name.pkl varcovfile_path=$tempdir/X_var_tr_$data_name.pkl echo "Done!" -rm -R $tempdir \ No newline at end of file +# rm -R $tempdirls