From 0ee8d409bd22e0d38ee1dc1d7a316f6cab506fac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:54:06 +0000 Subject: [PATCH 1/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/atlas/get_result_web.py | 26 +++++++------- .../sc_similarity_examples/sim_query_atlas.py | 35 ++++++++++--------- examples/atlas/test_get_result_web.py | 2 +- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/examples/atlas/get_result_web.py b/examples/atlas/get_result_web.py index a479d9de..d4d2bd4d 100644 --- a/examples/atlas/get_result_web.py +++ b/examples/atlas/get_result_web.py @@ -1,7 +1,7 @@ import argparse -from functools import partial import json import os +from functools import partial from pathlib import Path import numpy as np @@ -120,18 +120,19 @@ def spilt_web(url: str): print("No match found") -def get_metric(run,metric_col): +def get_metric(run, metric_col): """Extract metric value from wandb run. - + Parameters ---------- run : wandb.Run Weights & Biases run object - + Returns ------- float Metric value or negative infinity if metric not found + """ if metric_col not in run.summary: return float('-inf') # Return -inf for missing metrics to handle in comparisons @@ -140,7 +141,7 @@ def get_metric(run,metric_col): def get_best_method(urls, metric_col="test_acc"): """Find the best performing method across multiple wandb sweeps. - + Parameters ---------- urls : list @@ -163,11 +164,11 @@ def get_best_method(urls, metric_col="test_acc"): # Track run statistics run_states = {"all_total_runs": 0, "all_finished_runs": 0} - + for step_name, url in zip(step_names, urls): _, _, sweep_id = spilt_web(url) sweep = wandb.Api(timeout=1000).sweep(f"{entity}/{project}/{sweep_id}") - + # Update run statistics finished_runs = [run for run in sweep.runs if run.state == "finished"] run_states.update({ @@ -182,10 +183,10 @@ def get_best_method(urls, metric_col="test_acc"): best_run = max(sweep.runs, key=partial(get_metric, metric_col=metric_col)) if goal == "maximize" else \ min(sweep.runs, key=partial(get_metric, metric_col=metric_col)) if goal == "minimize" else \ None - + if best_run is None: raise RuntimeError("Optimization goal must be either 'minimize' or 'maximize'") - + if metric_col not in best_run.summary: continue if all_best_run is None: @@ -323,10 +324,10 @@ def get_new_ans(tissue): def write_ans(tissue, new_df, output_file=None): """Process and write results for a specific tissue type to CSV. - + Handles merging of new results with existing data, including conflict detection for metric values. - + Parameters ---------- tissue : str @@ -335,6 +336,7 @@ def write_ans(tissue, new_df, output_file=None): New results to be written output_file : str, optional Output file path. Defaults to 'sweep_results/{tissue}_ans.csv' + """ if output_file is None: output_file = f"sweep_results/{tissue}_ans.csv" @@ -345,7 +347,7 @@ def write_ans(tissue, new_df, output_file=None): # Reset index to ensure Dataset_id is a regular column new_df = new_df.reset_index(drop=True) - + # Process new data by merging rows with same Dataset_id new_df_processed = pd.DataFrame() for dataset_id in new_df['Dataset_id'].unique(): diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py index 07f1b9da..e7024a93 100644 --- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py +++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py @@ -76,18 +76,19 @@ def is_match(config_str): def is_matching_dict(yaml_str, target_dict): """Compare YAML configuration with target dictionary. - + Parameters ---------- yaml_str : str YAML configuration string to parse target_dict : dict Target dictionary to compare against - + Returns ------- bool True if dictionaries match, False otherwise + """ # Parse YAML string yaml_config = yaml.safe_load(yaml_str) @@ -107,18 +108,19 @@ def is_matching_dict(yaml_str, target_dict): def get_ans(query_dataset, method): """Get test accuracy results for a given dataset and method. - + Parameters ---------- query_dataset : str Dataset identifier method : str Method name to analyze - + Returns ------- pandas.DataFrame or None DataFrame containing test accuracy results, None if results don't exist + """ result_path = f"{file_root}/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv" if not os.path.exists(result_path): @@ -137,33 +139,33 @@ def get_ans(query_dataset, method): def get_ans_from_cache(query_dataset, method): """Get cached test accuracy results for atlas datasets. - + Parameters ---------- query_dataset : str Query dataset identifier method : str Method name to analyze - + Returns ------- pandas.DataFrame DataFrame containing test accuracy results from cache + """ # Get best method from step2 of atlas datasets # Search accuracy according to best method (all values should exist) - ans = pd.DataFrame(index=[method], - columns=[f"{atlas_dataset}_from_cache" for atlas_dataset in atlas_datasets]) - + ans = pd.DataFrame(index=[method], columns=[f"{atlas_dataset}_from_cache" for atlas_dataset in atlas_datasets]) + sweep_url = re.search(r"step2:([^|]+)", - conf_data[conf_data["dataset_id"] == query_dataset][method].iloc[0]).group(1) + conf_data[conf_data["dataset_id"] == query_dataset][method].iloc[0]).group(1) _, _, sweep_id = spilt_web(sweep_url) sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") - + for atlas_dataset in atlas_datasets: best_yaml = conf_data[conf_data["dataset_id"] == atlas_dataset][f"{method}_best_yaml"].iloc[0] match_run = None - + # Find matching run configuration for run in sweep.runs: if isinstance(best_yaml, float) and np.isnan(best_yaml): @@ -172,14 +174,13 @@ def get_ans_from_cache(query_dataset, method): if match_run is not None: raise ValueError("Multiple matching runs found when only one expected") match_run = run - + if match_run is None: logger.warning(f"No matching configuration found for {atlas_dataset} with method {method}") else: - ans.loc[method, f"{atlas_dataset}_from_cache"] = ( - match_run.summary["test_acc"] if "test_acc" in match_run.summary else np.nan - ) - + ans.loc[method, f"{atlas_dataset}_from_cache"] = (match_run.summary["test_acc"] + if "test_acc" in match_run.summary else np.nan) + return ans diff --git a/examples/atlas/test_get_result_web.py b/examples/atlas/test_get_result_web.py index 3bd817ae..a0e9a8b4 100644 --- a/examples/atlas/test_get_result_web.py +++ b/examples/atlas/test_get_result_web.py @@ -108,7 +108,7 @@ def test_write_ans(tmp_path): # 测试冲突情况的处理 write_ans(tissue, conflict_df, output_file=output_file) final_df = pd.read_csv(output_file) - + # 验证新值被更新 assert final_df[final_df['Dataset_id'] == 'dataset1']['method1_best_res'].iloc[0] == 0.7 From 3e24fcd5b89c00a0d27cdb1f73e09c6708bae6d7 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 10 Jan 2025 20:55:01 +0800 Subject: [PATCH 2/2] update ans --- .../example_usage_anndata.py | 14 ++++++++++++-- .../sc_similarity_examples/sim_query_atlas.py | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/atlas/sc_similarity_examples/example_usage_anndata.py b/examples/atlas/sc_similarity_examples/example_usage_anndata.py index b4b6d16e..f7103668 100644 --- a/examples/atlas/sc_similarity_examples/example_usage_anndata.py +++ b/examples/atlas/sc_similarity_examples/example_usage_anndata.py @@ -34,7 +34,8 @@ file_root = Path(__file__).resolve().parent set_seed(42) tissue = args.tissue -conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0) +# conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0) +conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue) target_files = list(conf_data[conf_data["queryed"] == False]["dataset_id"]) source_files = list(conf_data[conf_data["queryed"] == True]["dataset_id"]) @@ -150,4 +151,13 @@ def run_test_case(source_file): ] ans = run_test_case(source_file) merged_df = pd.concat(query_ans + [ans], join='inner') - merged_df.to_excel(writer, sheet_name=source_file[:4], index=True) + try: + # 尝试读取指定的分表 + existing_df = pd.read_excel(file_root / f"{tissue}_similarity.xlsx", sheet_name=source_file[:4]) + # 找出在新数据框中存在但在现有表格中不存在的行 + merged_df = pd.concat([existing_df, merged_df]) + merged_df = merged_df.drop_duplicates(keep='first') + # 使用 ExcelWriter 更新特定分表 + merged_df.to_excel(writer, sheet_name=source_file[:4], index=False) + except ValueError: + merged_df.to_excel(writer, sheet_name=source_file[:4], index=True) diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py index fb7483bb..b7b81ca0 100644 --- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py +++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py @@ -160,8 +160,8 @@ def get_ans_from_cache(query_dataset, method): # "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", # "71be997d-ff75-41b9-8a9f-1288c865f921" # ] -# conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue) -conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0) +conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue) +# conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0) atlas_datasets = list(conf_data[conf_data["queryed"] == False]["dataset_id"]) query_datasets = list(conf_data[conf_data["queryed"] == True]["dataset_id"]) if __name__ == "__main__":