-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile
112 lines (101 loc) · 3.68 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# mamba create -c conda-forge -c bioconda -n snakemake snakemake
# mamba activate snakemake
# snakemake -c2 --show-failed-logs --use-singularity --use-conda pairwise-sim.tsv.gz
data_repo = "https://huggingface.co/datasets/imageomics/phenoscape-character-eqs/resolve/main"
rule retrieve_annotations_file:
output:
"phenex-data-merged.ofn.gz"
container:
"docker://obolibrary/odkfull:v1.5"
shell:
"curl -L -O {data_repo}/{output}"
rule retrieve_tbox_file:
output:
"phenoscape-kb-tbox-classified.ttl.gz"
container:
"docker://obolibrary/odkfull:v1.5"
shell:
"curl -L -O {data_repo}/{output}"
rule convert_ofn_gz_to_ttl:
input:
"{ontology}.ofn.gz"
output:
"{ontology}.ttl"
container:
"docker://obolibrary/robot:v1.9.4"
shell:
"robot convert -i {input} -o {output}"
rule extract_descriptions:
input:
"phenex-data-merged.ttl",
"phenoscape-kb-tbox-classified.ttl.gz",
"sparql/extract-descriptions.rq"
output:
"extracted-descriptions.tsv"
container:
"docker://stain/jena:5.1.0"
shell:
"arq --data phenex-data-merged.ttl --data phenoscape-kb-tbox-classified.ttl.gz --query sparql/extract-descriptions.rq --results tsv | sed -E 's/^\"//' | sed -E 's/\\t\"/\\t/g' | sed -E 's/\"$//' | sed -E 's/\\\\\"/\"/g' | tail -n +2 >{output}"
rule extract_annotations:
input:
"phenex-data-merged.ttl",
"phenoscape-kb-tbox-classified.ttl.gz",
"sparql/descriptions-to-ontology.rq"
output:
"annotations.tsv"
container:
"docker://stain/jena:5.1.0"
shell:
"arq --data phenex-data-merged.ttl --data phenoscape-kb-tbox-classified.ttl.gz --query sparql/descriptions-to-ontology.rq --results tsv | sed -E 's/^\"//' | sed -E 's/\"\\t\"/\\t/g' | sed -E 's/\"$//' | sed -E 's/\\\\\"/\"/g' | tail -n +2 >{output}"
rule convert_ttl_gz_to_souffle_tsv:
input:
"{rdf}.ttl.gz"
output:
"{rdf}.facts"
container:
"docker://stain/jena:5.1.0"
shell:
"{{ riot -q --nocheck --output ntriples {input} || true; }} | sed 's/ /\\t/' | sed 's/ /\\t/' | sed 's/ \\.$//' >{output}"
rule subsumptions_closure:
input:
"phenoscape-kb-tbox-classified.facts",
"scripts/subsumptions.dl"
output:
"subsumptions.tsv"
container:
"docker://obolibrary/odkfull:v1.5"
shell:
"souffle -c scripts/subsumptions.dl"
rule compute_pairwise_similarity:
input:
"annotations.tsv",
"subsumptions.tsv"
output:
"pairwise-sim.tsv.gz"
container:
"docker://virtuslab/scala-cli:1.3.0"
shell:
"scala-cli run --server=false --java-opt -Xmx48G scripts/sim.sc -- {input} {output}"
rule create_train_data:
input:
"extracted-descriptions.tsv",
"pairwise-sim.tsv.gz",
"embed_model/create_train_data.py"
output:
"data_{percentage}p_TRAINING.tsv.gz",
"data_{percentage}p_ALL_NON_TRAIN.tsv.gz",
"data_{percentage}p_NON_OVERLAP.tsv.gz"
conda:
"environment.yaml"
shell:
"python embed_model/create_train_data.py extracted-descriptions.tsv pairwise-sim.tsv.gz {wildcards.percentage} data_{wildcards.percentage}p_TRAINING.tsv.gz data_{wildcards.percentage}p_ALL_NON_TRAIN.tsv.gz data_{wildcards.percentage}p_NON_OVERLAP.tsv.gz"
rule train_model:
input:
data="data_{percentage}p_TRAINING.tsv.gz",
script="embed_model/train_mpnet_v2.py"
output:
output_dir=directory("output-{percentage}p")
conda:
"train_environment.yaml"
shell:
"mkdir {output.output_dir}; python {input.script} {input.data} {output.output_dir}"