Skip to content

Commit

Permalink
crappy command line version of INCATools/table-editor#9
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall committed Feb 13, 2017
1 parent d5c446a commit 631526f
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 12 deletions.
79 changes: 79 additions & 0 deletions dptab2grid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3

__author__ = 'cjm'

import argparse
import logging
import re
import yaml
import json
import uuid
import csv
import itertools
import sys
from collections import Counter

def main():

delimiter=','
parser = argparse.ArgumentParser(description='DOSDB'
'fooo',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--input', type=str, required=False,
help='Input metadata file')

args = parser.parse_args()

input_file = csv.reader(open(args.input), delimiter=delimiter)
rows = [row for row in input_file]

nmap = {}
grid = {}
colmap = {}

for row in rows:
[iri,label,x,xl,y,yl] = row
nmap[x] = xl
nmap[y] = yl
colmap[y] = 1
if x not in grid:
grid[x] = {}
grid[x][y] = (iri,label)

tdel = " | "
tstart = "| "
tend = " |\n"
s = tstart
for y in colmap.keys():
s+= tdel + hlink(y, nmap[y])
s+= tend
s+= tstart + "---"
for y in colmap.keys():
s+= tdel + "---"
s+= tend

for (x,row) in grid.items():
s+= tstart + hlink(x, nmap[x])
for y in colmap.keys():
v = ""
if y in row:
v = hlink(*row[y])
s+= tdel + v
s+= tdel + tend
print(s)

def hlink(id,label):
url = id2url(id)
return "[{}]({})".format(label,url)

def id2url(id):
if len(id.split(":")) == 2:
[prefix,localid] = id.split(":")
else:
return id
return 'http://purl.obolibrary.org/obo/{}_{}'.format(prefix,localid)


if __name__ == "__main__":
main()

16 changes: 10 additions & 6 deletions extract-obo-syns.pl
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,16 @@
die $rest;
}
my @xrefs = split(/,\s*/,$xrefstr);
next if $type =~ m@smiles@i;
next if $type =~ m@inchi@i;
next if $type =~ m@formula@i;
next if $syn !~ m@[a-z]@; # skip abbrevs and chem symbols
next if $syn =~ m@_@; # skip weird stuff, e.g. grouped_by_chemistry in CHEBI
next if $syn =~ m@\@\w+$@; # skip lang tags
if ($type =~ m@CURATED@) {
}
else {
next if $type =~ m@smiles@i;
next if $type =~ m@inchi@i;
next if $type =~ m@formula@i;
next if $syn !~ m@[a-z]@; # skip abbrevs and chem symbols
next if $syn =~ m@_@; # skip weird stuff, e.g. grouped_by_chemistry in CHEBI
next if $syn =~ m@\@\w+$@; # skip lang tags
}
$syn =~ s@\"@'@g;
$syn =~ s@\\@@g;
push(@{$smap->{$id}},
Expand Down
76 changes: 70 additions & 6 deletions fill-col1-ids.pl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
my $dry_run = 0;
my $max = 999999999;
my $min = 0;
my $skip_dupes = 0;
while ($ARGV[0] =~ m@^\-@) {
my $opt = shift @ARGV;
if ($opt eq '-n' || $opt eq '--dry-run') {
Expand All @@ -15,11 +16,15 @@
elsif ($opt eq '--min') {
$min = shift @ARGV;
}
elsif ($opt eq '--skip-dupes') {
$skip_dupes = 1;
}
else {
die $opt;
}
}


my @files = @ARGV;
my %pfxh = ();
my %minmap = ();
Expand Down Expand Up @@ -65,36 +70,71 @@
my $n=0;

open(F,">$f.tmp") || die "writing to $f";
my $N_COLS;
foreach (@lines) {
my ($id,$lbl,@rest) = split(/[\t,]/,$_);
my $val = join(",",@rest);
if (m@ @) {
die "DOUBLE SPACE: $_";
}
my ($id,$lbl,@rest) = split_csvline($_);
if ($N_COLS) {
if (scalar(@rest) != $N_COLS) {
die "wrong number of cols: $_\n";
}
}
$N_COLS = scalar(@rest);
my $val = "";
for (my $i=0; $i<@rest; $i+=2) {
$val .= $rest[$i];
}
if (grep {m@ \! @} @rest) {
die "UH OH: $_";
}
if ($done{$val} && $id ne 'iri') {
print STDERR "DUPLICATION: ($id $f), ($done{$val} $done_in{$val}) => $val\n";
if (!$id) {
print STDERR "DUPLICATION: (ID:$id FILE:$f), ($done{$val} $done_in{$val}) => $val in: $_";
#if (!$id) {
if ($skip_dupes) {
$n++;
next;
}
else {
die "DUPE";
}
#}
}
if ($done{$id} && $id ne 'iri') {
print STDERR "DUPLICATED ID: (ID:$id FILE:$f), ($done{$val} $done_in{$val}) => $val in: $_";
if ($skip_dupes) {
$n++;
next;
}
else {
die "DUPE";
}
}
$done{$val} = $id;
$done{$id} = $val;
$done_in{$val} = $f;
if (!$id) {
$n++;
$id = next_id();
print STDERR "NEWID: $id $lbl\n";
$_ = "$id$_";
}
#print STDERR $_;
print F $_;
}
close(F);
if ($n) {
print STDERR "$f ADDED: $total\n";
print STDERR "FILE: $f ADDED: $total\n";
print `mv $f.tmp $f` unless $dry_run;
$total += $n;
}
else {
print STDERR "NO CHANGE: will not write\n";
`rm $f.tmp`;
}
}
print STDERR "ADDED: $total\n";
print STDERR "TOTAL CHANGED: $total\n";
exit 0;


Expand Down Expand Up @@ -133,3 +173,27 @@ sub next_id {
my $next_id = sprintf $FMT, $frag;
return $next_id;
}

sub split_csvline {
my $line = shift;
chomp $line;
if ($line =~ m@\t@) {
return split(/\t/,$_);
}
my @vals = split(/,/,$_);
my @rvals = ();
while (@vals) {
my $v = shift @vals;
chomp $v;
while ($v =~ m@^\"@ && $v !~ m@\"\s*$@) {
if (@vals) {
$v .= shift @vals;
}
else {
die "unclosed: '$v'\n";
}
}
push(@rvals, $v);
}
return @rvals;
}

0 comments on commit 631526f

Please sign in to comment.