crappy command line version of INCATools/table-editor#9

cmungall · Feb 13, 2017 · 631526f · 631526f
1 parent d5c446a
commit 631526f
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 12 deletions.
diff --git a/dptab2grid.py b/dptab2grid.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+__author__ = 'cjm'
+
+import argparse
+import logging
+import re
+import yaml
+import json
+import uuid
+import csv
+import itertools
+import sys
+from collections import Counter
+
+def main():
+
+    delimiter=','
+    parser = argparse.ArgumentParser(description='DOSDB'
+                                                 'fooo',
+                                     formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-i', '--input', type=str, required=False,
+                        help='Input metadata file')
+
+    args = parser.parse_args()
+
+    input_file = csv.reader(open(args.input), delimiter=delimiter)
+    rows = [row for row in input_file]
+
+    nmap = {}
+    grid = {}
+    colmap = {}
+
+    for row in rows:
+        [iri,label,x,xl,y,yl] = row
+        nmap[x] = xl
+        nmap[y] = yl
+        colmap[y] = 1
+        if x not in grid:
+            grid[x] = {}
+        grid[x][y] = (iri,label)
+
+    tdel = " | "
+    tstart = "| "
+    tend = " |\n"
+    s = tstart
+    for y in colmap.keys():
+        s+= tdel + hlink(y, nmap[y])
+    s+= tend
+    s+= tstart + "---"
+    for y in colmap.keys():
+        s+= tdel + "---"
+    s+= tend
+
+    for (x,row) in grid.items():
+        s+= tstart + hlink(x, nmap[x])
+        for y in colmap.keys():
+            v = ""
+            if y in row:
+                v = hlink(*row[y])
+            s+= tdel + v
+        s+= tdel + tend
+    print(s)
+
+def hlink(id,label):
+    url = id2url(id)
+    return "[{}]({})".format(label,url)
+
+def id2url(id):
+    if len(id.split(":")) == 2:
+        [prefix,localid] = id.split(":")
+    else:
+        return id
+    return 'http://purl.obolibrary.org/obo/{}_{}'.format(prefix,localid)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/extract-obo-syns.pl b/extract-obo-syns.pl
@@ -32,12 +32,16 @@
             die $rest;
         }
         my @xrefs = split(/,\s*/,$xrefstr);
-        next if $type =~ m@smiles@i;
-        next if $type =~ m@inchi@i;
-        next if $type =~ m@formula@i;
-        next if $syn !~ m@[a-z]@;   # skip abbrevs and chem symbols
-        next if $syn =~ m@_@;   # skip weird stuff, e.g. grouped_by_chemistry in CHEBI
-        next if $syn =~ m@\@\w+$@;   # skip lang tags
+        if ($type =~ m@CURATED@) {
+        }
+        else {
+            next if $type =~ m@smiles@i;
+            next if $type =~ m@inchi@i;
+            next if $type =~ m@formula@i;
+            next if $syn !~ m@[a-z]@;   # skip abbrevs and chem symbols
+            next if $syn =~ m@_@;   # skip weird stuff, e.g. grouped_by_chemistry in CHEBI
+            next if $syn =~ m@\@\w+$@;   # skip lang tags
+        }
         $syn =~ s@\"@'@g;
         $syn =~ s@\\@@g;
         push(@{$smap->{$id}},

diff --git a/fill-col1-ids.pl b/fill-col1-ids.pl
@@ -4,6 +4,7 @@
 my $dry_run = 0;
 my $max = 999999999;
 my $min = 0;
+my $skip_dupes = 0;
 while ($ARGV[0] =~ m@^\-@) {
     my $opt = shift @ARGV;
     if ($opt eq '-n' || $opt eq '--dry-run') {
@@ -15,11 +16,15 @@
     elsif ($opt eq '--min') {
         $min = shift @ARGV;
     }
+    elsif ($opt eq '--skip-dupes') {
+        $skip_dupes = 1;
+    }
     else {
         die $opt;
     }
 }
 
+
 my @files = @ARGV;
 my %pfxh = ();
 my %minmap = ();
@@ -65,36 +70,71 @@
     my $n=0;
 
     open(F,">$f.tmp") || die "writing to $f";
+    my $N_COLS;
     foreach (@lines) {
-        my ($id,$lbl,@rest) = split(/[\t,]/,$_);
-        my $val = join(",",@rest);
+        if (m@  @) {
+            die "DOUBLE SPACE: $_";
+        }
+        my ($id,$lbl,@rest) = split_csvline($_);
+        if ($N_COLS) {
+            if (scalar(@rest) != $N_COLS) {
+                die "wrong number of cols: $_\n";
+            }
+        }
+        $N_COLS = scalar(@rest);
+        my $val = "";
+        for (my $i=0; $i<@rest; $i+=2) {
+            $val .= $rest[$i];
+        }
+        if (grep {m@ \! @} @rest) {
+            die "UH OH: $_";
+        }
         if ($done{$val} && $id ne 'iri') {
-            print STDERR "DUPLICATION: ($id $f), ($done{$val} $done_in{$val}) => $val\n";
-            if (!$id) {
+            print STDERR "DUPLICATION: (ID:$id FILE:$f), ($done{$val} $done_in{$val}) => $val in: $_";
+            #if (!$id) {
+                if ($skip_dupes) {
+                    $n++;
+                    next;
+                }
+                else {
+                    die "DUPE";
+                }
+            #}
+        }
+        if ($done{$id} && $id ne 'iri') {
+            print STDERR "DUPLICATED ID: (ID:$id FILE:$f), ($done{$val} $done_in{$val}) => $val in: $_";
+            if ($skip_dupes) {
+                $n++;
+                next;
+            }
+            else {
                 die "DUPE";
             }
         }
         $done{$val} = $id;
+        $done{$id} = $val;
         $done_in{$val} = $f;
         if (!$id) {
             $n++;
             $id = next_id();
             print STDERR "NEWID: $id $lbl\n";
             $_ = "$id$_";
         }
+        #print STDERR $_;
         print F $_;
     }
     close(F);
     if ($n) {
-        print STDERR "$f ADDED: $total\n";
+        print STDERR "FILE: $f ADDED: $total\n";
         print `mv $f.tmp $f` unless $dry_run;
         $total += $n;
     }
     else {
+        print STDERR "NO CHANGE: will not write\n";
         `rm $f.tmp`;
     }
 }
-print STDERR "ADDED: $total\n";
+print STDERR "TOTAL CHANGED: $total\n";
 exit 0;
 
 
@@ -133,3 +173,27 @@ sub next_id {
     my $next_id = sprintf $FMT, $frag;
     return $next_id;
 }
+
+sub split_csvline {
+    my $line = shift;
+    chomp $line;
+    if ($line =~ m@\t@) {
+        return split(/\t/,$_);
+    }
+    my @vals = split(/,/,$_);
+    my @rvals = ();
+    while (@vals) {
+        my $v = shift @vals;
+        chomp $v;
+        while ($v =~ m@^\"@ && $v !~ m@\"\s*$@) {
+            if (@vals) {
+                $v .= shift @vals;
+            }
+            else {
+                die "unclosed: '$v'\n";
+            }
+        }
+        push(@rvals, $v);
+    }
+    return @rvals;
+}