curl · sluicing · Jan 8, 2024
diff --git a/mkindex.pl b/mkindex.pl
@@ -1,4 +1,5 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
+
 # Build an index of words in the file index-words that are found in the text.
 # Words are compared case insensitively except for those starting with a dash
 # (i.e. program option names). "Words" may actually be phrases consisting of
@@ -7,6 +8,10 @@
 
 use feature "fc";
 
+# import shared function
+use lib '.';
+require "urlify.pl";
+
 # Return the case-folded keyword UNLESS it appears to be an option string
 # in which case return it as-is. This makes word lookups case-insensitive
 # but option name lookups case-sensitive.
@@ -31,33 +36,6 @@ sub folded {
 }
 close(F);
 
-sub urlify {
-    my ($fname, $section)=@_;
-
-    # convert letters to lower case
-    $section =~ tr/[A-Z]/[a-z]/;
-
-    # Convert all '<' to '-less-than' 
-    $section =~ s/\</-less-than-/g;
-
-    # Convert all '>' to '-greater-than'
-    $section =~ s/\>/-greater-than-/g;
-
-    # remove rubbish
-    $section =~ s/[*`'":\(\),]+//g;
-
-    # convert anything left that isn't a dash, underscore, number or letter
-    $section =~ s/[^_a-zA-Z0-9-]/-/g;
-
-    # Remove starting chars that aren't a letter or underscore;
-    # those aren't legal for the beginning of a section ID.
-    $section =~ s/^[^_a-zA-Z]+//g;
-
-    # strip trailing dash '-' characters from the section header
-    $section =~ s/-+$//;
-
-    return "$fname#$section";
-}
 
 sub single {
     my ($fname)=@_;
@@ -72,16 +50,16 @@ sub single {
         my $l=$_;
 
         # Track whether we are within a markdown code block that begins/ends with ```
-        if ($_ =~ /^\`\`\`.*/) {
-            if ($in_code_section) {
+        if($_ =~ /^\`\`\`.*/) {
+            if($in_code_section) {
                 $in_code_section = 0;
             } else {
                 $in_code_section = 1;
             }
         }
 
-        if (!$in_code_section) {
-            if ($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
+        if(!$in_code_section) {
+            if($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
                 # This section header has an explicit ID specified, e.g. "#Foo {#foo}"
                 $depth = $1;
                 $section = $2;
@@ -90,17 +68,29 @@ sub single {
                 $section =~ s/\s+$//;
 
                 $url = "$fname#$dest_id";
+
+                # Use only the anchor text when a section heading has an anchor tag
+                if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
+                    $section = "$1$2";
+                }
                 $l = $section;
             }
-            elsif ($_ =~ /^(#[\#]*) (.*)/) {
+            elsif($_ =~ /^(#[\#]*) (.*)/) {
                 # This section header has no explicit ID specified, e.g. "#Foo"
                 $depth = $1;
                 $section = $2;
 
                 # trim whitespace off end of section
                 $section =~ s/\s+$//;
 
-                $url=urlify($fname, $section);
+                my $url_section  = urlify($section);
+                $url = "$fname#$url_section";
+
+                # Use only the anchor text when a section heading has an anchor tag
+                if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
+                    $section = "$1$2";
+                }
+
                 $l = $section; # use this too
             }
         }
@@ -121,7 +111,6 @@ sub single {
         foreach my $w (@lwords) {
             if(folded($l) =~ /$w/) {
                 if(!$word{$w}{$fname}) {
-                    #print " $w ($url)\n";
                     $word{$w}{$fname}++;
                     $all{$w} .= ($all{$w}?", ":"")."[$section]($url)";
                 }
@@ -158,9 +147,9 @@ sub byname {
     my $l = substr(sorting($w), 0, 1);
     if(!$letter{$l}) {
         $letter{$l}++;
-        print "## $l\n";
+        # Make sure headings have blank lines before and after
+        print "\n## $l\n\n";
     }
-
     printf " - ".$index{$w}.": ";
     print $all{$w}."\n";
 }
diff --git a/uni.pl b/uni.pl
@@ -1,4 +1,8 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
+
+# import shared function
+use lib '.';
+require "urlify.pl";
 
 my $sum = shift @ARGV;
 
@@ -20,37 +24,209 @@ sub dirname {
     return "";
 }
 
+
+# Converts a file path to something that can be used in a section ID
+sub urlify_file_path {
+    my ($part) = @_;
+
+    # Strip a leading '/' if present
+    $part =~ s/^\///;
+
+    $part =~ s/[\/\.]+/__/g;
+
+    return $part;
+}
+
+
+# Converts a relative path and current directory to a full path relative
+# to the "ROOT" directory.
+sub make_full_file_path {
+    my ($current_dir, $linked_file_path) = @_;
+
+    # Handle pants with "../" in them
+    while($linked_file_path =~ m/^\.\..*/) {
+        # Strip leading ".."
+        $linked_file_path =~ s/^..//;
+        # Strinp leading "/" if present
+        $linked_file_path =~ s/^\///;
+
+        # Strip trailing /
+        $current_dir =~ s/\/$//;
+        if($current_dir =~ m/.*\/.*/) {
+            # if there is more than one dir remaining, strip the last dir
+            $current_dir =~ s/(.*)\/(.*)/$1/;
+        } else {
+            # only one dir remaining, use root of ""
+            $current_dir = "";
+        }
+    }
+
+    my $final_path="$current_dir/$linked_file_path";
+
+    return $final_path;
+}
+
+
+# Rewrites an anchor destination to point to a section ID in the current file.
+sub update_anchor {
+    my ($anchor_text, $anchor_target, $dir, $f) = @_;
+    my $final_target = $anchor_target;
+
+    if($anchor_target =~ m/^(http\:|https\:).*$/ ) {
+        # don't rewrite regular http / https urls
+    } elsif($anchor_target =~ m/(^.*)(\.png|\.jpg)$/ ) {
+        #
+        # The anchor_target points to an image, add the image directory to the target
+        #
+        $final_target = "$dir$1$2";
+    } elsif($anchor_target =~ m/((.+)\.md)\#(.+)/ ) {
+        #
+        # The anchor_target points a SPECIFIC section of a DIFFERENT *.md file
+        #
+        my $full_file_path=make_full_file_path($dir, $1);
+        my $file_target=urlify_file_path($full_file_path);
+        my $section_target=$3;
+
+        $final_target = "#$file_target" . "-_-_-" . $section_target;
+    } elsif($anchor_target =~ m/((.+)\.md)$/x) {
+        #
+        # The anchor_target points NO SECTION in a DIFFERENT *.md file
+        #
+
+        my $full_file_path=make_full_file_path($dir, $1);
+        my $file_target=urlify_file_path($full_file_path);
+
+        $final_target = "#$file_target";
+    } elsif($anchor_target =~ m/\#(.+)$/x) {
+        #
+        # The anchor_target points A SECTION in THIS *.md file
+        #
+
+        my $full_file_path = $f;
+        my $file_target=urlify_file_path($full_file_path);
+
+        my $section_target=$1;
+
+        $final_target = "#$file_target" . "-_-_-" . $section_target;
+    }
+
+    return $anchor_text . "($final_target)";
+}
+
+my $errors = 0;
+
 sub include {
     my ($f) = @_;
     my $line;
     open(M, "<$f") || return;
     my $dir = dirname($f);
+
+    my $same_dir_url_part = urlify_file_path($dir);
+    my $same_file_url_part = urlify_file_path($f);
+
+    my $in_code_section = 0;
+    my $next_line_should_be_blank = 0;
+
+    # Print an empty span at the top of each new file, to give "foo.md" links from other files a
+    # place to target with a {#foo} style link in the combined .md file
     print "\n";
+    print "[ ]\{#$same_file_url_part\}\n";
+    print "\n";
+
     while(<M>) {
         $line++;
-        # strip out links to markdown files
-        $_ =~ s/\[([^]]*)\]\(.*\.md(|\#(.*))\)/$1/g;
-        # add path to image links
-        $_ =~ s/^!\[(.*)\]\(([^)]*)\)/![$1]($dir$2)/g;
-
-        if($_ =~ /\]\(.*\.md/) {
-            print STDERR "$f:$line:line-split markdown link\n";
-            print STDERR "$_";
-            $errors++;
-        }
-        if($_ =~ /谭/) {
-            # skip unicode letter pandoc does not like
+        my $complete_line = $_;
+
+        if($next_line_should_be_blank) {
+            if(! (($complete_line eq "") || ($complete_line =~ m/^[\s]*$/))) {
+                print STDERR "WARNING: The line after a '#' header should be blank in $f, but was: $complete_line\n";
+            }
+
+            $next_line_should_be_blank = 0;
         }
-        else {
-            print $_;
+
+        # Track whether we are within a markdown code block that begins/ends with ```
+        if($complete_line =~ /^\`\`\`.*/) {
+            if($in_code_section) {
+                $in_code_section = 0;
+            } else {
+                $in_code_section = 1;
+            }
+        } elsif(!$in_code_section) {
+            # Split line, so we can update multiple URLs on one line
+            my @line_items = split( /( \[ [^]]*\]\([^)]*?\) )/x , $_);
+
+            foreach my $item (@line_items) {
+                # Update all of the anchor targets in the line
+                $item =~ s/ (\[ [^]]*\]) \( ([^)]*?) \) /update_anchor($1 , $2, $dir, $f) /xge;
+            }
+            $complete_line = join('', @line_items);
+
+            #
+            # Check for section H1, H2, etc, definitions (starting with #, ##, etc)
+            #
+            # Add an explicit section ID to those if not present
+            # Include in that section ID a reference to the full path to the current file
+            my $full_file_path = $f;
+            my $file_target=urlify_file_path($full_file_path);
+
+            my $final_section_line = $complete_line;
+
+            if($complete_line =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
+                $next_line_should_be_blank = 1;
+
+                # This section header has an explicit ID specified, e.g. "#Foo {#foo}"
+                my $depth = $1;
+                my $section = $2;
+                my $section_id=$4;
+
+                # trim whitespace off end of section
+                $section =~ s/\s+$//;
+
+                $section_id = "$file_target" . "-_-_-" . $section_id;
+
+                $final_section_line = "$depth $section \{\#$section_id\}\n";
+            }
+            elsif($complete_line =~ /^(#[\#]*) (.*)/) {
+                $next_line_should_be_blank = 1;
+
+                # This section header has no explicit ID specified, e.g. "#Foo"
+                my $depth = $1;
+                my $section = $2;
+
+                # trim whitespace off end of section
+                $section =~ s/\s+$//;
+
+                my $section_id = "$file_target" . "-_-_-" . urlify($section);
+
+                $final_section_line = "$depth $section \{\#$section_id\}\n";
+            }
+
+            $complete_line = $final_section_line;
+
+            if($complete_line =~ /谭/) {
+                # skip unicode letter pandoc does not like
+            }
+            else {
+                print $complete_line;
+            }
+        } else {
+            # in a code section, print lines as-is.
+
+            if($complete_line =~ /谭/) {
+                # skip unicode letter pandoc does not like
+            }
+            else {
+                print $complete_line;
+            }
         }
     }
     close(M);
 }
 
 
-for my $f (@files) {
-    include($f);
+for my $file (@files) {
+    include($file);
 }
 
 exit $errors;