Skip to content

Commit

Permalink
Support links in the pdf,epub version via section url rewrites
Browse files Browse the repository at this point in the history
  • Loading branch information
sluicing committed Jan 8, 2024
1 parent 26498ea commit 0d69b40
Show file tree
Hide file tree
Showing 3 changed files with 266 additions and 54 deletions.
63 changes: 26 additions & 37 deletions mkindex.pl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/perl
#!/usr/bin/perl -w

# Build an index of words in the file index-words that are found in the text.
# Words are compared case insensitively except for those starting with a dash
# (i.e. program option names). "Words" may actually be phrases consisting of
Expand All @@ -7,6 +8,10 @@

use feature "fc";

# import shared function
use lib '.';
require "urlify.pl";

# Return the case-folded keyword UNLESS it appears to be an option string
# in which case return it as-is. This makes word lookups case-insensitive
# but option name lookups case-sensitive.
Expand All @@ -31,33 +36,6 @@ sub folded {
}
close(F);

sub urlify {
my ($fname, $section)=@_;

# convert letters to lower case
$section =~ tr/[A-Z]/[a-z]/;

# Convert all '<' to '-less-than'
$section =~ s/\</-less-than-/g;

# Convert all '>' to '-greater-than'
$section =~ s/\>/-greater-than-/g;

# remove rubbish
$section =~ s/[*`'":\(\),]+//g;

# convert anything left that isn't a dash, underscore, number or letter
$section =~ s/[^_a-zA-Z0-9-]/-/g;

# Remove starting chars that aren't a letter or underscore;
# those aren't legal for the beginning of a section ID.
$section =~ s/^[^_a-zA-Z]+//g;

# strip trailing dash '-' characters from the section header
$section =~ s/-+$//;

return "$fname#$section";
}

sub single {
my ($fname)=@_;
Expand All @@ -72,16 +50,16 @@ sub single {
my $l=$_;

# Track whether we are within a markdown code block that begins/ends with ```
if ($_ =~ /^\`\`\`.*/) {
if ($in_code_section) {
if($_ =~ /^\`\`\`.*/) {
if($in_code_section) {
$in_code_section = 0;
} else {
$in_code_section = 1;
}
}

if (!$in_code_section) {
if ($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
if(!$in_code_section) {
if($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
# This section header has an explicit ID specified, e.g. "#Foo {#foo}"
$depth = $1;
$section = $2;
Expand All @@ -90,17 +68,29 @@ sub single {
$section =~ s/\s+$//;

$url = "$fname#$dest_id";

# Use only the anchor text when a section heading has an anchor tag
if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
$section = "$1$2";
}
$l = $section;
}
elsif ($_ =~ /^(#[\#]*) (.*)/) {
elsif($_ =~ /^(#[\#]*) (.*)/) {
# This section header has no explicit ID specified, e.g. "#Foo"
$depth = $1;
$section = $2;

# trim whitespace off end of section
$section =~ s/\s+$//;

$url=urlify($fname, $section);
my $url_section = urlify($section);
$url = "$fname#$url_section";

# Use only the anchor text when a section heading has an anchor tag
if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
$section = "$1$2";
}

$l = $section; # use this too
}
}
Expand All @@ -121,7 +111,6 @@ sub single {
foreach my $w (@lwords) {
if(folded($l) =~ /$w/) {
if(!$word{$w}{$fname}) {
#print " $w ($url)\n";
$word{$w}{$fname}++;
$all{$w} .= ($all{$w}?", ":"")."[$section]($url)";
}
Expand Down Expand Up @@ -158,9 +147,9 @@ sub byname {
my $l = substr(sorting($w), 0, 1);
if(!$letter{$l}) {
$letter{$l}++;
print "## $l\n";
# Make sure headings have blank lines before and after
print "\n## $l\n\n";
}

printf " - ".$index{$w}.": ";
print $all{$w}."\n";
}
210 changes: 193 additions & 17 deletions uni.pl
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/perl
#!/usr/bin/perl -w

# import shared function
use lib '.';
require "urlify.pl";

my $sum = shift @ARGV;

Expand All @@ -20,37 +24,209 @@ sub dirname {
return "";
}


# Converts a file path to something that can be used in a section ID
sub urlify_file_path {
my ($part) = @_;

# Strip a leading '/' if present
$part =~ s/^\///;

$part =~ s/[\/\.]+/__/g;

return $part;
}


# Converts a relative path and current directory to a full path relative
# to the "ROOT" directory.
sub make_full_file_path {
my ($current_dir, $linked_file_path) = @_;

# Handle pants with "../" in them
while($linked_file_path =~ m/^\.\..*/) {
# Strip leading ".."
$linked_file_path =~ s/^..//;
# Strinp leading "/" if present
$linked_file_path =~ s/^\///;

# Strip trailing /
$current_dir =~ s/\/$//;
if($current_dir =~ m/.*\/.*/) {
# if there is more than one dir remaining, strip the last dir
$current_dir =~ s/(.*)\/(.*)/$1/;
} else {
# only one dir remaining, use root of ""
$current_dir = "";
}
}

my $final_path="$current_dir/$linked_file_path";

return $final_path;
}


# Rewrites an anchor destination to point to a section ID in the current file.
sub update_anchor {
my ($anchor_text, $anchor_target, $dir, $f) = @_;
my $final_target = $anchor_target;

if($anchor_target =~ m/^(http\:|https\:).*$/ ) {
# don't rewrite regular http / https urls
} elsif($anchor_target =~ m/(^.*)(\.png|\.jpg)$/ ) {
#
# The anchor_target points to an image, add the image directory to the target
#
$final_target = "$dir$1$2";
} elsif($anchor_target =~ m/((.+)\.md)\#(.+)/ ) {
#
# The anchor_target points a SPECIFIC section of a DIFFERENT *.md file
#
my $full_file_path=make_full_file_path($dir, $1);
my $file_target=urlify_file_path($full_file_path);
my $section_target=$3;

$final_target = "#$file_target" . "-_-_-" . $section_target;
} elsif($anchor_target =~ m/((.+)\.md)$/x) {
#
# The anchor_target points NO SECTION in a DIFFERENT *.md file
#

my $full_file_path=make_full_file_path($dir, $1);
my $file_target=urlify_file_path($full_file_path);

$final_target = "#$file_target";
} elsif($anchor_target =~ m/\#(.+)$/x) {
#
# The anchor_target points A SECTION in THIS *.md file
#

my $full_file_path = $f;
my $file_target=urlify_file_path($full_file_path);

my $section_target=$1;

$final_target = "#$file_target" . "-_-_-" . $section_target;
}

return $anchor_text . "($final_target)";
}

my $errors = 0;

sub include {
my ($f) = @_;
my $line;
open(M, "<$f") || return;
my $dir = dirname($f);

my $same_dir_url_part = urlify_file_path($dir);
my $same_file_url_part = urlify_file_path($f);

my $in_code_section = 0;
my $next_line_should_be_blank = 0;

# Print an empty span at the top of each new file, to give "foo.md" links from other files a
# place to target with a {#foo} style link in the combined .md file
print "\n";
print "[ ]\{#$same_file_url_part\}\n";
print "\n";

while(<M>) {
$line++;
# strip out links to markdown files
$_ =~ s/\[([^]]*)\]\(.*\.md(|\#(.*))\)/$1/g;
# add path to image links
$_ =~ s/^!\[(.*)\]\(([^)]*)\)/![$1]($dir$2)/g;

if($_ =~ /\]\(.*\.md/) {
print STDERR "$f:$line:line-split markdown link\n";
print STDERR "$_";
$errors++;
}
if($_ =~ //) {
# skip unicode letter pandoc does not like
my $complete_line = $_;

if($next_line_should_be_blank) {
if(! (($complete_line eq "") || ($complete_line =~ m/^[\s]*$/))) {
print STDERR "WARNING: The line after a '#' header should be blank in $f, but was: $complete_line\n";
}

$next_line_should_be_blank = 0;
}
else {
print $_;

# Track whether we are within a markdown code block that begins/ends with ```
if($complete_line =~ /^\`\`\`.*/) {
if($in_code_section) {
$in_code_section = 0;
} else {
$in_code_section = 1;
}
} elsif(!$in_code_section) {
# Split line, so we can update multiple URLs on one line
my @line_items = split( /( \[ [^]]*\]\([^)]*?\) )/x , $_);

foreach my $item (@line_items) {
# Update all of the anchor targets in the line
$item =~ s/ (\[ [^]]*\]) \( ([^)]*?) \) /update_anchor($1 , $2, $dir, $f) /xge;
}
$complete_line = join('', @line_items);

#
# Check for section H1, H2, etc, definitions (starting with #, ##, etc)
#
# Add an explicit section ID to those if not present
# Include in that section ID a reference to the full path to the current file
my $full_file_path = $f;
my $file_target=urlify_file_path($full_file_path);

my $final_section_line = $complete_line;

if($complete_line =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
$next_line_should_be_blank = 1;

# This section header has an explicit ID specified, e.g. "#Foo {#foo}"
my $depth = $1;
my $section = $2;
my $section_id=$4;

# trim whitespace off end of section
$section =~ s/\s+$//;

$section_id = "$file_target" . "-_-_-" . $section_id;

$final_section_line = "$depth $section \{\#$section_id\}\n";
}
elsif($complete_line =~ /^(#[\#]*) (.*)/) {
$next_line_should_be_blank = 1;

# This section header has no explicit ID specified, e.g. "#Foo"
my $depth = $1;
my $section = $2;

# trim whitespace off end of section
$section =~ s/\s+$//;

my $section_id = "$file_target" . "-_-_-" . urlify($section);

$final_section_line = "$depth $section \{\#$section_id\}\n";
}

$complete_line = $final_section_line;

if($complete_line =~ //) {
# skip unicode letter pandoc does not like
}
else {
print $complete_line;
}
} else {
# in a code section, print lines as-is.

if($complete_line =~ //) {
# skip unicode letter pandoc does not like
}
else {
print $complete_line;
}
}
}
close(M);
}


for my $f (@files) {
include($f);
for my $file (@files) {
include($file);
}

exit $errors;
Loading

0 comments on commit 0d69b40

Please sign in to comment.