Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 331 links for pdf with section url rewrites #404

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 26 additions & 37 deletions mkindex.pl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/perl
#!/usr/bin/perl -w

# Build an index of words in the file index-words that are found in the text.
# Words are compared case insensitively except for those starting with a dash
# (i.e. program option names). "Words" may actually be phrases consisting of
Expand All @@ -7,6 +8,10 @@

use feature "fc";

# import shared function
use lib '.';
require "urlify.pl";

# Return the case-folded keyword UNLESS it appears to be an option string
# in which case return it as-is. This makes word lookups case-insensitive
# but option name lookups case-sensitive.
Expand All @@ -31,33 +36,6 @@ sub folded {
}
close(F);

sub urlify {
my ($fname, $section)=@_;

# convert letters to lower case
$section =~ tr/[A-Z]/[a-z]/;

# Convert all '<' to '-less-than'
$section =~ s/\</-less-than-/g;

# Convert all '>' to '-greater-than'
$section =~ s/\>/-greater-than-/g;

# remove rubbish
$section =~ s/[*`'":\(\),]+//g;

# convert anything left that isn't a dash, underscore, number or letter
$section =~ s/[^_a-zA-Z0-9-]/-/g;

# Remove starting chars that aren't a letter or underscore;
# those aren't legal for the beginning of a section ID.
$section =~ s/^[^_a-zA-Z]+//g;

# strip trailing dash '-' characters from the section header
$section =~ s/-+$//;

return "$fname#$section";
}

sub single {
my ($fname)=@_;
Expand All @@ -72,16 +50,16 @@ sub single {
my $l=$_;

# Track whether we are within a markdown code block that begins/ends with ```
if ($_ =~ /^\`\`\`.*/) {
if ($in_code_section) {
if($_ =~ /^\`\`\`.*/) {
if($in_code_section) {
$in_code_section = 0;
} else {
$in_code_section = 1;
}
}

if (!$in_code_section) {
if ($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
if(!$in_code_section) {
if($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
# This section header has an explicit ID specified, e.g. "#Foo {#foo}"
$depth = $1;
$section = $2;
Expand All @@ -90,17 +68,29 @@ sub single {
$section =~ s/\s+$//;

$url = "$fname#$dest_id";

# Use only the anchor text when a section heading has an anchor tag
if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
$section = "$1$2";
}
$l = $section;
}
elsif ($_ =~ /^(#[\#]*) (.*)/) {
elsif($_ =~ /^(#[\#]*) (.*)/) {
# This section header has no explicit ID specified, e.g. "#Foo"
$depth = $1;
$section = $2;

# trim whitespace off end of section
$section =~ s/\s+$//;

$url=urlify($fname, $section);
my $url_section = urlify($section);
$url = "$fname#$url_section";

# Use only the anchor text when a section heading has an anchor tag
if($section =~ m/(.*)\[(.*)\]\(.*\).*/) {
$section = "$1$2";
}

$l = $section; # use this too
}
}
Expand All @@ -121,7 +111,6 @@ sub single {
foreach my $w (@lwords) {
if(folded($l) =~ /$w/) {
if(!$word{$w}{$fname}) {
#print " $w ($url)\n";
$word{$w}{$fname}++;
$all{$w} .= ($all{$w}?", ":"")."[$section]($url)";
}
Expand Down Expand Up @@ -158,9 +147,9 @@ sub byname {
my $l = substr(sorting($w), 0, 1);
if(!$letter{$l}) {
$letter{$l}++;
print "## $l\n";
# Make sure headings have blank lines before and after
print "\n## $l\n\n";
}

printf " - ".$index{$w}.": ";
print $all{$w}."\n";
}
210 changes: 193 additions & 17 deletions uni.pl
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/perl
#!/usr/bin/perl -w

# import shared function
use lib '.';
require "urlify.pl";

my $sum = shift @ARGV;

Expand All @@ -20,37 +24,209 @@ sub dirname {
return "";
}


# Converts a file path to something that can be used in a section ID
sub urlify_file_path {
my ($part) = @_;

# Strip a leading '/' if present
$part =~ s/^\///;

$part =~ s/[\/\.]+/__/g;

return $part;
}


# Converts a relative path and current directory to a full path relative
# to the "ROOT" directory.
sub make_full_file_path {
my ($current_dir, $linked_file_path) = @_;

# Handle pants with "../" in them
while($linked_file_path =~ m/^\.\..*/) {
# Strip leading ".."
$linked_file_path =~ s/^..//;
# Strinp leading "/" if present
$linked_file_path =~ s/^\///;

# Strip trailing /
$current_dir =~ s/\/$//;
if($current_dir =~ m/.*\/.*/) {
# if there is more than one dir remaining, strip the last dir
$current_dir =~ s/(.*)\/(.*)/$1/;
} else {
# only one dir remaining, use root of ""
$current_dir = "";
}
}

my $final_path="$current_dir/$linked_file_path";

return $final_path;
}


# Rewrites an anchor destination to point to a section ID in the current file.
sub update_anchor {
my ($anchor_text, $anchor_target, $dir, $f) = @_;
my $final_target = $anchor_target;

if($anchor_target =~ m/^(http\:|https\:).*$/ ) {
# don't rewrite regular http / https urls
} elsif($anchor_target =~ m/(^.*)(\.png|\.jpg)$/ ) {
#
# The anchor_target points to an image, add the image directory to the target
#
$final_target = "$dir$1$2";
} elsif($anchor_target =~ m/((.+)\.md)\#(.+)/ ) {
#
# The anchor_target points a SPECIFIC section of a DIFFERENT *.md file
#
my $full_file_path=make_full_file_path($dir, $1);
my $file_target=urlify_file_path($full_file_path);
my $section_target=$3;

$final_target = "#$file_target" . "-_-_-" . $section_target;
} elsif($anchor_target =~ m/((.+)\.md)$/x) {
#
# The anchor_target points NO SECTION in a DIFFERENT *.md file
#

my $full_file_path=make_full_file_path($dir, $1);
my $file_target=urlify_file_path($full_file_path);

$final_target = "#$file_target";
} elsif($anchor_target =~ m/\#(.+)$/x) {
#
# The anchor_target points A SECTION in THIS *.md file
#

my $full_file_path = $f;
my $file_target=urlify_file_path($full_file_path);

my $section_target=$1;

$final_target = "#$file_target" . "-_-_-" . $section_target;
}

return $anchor_text . "($final_target)";
}

my $errors = 0;

sub include {
my ($f) = @_;
my $line;
open(M, "<$f") || return;
my $dir = dirname($f);

my $same_dir_url_part = urlify_file_path($dir);
my $same_file_url_part = urlify_file_path($f);

my $in_code_section = 0;
my $next_line_should_be_blank = 0;

# Print an empty span at the top of each new file, to give "foo.md" links from other files a
# place to target with a {#foo} style link in the combined .md file
print "\n";
print "[ ]\{#$same_file_url_part\}\n";
print "\n";

while(<M>) {
$line++;
# strip out links to markdown files
$_ =~ s/\[([^]]*)\]\(.*\.md(|\#(.*))\)/$1/g;
# add path to image links
$_ =~ s/^!\[(.*)\]\(([^)]*)\)/![$1]($dir$2)/g;

if($_ =~ /\]\(.*\.md/) {
print STDERR "$f:$line:line-split markdown link\n";
print STDERR "$_";
$errors++;
}
if($_ =~ /谭/) {
# skip unicode letter pandoc does not like
my $complete_line = $_;

if($next_line_should_be_blank) {
if(! (($complete_line eq "") || ($complete_line =~ m/^[\s]*$/))) {
print STDERR "WARNING: The line after a '#' header should be blank in $f, but was: $complete_line\n";
}

$next_line_should_be_blank = 0;
}
else {
print $_;

# Track whether we are within a markdown code block that begins/ends with ```
if($complete_line =~ /^\`\`\`.*/) {
if($in_code_section) {
$in_code_section = 0;
} else {
$in_code_section = 1;
}
} elsif(!$in_code_section) {
# Split line, so we can update multiple URLs on one line
my @line_items = split( /( \[ [^]]*\]\([^)]*?\) )/x , $_);

foreach my $item (@line_items) {
# Update all of the anchor targets in the line
$item =~ s/ (\[ [^]]*\]) \( ([^)]*?) \) /update_anchor($1 , $2, $dir, $f) /xge;
}
$complete_line = join('', @line_items);

#
# Check for section H1, H2, etc, definitions (starting with #, ##, etc)
#
# Add an explicit section ID to those if not present
# Include in that section ID a reference to the full path to the current file
my $full_file_path = $f;
my $file_target=urlify_file_path($full_file_path);

my $final_section_line = $complete_line;

if($complete_line =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) {
$next_line_should_be_blank = 1;

# This section header has an explicit ID specified, e.g. "#Foo {#foo}"
my $depth = $1;
my $section = $2;
my $section_id=$4;

# trim whitespace off end of section
$section =~ s/\s+$//;

$section_id = "$file_target" . "-_-_-" . $section_id;

$final_section_line = "$depth $section \{\#$section_id\}\n";
}
elsif($complete_line =~ /^(#[\#]*) (.*)/) {
$next_line_should_be_blank = 1;

# This section header has no explicit ID specified, e.g. "#Foo"
my $depth = $1;
my $section = $2;

# trim whitespace off end of section
$section =~ s/\s+$//;

my $section_id = "$file_target" . "-_-_-" . urlify($section);

$final_section_line = "$depth $section \{\#$section_id\}\n";
}

$complete_line = $final_section_line;

if($complete_line =~ /谭/) {
# skip unicode letter pandoc does not like
}
else {
print $complete_line;
}
} else {
# in a code section, print lines as-is.

if($complete_line =~ /谭/) {
# skip unicode letter pandoc does not like
}
else {
print $complete_line;
}
}
}
close(M);
}


for my $f (@files) {
include($f);
for my $file (@files) {
include($file);
}

exit $errors;
Loading
Loading