# -----------------------------
#  Perl 5.8 or later required
# -----------------------------
require 5.008;

$main_info = $ARGV[0];
$infofile_encoding = $ARGV[1];

binmode STDOUT, $infofile_encoding;

$unit_separator = "";

$item_cnt = 0;
$section_cnt = 0;

# ------------------------------------------------------------------
# PART 1. BUILD INDEX FOR @DEFFN AND @DEFVR ITEMS
# ------------------------------------------------------------------

# (1.1)  Build index tables.

# (1.1a) Scan the *.info-* files for unit separator characters;
#        those mark the start of each texinfo node.
#        Build a hash table which associates the node name with the filename
#        and byte offset (NOT character offset) of the unit separator.
#
#        Do NOT use the indirect table + tag table (generated by makeinfo),
#        because those tables give character offsets; we want byte offsets.
#        It is easier to construct a byte offset table by hand,
#        rather than attempting to fix up the character offsets.
#        (Which are strange anyway.)

open (FH, "<" . $infofile_encoding, $main_info);
read (FH, $stuff, -s FH);

while ($stuff =~ m/^($main_info-\d+): (\d+)/cgsm) {
    $filename = $1;
    push @info_filenames, $filename;

    open FH2, "<" . $infofile_encoding, $filename;
    read FH2, $stuff2, -s FH2;

    while ($stuff2 =~ m/\G.*?(?=\n$unit_separator)/cgsm) {
        $offset = pos $stuff2;

        if ($stuff2 =~ m/^File:.*?Node: (.*?),/csgm) {
            $node_name = $1;
            $last_node_name = $node_name;
        }

        $node_offset{$node_name} = [($filename, int($offset))];
    }

    close $FH2;
}

close FH;

# (1.1b) Read the info index, which gives the node name and number of lines offset
#        for each indexed item. 

# ASSUME THAT THE INFO INDEX IS THE LAST NODE.
# (GETTING THE NODE NAME FROM THE COMMAND LINE IS PROBLEMATIC.)
$index_node_name = $last_node_name;

($index_filename, $index_node_offset) = @{$node_offset{$index_node_name}};

open (FH, "<" . $infofile_encoding, $index_filename);
read (FH, $stuff, -s FH);

if ($stuff =~ m/^File:.*?Node: $index_node_name.*^\* Menu:/icgsm) {
    while ($stuff =~ m/\G.*?^\* (\S+|[^:]+):\s+(.*?)\.\s+\(line\s+(\d+)\)/cgsm) {
        $topic_name = $1;
        $node_name = $2;
        $lines_offset = $3;
        $topic_locator{$topic_name} = [($node_name, $lines_offset)];
    }
}

close FH;

# (1.2)  Translate node name and number of lines offset into file name and byte offset
#        for each indexed item.
#        Also find the length of each item.

foreach $key (sort keys %topic_locator) {
    ($node_name, $lines_offset) = @{$topic_locator{$key}};
    ($filename, $character_offset) = @{$node_offset{$node_name}};
    $byte_offset = seek_lines($filename, $character_offset, $lines_offset);

    open FH, "<" . $infofile_encoding, $filename;
    seek FH, $byte_offset, 0;
    read FH, $stuff, -s FH;
    if ($stuff =~ m/(.*?)(?:\n\n(?= -- )|\n(?=[0-9])|(?=$unit_separator))/cgsm) {
        $text_length = length $1;
    }
    else {
        # Eat everything up til end of file.
        $stuff =~ m/(.*)/cgsm;
        $text_length = length $1;
    }
    close FH;

    $topic_locator{$key} = [($node_name, $filename, $byte_offset, $text_length)];
}

# (1.3)  Generate Lisp code. The functions in info.lisp expect this stuff.

print "(in-package :cl-info)\n";
print "(defun cause-maxima-index-to-load () nil)\n";

#        Pairs of the form (<index topic> . (<filename> <byte offset> <length> <node name>))

print "(defvar *info-deffn-defvr-pairs* '(\n";
print "; CONTENT: (<INDEX TOPIC> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS> <NODE NAME>))\n";

foreach $key (sort keys %topic_locator) {
    $item_cnt++;
    my $sanitized_key = $key;
    $sanitized_key =~ s/"/\\"/g;
    print "(\"$sanitized_key\" . (\"$topic_locator{$key}[1]\" $topic_locator{$key}[2] $topic_locator{$key}[3] \"$topic_locator{$key}[0]\"))\n";
}

print "))\n";

# ------------------------------------------------------------------
# PART 2. BUILD INDEX FOR @NODE ITEMS
# ------------------------------------------------------------------

# (2.1)  Search for 'mmm.nnn' at the start of a line,
#        and take each one of those to be the start of a node.
#
#        We could use the node table ($node_offset here), but we don't.

#        (a) The node table indexes nodes which contain only menus.
#            We don't want those because they have no useful text.
#
#        (b) The offset stated in the node table tells the location
#            of the "File: ..." header. We would have to cut off that stuff.
#
#        (c) Offsets computed by makeinfo are character offsets,
#            so we would have to convert those to byte offsets.
#            (But we have to do that anyway, so I guess there's no
#            advantage either way on that point.)

for $filename (@info_filenames) {

    open (FH, "<" . $infofile_encoding, $filename);
    read (FH, $stuff, -s FH);

    while ($stuff =~ m/\G(.*?)(?=^\d+\.\d+ .*?\n)/cgsm) {

        # Since FH was opened with $infofile_encoding,
        # pos returns a CHARACTER offset.
        $begin_node_offset = pos($stuff);

        if ($stuff =~ m/((^\d+\.\d+) (.*?)\n)/cgsm) {
            $node_title = $3;
            $node_length = length $1;
        }

        # Node text ends at a unit separator character,
        # or at the end of the file.

        if ($stuff =~ m/\G(.*?)($unit_separator)/cgsm) {
            $node_length += length $1;
        }
        else {
            $stuff =~ m/\G(.*)/csgm;
            $node_length += length $1;
        }

        $node_locator{$node_title} = [($filename, $begin_node_offset, $node_length)];
    }

    close FH;
}

# Translate character offsets to byte offsets.

foreach $node_title (sort keys %node_locator) {
    ($filename, $begin_node_offset, $node_length) = @{$node_locator{$node_title}};
    open FH, "<" . $infofile_encoding, $filename;
    read FH, $stuff, $begin_node_offset;
    my $begin_node_offset_bytes = tell FH;
    close FH;

    $node_locator{$node_title} = [($filename, $begin_node_offset_bytes, $node_length)];
}

# (2.2)  Generate Lisp code.
#
#        Pairs of the form (<node name> . (<filename> <byte offset> <length>))

print "(defvar *info-section-pairs* '(\n";
print "; CONTENT: (<NODE NAME> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS>))\n";

foreach $node_title (sort keys %node_locator) {
    $section_cnt++;
    ($filename, $begin_node_offset, $length) = @{$node_locator{$node_title}};
    my $sanitized_title = $node_title;
    $sanitized_title =~ s/"/\\"/g;
    print "(\"$sanitized_title\" . (\"$filename\" $begin_node_offset ", $length, "))\n";
}

print "))\n";

#        Construct hashtables from the lists given above.

print "(load-info-hashtables)\n";

# (2.3)  Do we have any items or sections?
#
#        Warn if no index items or secions found. 

($item_cnt+$section_cnt)>0 || 
    print STDERR "WARNING: Epmpty index. Probably makeinfo is too old. Version 4.7 or 4.8 required.\n";

# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------

sub seek_lines {
    my ($filename, $character_offset, $lines_offset) = @_;
    open FH, "<" . $infofile_encoding, $filename;
    read FH, $stuff, $character_offset;

    # MAKEINFO BUG: LINE OFFSET IS LINE NUMBER OF LAST LINE IN FUNCTION DEFINITION
    # (BUT WE NEED THE FIRST LINE OF THE FUNCTION DEFINITION)
    # BUG IS PRESENT IN MAKEINFO 4.8; FOLLOWING CAN GO AWAY WHEN BUG IS FIXED
    
    for (1 .. $lines_offset + 1) {
        my $x_maybe = tell FH;
        my $line = <FH>;
        if ($line =~ /^ -- \S/) {
            $x = $x_maybe;
        }
    }

    # END OF MAKEINFO BUG WORKAROUND
    # WHEN WORKAROUND IS NO LONGER NEEDED, ENABLE THE FOLLOWING LINES:

    # <FH> for 1 .. $lines_offset;
    # $x = tell FH;

    close FH;
    return $x;
}


syntax highlighted by Code2HTML, v. 0.9.1