#!/usr/bin/perl
#
# $Id: $
#

=head1 NAME

bsd-man-to-xml.pl - Converts BSD man pages into ECMA XML.

=head1 SYNOPSIS

zcat /path/to/man/page.gz | B<bsd-man-to-xml.pl> > page.xml

=head1 DESCRIPTION

Parses a BSD nroff man page and tries to convert it into an ECMA XML
documentation block.  It generates the <summary/>, <remarks/>, and <returns/>
XML sections.

This program is B<NOT> currently suitable for converting Linux man pages, 
since the Linux man pages use different nroff macros.  The Linux man pages 
are also less semantic and more output-oriented -- for example, BSD will 
use C<.Fn function> to name a function, while Linux will use 
C<.B function> (.B bolds the named item, with no semantic implications for 
what it's bolding).

=head1 NOTES

You I<will> need to edit the generated text.  Not all nroff macros are
supported, and the translator needs to guess about some things.  In
particular, search for C<TODO> (for enumeration values, since this program
cannot know which enumeration all constants belong to) and C<Syscall> (since
we assume all functions are in Syscall, when many are really in Stdlib).

=head1 COPYRIGHT

Copyright (C) 2006 Jonathan Pryor  <jonpryor@vt.edu>

=cut

#
# Copyright (C) 2006 Jonathan Pryor  <jonpryor@vt.edu>
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# 
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

use strict;

my $ignore_input = undef;
my @sections = ();
my $cref_see = 1;
my $indent = 0;

my $printer = undef;

my $commands = {
	'\\"' => sub {}, # comment; ignore
	'Dd' => sub {}, # date; ignore
	'Dt' => sub {}, # man page header; ignore
	'Os' => sub {}, # wtf?; ignore
	'Sh' => \&command_section_start,
	'Nm' => sub {}, # function name within NAME section; ignore
	'Nd' => \&command_summary,
	'Xr' => \&command_cref,
	'Dv' => \&command_constant,
	'Fa' => \&command_parameter,
	'Ql' => \&command_quote,
	'Pp' => \&command_paragraph,
	'Fn' => \&command_function,
	'In' => \&command_include,
	'Va' => \&command_variable,
	'Er' => \&command_error_variable,
	'It' => \&command_list_item,
	'Bl' => \&command_list_begin,
	'El' => \&command_list_end,
	'Bf' => sub {}, # begin format (e.g. bold); ignore
	'Ef' => sub {}, # end formatting (e.g. bold); ignore
	'Bq' => \&command_bracket_quote,
	'Er' => sub {command_constant (shift);},
	'Tn' => sub {$_ = shift; my ($tn) = /^"([^"]+)"$/; print_lead ($tn);},
	'Sy' => \&command_list_item_header,
	'Bd' => \&command_indent_begin,
	'Ed' => \&command_indent_end,
	'Sx' => sub {command_quote (shift);},
	'Dq' => sub {print_lead ("<c>", shift, "</c>\n");},
	'Pq' => sub {command_parse (shift);},
	'Rv' => sub {}, # no idea what it does; ignore
	'Em' => sub {print_lead ("<i>", shift, "</i>\n");},
};

my $section_handlers = {
	'NAME' => sub {
		$ignore_input = undef; 
		section_begin ("summary", "para");
	},
	'DESCRIPTION' => sub {
		$ignore_input = undef; 
		section_end ("para", "summary"); 
		section_begin ("remarks", "para");
	},
	'RETURN VALUES' => sub {
		$ignore_input = undef; 
		section_end (); 
		section_end ("remarks"); 
		section_begin ("returns", "para");
		# Some man pages don't print out what the return value is.
		# Provide a stock response.
		$printer = sub {
			$_ = shift;
			$printer = undef;
			if ($_ eq "</para>\n") {
				print_lead ("  On success, zero is returned.\n");
				print_lead ("  On error, -1 is returned and \n");
				print_lead ("  <see cref=\"M:Mono.Unix.Native.Stdlib.GetLastError\" />\n");
				print_lead ("  returns the translated error.\n");
				print_lead ($_);
			}
			else {
				print_lead ($_);
			}
		};
	},
	'ERRORS' => sub {
		$ignore_input = undef; 
		section_end ("para"); 
		section_begin ("block subset=\"none\" type=\"usage\"", "para");
	},
	'SEE ALSO' => sub {
		$ignore_input = undef; 
		while (@sections) {
			section_end ();
		} 
		$cref_see = 0;
	},
};

while (<>) {
	my ($line);
	if (($line) = /^\.(.*)$/) {
		command_parse ($line);
	}
	else {
		print_lead ($_);
	}
}

sub command_parse {
	$_ = shift;
	my ($arg, $rest);
	if (($arg, undef, $rest) = /^([\w\\"]+)(\s(.*))?$/ and exists $commands->{$arg}) {
		return $commands->{$arg}->($rest);
	}
	else {
		print_lead ($_);
	}
	return undef;
}

sub command_section_start {
	$_ = shift;
	my $name;
	my $rest;
	if (($name, $rest) = /^([\w\s]+)(.*)$/ and exists $section_handlers->{$name}) {
		$section_handlers->{$name}->();
		return $rest;
	}
	else {
		$ignore_input = 1;
	}
	return $_;
}

sub command_summary {
	print_lead (shift, "\n");
	return "";
}

sub command_cref {
	$_ = shift;
	my ($cmd, $section, $rest) = /^(\w+)\s(\w+)(.*)/;
	if ($cref_see) {
		if ($section ne "2" and $section ne "3") {
			print_lead ("<c>$cmd</c>($section)$rest\n");
		}
		elsif ($cref_see){
			print_lead ("<see cref=\"M:Mono.Unix.Native.Syscall.$cmd\" />($section)$rest\n");
		}
		return "";
	}
	elsif (!$cref_see and ($section eq "2" or $section eq "3")) {
		print_lead ("<altmember cref=\"M:Mono.Unix.Native.Syscall.$cmd\" />\n");
	}
	return "";
}

sub command_constant {
	$_ = shift;
	my ($pre, $arg, $rest) = /^([^\w]*)(\w+)(.*)$/;
	my $type = "TODO";
	if    ($arg =~ /SIG_/)    {$type = "Stdlib";}
	elsif ($arg =~ /SIG[^_]/) {$type = "Signum";}
	elsif ($arg =~ /O_/)      {$type = "OpenFlags";}
	elsif ($arg =~ /E[^_]+/)  {$type = "Errno";}
	elsif ($arg =~ /._OK/)    {$type = "AccessModes";}
	print_lead ("$pre <see cref=\"F:Mono.Unix.Native.$type.$arg\" /> $rest\n");
	return "";
}

sub command_parameter {
	$_ = shift;
	my ($arg, $rest) = get_possibly_quoted_string ($_);
	print_lead ("<paramref name=\"$arg\" /> $rest\n");
	return "";
}

sub get_possibly_quoted_string {
	$_ = shift;
	my ($a, $b, undef, $rest) = /^("([^"]+)"|\w+)(\s(.*))?$/;
	return (($b || $a), $rest);
}

sub command_paragraph {
	section_end ("para");
	section_begin ("para");
	return "";
}

sub command_bracket_quote {
	$_ = shift; 
	command_parse ($_);
}

sub command_quote {
	$_ = shift;
	# my ($arg, $rest) = /^(\S+)(\s[.,])?$/;
	my ($arg, $rest) = get_possibly_quoted_string ($_);
	$arg =~ s/\\&//g;
	print_lead ("\"<c>$arg</c>\"$rest\n");
	return "";
}

sub command_function {
	$_ = shift;
	my ($fn, $args, $rest) = /^(\w+)\s*(".*")?\s*(.*)$/;
	$args =~ s/"(.*)"/$1/;
	my @args = split /" "/, $args;
	print_lead ("<c>$fn</c>(", join (", ", @args), ")$rest\n");
	return "";
}

sub command_include {
	$_ = shift;
	my ($arg, $rest) = /^([\w\.\/]+)(.*)$/;
	print_lead ("<c>$arg</c>$rest\n");
	return "";
}

sub command_variable {
	$_ = shift;
	my ($arg, $rest) = /^(\w+)(.*)$/;
	if ($arg eq "errno") {
		print_lead ("<see cref=\"M:Mono.Unix.Native.Stdlib.GetLastError\" />$rest\n");
	}
	else {
		print_lead ("<c>$arg</c>$rest\n");
	}
	return "";
}

sub command_error_variable {
	$_ = shift;
	my ($arg, $rest) = /^(\w+)(.*)$/;
	print_lead ("<see cref=\"M:Mono.Unix.Native.Errno.$arg\" />$rest\n");
	return "";
}

my $list_type;

sub command_list_begin {
	$_ = shift;
	section_end ("para");
	my ($type) = /^([-\w]+)$/;
	if ($type eq "-bullet") {
		$list_type = "bullet";
		section_begin ("list type=\"bullet\"");
	}
	else {
		$list_type = "table";
		section_begin ("list type=\"table\"");
	}
}

sub command_list_item_header {
	$_ = shift;

	my $header;
	if (($header) = /^"([^"]+)"/) {
		section_begin ("listheader");
		my @items = split /\t/, $header;
		section_begin ("term");
		print_lead (shift @items, "\n");
		section_end ("term");
		foreach my $t (@items) {
			section_begin ("description");
			print_lead ($t, "\n");
			section_end ("description");
		}
		section_end ("listheader");
	}
}

sub command_list_item {
	$_ = shift;

	if (/^Sy\s/) {
		command_parse ($_);
		return "";
	}
	elsif ($sections [-1] eq "list" and /^Bq Er E\w+$/) {
		# start of an errno list.
		section_begin ("listheader", "term");
		print_lead ("Error\n");
		section_end ("term");
		section_begin ("description");
		print_lead ("Details\n");
		section_end ("description", "listheader");
	}

	if ($sections [-1] eq "para") {
		section_end ("para");
		section_end (); # term or description
		section_end ("item");
	}

	section_begin ("item", "term");
	my @descs = split /\sTa\s/, $_;
	command_parse (shift @descs);
	print "\n";

	if ($list_type eq "bullet") {
		section_begin ("para");
		return "";
	}

	section_end ("term");

	if (@descs == 0) {
		section_begin ("description", "para");
		return "";
	}
	foreach my $desc (@descs) {
		section_end ("para", "description") if $sections [-1] eq "para";
		section_begin ("description", "para");
		my $rest = undef;
		if ($desc =~ /^"/) {
			($desc, $rest) = $desc =~ /^"([^"]+)"(.*)$/;
		}
		command_parse ($desc);
		command_parse ($rest) if $rest;
		print "\n";
	}
	return "";
}

sub command_list_end {
	if ($sections [-1] eq "para") {
		section_end ("para");
		section_end (); # term or description
		section_end ("item");
	}
	section_end ("list");
	section_begin ("para");
}

my $num_indent_entries = undef;

sub command_indent_begin {
	section_end ("para");

	$printer = \&command_indent_line;
}

sub command_indent_line {
	my $line = shift;
	my @entries = split /\t+/, $line;

	unless (defined $num_indent_entries) {
		$num_indent_entries = scalar @entries;
		if ($num_indent_entries == 1) {
			print_indent ();
			print "<block subset=\"none\" type=\"usage\">\n";
		}
		else {
			print_indent (); print ("<list type=\"table\">");
			print_indent (); print ("  <listheader>\n");
			print_indent (); print ("    <term>Value</term>\n");
			for (my $i = 1; $i < $num_indent_entries; ++$i) {
				print_indent (); print ("    <description>Details</description>\n");
			}
			print_indent (); print ("  </listheader>\n");
		}
	}

	if ($num_indent_entries == 1) {
		my $l = $entries [0];
		$l =~ s/\n$//;
		print_indent (); print "  <para>", $l, "</para>\n";
	}
	else {
		print_indent (); print "  <item>\n";
		print_indent (); print "    <term>", (shift @entries), "</term>\n";
		foreach my $e (@entries) {
			my $g = $e;
			$g =~ s/\n$//;
			print_indent (); print "    <description>$g</description>\n";
		}
		print_indent (); print "  </item>\n";
	}
}

sub command_indent_end {
	$printer = undef;
	if ($num_indent_entries == 1) {
		print_lead (); print "</block>\n";
	}
	else {
		print_lead (); print "</list>\n";
	}
	$num_indent_entries = undef;
	section_begin ("para");
}

sub print_indent {
	unless ($ignore_input) {
		print "        ";
		print ("  " x $indent);
	}
}

sub print_lead {
	unless ($ignore_input) {
		if (defined $printer) {
			$printer->(@_);
		}
		else {
			print_indent ();
			foreach my $a (@_) {
				my $b = $a;
				$b =~ s/\\(.)/$1/g;
				print $b;
			}
		}
	}
}

sub section_begin {
	foreach my $section (@_) {
		print_lead ("<$section>\n");
		my ($tag) = $section =~ /^(\w+)/;
		push @sections, $tag;
		++$indent;
	}
}

sub section_end {
	my $end = sub {
		my $section = shift;
		die unless defined $section;
		--$indent;
		print_lead ("</$section>\n");
	};

	if (scalar @_) {
		foreach my $section (@_) {
			my ($expected) = pop @sections;
			my (undef, undef, $line, $method) = caller;
			die "internal error: expected '$expected' but got '$section' " . 
				"(at $method:$line)" if $section ne $expected;
			$end->($section);
		}
	}
	else {
		$end->(pop @sections);
	}
}



syntax highlighted by Code2HTML, v. 0.9.1