package Regex::Ops::Tree;

# Copyright (C) 2002-2006, The Perl Foundation.
# $Id: Tree.pm 21249 2007-09-13 06:33:24Z paultcochrane $

use base 'Exporter';
use strict;
use warnings;
use Carp qw(confess);

@Regex::Ops::Tree::EXPORT = qw(rop);

# Tree operators
#
# This package defines operators that can be used to construct a
# syntax tree for a regular expression.
#

# Core ops generated by parser
# ----------------------------
# match : Match a single codepoint
# charclass : Match a character class
# seq
# alternate : Match R or S at the same point?
# multi_match : Match m..n repetitions of R
# group : Capture a group
# scan : Scan through input until R matches
# atend : At the end of the input?
# advance : Unconditionally advance 1 char
# code : Embedded code, in some language

# Stuff that is used for optimization
# -----------------------------------
# nop : Do nothing (placeholder)
# check : Check to be sure we are n chars away from the end of the string

@Regex::Ops::Tree::_atom::ISA   = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::_onearg::ISA = qw(Regex::Ops::Tree);

@Regex::Ops::Tree::match::ISA       = qw(Regex::Ops::Tree::_atom);
@Regex::Ops::Tree::charclass::ISA   = qw(Regex::Ops::Tree::_atom);
@Regex::Ops::Tree::classpieces::ISA = qw(Regex::Ops::Tree::_atom);
@Regex::Ops::Tree::seq::ISA         = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::alternate::ISA   = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::multi_match::ISA = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::group::ISA       = qw(Regex::Ops::Tree::_onearg);
@Regex::Ops::Tree::call::ISA        = qw(Regex::Ops::Tree::_onearg);
@Regex::Ops::Tree::rule::ISA        = qw(Regex::Ops::Tree::_onearg);
@Regex::Ops::Tree::scan::ISA        = qw(Regex::Ops::Tree::_onearg);
@Regex::Ops::Tree::atend::ISA       = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::advance::ISA     = qw(Regex::Ops::Tree::_atom);

@Regex::Ops::Tree::nop::ISA   = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::check::ISA = qw(Regex::Ops::Tree);

@Regex::Ops::Tree::call::ISA = qw(Regex::Ops::Tree);
@Regex::Ops::Tree::code::ISA = qw(Regex::Ops::Tree);

# Construct a new op
sub op {
    my ( $class, $name, $args, %opts ) = @_;

    $class = ref($class) if ref $class;
    while (1) {
        last if UNIVERSAL::isa( "${class}::$name", 'Regex::Ops::Tree' );
        $class =~ s/::\w+$// or confess "Called op on invalid class $_[0]";
    }
    $class = "${class}::$name";
    my $self = bless {
        name => $name,
        args => $args || [],
        %opts
    }, $class;

    return $self->init();
}

# Default initialization, to be overridden in subclasses.
sub init { return shift(); }

# Exportable convenience function
sub rop {
    return __PACKAGE__->op(@_);
}

use vars qw(%MARKERS);

sub mark {
    my $name = shift || '';
    my $number = ++$MARKERS{$name};
    $number = '' if ( $number == 1 ) && ( $name ne '' );
    return bless( [ 'label', "\@$name$number" ], 'asm_op' );
}

############################################################################
# OPTIMIZATION INFORMATION
#
# All of this is only for computing information that may be useful in
# optimizing generated regular expressions. It is not needed for basic
# compilation.
#
# Probably the best way to understand this stuff is to look at
# where it's used, mostly in TreeOptimize.pm.
############################################################################

sub order_startset {
    my $start = shift;

    # Must maintain invariant that '' comes first, if it exists.
    my @null;
    push( @null, '' ) if exists $start->{''};
    delete $start->{''};
    return ( @null, keys %$start );
}

# Defaults
sub minlen   { confess "unimplemented" }
sub maxlen   { confess "unimplemented" }
sub startset { confess "unimplemented" }
sub hasback  { 0 }
sub dfa_safe { 0 }

# Superclass for ops like scan(R) that contain a single subexpression.
# By default, all calls are propagated to the subexpression.
package Regex::Ops::Tree::_onearg;
sub minlen   { my ($op) = @_; $op->{args}->[0]->minlen() }
sub maxlen   { my ($op) = @_; $op->{args}->[0]->maxlen() }
sub startset { my ($op) = @_; $op->{args}->[0]->startset() }
sub hasback  { my ($op) = @_; $op->{args}->[0]->hasback() }
sub dfa_safe { my ($op) = @_; $op->{args}->[0]->dfa_safe() }

# Superclass for ops that match a single input atom (eg a character or
# character class.)
package Regex::Ops::Tree::_atom;

sub minlen   { 1 }
sub maxlen   { 1 }
sub dfa_safe { 1 }
sub hasback  { 1 }

# Sequences of regexes like RS
package Regex::Ops::Tree::seq;

sub init {
    my ($self) = @_;

    if ( @{ $self->{args} } == 0 ) {
        delete $self->{name};
        delete $self->{args};
        return $self->op( 'nop', [], %$self );
    }
    elsif ( @{ $self->{args} } == 1 && 2 == keys %$self ) {
        return $self->{args}->[0];
    }
    else {
        return $self;
    }
}

sub minlen {
    my $op     = shift;
    my $minlen = 0;
    $minlen += $_->minlen() foreach @{ $op->{args} };
    return $minlen;
}

sub maxlen {
    my $op     = shift;
    my $maxlen = 0;
    foreach my $kid ( @{ $op->{args} } ) {
        my $kidmax = $kid->maxlen();
        return undef if !defined $kidmax;
        $maxlen += $kidmax;
    }
    return $maxlen;
}

sub startset {
    my $op = shift;
    my %start;
    foreach ( @{ $op->{args} } ) {
        my @next = $_->startset();
        @start{@next} = ();

        # Stop unless NULLABLE.
        last unless ( @next && $next[0] eq '' );
    }

    return Regex::Ops::Tree::order_startset( \%start );
}

sub dfa_safe {
    my $op = shift;
    foreach ( @{ $op->{args} } ) {
        return 0 unless $_->dfa_safe();
    }
    return 1;
}

# R|S
package Regex::Ops::Tree::alternate;

sub init {
    my ($self) = @_;

    if ( @{ $self->{args} } == 0 ) {
        delete $self->{name};
        delete $self->{args};
        return $self->op( 'nop', [], %$self );
    }
    elsif ( @{ $self->{args} } == 1 && 2 == keys %$self ) {
        return $self->{args}->[0];
    }
    else {
        return $self;
    }
}

sub minlen {
    my $op = shift;
    my $min;
    foreach ( @{ $op->{args} } ) {
        my $len = $_->minlen();
        $min = $len if ( !defined $min ) || ( $min > $len );
    }
    return $min || 0;
}

sub maxlen {
    my $op  = shift;
    my $max = 0;
    foreach ( @{ $op->{args} } ) {
        my $len = $_->maxlen();
        return undef if !defined $len;
        $max = $len if $max < $len;
    }
    return $max;
}

sub startset {
    my $op = shift;
    my %start;
    foreach ( @{ $op->{args} } ) {
        @start{ $_->startset() } = ();
    }
    return Regex::Ops::Tree::order_startset( \%start );
}

# Returns true if at most one subexpression can ever hold at a given
# point in the input string. (Return value is conservative: this will
# return false if it is not sure.)
sub disjoint {
    my $op = shift;

    # For now, return true iff all subexpressions' startsets contain
    # only integers (no character classes or weirder things), and all
    # of those integers are different.
    my %start;
    foreach my $subop ( @{ $op->{args} } ) {
        my @subop_startset = $subop->startset();
        return 0 if grep { ref($_) || $_ !~ /^\d+$/ } @subop_startset;
        foreach (@subop_startset) {
            return 0 if exists( $start{$_} );
        }
    }

    return 1;
}

sub dfa_safe {
    my $op = shift;

    my $dfa_safe_subexprs = 1;
    foreach ( @{ $op->{args} } ) {
        $dfa_safe_subexprs = 0, last if !$_->dfa_safe();
    }

    return 1 if $dfa_safe_subexprs && $op->disjoint();

    # Insert better tests here

    return 0;
}

# R*, R+, R?, nongreedy variants of those
package Regex::Ops::Tree::multi_match;

sub minlen {
    my $op = shift;
    return 0 if $op->{args}->[0] <= 0;
    return $op->{args}->[0] * $op->{args}->[3]->minlen();
}

sub maxlen {
    my $op = shift;
    my ( $min, $max, $greedy, $R ) = @{ $op->{args} };
    my $sublen = $R->maxlen();
    if ( defined($max) && $max !~ /^-?\d+/ ) {
        return undef;
    }
    elsif ( !defined($max) || $max == -1 ) {
        return undef if !defined($sublen);    # [m..INF]*
        return undef if $sublen > 0;          # [m..sublen]*
        return 0;                             # [0..0]*
    }
    else {
        return undef if !defined($sublen);    # [m..INF] repeated up to N times
        return $max * $sublen;                # [m..sublen] repeated up to N times
    }
}

sub startset {
    my $op  = shift;
    my @sub = $op->{args}->[3]->startset();
    if ( @sub && $sub[0] ne '' ) {
        if ( $op->minlen() == 0 ) {
            return ( '', @sub );
        }
        else {
            return @sub;
        }
    }
    else {
        return @sub;
    }
}

package Regex::Ops::Tree::scan;
sub maxlen { undef }

# $ (not R$, just $)
package Regex::Ops::Tree::atend;

sub minlen   { 0 }
sub maxlen   { 0 }
sub startset { return () }
sub hasback  { 0 }
sub dfa_safe { 1 }

package Regex::Ops::Tree::nop;
sub minlen { 0 }
sub maxlen { 0 }

package Regex::Ops::Tree::check;
sub minlen { $_[0]->{args}->[1]->minlen(); }
sub maxlen { $_[0]->{args}->[1]->maxlen(); }

package Regex::Ops::Tree::rule;
sub minlen   { my ($op) = @_; $op->{args}->[1]->minlen() }
sub maxlen   { my ($op) = @_; $op->{args}->[1]->maxlen() }
sub startset { my ($op) = @_; $op->{args}->[1]->startset() }
sub hasback  { my ($op) = @_; $op->{args}->[1]->hasback() }
sub dfa_safe { my ($op) = @_; $op->{args}->[1]->dfa_safe() }

# Rule calls are totally unpredictable -- for now. I think some static
# analysis might not be too hard.
package Regex::Ops::Tree::call;
sub minlen   { 0 }
sub maxlen   { undef }
sub dfa_safe { 0 }
sub hasback  { 1 }
sub startset { undef }

# Embedded code is truly unpredictable. Although there will probably
# be pragmata for allowing code to specify that it won't muck with
# things.
package Regex::Ops::Tree::code;
sub minlen   { 0 }
sub maxlen   { undef }
sub dfa_safe { 0 }
sub hasback  { 0 }       # FIXME! code should be allowed to have BACK{} blocks
sub startset { undef }

########################################################################
# Rendering - only used for debugging for now
########################################################################

package Regex::Ops::Tree;    # Won't get used much

sub needparen { 0 }

# Nonportable
sub isplain {
    my $ord = shift;
    return 1 if $ord >= ord('a') && $ord <= ord('z');
    return 1 if $ord >= ord('A') && $ord <= ord('Z');
    return 1 if $ord >= ord('0') && $ord <= ord('9');
    return 1 if $ord =~ /^[~!@#%&_'":;>,<]$/;

    #    return 1 if $ord =~ /^[`~!@#$%^&*()\-_+{}\[\]\\|'":;\/?.>,<]$/;
    return 0;
}

sub Regex::Ops::Tree::match::render {
    my $op   = shift;
    my $atom = $op->{args}->[0];
    if ( $atom =~ /^\d+$/ ) {
        return chr($atom) if isplain($atom);
        return sprintf( "\\x%02x", $atom );    # Nonportable
    }
    else {
        die;
    }
}

sub Regex::Ops::Tree::charclass::render {
    die;
}

sub Regex::Ops::Tree::classpieces::render {
    die;
}

sub Regex::Ops::Tree::seq::needparen { 0 }

sub Regex::Ops::Tree::seq::render {
    my $op = shift;
    return join( '', map { $_->render() } @{ $op->{args} } );
}

sub Regex::Ops::Tree::alternate::needparen { 1 }

sub Regex::Ops::Tree::alternate::render {
    my $op  = shift;
    my $str = '';
    foreach my $case ( @{ $op->{args} } ) {
        $str .= "|" unless length($str) == 0;
        my $R = $case->render();
        $str .= $case->needparen() ? "(?:$R)" : $R;
    }

    return $str;
}

sub Regex::Ops::Tree::multi_match::needparen { 1 }

sub Regex::Ops::Tree::multi_match::render {
    my $op = shift;
    my ( $min, $max, $greedy, $R ) = @{ $op->{args} };
    my $base = $R->render();
    $base = "(?:$base)" if $R->needparen();
    if ( $min == 0 && $max == 1 ) {
        $base .= "?";
    }
    elsif ( $min == 0 && $max == -1 ) {
        $base .= "*";
    }
    elsif ( $min == 1 && $max == -1 ) {
        $base .= "+";
    }
    elsif ( $max == -1 ) {
        $base .= "{$min,}";
    }
    else {
        $base .= "{$min,$max}";
    }

    $base .= "?" unless $greedy;
    return $base;
}

sub Regex::Ops::Tree::group::needparen { 0 }

sub Regex::Ops::Tree::group::render {
    my $op  = shift;
    my $R   = $op->{args}->[0];
    my $str = $R->render();

    # Strip off (?:) from the subexpression if possible, so we can
    # render things as (R) instead of ((?:R))
    if ( $str =~ /^\(\?\:(.*)\)$/ ) {
        $str = $1;
    }
    return "($str)";
}

# FIXME: Should render the _absence_ of this op differently!
sub Regex::Ops::Tree::scan::needparen { 0 }
sub Regex::Ops::Tree::scan::render    { $_[0]->{args}->[0]->render() }

sub Regex::Ops::Tree::atend::needparen { 0 }
sub Regex::Ops::Tree::atend::render    { '$' }

sub Regex::Ops::Tree::check::needparen { 0 }
sub Regex::Ops::Tree::check::render    { $_[0]->{args}->[1]->render() }

sub Regex::Ops::Tree::rule::render {
    my $self = shift;
    my ( $name, $tree ) = @{ $self->{args} };
    my $expr = $tree->render;
    if ( $name eq 'default' ) {
        return $expr;
    }
    else {
        return "rule $name { $expr }";
    }
}

########################################################################
# Dumping trees, for debugging only
########################################################################

package Regex::Ops::Tree;

sub dump_tree {
    my ($op) = @_;
    my $ref = $op->reftree();
    return dump_ref($ref);
}

sub dump_ref {
    my ( $ref, $indent ) = @_;
    $indent ||= "";
    print $indent;
    if ( ref $ref ) {
        my ( $name, @children ) = @$ref;
        print $name, "\n";
        dump_ref( $_, $indent . "  " ) foreach (@children);
    }
    else {
        print $ref, "\n";
    }
}

sub annotated {
    my ( $op, $str ) = @_;
    my $min = $op->minlen();
    my $max = $op->maxlen();
    $max = "INF" if !defined $max;
    return "$str [$min..$max]";
}

# Defaults
sub Regex::Ops::Tree::reftree {
    my $op = shift;
    return $op->{name};
}

sub Regex::Ops::Tree::_onearg::reftree {
    my $op = shift;
    return [ annotated( $op, $op->{name} ), $op->{args}->[0]->reftree() ];
}

sub Regex::Ops::Tree::match::reftree {
    my $op = shift;
    return annotated( $op, "match(" . $op->render(@_) . ")" );
}

sub Regex::Ops::Tree::charclass::reftree {
    my $op = shift;
    return annotated( $op, "charclass " . $op->render(@_) );
}

sub Regex::Ops::Tree::classpieces::reftree {
    my $op = shift;
    return annotated( $op, "classpieces " . $op->render(@_) );
}

sub Regex::Ops::Tree::seq::reftree {
    my $op = shift;
    return [ annotated( $op, "seq" ), map { $_->reftree() } @{ $op->{args} } ];
}

sub Regex::Ops::Tree::alternate::reftree {
    my $op = shift;
    my $ref = [ annotated( $op, "alternate" ) ];
    push( @$ref, $_->reftree() ) foreach ( @{ $op->{args} } );
    return $ref;
}

sub Regex::Ops::Tree::multi_match::reftree {
    my $op = shift;

    my ( $min, $max, $greedy, $R ) = @{ $op->{args} };
    my $ref = [ annotated( $op, "multi_match($min .. $max)" ), $R->reftree() ];
    $ref->[0] .= "?" unless $greedy;

    return $ref;
}

sub Regex::Ops::Tree::check::reftree {
    my $op = shift;
    return [ annotated( $op, "check($op->{args}->[0])" ), $op->{args}->[1]->reftree() ];
}

sub Regex::Ops::Tree::rule::reftree {
    my $op = shift;
    my ( $rule, $tree ) = @{ $op->{args} };
    return [ annotated( $op, "rule($rule)" ), $tree->reftree() ];
}

1;

# Local Variables:
#   mode: cperl
#   cperl-indent-level: 4
#   fill-column: 100
# End:
# vim: expandtab shiftwidth=4:


syntax highlighted by Code2HTML, v. 0.9.1