# Optimize the regular expression syntax tree before rendering it out
# as List ops. Most optimizations should go here, since the List form
# is really suitable only for peephole optimization.

# Copyright (C) 2004-2006, The Perl Foundation.
# $Id: TreeOptimize.pm 21249 2007-09-13 06:33:24Z paultcochrane $

package Regex::TreeOptimize;
use Regex::Ops::Tree qw(rop);
use Carp qw(croak);
use strict;
use warnings;

sub new {
    my ( $proto, %opts ) = @_;
    my $self = bless \%opts, ( ref($proto) || $proto );
    $self->init();
    return $self;
}

sub init {
}

sub optimize_tree {
    my ( $self, $t, $ctx ) = @_;
    $t = $self->pass1( $t, $ctx );

    # Disable implicit checks
    $t = $self->maptree( $t, sub { shift->{nocheck} = 1 } );

    # Add necessary checks back in
    $t = $self->add_checks( $t, 0, 0 );

    # Delete the no-implicit-check marker
    $t = $self->maptree( $t, sub { delete shift->{nocheck} } );
    return $t;
}

# Perform a few simple optimizations:
#
# 1. R(ST)    -> RST
# 2. aR|aS    -> a(R|S)
#    R|<null> -> R?
#    <null>|R -> R??
#
sub pass1 {
    my ( $self, $t, $ctx ) = @_;
    my $type = $t->{name};
    if ( $type eq 'seq' ) {

        # R.(S.T) => R.S.T
        # Or parenthetically, seq(R,seq(\alpha)) => seq(R, \alpha)
        #
        my @pre_pieces = map { $self->pass1( $_, $ctx ) } @{ $t->{args} };
        my @pieces;
        foreach (@pre_pieces) {
            if ( ref $_ && $_->{name} eq 'seq' ) {
                push @pieces, @{ $_->{args} };
            }
            else {
                push @pieces, $_;
            }
        }
        return rop( 'seq', \@pieces );
    }
    elsif ( $type eq 'alternate' ) {

        # Pull out common prefixes, and convert alternation with null
        # expressions to optional matches:
        #
        # opt(aR|aS) => a(R|S)
        # opt(R|)    => opt(R)?
        # opt(|R)    => opt(R)??
        #
        my @children = map { $self->pass1( $_, $ctx ) } @{ $t->{args} };
        if ( @children == 1 ) {
            return $children[0];
        }

        for (@children) { croak "nonref alternative?" if !ref $_; }

        # opt(|R) -> opt(R)??
        # opt(|R|S) -> opt(R|S)??
        # opt(|||R|S) -> opt(R|S)??
        # opt(|||) -> ()
        if ( $children[0]->{name} eq 'nop' ) {
            pop(@children) while $children[0]->{name} eq 'nop';
            if ( @children == 0 ) {
                return rop( 'nop', [] );
            }
            elsif ( @children == 1 ) {
                return rop( 'multi_match', [ 0, 1, 0, $children[0] ] );
            }
            else {
                @{ $t->{args} } = @children;
                return rop( 'multi_match', [ 0, 1, 0, $self->pass1( $t, $ctx ) ] );
            }
        }

        # opt(R|) -> opt(R)?
        # opt(R|S|) -> opt(R|S)?
        # opt(R|S||) -> opt(R|S)?
        if ( $children[-1]->{name} eq 'nop' ) {
            pop(@children) while $children[-1]->{name} eq 'nop';
            if ( @children == 1 ) {
                return rop( 'multi_match', [ 0, 1, 1, $children[0] ] );
            }
            else {
                @{ $t->{args} } = @children;
                return rop( 'multi_match', [ 0, 1, 1, $self->pass1( $t, $ctx ) ] );
            }
        }

        # opt(aR|aS) -> a opt(R|S)
        # opt(aR|aS|aT) -> a opt(R|S|T)
        # opt(aR|aS|bT) -> (a opt(R|S) | opt(bT))
        # opt(R|...) -> (opt(R)|opt(...)) (when R is not a sequence)
        my $R = shift(@children);

        # opt(R|...) -> (opt(R)|opt(...)) (R is not a sequence)
        if ( $R->{name} ne 'seq' ) {
            my $subtree = $self->pass1( rop( 'alternate', \@children ), $ctx );
            @{ $t->{args} } = ( $R, $subtree );
            return $t;
        }

        # Find as many things as possible that match a first
        my ( $R0, @Rrest ) = @{ $R->{args} };
        my @shrunken = ();
        while (@children) {
            last unless $children[0]->{name} eq 'seq';
            my ( $S0, @Srest ) = @{ $children[0]->{args} };
            last unless matchsame( $R0, $S0 );
            my $S = shift(@children);
            @{ $S->{args} } = @Srest;
            push @shrunken, $S;
        }

        # None, so use opt(R|...) -> (opt(R)|opt(...))
        if ( @shrunken == 0 ) {
            my $subtree = $self->pass1( rop( 'alternate', \@children ), $ctx );
            @{ $t->{args} } = ( $R, $subtree );
            return $t;
        }

        # Have at least one to combine with, but possibly not all
        @{ $R->{args} } = @Rrest;
        my $subtree = $self->pass1( rop( 'alternate', [ $R, @shrunken ] ), $ctx );
        my $opt = rop( 'seq', [ $R0, $subtree ] );
        if ( @children == 0 ) {

            # Nothing left, so the original alternation disappears
            return $opt;
        }

        # Something left: opt(aR|aS|T|...) -> a opt(R|S) | opt(T|...)
        my $leftovers = $self->pass1( rop( 'alternate', \@children ), $ctx );
        @{ $t->{args} } = ( $opt, $leftovers );
        return $t;
    }
    else {

        # Find all subtrees, and recurse through them.
        foreach my $arg ( @{ $t->{args} } ) {
            if ( UNIVERSAL::isa( $arg, 'Regex::Ops::Tree' ) ) {

                # $arg is a reference variable, remember.
                $arg = $self->pass1( $arg, $ctx );
            }
        }
        return $t;
    }
}

# Note: the following fails to rewrite
#   aR|[a]S
# to
#   a(R|S)
#
sub matchsame {
    my ( $R, $S ) = @_;
    return 0 if !ref $R;
    return 0 if !ref $S;
    return 0 if $R->{name} ne $S->{name};
    my $type = $R->{name};
    if ( $type eq 'match' ) {
        return $R->{args}[0] == $S->{args}[0];
    }
    elsif ( $type eq 'classmatch' ) {
        return $R->{args}[0] eq $S->{args}[0];
    }
    elsif ( $type eq 'start' || $type eq 'end' ) {
        return 0;
    }
    else {
        my @Rargs = @{ $R->{args} };
        my @Sargs = @{ $S->{args} };
        return 0 if ( @Rargs != @Sargs );
        for ( 0 .. $#Rargs ) { return 0 if $Rargs[$_] ne $Sargs[$_] }
        return 1;
    }
}

# sub add_checks
#
# Augments the tree with length checks, so that primitive matching
# operations (eg matching single characters or character ranges) won't
# have to do their own checking.
#
# Without this, /abc/ would compile to
#  check(1)
#  match(a)
#  check(1)
#  match(b)
#  check(1)
#  match(c)
#
# where check(n) is "am I closer than n away from the end of the string?"
#
# After this operation, /abc/ compiles to
#
#  check(3)
#  match(a)
#  match(b)
#  match(c)
#
# and something trickier like /ab|c[dxy]e/ compiles to
#
#          check(2)
#          match(a) or goto try_S
#          match(b) or goto try_S
#          goto next
#   try_S: check(3)
#          match(c) or fail
#          match([dxy]) or fail
#          match(e) or fail
#
# Note that the placement of the checks is *not* always optimal. But
# they should do a pretty decent job of making things fail early if
# you get too close to the end of the string.
#
# Args:
#  t - The Tree op being visited
#  guarantee - The number of atoms we are currently guaranteed to have for
#      the current node (NOT for this node and everything following it)
#  follow_min - The minimum number of atoms in nodes following the current
#
# FIXME: Once I add in the various 'cut' operations, it will no longer
# be correct to fail early (since failure can then be more dramatic!).
#
sub add_checks {
    my ( $self, $t, $guarantee, $follow_min ) = @_;

    if ( $t->{name} eq 'rule' ) {
        $t->{args}->[1] = $self->add_checks( $t->{args}->[1], 0, 0 );
        return $t;
    }

    # Have enough for any possible match of this subtree?
    return $t if ( defined( $t->maxlen() ) && $guarantee >= $t->maxlen() );

    # Need more for even the shortest match of this subtree?
    if ( ( $t->minlen() || 0 ) > $guarantee ) {
        my $newtree = $self->add_checks( $t, $t->minlen(), $follow_min );
        return rop( 'check', [ $t->minlen() + $follow_min, $newtree ] );
    }

    # Shortest possible subtree match is guaranteed.
    my $type = $t->{name};

    if ( $type eq 'seq' && !$t->{add_check_visited} ) {

        # Split the seq up into chunks that look like they're nicely
        # checkable.

        # Break the list of children up into contiguous chunks of
        # things with and without maximum lengths
        my $havemax = 0;
        my @chunks = ( [] );

        foreach my $kid ( @{ $t->{args} } ) {
            if ( defined( $kid->maxlen() ) xor $havemax ) {

                # Change
                push @chunks, [];
                $havemax = 1 - $havemax;
            }
            push @{ $chunks[-1] }, $kid;
        }
        shift(@chunks) if @{ $chunks[0] } == 0;

        # If there were things with different types, create a new seq
        # that has a child seq for each chunk.
        if ( @chunks > 1 ) {
            my @children = map { rop( 'seq', $_ ) } @chunks;
            $t = rop( 'seq', \@children );
        }

        $t->{add_check_visited} = 1;
        return $self->add_checks( $t, $guarantee, $follow_min );

    }
    elsif ( $type eq 'seq' && $t->{add_check_visited} ) {
        my ( $kid_follow_min, $kid_guarantee ) = ( $follow_min, $guarantee );

        foreach my $kid ( @{ $t->{args} } ) {
            $kid_follow_min += $kid->minlen();
        }

        foreach my $kid ( @{ $t->{args} } ) {
            $kid_follow_min -= $kid->minlen();

            $kid = $self->add_checks( $kid, $kid_guarantee, $kid_follow_min );

            if ( defined( $kid->maxlen() ) ) {
                $kid_guarantee -= $kid->maxlen();
            }
            else {
                $kid_guarantee = 0;
            }
        }
    }
    elsif ( $type eq 'multi_match' ) {

        # R<min,max>: R is guaranteed to have 1/max as many available
        # as the whole thing does. If max is unknown, we cannot
        # guarantee anything.
        my $R   = $t->{args}->[3];
        my $max = $t->maxlen();
        if ( !defined($max) || $max < 0 ) { $guarantee = 0; }
        else { $guarantee /= $max; }
        $t->{args}->[3] = $self->add_checks( $R, int($guarantee), $follow_min );
        return $t;
    }
    else {

        # If we're scanning, we'll eat up any possible guarantee.
        $guarantee = 0 if $type eq 'scan';

        # Default for everything else is to pass the same guarantee
        # and follow_min to all children. (Which is reasonable, since
        # everything but seq tries to match its children at the same
        # starting point in the string -- think of alternation, for
        # example.)
        foreach my $arg ( @{ $t->{args} } ) {
            if ( UNIVERSAL::isa( $arg, 'Regex::Ops::Tree' ) ) {

                # $arg is a reference variable, remember.
                $arg = $self->add_checks( $arg, $guarantee, $follow_min );
            }
        }
    }

    return $t;
}

# By default, all atom match ops check to be sure there are enough
# atoms left in the input before looking at the next atom. This
# routine disables all of those checks. So you should call
# add_checks() to put the (hopefully fewer) checks back in.
sub disable_implicit_checks {
    my ( $self, $t, $ctx ) = @_;
    return $self->maptree( $t, sub { shift->{nocheck} = 1 } );
}

sub maptree {
    my ( $self, $t, $sub ) = @_;
    $sub->($t);
    foreach my $arg ( @{ $t->{args} } ) {
        if ( UNIVERSAL::isa( $arg, 'Regex::Ops::Tree' ) ) {
            $self->maptree( $arg, $sub );
        }
    }
    return $t;
}

1;

# Local Variables:
#   mode: cperl
#   cperl-indent-level: 4
#   fill-column: 100
# End:
# vim: expandtab shiftwidth=4:


syntax highlighted by Code2HTML, v. 0.9.1