module: regular-expressions
author: Nick Kramer (nkramer@cs.cmu.edu)
copyright:  Copyright (C) 1994, Carnegie Mellon University.
            All rights reserved.
rcs-header: $Header: /scm/cvs/fundev/Sources/lib/regular-expressions/parse.dylan,v 1.1 2004/03/12 00:08:52 cgay Exp $

//======================================================================
//
// Copyright (c) 1994  Carnegie Mellon University
// All rights reserved.
// 
// Use and copying of this software and preparation of derivative
// works based on this software are permitted, including commercial
// use, provided that the following conditions are observed:
// 
// 1. This copyright notice must be retained in full on any copies
//    and on appropriate parts of any derivative works.
// 2. Documentation (paper or online) accompanying any system that
//    incorporates this software, or any part of it, must acknowledge
//    the contribution of the Gwydion Project at Carnegie Mellon
//    University.
// 
// This software is made available "as is".  Neither the authors nor
// Carnegie Mellon University make any warranty about the software,
// its performance, or its conformity to any specification.
// 
// Bug reports, questions, comments, and suggestions should be sent by
// E-mail to the Internet address "gwydion-bugs@cs.cmu.edu".
//
//======================================================================

// This is a program to parse regular expressions. The grammar I'm using is:
//
//      <regexp> ::= <alternative> | <alternative>|<regexp>
//
//      <alternative> ::= <quantified-atom> | <quantified-atom><alternative>
//
//      <quantified-atom> ::= <atom> | <atom><quantifier>
//
//      <quantifier> ::= * | + | ? | {n} | {n,} | {n, m}
//            (where n and m are decimal integers)
//
//      <atom> ::= (<regexp>) | <extended-character>
//
// See "Programming perl", p. 103-104 for more details.
//
// Because an assertion is a type of <extended-character>, this will
// parse a "quantified assertion", which really isn't a legal regular
// expression component.  Match.dylan could go into an infinite loop
// if given this.

define abstract class <parsed-regexp> (<object>)
end class <parsed-regexp>;

define class <mark> (<parsed-regexp>)
  slot child :: <parsed-regexp>,  required-init-keyword: #"child";
  constant slot group-number :: <integer>, required-init-keyword: #"group";
end class <mark>;

define class <union> (<parsed-regexp>)          //    |
  slot left  :: <parsed-regexp>, required-init-keyword: #"left";
  slot right :: <parsed-regexp>, required-init-keyword: #"right";
end class <union>;

define class <alternative> (<parsed-regexp>)    // concatenation
  slot left :: <parsed-regexp>,  required-init-keyword: #"left";
  slot right :: <parsed-regexp>, required-init-keyword: #"right";
end class <alternative>;

define class <parsed-assertion> (<parsed-regexp>)
  constant slot asserts :: <symbol>, required-init-keyword: #"assertion";
end class <parsed-assertion>;

define class <quantified-atom> (<parsed-regexp>)
  slot atom :: <parsed-regexp>, required-init-keyword: #"atom";
  constant slot min-matches :: <integer>, init-value: 0,
    init-keyword: #"min";
  constant slot max-matches :: false-or(<integer>), init-value: #f, 
    init-keyword: #"max";
end class <quantified-atom>;

define abstract class <parsed-atom> (<parsed-regexp>)
end class <parsed-atom>;

define class <parsed-character> (<parsed-atom>)
  constant slot character :: <character>, required-init-keyword: #"character";
end class <parsed-character>;

define class <parsed-string> (<parsed-atom>)
  constant slot string :: <string>, required-init-keyword: #"string";
end class <parsed-string>;

define class <parsed-set> (<parsed-atom>)
  constant slot char-set :: <character-set>, required-init-keyword: #"set";
end class <parsed-set>;

define class <parsed-backreference> (<parsed-atom>)
  constant slot group-number :: <integer>, required-init-keyword: #"group"; 
end class <parsed-backreference>;

// <parse-info> contains some information about the current regexp
// being parsed.  Using a structure is slightly nicer than having
// global variables..
//
define class <parse-info> (<object>)
  slot backreference-used :: <boolean>, init-value: #f;
     // Whether or not the function includes \1, \2, etc in the regexp.
     // This is different from return-marks, which determines whether the
     // user wants to know about the marks.
  slot has-alternatives :: <boolean>, init-value: #f;
  slot has-quantifiers :: <boolean>, init-value: #f;
  slot current-group-number :: <integer>, init-value: 0;
  constant slot set-type :: <class>, required-init-keyword: #"set-type";
end class <parse-info>;

define class <illegal-regexp> (<error>)
  constant slot regular-expression :: <string>, 
    required-init-keyword: #"regexp";
end class <illegal-regexp>;

define sealed domain make (singleton(<illegal-regexp>));
define sealed domain initialize (<illegal-regexp>);

/* KJP: Doesn't work this way in Functional Developer.
define sealed method report-condition (cond :: <illegal-regexp>, stream) => ();
  condition-format(stream, "Illegal regular expression: \n"
		     "A sub-regexp that matches the empty string has"
		     " been quantified in\n   %s",
		   cond.regular-expression);
end method report-condition;
*/
ignorable(regular-expression);

define method parse (regexp :: <string>, character-set-type :: <class>)
 => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>,
     backrefs? :: <boolean>, alternatives? :: <boolean>, 
     quantifiers? :: <boolean>);
  let parse-info = make(<parse-info>, set-type: character-set-type);
  let parse-string = make(<parse-string>, string: regexp);
  let parse-tree = make(<mark>, group: 0, 
			child: parse-regexp(parse-string, parse-info));
  let optimized-regexp = optimize(parse-tree);
  if (optimized-regexp.pathological?)
    signal(make(<illegal-regexp>, regexp: regexp));
  else
    values(optimized-regexp,
	   parse-info.current-group-number,
	   parse-info.backreference-used,
	   parse-info.has-alternatives,
	   parse-info.has-quantifiers);
  end if;
end method parse;

define method parse-regexp (s :: <parse-string>, info :: <parse-info>)
 => parsed-regexp :: <parsed-regexp>;
  let alternative = parse-alternative(s, info);
  if (lookahead(s) = '|')
    info.has-alternatives := #t;
    make(<union>, left: alternative, right: parse-regexp(consume(s), info));
  else
    alternative;
  end if;
end method parse-regexp;

define method parse-alternative (s :: <parse-string>, info :: <parse-info>)
 => parsed-regexp :: <parsed-regexp>;
  let term = parse-quantified-atom(s, info);
  if (member?(lookahead(s), #(#f, '|', ')')))
    term;
  else
    make(<alternative>, left: term, right: parse-alternative(s, info));
  end if;
end method parse-alternative;

define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
 => parsed-regexp :: <parsed-regexp>;
  let atom = parse-atom(s, info);
  let char = lookahead(s);
  select (char by \=)
    '*' =>
      info.has-quantifiers := #t;
      consume(s);
      make(<quantified-atom>, min: 0, atom: atom);

    '+' =>
      info.has-quantifiers := #t;
      consume(s);
      make(<quantified-atom>, min: 1, atom: atom);

    '?' =>
      info.has-quantifiers := #t;
      consume(s);
      make(<quantified-atom>, min: 0, max: 1, atom: atom);

    '{' =>
      info.has-quantifiers := #t;
      consume(s);
      let first-string = make(<deque>);
      let second-string = make(<deque>);
      let has-comma = #f;
      for (c = lookahead(s) then lookahead(s), until: c = '}')
	consume(s);
	if (c = ',')  
	  has-comma := #t;
	elseif (has-comma)  
	  push-last(second-string, c);
	else 
	  push-last(first-string, c);
	end if;
      end for;
      consume(s);         // Eat closing brace
      make(<quantified-atom>, atom: atom, 
	   min: sequence-to-integer(first-string), // KJP: string-to -> sequence-to
	   max:  if (~has-comma)    
		   sequence-to-integer(first-string)
		 elseif (empty?(second-string))   
		   #f
		 else
		   sequence-to-integer(second-string) 
		 end if);

    otherwise =>
      atom;
  end select;
end method parse-quantified-atom;

// KJP: added, quickie
//
define method sequence-to-integer (seq :: <deque>) => (int :: <integer>)
  string-to-integer(as(<byte-string>, seq));
end method sequence-to-integer;

define method parse-atom (s :: <parse-string>, info :: <parse-info>)
 => parsed-regexp :: <parsed-regexp>;
  let char = lookahead(s);
  select (char)
    '(' =>
      consume(s);   // Consume beginning paren
      info.current-group-number := info.current-group-number + 1;
      let this-group = info.current-group-number;
      let regexp = parse-regexp(s, info);
      if (lookahead(s) ~= ')')
	error("Unbalanced parens in regexp");
      end if;
      consume(s);   // Consume end paren
      make(<mark>, child: regexp, group: this-group);

    ')' =>
      #f;             // Need something to terminate upon seeing a close paren

    #f  =>
      #f;   // Signal error?  (end of stream)

    '*', '|', '+' =>
      #f;

    '\\' =>
      consume(s);        // Consume the backslash
      parse-escaped-character(s, info);

    '[' =>
      consume(s);        // Eat the opening brace
      let set-string = make(<deque>);      // Need something that'll 
                                           // preserve the right ordering
      for (char = lookahead(s) then lookahead(s), until: char == ']')
	consume(s);                    // eat char
	if (char ~== '\\')
	  push-last(set-string, char);
	else
	  let char2 = lookahead(s);
	  consume(s);  // Eat escaped char
	  if (char2 == ']')
	    push-last(set-string, ']');
	  else
	    push-last(set-string, '\\');
	    push-last(set-string, char2);
	  end if;
	end if;
      end for;
      consume(s);     // Eat ending brace
      make(<parsed-set>, set: make(info.set-type, description: set-string));

    '.' =>
      consume(s);
      dot;

    '^' =>
      consume(s);
      make(<parsed-assertion>, assertion: #"beginning-of-string");

    '$' =>
      consume(s);
      make(<parsed-assertion>, assertion: #"end-of-string");
  
      // Insert more special characters here

    otherwise =>
      let char = lookahead(s);
      consume(s);
      make(<parsed-character>, character: char);
  end select;
end method parse-atom;

define constant any-char 
  = make(<case-sensitive-character-set>, description: "^\n");

// The useful definitions of all these is in as(<character-set>)
//
define constant digit-chars
  = make(<case-sensitive-character-set>, description: "\\d");
define constant not-digit-chars
  = make(<case-sensitive-character-set>, description: "^\\d");
define constant word-chars
  = make(<case-sensitive-character-set>, description: "\\w");
define constant not-word-chars
  = make(<case-sensitive-character-set>, description: "^\\w");
define constant whitespace-chars
  = make(<case-sensitive-character-set>, description: "\\s");
define constant not-whitespace-chars
  = make(<case-sensitive-character-set>, description: "^\\s");

define constant dot = make(<parsed-set>, set: any-char);
/* KJP: Not used.
define constant dot-star = make(<quantified-atom>, min: 0, max: #f,
				atom: dot);
*/

// This only handles escaped characters *outside* of a character
// set. Inside of a character set is a whole different story.
//
define method parse-escaped-character 
    (s :: <parse-string>, info :: <parse-info>)
 => parsed-regexp :: <parsed-regexp>;
  let next-char = lookahead(s);
  consume(s);
  select (next-char)
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' =>
      info.backreference-used := #t;
      make(<parsed-backreference>, group: digit-to-integer(next-char));

    'n' =>   make(<parsed-character>, character: '\n');   // Newline
    't' =>   make(<parsed-character>, character: '\t');   // Tab
    'f' =>   make(<parsed-character>, character: '\f');   // Formfeed
    'r' =>   make(<parsed-character>, character: '\r');   // Carriage return

    'b' =>   make(<parsed-assertion>, assertion: #"word-boundary");
    'B' =>   make(<parsed-assertion>, assertion: #"not-word-boundary");
       // Beginning and end of string are not escaped

    'd' =>   make(<parsed-set>, set: digit-chars);
    'D' =>   make(<parsed-set>, set: not-digit-chars);
    'w' =>   make(<parsed-set>, set: word-chars);
    'W' =>   make(<parsed-set>, set: not-word-chars);
    's' =>   make(<parsed-set>, set: whitespace-chars);
    'S' =>   make(<parsed-set>, set: not-whitespace-chars);

    // Insert more escaped characters here

    otherwise =>
      make(<parsed-character>, character: next-char);
  end select;
end method parse-escaped-character;

define method is-anchored? (regexp :: <parsed-regexp>)
 => (result :: <boolean>);
  select (regexp by instance?)
    <mark> => is-anchored?(regexp.child);
    <alternative> => is-anchored?(regexp.left);
    <parsed-assertion> => regexp.asserts == #"beginning-of-string";
    otherwise => #f;
  end select;
end method is-anchored?;

define method initial-substring (regexp :: <parsed-regexp>)
 => (result :: <string>);
  let result = make(<deque>);
  local method init (regexp :: <parsed-regexp>, result :: <deque>)
	  select (regexp by instance?)
	    <alternative> =>
	      init(regexp.left, result) & init(regexp.right, result);
	    <parsed-character> =>
	      push-last(result, regexp.character);
	    <parsed-string> =>
	      for (ch in regexp.string) push-last(result, ch) end for;
	    <mark> =>
	      init(regexp.child, result);
	    <parsed-assertion> =>
	      #t;
	    otherwise =>
	      #f;
	  end select;
	end method init;
  init(regexp, result);
  as(<byte-string>, result);
end method initial-substring;

// Optimize converts a parse tree into an "optimized" parse tree.
// Currently the only optimization is merging adjacent characters into
// a string.
//
define method optimize (regexp :: <parsed-regexp>)
 => (regexp :: <parsed-regexp>);
  select (regexp by instance?)
    <mark> =>
      regexp.child := optimize(regexp.child);
      regexp;
    <alternative> =>
      if (instance?(regexp.left, <parsed-character>))
	let result-str = make(<deque>);
	push-last(result-str, regexp.left.character);
	for (next = regexp.right then next.right,
	     while: (instance?(next, <alternative>)
		       & instance?(next.left, <parsed-character>)))
	  push-last(result-str, next.left.character)
	finally
	  if (instance?(next, <parsed-character>))
	    push-last(result-str, next.character);
	    make(<parsed-string>, string: as(<string>, result-str));
	  elseif (result-str.size = 1)
	    regexp.right := optimize(regexp.right);
	    regexp;
	  else
	    make(<alternative>,
		 left: make(<parsed-string>, string: as(<string>, result-str)),
		 right: optimize(next));
	  end if;
	end for;
      else
	regexp.left := optimize(regexp.left);
	regexp.right := optimize(regexp.right);
	regexp;
      end if;
    <union> =>
      regexp.left := optimize(regexp.left);
      regexp.right := optimize(regexp.right);
      regexp;
    <quantified-atom> =>
      regexp.atom := optimize(regexp.atom);
      regexp;
    otherwise =>
      regexp;
  end select;
end method optimize;

// We have to somehow deal with pathological regular expressions like
// ".**".  Perl simply signals an error in this case.  We *could* in
// fact match these pathological regexps using the formulation below,
// but it doesn't seem worth the trouble.  Frankly, I doubt anyone has
// ever tried to use such a pathological regexp and *not* have done it
// by mistake.  But in case I'm wrong, here's how to fix a
// pathological regexp:
//
// First, realize that pathological regexps stem from infinitely
// quantifying subregexps that could match the empty string.  So what
// we do is find this subregexps, and perform the following
// transformation:
//
//  case (type of regexp)
//    r1r2 => r1'r2|r2'
//    r1|r2 => r1'|r2'
//    r1{0,n} => r1'{1,n}
//    r1{0,} => r1'{1,}
//    atom => atom
//    assertion => can't be done
//
// This transformation turns a might-match-emptystring regexp into a
// regexp that matches the same set of strings minus the empty string.
// If this transformation can't be done, remember that "$*" is
// equivalent to "always true and consumes no input".


define generic matches-empty-string? (regexp :: <parsed-regexp>)
 => answer :: <boolean>;

define method matches-empty-string? (regexp :: <parsed-atom>)
 => answer :: <boolean>;
  #f;
end method matches-empty-string?;

define method matches-empty-string? (regexp :: <parsed-assertion>)
 => answer :: <boolean>;
  #t;
end method matches-empty-string?;

define method matches-empty-string? (regexp :: <mark>)
 => answer :: <boolean>;
  regexp.child.matches-empty-string?;
end method matches-empty-string?;

define method matches-empty-string? (regexp :: <union>)
 => answer :: <boolean>;
  regexp.left.matches-empty-string? | regexp.right.matches-empty-string?;
end method matches-empty-string?;

define method matches-empty-string? (regexp :: <alternative>)
 => answer :: <boolean>;
  regexp.left.matches-empty-string? & regexp.right.matches-empty-string?;
end method matches-empty-string?;

define method matches-empty-string? (regexp :: <quantified-atom>)
 => answer :: <boolean>;
   regexp.min-matches == 0 | regexp.atom.matches-empty-string?;
end method matches-empty-string?;


define generic pathological? (regexp :: <parsed-regexp>)
 => answer :: <boolean>;

define method pathological? (regexp :: <parsed-atom>)
 => answer :: <boolean>;
  #f;
end method pathological?;

define method pathological? (regexp :: <parsed-assertion>)
 => answer :: <boolean>;
  #f;
end method pathological?;

define method pathological? (regexp :: <mark>)
 => answer :: <boolean>;
  regexp.child.pathological?;
end method pathological?;

define method pathological? (regexp :: <union>)
 => answer :: <boolean>;
  regexp.left.pathological? | regexp.right.pathological?;
end method pathological?;

define method pathological? (regexp :: <alternative>)
 => answer :: <boolean>;
  regexp.left.pathological? | regexp.right.pathological?;
end method pathological?;

define method pathological? (regexp :: <quantified-atom>)
 => answer :: <boolean>;
  regexp.max-matches == #f & regexp.atom.matches-empty-string?;
end method pathological?;

// Seals for file parse.dylan

// <mark> -- subclass of <parsed-regexp>
define sealed domain make(singleton(<mark>));
// <union> -- subclass of <parsed-regexp>
define sealed domain make(singleton(<union>));
// <alternative> -- subclass of <parsed-regexp>
define sealed domain make(singleton(<alternative>));
// <parsed-assertion> -- subclass of <parsed-regexp>
define sealed domain make(singleton(<parsed-assertion>));
// <quantified-atom> -- subclass of <parsed-regexp>
define sealed domain make(singleton(<quantified-atom>));
// <parsed-character> -- subclass of <parsed-atom>
define sealed domain make(singleton(<parsed-character>));
// <parsed-string> -- subclass of <parsed-atom>
define sealed domain make(singleton(<parsed-string>));
// <parsed-set> -- subclass of <parsed-atom>
define sealed domain make(singleton(<parsed-set>));
// <parsed-backreference> -- subclass of <parsed-atom>
define sealed domain make(singleton(<parsed-backreference>));
// <parse-info> -- subclass of <object>
define sealed domain make(singleton(<parse-info>));
define sealed domain initialize(<parse-info>);