module: string-hacking author: Nick Kramer (nkramer@cs.cmu.edu) synopsis: Random functionality for working with strings copyright: Copyright (C) 1994, Carnegie Mellon University. All rights reserved. rcs-header: $Header: /scm/cvs/fundev/Sources/lib/string-extensions/string-hacking.dylan,v 1.1 2004/03/12 00:09:20 cgay Exp $ //====================================================================== // // Copyright (c) 1994 Carnegie Mellon University // All rights reserved. // // Use and copying of this software and preparation of derivative // works based on this software are permitted, including commercial // use, provided that the following conditions are observed: // // 1. This copyright notice must be retained in full on any copies // and on appropriate parts of any derivative works. // 2. Documentation (paper or online) accompanying any system that // incorporates this software, or any part of it, must acknowledge // the contribution of the Gwydion Project at Carnegie Mellon // University. // // This software is made available "as is". Neither the authors nor // Carnegie Mellon University make any warranty about the software, // its performance, or its conformity to any specification. // // Bug reports, questions, comments, and suggestions should be sent by // E-mail to the Internet address "gwydion-bugs@cs.cmu.edu". // //====================================================================== // add has no useful guarenteed behavior on strings. // define method add-last (string :: , character :: ) => new-string :: ; concatenate(string, make(, size: 1, fill: character)); end method add-last; // Like character-- in C // define method predecessor (c :: ) => c2 :: ; as(, as(, c) - 1); end method predecessor; // Like character++ in C // define method successor (c :: ) => c2 :: ; as(, as(, c) + 1); end method successor; // ----------------------------------------------------------------- // The following two functions are not exported. define method xor (value1 :: , value2 :: ) => answer :: ; if (value1) ~value2; else value2; end if; end method xor; // This does a "reverse curry". It takes a function of one argument, // and returns a function of two arguments that ignores the second // argument. Useful for making functions for remove!. // /* KJP: Not used. define method make-test (predicate? :: ) => tester :: ; method (value :: , ignored :: ) => answer :: ; predicate?(value); end method; end method make-test; */ // ----------------------------------------------------------------- // KJP: Hack because of unicode glitches. define constant = ; // Character-set: A reasonably efficient way of storing sets of // characters. Store byte characters in a vector of size 256, and // keep the rest as sequences of ranges and single characters. // define sealed abstract class () constant slot byte-characters :: , // KJP: constant init-function: method () make() end; slot char-ranges :: ; // sequence of begin-char/end-char pairs slot single-chars :: ; // Characters that aren't part of a range slot negated-set? :: ; end class ; // Uses == as a comparison // define class () end class ; // Uses case-insensitive-equal as a comparison // define class () end class ; define sealed inline method key-test (set :: ) => id :: ; \==; end method key-test; define sealed inline method key-test (set :: ) => case-insensitive-equal :: ; case-insensitive-equal; end method key-test; define sealed inline method type-for-copy (set :: ) => cls :: ; ; end method type-for-copy; // Fills the byte-vector with #t's corresponding to byte characters in // the ranges and single-chars. Also converts ranges and single-chars // to vectors and strings, respectively. negated: is handled by // an init-keyword. // define sealed method initialize (set :: , #next next-method, #key description = "", #all-keys) => false :: singleton(#f); next-method(); let (ranges, chars, negated) = parse-description(description); set.negated-set? := negated; if (negated) // Add all byte characters to the vector, and we will delete the ones we // don't want. for (i from 0 below 256) // Was as(,...), which isn't supported set.byte-characters[as(, i)] := #t; end for; end if; handle-single-chars!(set, chars); let shmoonicode-ranges = #(); for (range in ranges) let first = head(range); let last = tail(range); if (byte-character?(first) & byte-character?(last)) for (c = first then successor(c), until: c > last) add-to-byte-vector!(set, c); end for; else shmoonicode-ranges := add!(shmoonicode-ranges, range); end if; end for; set.char-ranges := as(, shmoonicode-ranges); #f; end method initialize; // Not exported. Turns the appropriate character or characters in the // byte-vector to #t. // define method add-to-byte-vector! (set :: , char :: ) => false :: singleton(#f); set.byte-characters[char] := ~set.negated-set?; #f; end method add-to-byte-vector!; define method add-to-byte-vector! (set :: , char :: ) => false :: singleton(#f); set.byte-characters[as-lowercase(char)] := ~set.negated-set?; set.byte-characters[as-uppercase(char)] := ~set.negated-set?; #f; end method add-to-byte-vector!; define variable no-default = pair(#f, #f); // Call member? to do real work. // define method element (set :: , char :: , #key default = no-default) => char-or-f :: false-or(); if (member?(char, set)) char; elseif (default == no-default) error("Element %= not found", char); else default; end if; end method element; // test: is accepted but ignored. // define method member? (char :: , set :: , #key test :: = key-test(set)) => answer :: ; if (test == key-test(set)) in-byte-vector?(set, char); else block (return) for (elem :: in set) if (test(char, elem)) return(#t) end if; end for; end block; end if; end method member?; // char is not a byte-character // define method member? (c :: , set :: , #key test :: = key-test(set)) => answer :: ; if (test == key-test(set)) xor(in-single-chars?(set, c) | in-ranges?(c, set), set.negated-set?); else block (return) for (elem :: in set) if (test(c, elem)) return(#t) end if; end for; end block; end if; end method member?; define method handle-single-chars! (set :: , char-coll :: ) => same-set :: ; let not-byte-chars = make(, size: 0); for (c in char-coll) if (instance?(c, )) add-to-byte-vector!(set, c); else not-byte-chars := add!(not-byte-chars, c); end if; end for; set.single-chars := not-byte-chars; set; end method handle-single-chars!; // Convert a character set string (without [ and ]) into a character set. // define method as (type == , coll :: ) => set :: ; error("Need to specify whether you want a " " or a "); end method as; define method as (type == , coll :: ) => set :: ; let set = make(); handle-single-chars!(set, coll); set; end method as; define method as (type == , coll :: ) => set :: ; let set = make(); handle-single-chars!(set, coll); set; end method as; // Not exported. // Type is either or // define method parse-description (string :: ); let s = make(, string: string); let negated = (lookahead(s) == '^'); if (negated) consume(s) end; let char-list = #(); let range-list = #(); until (lookahead(s) = #f) // until end of string let char = lookahead(s); consume(s); if (lookahead(s) = '-') consume(s); let second-char = lookahead(s); consume(s); range-list := add!(range-list, pair(char, second-char)); elseif (char = '\\') let escaped-char = lookahead(s); consume(s); select (escaped-char by \==) 'n' => char-list := add!(char-list, '\n'); // newline 't' => char-list := add!(char-list, '\t'); // tab 'f' => char-list := add!(char-list, '\f'); // formfeed 'r' => char-list := add!(char-list, '\r'); // carriage return 'b' => char-list := add!(char-list, '\b'); // backspace 'd' => range-list := add!(range-list, pair('0', '9')); // digit-char 'w' => // word-char range-list := concatenate(range-list, list(pair('a', 'z'), pair('A', 'Z'), pair('0', '9'))); char-list := add!(char-list, '_'); 's' => char-list := concatenate(char-list, " \t\n\r\f"); // whitespace otherwise => char-list := add!(char-list, escaped-char); end select; else char-list := add!(char-list, char); end if; end until; values(range-list, char-list, negated); end method parse-description; // Not highly useful for a non-mutable class, but why bother erasing // perfectly good code.. // define method shallow-copy (set :: ) => new-set :: ; let new-set = make(object-class(set)); // Wish I had keyed-by let coll = set.byte-characters; let (state, limit, next, done?, cur-key, cur-elt) = forward-iteration-protocol(coll); for (st = state then next(coll, st), until: done?(coll, st, limit)) let elt = cur-elt(coll, st); let key = cur-key(coll, st); new-set.byte-characters[key] := elt; end for; new-set.char-ranges := shallow-copy(set.char-ranges); new-set.single-chars := shallow-copy(set.single-chars); new-set.negated-set? := set.negated-set?; new-set; end method shallow-copy; // The following in-? functions are not exported, and ignore the // negated? bit. // define method in-byte-vector? (set :: , c :: ) => answer :: ; set.byte-characters[c]; end method in-byte-vector?; define method in-ranges? (c :: , set :: ) => answer :: ; block (return) for (range in set.char-ranges) if (c >= head(range) & c <= tail(range)) return(#t); end if; end for; #f; end block; end method in-ranges?; define method in-ranges? (c :: , set :: ) => answer :: ; block (return) for (range in set.char-ranges) if (as-lowercase(c) >= head(range) & as-lowercase(c) <= tail(range)) return(#t); elseif (as-uppercase(c) >= head(range) & as-uppercase(c) <= tail(range)) return(#t); end if; end for; #f; end block; end method in-ranges?; define method in-single-chars? (set :: , c :: ) => answer :: ; member?(c, set.single-chars, test: \==); end method in-single-chars?; define method in-single-chars? (set :: , c :: ) => answer :: ; member?(c, set.single-chars, test: case-insensitive-equal); end method in-single-chars?; define constant $max-character = as(, 65535); define class () slot phase :: one-of(#"byte", #"range", #"single", #"done") = #"byte"; slot index :: = 0; slot char :: = ' '; end class ; define method new-phase (phase == #"range", set :: , state :: ) => (); if (set.char-ranges.empty?) new-phase(#"single", set, state); else state.phase := #"range"; state.index := 0; state.char := set.char-ranges.first.head; end if; end method new-phase; define method new-phase (phase == #"single", set :: , state :: ) => (); if (set.single-chars.empty?) state.phase := #"done"; else state.phase := #"single"; state.index := 0; state.char := set.single-chars.first; end if; end method new-phase; define method forward-iteration-protocol (set :: ) => (initial-state :: , limit :: , next-state :: , finished-state? :: , current-key :: , current-element :: , current-element-setter :: , copy-state :: ); if (set.negated-set?) slow-char-set-iterator(set); else let state = make(); block (return) for (ch in set.byte-characters, i from 0) if (ch) state.index := i; state.char := as(, i); return(); end if; end for; new-phase(#"range", set, state); end block; values(state, #f, // limit // next method (set :: , state :: ) => next-state :: ; select (state.phase) #"byte" => block (return) let chars = set.byte-characters; for (i from state.index + 1 below chars.size) let ch = as(, i); if (chars[ch]) state.index := i; state.char := ch; return(); end if; end for; new-phase(#"range", set, state); end block; #"range" => let new-char = state.char.successor; case (new-char <= set.char-ranges[state.index].tail) => state.char := new-char; (state.index + 1 < set.char-ranges.size) => state.index := state.index + 1; state.char := set.char-ranges[state.index].head; otherwise => new-phase(#"single", set, state); end case; #"single" => let new-index = state.index + 1; if (new-index < set.single-chars.size) state.index := new-index; state.char := set.single-chars[new-index]; else state.phase := #"done"; end if; otherwise => error("Attempt to advance a finished character-set iterator"); end select; state; end method, // finished? method (set :: , state :: , limit == #f) // KJP: -> == #f => answer :: ; state.phase == #"done"; end method, // key method (set :: , state :: ) => state :: ; state.char; end method, // element method (set :: , state :: ) => state :: ; state.char; end method, // element-setter method (value, set :: , state :: ) => state :: ; error("Character sets are immutable"); end method, // copy-state method (set :: , state :: ) => state :: ; make(, phase: state.phase, index: state.index, char: state.char); end method); end if; end method forward-iteration-protocol; define method forward-iteration-protocol (set :: ) => (initial-state :: , limit :: , next-state :: , finished-state? :: , current-key :: , current-element :: , current-element-setter :: , copy-state :: ); slow-char-set-iterator(set); end method forward-iteration-protocol; // This makes a conservative guess about the last character in the set define method guess-max-char (set :: ) let max-char :: = as(, 255); for (elem in set.single-chars) if (as-uppercase(elem) > max-char) max-char := as-uppercase(elem); end if; if (as-lowercase(elem) > max-char) max-char := as-lowercase(elem); end if; end for; for (range in set.char-ranges) let elem :: = range.tail; if (as-uppercase(elem) > max-char) max-char := as-uppercase(elem); end if; if (as-lowercase(elem) > max-char) max-char := as-lowercase(elem); end if; end for; max-char; end method guess-max-char; define method guess-max-char (set :: ) let max-char :: = as(, 255); for (elem in set.single-chars) if (elem > max-char) max-char := elem; end if; end for; for (range in set.char-ranges) let elem :: = range.tail; if (elem > max-char) max-char := elem; end if; end for; max-char; end method guess-max-char; // Plows through all possible characters, using member? to see if it's // a valid key. // define method slow-char-set-iterator (set :: ) => (initial-state :: , limit :: , next-state :: , finished-state? :: , current-key :: , current-element :: , current-element-setter :: , copy-state :: ); let max-char :: = if (set.negated-set?) $max-character; else guess-max-char(set); end if; local method find-next (set :: , ch :: , limit :: ) => (result :: false-or()); block (return) for (c = ch then successor(c), until: member?(c, set)) if (c == limit) return(#f) end if; finally c; end for; end block; end method find-next; values(find-next(set, as(, 0), max-char), #f, // next method (set :: , state :: ) => next-state :: false-or(); state ~== max-char & find-next(set, state.successor, max-char); end method, // finished? method (set :: , state :: false-or(), limit == #f) // KJP: -> #f => answer :: ; ~state; end method, // key method (set :: , state :: ) => state :: ; state; end method, // element method (set :: , state :: ) => state :: ; state; end method, // element-setter method (value, set :: , state :: ) => state :: ; error("Character sets are immutable"); end method, // copy-state method (set :: , state :: ) => state :: ; state; end method); end method slow-char-set-iterator; // ----------------------------------------------------------------- // has nothing to do with a hashtable // (). It's really just a vector that uses byte-characters instead // of integers as indices. // define class () constant slot jump-vector :: , // KJP: constant init-function: method () make(, size: 256, fill: #f) end; end class ; // This function doesn't believe in the concept of defaults. // The parameter is there only to make the compiler happy. // define method element (jt :: , key :: , #key default: default = #f) => elt :: ; jt.jump-vector [as(, key)]; end method element; define method element-setter (value, jt :: , key :: ) => value :: ; jt.jump-vector [as(, key)] := value; end method element-setter; define method forward-iteration-protocol (jt :: ) => (initial-state :: , limit :: , next-state :: , finished-state? :: , current-key :: , current-element :: , current-element-setter :: , copy-state :: ); values(0, 256, // init and limit method (coll, state) state + 1 end, // next-state method (coll, state, limit) state >= limit end, // finished-state? method (coll, state) as(, state) end, // current-key method (coll, state) jt.jump-vector[state] end, // current-elt method (value, coll, state) jt.jump-vector[state] := value end, // Current-elt-setter method (coll, state) state end); // copy-state end method forward-iteration-protocol; // Seals for file string-hacking.dylan // -- subclass of define sealed domain make(singleton()); // -- subclass of define sealed domain make(singleton()); // -- subclass of define sealed domain make(singleton()); define sealed domain initialize();