Module: java-parser Author: Gail Zacharias Copyright: Original Code is Copyright (c) 1995-2004 Functional Objects, Inc. All rights reserved. License: Functional Objects Library Public License Version 1.0 Dual-license: GNU Lesser General Public License Warranty: Distributed WITHOUT WARRANTY OF ANY KIND define constant $max-lexer-code :: = 255; define class () constant slot tokenizer-result :: false-or(), required-init-keyword: result:; constant slot tokenizer-transitions :: false-or(), required-init-keyword: transitions:; end class ; define method add-transition (table :: , on :: , new-state :: ) debug-assert(~table[on], "input %= transitions to both %= and %=", as(, on), table[on], new-state); table[on] := new-state; end; define inline method add-transition (table :: , on :: , new-state :: ) add-transition(table, as(, on), new-state); end method add-transition; define method add-transition (table :: , on :: , new-state :: ) let last = #f; let range = #f; for (char :: in on) if (range) if (last) for (i :: from as(, last) + 1 to as(, char)) add-transition(table, i, new-state); end for; last := #f; else add-transition(table, '-', new-state); add-transition(table, char, new-state); last := char; end if; range := #f; elseif (char == '-') range := #t; else add-transition(table, char, new-state); last := char; end if; end for; end method add-transition; define method make-tokenizer-state (name :: , result :: false-or(), #rest transitions) let table = size(transitions) > 0 & make(, size: $max-lexer-code + 1, fill: #f); for (transition :: in transitions) add-transition(table, head(transition), tail(transition)); end for; pair(name, make(, result: result, transitions: table)); end make-tokenizer-state; define inline-only function compile-state-machine (#rest states) => (state :: ) // make a hash table mapping state names to states. let state-table = make(); for (state-pair :: in states) let name = state-pair.head; debug-assert(~element(state-table, name, default: #f), "State %= multiply defined.", name); state-table[name] := state-pair.tail; end for; // Now that we have a table mapping state names to states, change the // entries in the transition tables to refer to the new state // object themselves instead of just to the new state name. for (state-pair :: in states) let state :: = state-pair.tail; let table = state.tokenizer-transitions; if (table) for (i from 0 to $max-lexer-code) let new-state = table[i]; if (new-state) table[i] := state-table[new-state]; end if; end for; end if; end for; element(state-table, #"start"); end compile-state-machine; define macro state-machine-definer { define state-machine ?variable-name:name ?states end } => { define constant ?variable-name :: = compile-state-machine(?states) } states: { } => { } { state ?state-name:token ?action:expression, ?actions ; ... } => { make-tokenizer-state(?state-name, ?action, ?actions), ... } actions: { } => { } // TODO: since we never use an actual expression for chars, // we could do #(?chars, ?next-state), save load-time consing... // Do this after verify that \<> escapes work. { ?next-state:token ?chars:expression, ... } => { pair(?chars, ?next-state), ... } end macro; define function hex-shift (value :: false-or(), h :: ) => (new-value :: false-or()) when (value) let n = if (48 <= h & h <= 57) h - 48 elseif (65 <= h & h <= 70) h - 55 elseif (97 <= h & h <= 102) h - 87 else #f end; when (n) value * 16 + n end; end; end; // TODO: THIS NEEDS TO HANDLE UNICODE ESCAPES define function extract-string (contents :: , start-pos :: , end-pos :: ) let bytes = end-pos - start-pos; let string :: = make(, size: bytes); copy-bytes(contents, start-pos, string, 0, bytes); string end; define method get-token-from-contents (initial-state :: , contents :: , initial-position :: , context) => (token-or-eoi, new-position :: ) let length :: = contents.size; let even? = #t; local method readch (posn :: ) => (ch :: , posn :: ) let ch = contents[posn]; if (ch ~== as(, '\\')) even? := #t; values(ch, posn + 1) elseif (even? & posn + 1 < length & contents[posn + 1] == as(, 'u')) iterate unicode (pos :: = posn + 2) if (pos < length & contents[pos] == as(, 'u')) unicode(pos + 1); else let val = pos + 3 < length & hex-shift(hex-shift(hex-shift(hex-shift(0,contents[pos]), contents[pos + 1]), contents[pos + 2]), contents[pos + 3]); if (val) values(val, pos + 3) else parse-error(context, "Invalid escape: %s", extract-string(contents, posn, posn + 3)); even? := #f; values(ch, posn + 1); end; end; end iterate; else even? := ~even?; values(ch, posn + 1) end; end method; let result-function :: false-or() = #f; let result-start :: = initial-position; let result-end :: false-or() = #f; iterate loop (state :: = initial-state, posn :: = initial-position) when (state == initial-state) result-function := #f; result-start := posn; result-end := #f; end; when (state.tokenizer-result) // It is an accepting state, so record the result and where it ended. result-function := state.tokenizer-result; result-end := posn; end; // Try advancing the state machine once more if possible. let table = state.tokenizer-transitions; let (new-state, next-pos) = when (posn < length) let (ch, next-pos) = readch(posn); values(table & ch <= $max-lexer-code & table[ch], next-pos) end; if (new-state) loop(new-state, next-pos); elseif (result-function) let string = extract-string(contents, result-start, result-end); values(result-function(context, string), result-end) elseif (result-start == length) values(#f, length) // EOI else let end-pos = next-pos | length; parse-error(context, "Invalid token: %s", extract-string(contents, result-start, result-end)); get-token-from-contents(initial-state, contents, end-pos, context); end; end iterate; end method get-token-from-contents; define method parse-error (context :: , #rest format-args) apply(error, format-args) end;