#!/usr/bin/ruby -w # vim:set ts=2 sw=2 expandtab: # xmlformat.rb - XML document reformatter # Copyright (c) 2004, 2005, Kitebird, LLC. All rights reserved. # Some portions are based on the REX shallow XML parser, which # is Copyright (c) 1998, Robert D. Cameron. These include the # regular expression parsing variables and the shallow_parse() # method. # This software is licensed as described in the file LICENSE, # which you should have received as part of this distribution. # Differences from Perl version: # - Pattern for classifying token as text node is different. # (cannot use !~ op for case) # - It's important to use \A and \z|\Z rather than ^ and $ in pattern # matches on tokens, because ^ and $ might match after/before a # newline for a token that spans multiple lines! require "getoptlong" PROG_NAME = "xmlformat" PROG_VERSION = "1.04" PROG_LANG = "Ruby" # ---------------------------------------------------------------------- # XMLFormat module # Contains: # - Methods for parsing XML document # - Methods for reading configuration file and operating on configuration # information. module XMLFormat # ---------------------------------------------------------------------- # Module methods # warn - print message to stderr # die - print message to stderr and exit def warn(*args) $stderr.print args end def die(*args) $stderr.print args exit(1) end # ---------------------------------------------------------------------- # Module variables - these do not vary per class invocation # Regular expressions for parsing document components. Based on REX. # Compared to Perl version, these variable names use more Ruby-like # lettercase. (Ruby likes to interpret variables that begin with # uppercase as constants.) # spe = shallow parsing expression # se = scanning expression # ce = completion expression # rsb = right square brackets # qm = question mark @@text_se = "[^<]+" @@until_hyphen = "[^-]*-" @@until_2_hyphens = "#{@@until_hyphen}(?:[^-]#{@@until_hyphen})*-" @@comment_ce = "#{@@until_2_hyphens}>?" @@until_rsbs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+" @@cdata_ce = "#{@@until_rsbs}(?:[^\\]>]#{@@until_rsbs})*>" @@s = "[ \\n\\t\\r]+" @@name_strt = "[A-Za-z_:]|[^\\x00-\\x7F]" @@name_char = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]" @@name = "(?:#{@@name_strt})(?:#{@@name_char})*" @@quote_se = "\"[^\"]*\"|'[^']*'" @@dt_ident_se = "#{@@s}#{@@name}(?:#{@@s}(?:#{@@name}|#{@@quote_se}))*" @@markup_decl_ce = "(?:[^\\]\"'><]+|#{@@quote_se})*>" @@s1 = "[\\n\\r\\t ]" @@until_qms = "[^?]*\\?+" @@pi_tail = "\\?>|#{@@s1}#{@@until_qms}(?:[^>?]#{@@until_qms})*>" @@dt_item_se = "<(?:!(?:--#{@@until_2_hyphens}>|[^-]#{@@markup_decl_ce})|\\?#{@@name}(?:#{@@pi_tail}))|%#{@@name};|#{@@s}" @@doctype_ce = "#{@@dt_ident_se}(?:#{@@s})?(?:\\[(?:#{@@dt_item_se})*\\](?:#{@@s})?)?>?" @@decl_ce = "--(?:#{@@comment_ce})?|\\[CDATA\\[(?:#{@@cdata_ce})?|DOCTYPE(?:#{@@doctype_ce})?" @@pi_ce = "#{@@name}(?:#{@@pi_tail})?" @@end_tag_ce = "#{@@name}(?:#{@@s})?>?" @@att_val_se = "\"[^<\"]*\"|'[^<']*'" @@elem_tag_se = "#{@@name}(?:#{@@s}#{@@name}(?:#{@@s})?=(?:#{@@s})?(?:#{@@att_val_se}))*(?:#{@@s})?/?>?" @@markup_spe = "<(?:!(?:#{@@decl_ce})?|\\?(?:#{@@pi_ce})?|/(?:#{@@end_tag_ce})?|(?:#{@@elem_tag_se})?)" @@xml_spe = Regexp.new("#{@@text_se}|#{@@markup_spe}") # ---------------------------------------------------------------------- # Allowable formatting options and their possible values: # - The keys of this hash are the allowable option names # - The value for each key is list of allowable option values # - If the value is nil, the option value must be numeric # If any new formatting option is added to this program, it # must be specified here, *and* a default value for it should # be listed in the *DOCUMENT and *DEFAULT pseudo-element # option hashes. @@opt_list = { "format" => [ "block", "inline", "verbatim" ], "normalize" => [ "yes", "no" ], "subindent" => nil, "wrap-length" => nil, "entry-break" => nil, "exit-break" => nil, "element-break" => nil } class XMLFormatter # Object creation: set up the default formatting configuration # and variables for maintaining input and output document. def initialize # Formatting options for each element. @elt_opts = { } # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can # be overridden in the configuration file, but the options must also be # built in to make sure they exist if not specified in the configuration # file. Each of the structures must have a value for every option. # Options for top-level document children. # - Do not change entry-break: 0 ensures no extra newlines before # first element of output. # - Do not change exit-break: 1 ensures a newline after final element # of output document. # - It's probably best not to change any of the others, except perhaps # if you want to increase the element-break. @elt_opts["*DOCUMENT"] = { "format" => "block", "normalize" => "no", "subindent" => 0, "wrap-length" => 0, "entry-break" => 0, # do not change "exit-break" => 1, # do not change "element-break" => 1 } # Default options. These are used for any elements in the document # that are not specified explicitly in the configuration file. @elt_opts["*DEFAULT"] = { "format" => "block", "normalize" => "no", "subindent" => 1, "wrap-length" => 0, "entry-break" => 1, "exit-break" => 1, "element-break" => 1 } # Run the *DOCUMENT and *DEFAULT options through the option-checker # to verify that the built-in values are legal. err_count = 0 @elt_opts.keys.each do |elt_name| # ... for each element @elt_opts[elt_name].each do |opt_name, opt_val| # ... for each option opt_val, err_msg = check_option(opt_name, opt_val) if err_msg.nil? @elt_opts[elt_name][opt_name] = opt_val else warn "LOGIC ERROR: #{elt_name} default option is invalid\n" warn "#{err_msg}\n" err_count += 1 end end end # Make sure that the every option is represented in the # *DOCUMENT and *DEFAULT structures. @@opt_list.keys.each do |opt_name| @elt_opts.keys.each do |elt_name| if !@elt_opts[elt_name].has_key?(opt_name) warn "LOGIC ERROR: #{elt_name} has no default '#{opt_name}' option\n" err_count += 1 end end end if err_count > 0 raise "Cannot continue; internal default formatting options must be fixed" end end # Initialize the variables that are used per-document def init_doc_vars # Elements that are used in the document but not named explicitly # in the configuration file. @unconf_elts = { } # List of tokens for current document. @tokens = [ ] # List of line numbers for each token @line_num = [ ] # Document node tree (constructed from the token list) @tree = [ ] # Variables for formatting operations: # @out_doc = resulting output document (constructed from document tree) # @pending = array of pending tokens being held until flushed @out_doc = "" @pending = [ ] # Inline elements within block elements are processed using the # text normalization (and possible line-wrapping) values of their # enclosing block. Blocks and inlines may be nested, so we maintain # a stack that allows the normalize/wrap-length values of the current # block to be determined. @block_name_stack = [ ] # for debugging @block_opts_stack = [ ] # A similar stack for maintaining each block's current break type. @block_break_type_stack = [ ] end # Accessors for token list and resulting output document def tokens return @tokens end def out_doc return @out_doc end # Methods for adding strings to output document or # to the pending output array def add_to_doc(str) @out_doc << str end def add_to_pending(str) @pending << str end # Block stack maintenance methods # Push options onto or pop options off from the stack. When doing # this, also push or pop an element onto the break-level stack. def begin_block(name, opts) @block_name_stack << name @block_opts_stack << opts @block_break_type_stack << "entry-break" end def end_block @block_name_stack.pop @block_opts_stack.pop @block_break_type_stack.pop end # Return the current block's normalization status or wrap length def block_normalize return @block_opts_stack.last["normalize"] == "yes" end def block_wrap_length return @block_opts_stack.last["wrap-length"] end # Set the current block's break type, or return the number of newlines # for the block's break type def set_block_break_type(type) @block_break_type_stack[@block_break_type_stack.size-1] = type end def block_break_value return @block_opts_stack.last[@block_break_type_stack.last] end # Read configuration information. For each element, construct a hash # containing a hash key and value for each option name and value. # After reading the file, fill in missing option values for # incomplete option structures using the *DEFAULT options. def read_config(conf_file) elt_names = nil in_continuation = false saved_line = "" File.open(conf_file) do |fh| fh.each_line do |line| line.chomp! next if line =~ /^\s*($|#)/ # skip blank lines, comments if in_continuation line = saved_line + " " + line saved_line = "" in_continuation = false end if line !~ /^\s/ # Line doesn't begin with whitespace, so it lists element names. # Names are separated by whitespace or commas, possibly followed # by a continuation character or comment. if line =~ /\\$/ in_continuation = true saved_line = line.sub(/\\$/, "") # remove continuation character next end line.sub!(/\s*#.*$/, "") # remove any trailing comment elt_names = line.split(/[\s,]+/) # make sure each name has an entry in the elt_opts structure elt_names.each do |elt_name| @elt_opts[elt_name] = { } unless @elt_opts.has_key?(elt_name) end else # Line begins with whitespace, so it contains an option # to apply to the current element list, possibly followed by # a comment. First check that there is a current list. # Then parse the option name/value. if elt_names.nil? raise "#{conf_file}:#{$.}: Option setting found before any " + "elements were named.\n" end line.sub!(/\s*#.*$/, "") line =~ /^\s*(\S+)(?:\s+|\s*=\s*)(\S+)$/ opt_name, opt_val = $1, $2 raise "#{conf_file}:#{$.}: Malformed line: #{$_}" if opt_val.nil? # Check option. If illegal, die with message. Otherwise, # add option to each element in current element list opt_val, err_msg = check_option(opt_name, opt_val) raise "#{conf_file}:#{$.}: #{err_msg}\n" unless err_msg.nil? elt_names.each do |elt_name| @elt_opts[elt_name][opt_name] = opt_val end end end end # For any element that has missing option values, fill in the values # using the options for the *DEFAULT pseudo-element. This speeds up # element option lookups later. It also makes it unnecessary to test # each option to see if it's defined: All element option structures # will have every option defined. def_opts = @elt_opts["*DEFAULT"] @elt_opts.keys.each do |elt_name| next if elt_name == "*DEFAULT" def_opts.keys.each do |opt_name| next if @elt_opts[elt_name].has_key?(opt_name) # already set @elt_opts[elt_name][opt_name] = def_opts[opt_name] end end end # Check option name to make sure it's legal. Check the value to make sure # that it's legal for the name. Return a two-element array: # (value, nil) if the option name and value are legal. # (nil, message) if an error was found; message contains error message. # For legal values, the returned value should be assigned to the option, # because it may get type-converted here. def check_option(opt_name, opt_val) # - Check option name to make sure it's a legal option # - Then check the value. If there is a list of values # the value must be one of them. Otherwise, the value # must be an integer. if !@@opt_list.has_key?(opt_name) return [ nil, "Unknown option name: #{opt_name}" ] end allowable_val = @@opt_list[opt_name] if !allowable_val.nil? if !allowable_val.find { |val| val == opt_val } return [ nil, "Unknown '#{opt_name}' value: #{opt_val}" ] end elsif !opt_val.is_a?(Integer) if opt_val =~ /^\d+$/ opt_val = opt_val.to_i else return [ nil, "'#{opt_name}' value (#{opt_val}) should be an integer" ] end end return [ opt_val, nil ] end private :check_option # Return hash of option values for a given element. If no options are found: # - Add the element name to the list of unconfigured options. # - Assign the default options to the element. (This way the test for the # option fails only once.) def get_opts(elt_name) opts = @elt_opts[elt_name] if opts.nil? @unconf_elts[elt_name] = 1 opts = @elt_opts[elt_name] = @elt_opts["*DEFAULT"] end return opts end private :get_opts # Display contents of configuration options to be used to process document. # For each element named in the elt_opts structure, display its format # type, and those options that apply to the type. def display_config # Format types and the additional options that apply to each type format_opts = { "block" => [ "entry-break", "element-break", "exit-break", "subindent", "normalize", "wrap-length" ], "inline" => [ ], "verbatim" => [ ] } @elt_opts.keys.sort.each do |elt_name| puts elt_name opts = @elt_opts[elt_name] format = opts["format"] # Write out format type, then options that apply to the format type puts " format = #{format}" format_opts[format].each do |opt_name| puts " #{opt_name} = #{opts[opt_name]}" end puts end end # Display the list of elements that are used in the document but not # configured in the configuration file. # Then re-unconfigure the elements so that they won't be considered # as configured for the next document, if there is one. def display_unconfigured_elements elts = @unconf_elts.keys if elts.empty? puts "The document contains no unconfigured elements." else puts "The following document elements were assigned no formatting options:" puts line_wrap(elts.sort.join(" "), 0, 0, 65).join("\n") end elts.each do |elt_name| @elt_opts.delete(elt_name) end end # ---------------------------------------------------------------------- # Main document processing routine. # - Argument is a string representing an input document # - Return value is the reformatted document, or nil. An nil return # signifies either that an error occurred, or that some option was # given that suppresses document output. In either case, don't write # any output for the document. Any error messages will already have # been printed when this returns. def process_doc(doc, verbose, check_parser, canonize_only, show_unconf_elts) init_doc_vars # Perform lexical parse to split document into list of tokens warn "Parsing document...\n" if verbose shallow_parse(doc) if (check_parser) warn "Checking parser...\n" if verbose # concatentation of tokens should be identical to original document if doc == tokens.join("") puts "Parser is okay" else puts "PARSER ERROR: document token concatenation differs from document" end return nil end # Assign input line number to each token assign_line_numbers # Look for and report any error tokens returned by parser warn "Checking document for errors...\n" if verbose if report_errors > 0 warn "Cannot continue processing document.\n" return nil end # Convert the token list to a tree structure warn "Convert document tokens to tree...\n" if verbose if tokens_to_tree > 0 warn "Cannot continue processing document.\n" return nil end # Check: Stringify the tree to convert it back to a single string, # then compare to original document string (should be identical) # (This is an integrity check on the validity of the to-tree and stringify # operations; if one or both do not work properly, a mismatch should occur.) #str = tree_stringify #print str #warn "ERROR: mismatch between document and resulting string\n" if doc != str # Canonize tree to remove extraneous whitespace warn "Canonizing document tree...\n" if verbose tree_canonize if (canonize_only) puts tree_stringify return nil end # One side-effect of canonizing the tree is that the formatting # options are looked up for each element in the document. That # causes the list of elements that have no explicit configuration # to be built. Display the list and return if user requested it. if show_unconf_elts display_unconfigured_elements return nil end # Format the tree to produce formatted XML as a single string warn "Formatting document tree...\n" if verbose tree_format # If the document is not empty, add a newline and emit a warning if # reformatting failed to add a trailing newline. This shouldn't # happen if the *DOCUMENT options are set up with exit-break = 1, # which is the reason for the warning rather than just silently # adding the newline. str = out_doc if !str.empty? && str !~ /\n\z/ warn "LOGIC ERROR: trailing newline had to be added\n" str << "\n" end return str end # ---------------------------------------------------------------------- # Parse XML document into array of tokens and store array def shallow_parse(xml_document) @tokens = xml_document.scan(@@xml_spe) end # ---------------------------------------------------------------------- # Extract a tag name from a tag and return it. This uses a subset # of the document-parsing pattern elements. # Dies if the tag cannot be found, because this is supposed to be # called only with a legal tag. def extract_tag_name(tag) match = /\A<\/?(#{@@name})/.match(tag) return match[1] if match raise "Cannot find tag name in tag: #{tag}" end private :extract_tag_name # ---------------------------------------------------------------------- # Assign an input line number to each token. The number indicates # the line number on which the token begins. def assign_line_numbers line_num = 1; @line_num = [ ] @tokens.each do |token| @line_num << line_num line_num += token.count "\n" end end private :assign_line_numbers # ---------------------------------------------------------------------- # Check token list for errors and report any that are found. Error # tokens are those that begin with "<" but do not end with ">". # Returns the error count. # Does not modify the original token list. def report_errors err_count = 0 @tokens.each_index do |i| token = @tokens[i] if token =~ /\A\Z/ warn "Malformed token at line #{@line_num[i]}, token #{i+1}: #{token}\n" err_count += 1 end end warn "Number of errors found: #{err_count}\n" if err_count > 0 return err_count end # ---------------------------------------------------------------------- # Helper routine to print tag stack for tokens_to_tree def print_tag_stack(label, stack) if stack.size < 1 warn " #{label}: none\n" else warn " #{label}:\n" stack.each_with_index do |tag, i| warn " #{i+1}: #{tag}\n" end end end # Convert the list of XML document tokens to a tree representation. # The implementation uses a loop and a stack rather than recursion. # Does not modify the original token list. # Returns an error count. def tokens_to_tree tag_stack = [ ] # stack for element tags children_stack = [ ] # stack for lists of children children = [ ] # current list of children err_count = 0 # Note: the text token pattern test assumes that all text tokens # are non-empty. This should be true, because REX doesn't create # empty tokens. @tokens.each_index do |i| token = @tokens[i] line_num = @line_num[i] tok_err = "Error near line #{line_num}, token #{i+1} (#{token})" case token when /\A[^<]/ # text children << text_node(token) when /\A