require 'htree/doc' require 'htree/elem' require 'htree/loc' require 'htree/extract_text' require 'uri' module HTree module Traverse def doc?() Doc::Trav === self end def elem?() Elem::Trav === self end def text?() Text::Trav === self end def xmldecl?() XMLDecl::Trav === self end def doctype?() DocType::Trav === self end def procins?() ProcIns::Trav === self end def comment?() Comment::Trav === self end def bogusetag?() BogusETag::Trav === self end def get_subnode(*indexes) n = self indexes.each {|index| n = n.get_subnode_internal(index) } n end end module Container::Trav # +each_child+ iterates over each child. def each_child(&block) # :yields: child_node children.each(&block) nil end # +each_child_with_index+ iterates over each child. def each_child_with_index(&block) # :yields: child_node, index children.each_with_index(&block) nil end # +find_element+ searches an element which universal name is specified by # the arguments. # It returns nil if not found. def find_element(*names) traverse_element(*names) {|e| return e } nil end # +traverse_element+ traverses elements in the tree. # It yields elements in depth first order. # # If _names_ are empty, it yields all elements. # If non-empty _names_ are given, it should be list of universal names. # # A nested element is yielded in depth first order as follows. # # t = HTree('') # t.traverse_element("a", "c") {|e| p e} # # => # {elem {elem {emptyelem } } {emptyelem } } # {emptyelem } # {emptyelem } # # Universal names are specified as follows. # # t = HTree(<<'End') # # # # # End # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e} # # => # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">} # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">} # def traverse_element(*names, &block) # :yields: element if names.empty? traverse_all_element(&block) else name_set = {} names.each {|n| name_set[n] = true } traverse_some_element(name_set, &block) end nil end def each_hyperlink_attribute traverse_element( '{http://www.w3.org/1999/xhtml}a', '{http://www.w3.org/1999/xhtml}area', '{http://www.w3.org/1999/xhtml}link', '{http://www.w3.org/1999/xhtml}img', '{http://www.w3.org/1999/xhtml}object', '{http://www.w3.org/1999/xhtml}q', '{http://www.w3.org/1999/xhtml}blockquote', '{http://www.w3.org/1999/xhtml}ins', '{http://www.w3.org/1999/xhtml}del', '{http://www.w3.org/1999/xhtml}form', '{http://www.w3.org/1999/xhtml}input', '{http://www.w3.org/1999/xhtml}head', '{http://www.w3.org/1999/xhtml}base', '{http://www.w3.org/1999/xhtml}script') {|elem| case elem.name when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i attrs = ['href'] when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i attrs = ['src', 'longdesc', 'usemap'] when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i attrs = ['classid', 'codebase', 'data', 'usemap'] when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i attrs = ['cite'] when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i attrs = ['action'] when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i attrs = ['src', 'usemap'] when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i attrs = ['profile'] when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i attrs = ['src', 'for'] end attrs.each {|attr| if hyperlink = elem.get_attribute(attr) yield elem, attr, hyperlink end } } end private :each_hyperlink_attribute # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute # of A element. # # It yields HTree::Text (or HTree::Loc) and URI for each hyperlink. # # The URI objects are created with a base URI which is given by # HTML BASE element or the argument ((|base_uri|)). # +each_hyperlink_uri+ doesn't yields href of the BASE element. def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri base_uri = URI.parse(base_uri) if String === base_uri links = [] each_hyperlink_attribute {|elem, attr, hyperlink| if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name base_uri = URI.parse(hyperlink.to_s) else links << hyperlink end } if base_uri links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s } else links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) } end end # +each_hyperlink+ traverses hyperlinks such as HTML href attribute # of A element. # # It yields HTree::Text or HTree::Loc. # # Note that +each_hyperlink+ yields HTML href attribute of BASE element. def each_hyperlink # :yields: text links = [] each_hyperlink_attribute {|elem, attr, hyperlink| yield hyperlink } end # +each_uri+ traverses hyperlinks such as HTML href attribute # of A element. # # It yields URI for each hyperlink. # # The URI objects are created with a base URI which is given by # HTML BASE element or the argument ((|base_uri|)). def each_uri(base_uri=nil) # :yields: URI each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri } end end # :stopdoc: module Doc::Trav def traverse_all_element(&block) children.each {|c| c.traverse_all_element(&block) } end end module Elem::Trav def traverse_all_element(&block) yield self children.each {|c| c.traverse_all_element(&block) } end end module Leaf::Trav def traverse_all_element end end module Doc::Trav def traverse_some_element(name_set, &block) children.each {|c| c.traverse_some_element(name_set, &block) } end end module Elem::Trav def traverse_some_element(name_set, &block) yield self if name_set.include? self.name children.each {|c| c.traverse_some_element(name_set, &block) } end end module Leaf::Trav def traverse_some_element(name_set) end end # :startdoc: module Traverse # +traverse_text+ traverses texts in the tree def traverse_text(&block) # :yields: text traverse_text_internal(&block) nil end end # :stopdoc: module Container::Trav def traverse_text_internal(&block) each_child {|c| c.traverse_text_internal(&block) } end end module Leaf::Trav def traverse_text_internal end end module Text::Trav def traverse_text_internal yield self end end # :startdoc: module Container::Trav # +filter+ rebuilds the tree without some components. # # node.filter {|descendant_node| predicate } -> node # loc.filter {|descendant_loc| predicate } -> node # # +filter+ yields each node except top node. # If given block returns false, corresponding node is dropped. # If given block returns true, corresponding node is retained and # inner nodes are examined. # # +filter+ returns an node. # It doesn't return location object even if self is location object. # def filter(&block) subst = {} each_child_with_index {|descendant, i| if yield descendant if descendant.elem? subst[i] = descendant.filter(&block) else subst[i] = descendant end else subst[i] = nil end } to_node.subst_subnode(subst) end end module Doc::Trav # +title+ searches title and return it as a text. # It returns nil if not found. # # +title+ searchs following information. # # - ... in HTML # - ... in RSS # - ... in Atom def title e = find_element('title', '{http://www.w3.org/1999/xhtml}title', '{http://purl.org/rss/1.0/}title', '{http://my.netscape.com/rdf/simple/0.9/}title', '{http://www.w3.org/2005/Atom}title', '{http://purl.org/atom/ns#}title') e && e.extract_text end # +author+ searches author and return it as a text. # It returns nil if not found. # # +author+ searchs following information. # # - in HTML # - in HTML # - author-name in RSS # - author-name in RSS # - author-name in Atom def author traverse_element('meta', '{http://www.w3.org/1999/xhtml}meta') {|e| begin next unless e.fetch_attr('name').downcase == 'author' author = e.fetch_attribute('content').strip return author if !author.empty? rescue IndexError end } traverse_element('link', '{http://www.w3.org/1999/xhtml}link') {|e| begin next unless e.fetch_attr('rev').downcase == 'made' author = e.fetch_attribute('title').strip return author if !author.empty? rescue IndexError end } if channel = find_element('{http://purl.org/rss/1.0/}channel') channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e| begin author = e.extract_text.strip return author if !author.empty? rescue IndexError end } channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e| begin author = e.extract_text.strip return author if !author.empty? rescue IndexError end } end ['http://www.w3.org/2005/Atom', 'http://purl.org/atom/ns#'].each {|xmlns| each_child {|top| next unless top.elem? if top.name == "{#{xmlns}}feed" if feed_author = find_element("{#{xmlns}}author") feed_author.traverse_element("{#{xmlns}}name") {|e| begin author = e.extract_text.strip return author if !author.empty? rescue IndexError end } end end } } nil end end module Doc::Trav # +root+ searches root element. # If there is no element on top level, it raise HTree::Error. # If there is two or more elements on top level, it raise HTree::Error. def root es = [] children.each {|c| es << c if c.elem? } raise HTree::Error, "no element" if es.empty? raise HTree::Error, "multiple top elements" if 1 < es.length es[0] end # +has_xmldecl?+ returns true if there is an XML declaration on top level. def has_xmldecl? children.each {|c| return true if c.xmldecl? } false end end module Elem::Trav # +name+ returns the universal name of the element as a string. # # p HTree('').root.name # # => # "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF" # def name() element_name.universal_name end # +qualified_name+ returns the qualified name of the element as a string. # # p HTree('').root.qualified_name # # => # "rdf:RDF" def qualified_name() element_name.qualified_name end # +attributes+ returns attributes as a hash. # The hash keys are HTree::Name objects. # The hash values are HTree::Text or HTree::Location objects. # # p HTree('').root.attributes # # => # {href=>{text "uu"}, name=>{text "xx"}} # # p HTree('').make_loc.root.attributes # # => # {href=>#, name=>#} # def attributes result = {} each_attribute {|name, text| result[name] = text } result end def each_attr each_attribute {|name, text| uname = name.universal_name str = text.to_s yield uname, str } end # call-seq: # elem.fetch_attribute(name) -> text or raise IndexError # elem.fetch_attribute(name, default) -> text or default # elem.fetch_attribute(name) {|uname| default } -> text or default # # +fetch_attribute+ returns an attribute value as a text. # # elem may be an instance of HTree::Elem or a location points to it. def fetch_attribute(uname, *rest, &block) if 1 < rest.length raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)" end if !rest.empty? && block_given? raise ArgumentError, "block supersedes default value argument" end uname = uname.universal_name if uname.respond_to? :universal_name return update_attribute_hash.fetch(uname) { if block_given? return yield(uname) elsif !rest.empty? return rest[0] else raise IndexError, "attribute not found: #{uname.inspect}" end } end # call-seq: # elem.fetch_attr(name) -> string or raise IndexError # elem.fetch_attr(name, default) -> string or default # elem.fetch_attr(name) {|uname| default } -> string or default # # +fetch_attr+ returns an attribute value as a string. # # elem may be an instance of HTree::Elem or a location points to it. def fetch_attr(uname, *rest, &block) if 1 < rest.length raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)" end if !rest.empty? && block_given? raise ArgumentError, "block supersedes default value argument" end uname = uname.universal_name if uname.respond_to? :universal_name return update_attribute_hash.fetch(uname) { if block_given? return yield(uname) elsif !rest.empty? return rest[0] else raise IndexError, "attribute not found: #{uname.inspect}" end }.to_s end def get_attribute(uname) uname = uname.universal_name if uname.respond_to? :universal_name update_attribute_hash[uname] end def get_attr(uname) if text = update_attribute_hash[uname] text.to_s else nil end end end end