''' abstractParser.py Copyright 2006 Andres Riancho This file is part of w3af, w3af.sourceforge.net . w3af is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. w3af is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with w3af; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ''' import core.controllers.outputManager as om from core.controllers.w3afException import w3afException from sgmllib import SGMLParser import core.data.dc.form as form import core.data.parsers.urlParser as urlParser import string class abstractParser(SGMLParser): ''' This class is an abstract document parser. @author: Andres Riancho ( andres.riancho@gmail.com ) ''' def __init__(self, document, baseUrl, useTidy=True, verbose=0): SGMLParser.__init__(self, verbose) self._baseUrl = '' self._baseDomain = '' self._tagsContainingURLs = ('go', 'a','img', 'link', 'script', 'iframe', 'object', 'embed', 'area', 'frame', 'applet', 'input', 'base', 'div', 'layer', 'ilayer', 'bgsound', 'form') self._urlAttrs = ('href', 'src', 'data', 'action' ) self._urlsInDocument = [] self._forms = [] self._insideForm = False self._insideSelect = False self._commentsInDocument = [] self._metaRedirs = [] # "setBaseUrl" self._baseUrl = baseUrl self._baseDomain = urlParser.getDomain(baseUrl) self._useTidy = useTidy # Now we are ready to work self._preParse( document ) def unknown_starttag(self, tag, attrs): ''' Called for each start tag attrs is a list of (attr, value) tuples e.g. for
, tag="pre", attrs=[("class", "screen")]

		Note that improperly embedded non-HTML code (like client-side Javascript)
		may be parsed incorrectly by the ancestor, causing runtime script errors.
		All non-HTML code must be enclosed in HTML comment tags ()
		to ensure that it will pass through this parser unaltered (in handle_comment).
		'''
		self._findReferences(tag, attrs)
		self._findForms(tag, attrs)
		self._findMetaRedir(tag, attrs)
	
	def _findMetaRedir( self, tag, attrs):
		'''
		Find meta tag redirections, like this one:
		
		'''
		if tag.lower() == 'meta':
			hasHTTP_EQUIV = False
			hasContent = False
			content = ''
			for attr in attrs:
				if attr[0].lower() == 'http-equiv' and attr[1].lower() == 'refresh':
					hasHTTP_EQUIV = True
				if attr[0].lower() == 'content':
					hasContent = True
					content = attr[1].lower()
			if hasContent and hasHTTP_EQUIV:
				self._metaRedirs.append( content )
	
	def _findReferences(self, tag, attrs):
		'''
		This method finds references inside an document document.
		'''
		if tag.lower() in self._tagsContainingURLs:
			for attr in attrs:
				if attr[0].lower() in self._urlAttrs:
					if len(  attr[1] ):
							if attr[1][0] != '#':
								self._urlsInDocument.append ( urlParser.urlJoin( self._baseUrl ,attr[1] ) )
								break
	
	def _parse(self, s):
		'''
		This method parses the document.
		
		@parameter s: The document to parse.
		'''
		try:
			self.feed(s)
			self.close()
		except Exception, e:
			raise w3afException('Exception found while parsing document. Exception: ' + str(e) )
		else:
			# Saves A LOT of memory
			# without this a run will use 4,799,936
			# with this, a run will use 113,696
			del self.rawdata
		
	def getAccounts( self , documentString ):
		'''
		@return: A list with all mail users that are present in the documentString.
		'''
		accounts = []
		
		if documentString.find('@') != -1:
			# This two for loops were taken from Sergio Alvarez fingergoogle.py
			for i in ('=','"', '\'','
', '[', ']', '<', '>', ':', ';', '&', '(', ')', '{', '}'): documentString = string.replace(documentString, i, ' ') documentString = string.split(documentString, '\n') for line in documentString: if line.count('@'+self._baseDomain): split = string.split(line, ' ') for i in split: if i.count('@'+self._baseDomain): if i[0] == '@': continue if string.split(i, '@')[1] != self._baseDomain: continue i = i[:-(len(self._baseDomain)+1)] if len(i) > 1: if i[-1] == '@': i = i[:-1] # If account aint in account list , then add if accounts.count(i) == 0: accounts.append(i) return accounts def handle_comment( self , text ): ''' This method is called by parse when a comment is found. ''' self._commentsInDocument.append( text )