'''
abstractParser.py

Copyright 2006 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

import core.controllers.outputManager as om
from core.controllers.w3afException import w3afException

from sgmllib import SGMLParser
import core.data.dc.form as form
import core.data.parsers.urlParser as urlParser
import string

class abstractParser(SGMLParser):
	'''
	This class is an abstract document parser.
	
	@author: Andres Riancho ( andres.riancho@gmail.com )
	'''
	def __init__(self, document, baseUrl, useTidy=True, verbose=0):
		SGMLParser.__init__(self, verbose)
		self._baseUrl = ''
		self._baseDomain = ''
		self._tagsContainingURLs =  ('go', 'a','img', 'link', 'script', 'iframe', 'object',
				'embed', 'area', 'frame', 'applet', 'input', 'base',
				'div', 'layer', 'ilayer', 'bgsound', 'form')
		self._urlAttrs = ('href', 'src', 'data', 'action' )

		self._urlsInDocument = []
		self._forms = []
		self._insideForm = False
		self._insideSelect = False
		self._commentsInDocument = []
		self._metaRedirs = []
		
		# "setBaseUrl"
		self._baseUrl = baseUrl
		self._baseDomain = urlParser.getDomain(baseUrl)
		
		self._useTidy = useTidy
		
		# Now we are ready to work
		self._preParse( document )

	def unknown_starttag(self, tag, attrs):
		'''
		Called for each start tag
		attrs is a list of (attr, value) tuples
		e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]

		Note that improperly embedded non-HTML code (like client-side Javascript)
		may be parsed incorrectly by the ancestor, causing runtime script errors.
		All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
		to ensure that it will pass through this parser unaltered (in handle_comment).
		'''
		self._findReferences(tag, attrs)
		self._findForms(tag, attrs)
		self._findMetaRedir(tag, attrs)
	
	def _findMetaRedir( self, tag, attrs):
		'''
		Find meta tag redirections, like this one:
		<META HTTP-EQUIV="refresh" content="4;URL=http://www.f00.us/">
		'''
		if tag.lower() == 'meta':
			hasHTTP_EQUIV = False
			hasContent = False
			content = ''
			for attr in attrs:
				if attr[0].lower() == 'http-equiv' and attr[1].lower() == 'refresh':
					hasHTTP_EQUIV = True
				if attr[0].lower() == 'content':
					hasContent = True
					content = attr[1].lower()
			if hasContent and hasHTTP_EQUIV:
				self._metaRedirs.append( content )
	
	def _findReferences(self, tag, attrs):
		'''
		This method finds references inside an document document.
		'''
		if tag.lower() in self._tagsContainingURLs:
			for attr in attrs:
				if attr[0].lower() in self._urlAttrs:
					if len(  attr[1] ):
							if attr[1][0] != '#':
								self._urlsInDocument.append ( urlParser.urlJoin( self._baseUrl ,attr[1] ) )
								break
	
	def _parse(self, s):
		'''
		This method parses the document.
		
		@parameter s: The document to parse.
		'''
		try:
			self.feed(s)
			self.close()
		except Exception, e:
			raise w3afException('Exception found while parsing document. Exception: ' + str(e) )
		else:
			# Saves A LOT of memory
			# without this a run will use 4,799,936
			# with this, a run will use 113,696
			del self.rawdata
		
	def getAccounts( self , documentString ):
		'''
		@return: A list with all mail users that are present in the documentString.
		'''
		accounts = []
		
		if documentString.find('@') != -1:
			# This two for loops were taken from Sergio Alvarez fingergoogle.py
			for i in ('=','"', '\'','<br>', '[', ']', '<', '>', ':', ';', '&', '(', ')', '{', '}'):
				documentString = string.replace(documentString, i, ' ')
			documentString = string.split(documentString, '\n')
			for line in documentString:
				if line.count('@'+self._baseDomain):
					split = string.split(line, ' ')
					for i in split:
						if i.count('@'+self._baseDomain):
							if i[0] == '@':
								continue
							if string.split(i, '@')[1] != self._baseDomain:
								continue
							i = i[:-(len(self._baseDomain)+1)]
							if len(i) > 1:
								if i[-1] == '@':
									i = i[:-1]
							# If account aint in account list , then add
							if accounts.count(i) == 0:
								accounts.append(i)
		
		return accounts
	
	def handle_comment( self , text ):
		'''
		This method is called by parse when a comment is found.
		'''
		self._commentsInDocument.append( text )