'''
htmlParser.py

Copyright 2006 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

import core.controllers.outputManager as om
from core.controllers.w3afException import w3afException
import core.data.kb.config as cf
try:
	import extlib.utidylib as tidy
	om.out.debug('htmlParser is using the bundled utidy library')
except:
	try:
		import tidy
		om.out.debug('htmlParser is using the systems utidy library')
	except:
		raise w3afException('You have to install utidy lib.')

from core.data.parsers.abstractParser import abstractParser
import core.data.parsers.urlParser as urlParser

import core.data.dc.form as form

class htmlParser(abstractParser):
	'''
	This class parses HTML's.
	
	@author: Andres Riancho ( andres.riancho@gmail.com )
	'''
	
	
	def __init__(self, document, baseUrl, useTidy=True, verbose=0):
		self._tidyOptions = dict( tidy_mark=0,\
		quote_ampersand=0, quote_marks=0, \
		quote_nbsp=0, force_output=1)
		
		self._tagsContainingURLs =  ('a','img', 'link', 'script', 'iframe', 'object',
				'embed', 'area', 'frame', 'applet', 'input', 'base',
				'div', 'layer', 'ilayer', 'bgsound', 'form')
		self._urlAttrs = ('href', 'src', 'data', 'action' )
		
		self._goodInputType = ['password','text','checkbox','radio','hidden','file','submit']
		
		abstractParser.__init__(self, document, baseUrl, useTidy, verbose)
		
	def _preParse( self, HTMLDocument ):
		assert self._baseUrl != '', 'The base URL must be setted.'
		# If tidy is on, we should "fix" the html with it
		if self._useTidy:
			HTMLDocument = str ( tidy.parseString(HTMLDocument, **self._tidyOptions) )
		
		# Now we are ready to work
		self._parse ( HTMLDocument )
	
	def getForms( self, HTML ):
		'''
		@parameter HTML: The HTML to parse.
		@return: Returns list of forms.
		'''		
		return self._forms
		
	def getReferences( self, HTML ):
		'''
		Searches for references on a page. w3af searches references in every html tag, including:
			- a
			- forms
			- images
			- frames
			- etc.
		
		@parameter HTML: The HTML to parse.
		@return: Returns list of links.
		'''
		return set( self._urlsInDocument )

		
	def getComments( self, HTML ):
		'''
		@parameter HTML: The HTML to parse.
		@return: Returns list of comment strings.
		'''
		return set( self._commentsInDocument )
	
	def getMetaRedir( self, HTML ):
		'''
		@parameter HTML: The HTML to parse.
		@return: Returns list of meta redirections.
		'''
		return set( self._metaRedirs )
			
	def unknown_endtag(self, tag):         
		'''
		called for each end tag, e.g. for </pre>, tag will be "pre"
		'''
		if tag.lower() == 'form' :
			self._insideForm = False
		
		if tag.lower() == 'select' :
			self._insideSelect = False
		
	def _findForms(self, tag, attrs):
		'''
		This method finds forms inside an HTML document.
		'''
		
		'''
		<FORM action="http://somesite.com/prog/adduser" method="post">
		<P>
				<LABEL for="firstname">First name: </LABEL>
				<INPUT type="text" id="firstname"><BR>
				<LABEL for="lastname">Last name: </LABEL>
				<INPUT type="text" id="lastname"><BR>
				<LABEL for="email">email: </LABEL>
				<INPUT type="text" id="email"><BR>
				<INPUT type="radio" name="sex" value="Male"> Male<BR>
				<INPUT type="radio" name="sex" value="Female"> Female<BR>
				<INPUT type="submit" value="Send"> <INPUT type="reset">
		</P>
		</FORM>
		'''
		
		if tag.lower() == 'form' :
			#Find the method
			method = 'GET'
			foundMethod = False
			for attr in attrs:
				if attr[0].lower() == 'method':
					method = attr[1].upper()
					foundMethod = True
			
			if not foundMethod:
				om.out.debug('htmlParser found a form without a method. Using GET as the default.')
			
			#Find the action
			foundAction = False
			for attr in attrs:
				if attr[0].lower() == 'action':
					action = urlParser.urlJoin( self._baseUrl ,attr[1] )
					foundAction = True
					
			if not foundAction:
				om.out.debug('htmlParser found a form without an action. Javascript is being used.')
				# <form name="frmRegistrar" onsubmit="valida();">
			else:
				self._insideForm = True
				f = form.form()
				f.setMethod( method )			
				f.setAction( action )
				self._forms.append( f )
		
		if self._insideForm:
			# I am inside a form, I should parse input tags
			if tag.lower() == 'input':
				for attr in attrs:
					if attr[0].lower() == 'type':
						break
				if attr[1].lower() in self._goodInputType:
					# We are working with the last form
					f = self._forms[ len(self._forms) -1 ]
					f.addInput( attrs )
					
					if attr[1].lower() == 'file':
						f.hasFileInput = True
						f.addFileInput( attrs )
					
			elif tag.lower() == 'select':
				self._insideSelect = True
				try:
					self._selectTagName = [ v[1] for v in attrs if v[0].lower() in ['name','id'] ][0]
				except:
					om.out.debug('htmlParser found a select tag without a name attr !')
			
			if self._insideSelect:
				if tag.lower() == 'option':
					# We are working with the last form in the list
					f = self._forms[ len(self._forms) -1 ]
					attrs.append( ('name',self._selectTagName) ) 
					f.addInput( attrs )