'''
urlParser.py

Copyright 2006 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

from core.controllers.w3afException import w3afException
from core.data.dc.queryString import queryString
import core.controllers.outputManager as om
import urlparse as _uparse
import urllib
import cgi
import socket
import re

'''
This module parses Url's.

@author: Andres Riancho ( andres.riancho@gmail.com )
'''

def hasQueryString( url):
	'''
	Analizes the uri to check for a query string.
	
	@parameter url: The uri to analize.
	@return: True if the URL has a query string.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	if qs != '':
		return True
	return False

def getQueryString( url ):
	'''
	Parses the query string and returns a dict.
	
	@parameter url: The url with the query string to parse.
	@return: A QueryString Object, example :
		- input url : http://localhost/foo.asp?xx=yy&bb=dd
		- output dict : { xx:yy , bb:dd }
	'''
	parsedQs = None
	result = queryString()
	if hasQueryString( url ):
		scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
		try:
			parsedQs = cgi.parse_qs( qs ,keep_blank_values=True,strict_parsing=True)
		except:
			#om.out.debug('Strange things found when parsing query string: ' + qs)
			pass
		else:
			for i in parsedQs.keys():
				result[ i ] = parsedQs[ i ][0]

	return result

def uri2url( url):
	'''
	@parameter url: The url with the query string.
	@return: Returns a string contaning the URL without the query string. Example :
		- input url : http://localhost/foo.asp?xx=yy&bb=dd#fragment
		- output url string : http://localhost/foo.asp
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return scheme+'://'+domain+path

def removeFragment(  url ):
	'''
	@parameter url: The url with fragments
	@return: Returns a string contaning the URL without the fragment. Example :
		- input url : http://localhost/foo.asp?xx=yy&bb=dd#fragment
		- output url string : http://localhost/foo.asp?xx=yy&bb=dd
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	if qs != '':
		res = scheme+'://'+domain+path+'?'+qs
	else:
		res = scheme+'://'+domain+path
	return res
	
def baseUrl(  url ):
	'''
	@parameter url: The url with the query string.
	@return: Returns a string contaning the URL without the query string and without any path. 
	Example :
		- input url : http://localhost/dir1/foo.asp?xx=yy&bb=dd
		- output url string : http://localhost/
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return scheme+'://'+domain + '/'
	
def urlJoin( baseurl , relative ):
	'''
	Construct a full (``absolute'') URL by combining a ``base URL'' (base) with a ``relative URL'' (url). 
	Informally, this uses components of the base URL, in particular the addressing scheme, the network location and (part of) the path, 
	to provide missing components in the relative URL.

	Example:
	urljoin('http://www.cwi.nl/%7Eguido/Python.html', 'FAQ.html')
	yields the string
	'http://www.cwi.nl/%7Eguido/FAQ.html'
	@param baseurl: The base url to join
	@param relative: The relative url to add to the base url
	'''
	if relative.find('//') == 0:
		# This special case had to be generated cause of some pykto tests
		scheme, domain, path, x1, qs, x3 = _uparse.urlparse( baseurl )
		lastSlash = path.rfind( '/' )
		if lastSlash != 0:
			# I have more than one /
			path = path[: lastSlash]
		
		relative = relative[1:]
		response =  scheme + '://' + domain + path + relative
	else:
		response = _uparse.urljoin( baseurl, relative )
		
	return response
	
def getDomain( url):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return domain

def getProtocol( url ):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return scheme

def getRootDomain( domain ):
	'''
	Get the root domain name. Examples:
	
	input: www.ciudad.com.ar
	output: ciudad.com.ar
	
	input: i.love.myself.ru
	output: myself.ru
	
	Code taken from: http://getoutfoxed.com/node/41
	'''
	# TODO: this list should be updated from time to time, automatically.
	# taken from http:#en.wikipedia.org/wiki/List_of_Internet_top-level_domains
	gTopLevelDomainDict =  {
		"ac":1,"ad":1,"ae":1,"aero":1,"af":1,"ag":1,"ai":1,"al":1,"am":1,
		"an":1,"ao":1,"aq":1,"ar":1,"arpa":1,"as":1,"at":1,"au":1,"aw":1,
		"az":1,"ba":1,"bb":1,"bd":1,"be":1,"bf":1,"bg":1,"bh":1,"bi":1,
		"biz":1,"bj":1,"bm":1,"bn":1,"bo":1,"br":1,"bs":1,"bt":1,"bv":1,
		"bw":1,"by":1,"bz":1,"ca":1,"cc":1,"cd":1,"cf":1,"cg":1,"ch":1,
		"ci":1,"ck":1,"cl":1,"cm":1,"cn":1,"co":1,"com":1,"coop":1,"cr":1,
		"cu":1,"cv":1,"cx":1,"cy":1,"cz":1,"de":1,"dj":1,"dk":1,"dm":1,
		"do":1,"dz":1,"ec":1,"edu":1,"ee":1,"eg":1,"er":1,"es":1,"et":1,
		"fi":1,"fj":1,"fk":1,"fm":1,"fo":1,"fr":1,"ga":1,"gb":1,"gd":1,
		"ge":1,"gf":1,"gg":1,"gh":1,"gi":1,"gl":1,"gm":1,"gn":1,"gov":1,
		"gp":1,"gq":1,"gr":1,"gs":1,"gt":1,"gu":1,"gw":1,"gy":1,"hk":1,
		"hm":1,"hn":1,"hr":1,"ht":1,"hu":1,"id":1,"ie":1,"il":1,"im":1,
		"in":1,"info":1,"int":1,"io":1,"iq":1,"ir":1,"is":1,"it":1,"je":1,
		"jm":1,"jo":1,"jp":1,"ke":1,"kg":1,"kh":1,"ki":1,"km":1,"kn":1,
		"kr":1,"kw":1,"ky":1,"kz":1,"la":1,"lb":1,"lc":1,"li":1,"lk":1,
		"lr":1,"ls":1,"lt":1,"lu":1,"lv":1,"ly":1,"ma":1,"mc":1,"md":1,
		"mg":1,"mh":1,"mil":1,"mk":1,"ml":1,"mm":1,"mn":1,"mo":1,"mp":1,
		"mq":1,"mr":1,"ms":1,"mt":1,"mu":1,"museum":1,"mv":1,"mw":1,"mx":1,
		"my":1,"mz":1,"na":1,"name":1,"nc":1,"ne":1,"net":1,"nf":1,"ng":1,
		"ni":1,"nl":1,"no":1,"np":1,"nr":1,"nu":1,"nz":1,"om":1,"org":1,
		"pa":1,"pe":1,"pf":1,"pg":1,"ph":1,"pk":1,"pl":1,"pm":1,"pn":1,
		"pr":1,"pro":1,"ps":1,"pt":1,"pw":1,"py":1,"qa":1,"re":1,"ro":1,
		"ru":1,"rw":1,"sa":1,"sb":1,"sc":1,"sd":1,"se":1,"sg":1,"sh":1,
		"si":1,"sj":1,"sk":1,"sl":1,"sm":1,"sn":1,"so":1,"sr":1,"st":1,
		"su":1,"sv":1,"sy":1,"sz":1,"tc":1,"td":1,"tf":1,"tg":1,"th":1,
		"tj":1,"tk":1,"tm":1,"tn":1,"to":1,"tp":1,"tr":1,"tt":1,"tv":1,
		"tw":1,"tz":1,"ua":1,"ug":1,"uk":1,"um":1,"us":1,"uy":1,"uz":1,
		"va":1,"vc":1,"ve":1,"vg":1,"vi":1,"vn":1,"vu":1,"wf":1,"ws":1,
		"ye":1,"yt":1,"yu":1,"za":1,"zm":1,"zw":1 
	}
	
	# break authority into two parts: subdomain(s), and base authority
	# e.g. images.google.com --> [images, google.com]
	#      www.popo.com.au --> [www, popo.com.au]
	def splitAuthority(aAuthority):
	
		# walk down from right, stop at (but include) first non-toplevel domain
		chunks = re.split("\.",aAuthority)
		chunks.reverse()
		
		baseAuthority=""
		subdomain=""
		foundBreak = 0
		
		for i in chunks:
			if (not foundBreak):
				baseAuthority = i + (".","")[baseAuthority==""] + baseAuthority
			else:
				subdomain = i  + (".","")[subdomain==""] + subdomain
			if (not gTopLevelDomainDict.has_key(i)):
				 foundBreak=1
		return ([subdomain,baseAuthority])
	
	# def to split URI into its parts, returned as URI object
	def decomposeURI(aURI):
	
		# http://www.faqs.org/rfcs/rfc2396.html
		uriDef = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?"
		#          12            3  4          5       6  7        8 9
		myRegEp = re.compile(uriDef)
		
		#m = myRegEp.exec(aURI)
		m = myRegEp.match(aURI)
		if (not m):
			return False
		
		scheme = ("",m.group(2))[bool(m.group(2))]
		authority =  ("",m.group(4))[bool(m.group(4))]
		path = ("",m.group(5))[bool(m.group(5))]
		query = ""
		fragment =  ("",m.group(9))[bool(m.group(9))]
		
		s = splitAuthority(authority)
		subdomain = s[0]
		baseAuthority = s[1]
		
		return baseAuthority
		
	if not domain.count('://'):
		# sometimes i make mistakes...
		domain = 'http://' + domain
	
	return decomposeURI( domain )
		
def getDomainPath( url):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name and the path for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return scheme + '://' +domain+ path[:path.rfind('/')+1]

def getFileName( url ):
	'''
	@parameter url: The url to parse.
	@return: Returns the filename name and the path for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return path[path.rfind('/')+1:]

def getExtension( url ):
	'''
	@parameter url: The url to parse.
	@return: Returns the extension of the filename, if possible, else, ''.
	'''
	fname = getFileName( url )
	extension = fname[ fname.rfind('.') +1 :]
	if extension == fname:
		return ''
	else:
		return extension
	
def allButScheme( url):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name and the path for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return domain+ path[:path.rfind('/')+1]
	
def getPath( url):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name and the path for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	return path
	
def getPathQs( url):
	'''
	@parameter url: The url to parse.
	@return: Returns the domain name and the path for the url.
	'''
	scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url )
	if qs != '':
		res = path + '?' + qs
	else:
		res = path
	return res
	
def urlDecode( url ):
	'''
	UrlDecode the url.
	'''
	res = None
	if type(url) == type(""):
		import string
		res = urllib.unquote(string.replace(url, "+", " "))
	return res

def getDirectories( url ):
	'''
	Get a list of all directories and subdirectories.
	Example:
		- url = 'http://www.o.com/a/b/c/'
		- return: ['http://www.o.com/a/b/c/','http://www.o.com/a/b/','http://www.o.com/a/','http://www.o.com/']
	'''
	res = []
	
	dp = getDomainPath( url )
	bu = baseUrl( url )
	directories = dp.replace( bu, '' )
	splittedDirs = directories.split('/')
	for i in xrange( len(splittedDirs) ):
		url = bu + '/'.join( splittedDirs[:i] )
		if url[len( url )-1] != '/':
			url += '/'
		res.append( url )
	
	return res