###################################################################### # regular expressions used by Zwiki # # Some people, when confronted with a problem, think 'I know, I'll use # regular expressions.' Now they have two problems. --Jamie Zawinski # # Be brave. Read on. # # don't bother trying to keep to 80 char lines, here import re, string import Defaults from Utils import BLATHER, DEBUG, formattedTraceback # URLs/URIs (better regexps in urllib/urlparse ?) urlchars = r'[A-Za-z0-9/:;@_%~#=&\.\-\?\+\$,]+' url = r'["=]?((about|gopher|http|https|ftp|mailto|file):%s)' % (urlchars) # valid characters for zwiki page ids # These are the characters which are used to form the ids of zwiki page # objects. They must be legal in both zope ids and urls, and exclude _ # which is used for quoting. Cf http://zwiki.org/HowZwikiTitleAndIdWorks . # It is possible to enable single-byte non-ascii letters in page ids here # if you also hack bad_id in zope's OFS/ObjectManager.py, but then your # urls would be illegal. zwikiidcharsexpr = re.compile(r'[a-zA-Z0-9.-]') # used in generating page ids # XXX NB this is affected by locale - may not be what we want spaceandlowerexpr = re.compile(r'\s+([%s])'%(string.lowercase)) # free-form wiki links # zwiki uses three kinds of delimiters to enclose free-form wiki page names: # single bracketed phrases (only) [...] # what's inside the brackets should be group 1 singlebracketedexpr = r'\[(?:(?!\[))([^\n\]]+)\]' # wikipedia-style double brackets [[...]] doublebracketedexpr = r'\[\[([^\n\]]+)\]\]' # wicked-style double parentheses ((...)), for international users whose # keyboards make brackets hard to type doubleparenthesisexpr = r'\(\(([^\n\]]+)\)\)' # match either single or double brackets, to simplify later regexps a little bracketedexpr = r'\[\[?([^\n\]]+)\]\]?' # bare wikinames # # "bare wikinames" here means page names which will be automatically # wiki-linked without needing to be enclosed in brackets. Zwiki's # wikinames are standard c2.com-style CamelCase plus the following # additions: # # - words of a single letter are allowed (APage, PageA) # # - trailing digits are allowed (PageVersion22, but not Page22Version) # (XXX for simplicity and to reduce unexpected wikilinking, eg of big # random texts. Change this ?) # # - non-ascii letters are allowed. We aim to as far as possible work as # international users would expect, out of the box and regardless of # python version, locale setting, platform etc. Better ideas are # welcome. One of two possible setups is chosen at startup: # # 1. if a system locale is configured, the locale's letters are allowed # # We include these in our regexps below. We need them utf8-encoded # since zwiki text is always stored utf8-encoded. So we convert them # from the system's default encoding to unicode and re-encode as # utf8. It's hard to see how to do this robustly on all systems and # it has been the cause of many zwiki startup problems, so we must be # careful not to let any error stop product initialisation (#769, # #1158). Other notes: don't rely on python 2.3's # getpreferredencoding, gives wrong answer; work around a python bug # with some locales (#392). # # 2. if no system locale is configured or there was an error during the # above, a default set of non-ascii letters are allowed # # On systems where we can't detect the locale's characters, we jump # through some hoops to support a number of non-ascii letters common # in latin and other languages so things are more likely to "just # work" for more users. # we'll set up the following strings to use when building the regexps: # U: 'A|B|C|... ' # L: 'a|b|c|...' # Ubr: '[ABC...]' # Lbr: '[abc...]' # where A, b etc. are the utf8-encoded upper & lower-case letters. try: import locale lang, encoding = locale.getlocale() U = '|'.join([c.encode('utf8') for c in unicode(string.uppercase, encoding)]) L = '|'.join([c.encode('utf8') for c in unicode(string.lowercase, encoding)]) Ubr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(string.uppercase, encoding)]) Lbr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(string.lowercase, encoding)]) relocaleflag = r'(?L)' wordboundary = r'\b' except: # no locale is set, or there was a problem detecting it or a # problem decoding its letters. # XXX must be a less ugly way to do this: # if it's just that there's no locale, don't log a warning try: lang, encoding = locale.getlocale() except: lang, encoding = -1,-1 if (lang, encoding) == (None, None): pass else: BLATHER('the system locale gave a problem in Regexps.py, so WikiNames will not be locale-aware') DEBUG(formattedTraceback()) # define a useful default set of non-ascii letters, mainly european letters # from http://zwiki.org/InternationalCharacterExamples # XXX more have been added to that page (latvian, polish).. how far # should we go with this ? Could we make it always recognise all # letters and forget locale awareness ? Are regexps getting slow ? # XXX needs more work, see failing links at # http://zwiki.org/InternationalCharactersInPageNames uppercase = string.uppercase + '\xc3\x80\xc3\x81\xc3\x82\xc3\x83\xc3\x84\xc3\x85\xc3\x86\xc3\x88\xc3\x89\xc3\x8a\xc3\x8b\xc3\x8c\xc3\x8d\xc3\x8e\xc3\x8f\xc3\x92\xc3\x93\xc3\x94\xc3\x95\xc3\x96\xc3\x98\xc3\x99\xc3\x9a\xc3\x9b\xc3\x9c\xc3\x9d\xc3\x87\xc3\x90\xc3\x91\xc3\x9e' lowercase = string.lowercase + '\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6\xc3\xa8\xc3\xa9\xc3\xaa\xc3\xab\xc3\xac\xc3\xad\xc3\xae\xc3\xaf\xc3\xb2\xc3\xb3\xc3\xb4\xc3\xb5\xc3\xb6\xc3\xb8\xc3\xb9\xc3\xba\xc3\xbb\xc3\xbc\xc3\xbd\xc3\xbf\xc2\xb5\xc3\x9f\xc3\xa7\xc3\xb0\xc3\xb1\xc3\xbe' U = '|'.join([c.encode('utf8') for c in unicode(uppercase,'utf-8')]) L = '|'.join([c.encode('utf8') for c in unicode(lowercase,'utf-8')]) Ubr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(uppercase,'utf-8')]) Lbr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(lowercase,'utf-8')]) relocaleflag = '' wordboundary = '(?%s):(?P%s))' % (localwikilink1,urlchars) anywikilinkexpr = re.compile(r'(%s|%s)' % (interwikilink,wikilink)) markedwikilinkexpr = re.compile(r'(.*?)') untitledwikilinkexpr = re.compile(r'[^/"]*)" title="">.*?') wikinamewords = r'((%s(?!%s))+|%s%s+|[0-9]+)'%(Ubr,Lbr,Ubr,Lbr) remotewikiurl = r'(?mi)RemoteWikiURL[:\s]+(?P[^\s]*)\s*$' protected_line = r'(?m)^!(.*)$' # stx footnotes # handled by us so as to co-exist with our bracketed links # real stx allows refchars = r'[0-9_%s-]' % (string.letters) footnoteexpr = r'(?sm)^\.\. \[([^\n\]]+)\]' # for stripping javascript # XXX needs work, eg should not match # javascriptexpr = r'(?iL)<(([^>\w]*script|iframe)[^>]*)>' # \1 will be displayed # for stripping HTML header/footer # XXX these are expensive, may hit max recursion limit on bsd htmlheaderexpr = r'(?si)^(\s*' htmlfooterexpr = r'(?si)\s*\s*$' # better ? safe ? htmlbodyexpr = r'(?si)^.*?]*?>(.*)]*?>.*?$' # one more badass regexp: # sgml tags, including ones containing dtml & python expressions and multiline # Notes: # - r'(?s)<((".*?")|[^">]+)*>' takes exponential time # - r'(?s)<((".*?")|[^">]+(?![^">]))*>' avoids backtracking (see perlre) # - to avoid matching casual angle bracket use, treat dtml separately # - recognising that stuff like