#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2006-2007 Zuza Software Foundation # # This file is part of translate. # # translate is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # translate is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with translate; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # """Parent class for LISA standards (TMX, TBX, XLIFF)""" from translate.storage import base from translate.misc.multistring import multistring from translate.misc import ourdom # Note that it is important that the local "ourdom" class is used. The builtin # pretty printing of XML is not correct in minidom. Therefore take care to # only instantiate classes from ourdom, and to call those functions as needed. # # For example: ourdom.Element, ourdom.Document, ourdom.parseString def getText(nodelist): """joins together the text from all the text nodes in the nodelist and their children""" rc = [] if not isinstance(nodelist, list): nodelist = [nodelist] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(node.data) elif node.nodeType == node.ELEMENT_NODE: rc += getText(node.childNodes) return multistring(''.join(rc)) #return "".join([t.data for t in node.childNodes if t.nodeType == t.TEXT_NODE]) def _findAllMatches(text,re_obj): 'generate match objects for all @re_obj matches in @text' start = 0 max = len(text) while start < max: m = re_obj.search(text, start) if not m: break yield m start = m.end() import re placeholders = ['(%[diouxXeEfFgGcrs])',r'(\\+.?)','(%[0-9]$lx)','(%[0-9]\$[a-z])','(<.+?>)'] re_placeholders = [re.compile(ph) for ph in placeholders] def _getPhMatches(text): 'return list of regexp matchobjects for with all place holders in the @text' matches = [] for re_ph in re_placeholders: matches.extend(list(_findAllMatches(text,re_ph))) # sort them so they come sequentially matches.sort(lambda a,b: cmp(a.start(),b.start())) return matches class LISAunit(base.TranslationUnit): """A single unit in the file. Provisional work is done to make several languages possible.""" #The name of the root element of this unit type:(termEntry, tu, trans-unit) rootNode = "" #The name of the per language element of this unit type:(termEntry, tu, trans-unit) languageNode = "" #The name of the innermost element of this unit type:(term, seg) textNode = "" def __init__(self, source, document=None, empty=False): """Constructs a unit containing the given source string""" if document: self.document = document else: self.document = ourdom.Document() if empty: return self.xmlelement = self.document.createElement(self.rootNode) #add descrip, note, etc. super(LISAunit, self).__init__(source) def __eq__(self, other): """Compares two units""" languageNodes = self.getlanguageNodes() otherlanguageNodes = other.getlanguageNodes() if len(languageNodes) != len(otherlanguageNodes): return False for i in range(len(languageNodes)): mytext = self.getNodeText(languageNodes[i]) othertext = other.getNodeText(otherlanguageNodes[i]) if mytext != othertext: #TODO:^ maybe we want to take children and notes into account return False return True def setsource(self, source, sourcelang='en'): languageNodes = self.getlanguageNodes() sourcelanguageNode = self.createlanguageNode(sourcelang, source, "source") if len(languageNodes) > 0: self.xmlelement.replaceChild(sourcelanguageNode, languageNodes[0]) else: self.xmlelement.appendChild(sourcelanguageNode) def getsource(self): return self.getNodeText(self.getlanguageNode(lang=None, index=0)) source = property(getsource, setsource) def settarget(self, text, lang='xx', append=False): #XXX: we really need the language - can't really be optional """Sets the "target" string (second language), or alternatively appends to the list""" #Firstly deal with reinitialising to None or setting to identical string if self.gettarget() == text: return languageNodes = self.getlanguageNodes() assert len(languageNodes) > 0 if not text is None: languageNode = self.createlanguageNode(lang, text, "target") if append or len(languageNodes) == 1: self.xmlelement.appendChild(languageNode) else: self.xmlelement.insertBefore(languageNode, languageNodes[1]) if not append and len(languageNodes) > 1: self.xmlelement.removeChild(languageNodes[1]) def gettarget(self, lang=None): """retrieves the "target" text (second entry), or the entry in the specified language, if it exists""" if lang: node = self.getlanguageNode(lang=lang) else: node = self.getlanguageNode(lang=None, index=1) return self.getNodeText(node) target = property(gettarget, settarget) def createlanguageNode(self, lang, text, purpose=None): """Returns a xml Element setup with given parameters to represent a single language entry. Has to be overridden.""" return None def createPHnodes(self, parent, text): """Create the text node in parent containing all the ph tags""" if isinstance(text, str): text = text.decode("utf-8") start = 0 for i,m in enumerate(_getPhMatches(text)): #pretext pretext = text[start:m.start()] if pretext: parent.appendChild(self.document.createTextNode(pretext)) #ph node phnode = ourdom.Element("ph") phnode.setAttribute("id", str(i+1)) phnode.appendChild(self.document.createTextNode(m.group())) parent.appendChild(phnode) start = m.end() #post text if text[start:]: parent.appendChild(self.document.createTextNode(text[start:])) def getlanguageNodes(self): """Returns a list of all nodes that contain per language information.""" return self.xmlelement.getElementsByTagName(self.languageNode) def getlanguageNode(self, lang=None, index=None): """Retrieves a languageNode either by language or by index""" if lang is None and index is None: raise KeyError("No criterea for languageNode given") languageNodes = self.getlanguageNodes() if lang: for set in languageNodes: if set.getAttribute("xml:lang") == lang: return set else:#have to use index if index >= len(languageNodes): return None else: return languageNodes[index] return None def getNodeText(self, languageNode): """Retrieves the term from the given languageNode""" if languageNode is None: return None if self.textNode: terms = languageNode.getElementsByTagName(self.textNode) if len(terms) == 0: return None return getText([terms[0]]) else: return getText([languageNode]) def __str__(self): return self.xmlelement.toxml().encode('utf-8') def createfromxmlElement(cls, element, document): term = cls(None, document=document, empty=True) term.xmlelement = element return term createfromxmlElement = classmethod(createfromxmlElement) class LISAfile(base.TranslationStore): """A class representing a file store for one of the LISA file formats.""" UnitClass = LISAunit #The root node of the XML document: rootNode = "" #The root node of the content section: bodyNode = "" #The XML skeleton to use for empty construction: XMLskeleton = "" def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None): super(LISAfile, self).__init__(unitclass=unitclass) self.setsourcelanguage(sourcelanguage) self.settargetlanguage(targetlanguage) if inputfile is not None: self.parse(inputfile) assert self.document.documentElement.tagName == self.rootNode else: self.parse(self.XMLskeleton) self.addheader() def addheader(self): """Method to be overridden to initialise headers, etc.""" pass def initbody(self): """Initialises self.body so it never needs to be retrieved from the DOM again.""" self.body = self.document.getElementsByTagName(self.bodyNode)[0] def setsourcelanguage(self, sourcelanguage): """Sets the source language for this store""" self.sourcelanguage = sourcelanguage def settargetlanguage(self, targetlanguage): """Sets the target language for this store""" self.targetlanguage = targetlanguage def addsourceunit(self, source): #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? """Adds and returns a new unit with the given string as first entry.""" newunit = self.UnitClass(source, self.document) self.addunit(newunit) return newunit def addunit(self, unit): self.body.appendChild(unit.xmlelement) self.units.append(unit) def __str__(self): """Converts to a string containing the file's XML""" return self.document.toprettyxml(indent="\t", encoding="utf-8") def parse(self, xml): """Populates this object from the given xml string""" if not hasattr(self, 'filename'): self.filename = getattr(xml, 'name', '') if hasattr(xml, "read"): xml.seek(0) posrc = xml.read() xml = posrc self.document = ourdom.parseString(xml) self.encoding = self.document.encoding assert self.document.documentElement.tagName == self.rootNode self.initbody() termEntries = self.document.getElementsByTagName(self.UnitClass.rootNode) if termEntries is None: return for entry in termEntries: term = self.UnitClass.createfromxmlElement(entry, self.document) self.units.append(term) def parsestring(cls, storestring): """Parses the string to return the correct file object""" newstore = cls() if storestring: newstore.parse(storestring) return newstore parsestring = classmethod(parsestring) def __del__(self): """clean up the document if required""" if hasattr(self, "document"): self.document.unlink()