#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 
# Copyright 2002-2006 Zuza Software Foundation
# 
# This file is part of translate.
#
# translate is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# translate is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with translate; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""classes that hold units of .po files (pounit) or entire files (pofile)
gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""

from __future__ import generators
from translate.misc.multistring import multistring
from translate.misc import quote
from translate.misc import textwrap
from translate.storage import base
from translate.storage import poheader
import re
import codecs

# general functions for quoting / unquoting po strings

po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])

def escapeforpo(line):
  """Escapes a line for po format. assumes no \n occurs in the line.
  
  @param line: unescaped text
  """
  special_locations = []
  for special_key in po_escape_map:
    special_locations.extend(quote.find_all(line, special_key))
  special_locations = dict.fromkeys(special_locations).keys()
  special_locations.sort()
  escaped_line = ""
  last_location = 0
  for location in special_locations:
    escaped_line += line[last_location:location]
    escaped_line += po_escape_map[line[location:location+1]]
    last_location = location+1
  escaped_line += line[last_location:]
  return escaped_line

def unescapehandler(escape):

  return po_unescape_map.get(escape, escape)

def wrapline(line):
    """Wrap text for po files."""
    wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)

    # Lines should not start with a space...
    if len(wrappedlines) > 1:
        for index, line in enumerate(wrappedlines[1:]):
            if line.startswith(' '):
                # Remove the space at the beginning of the line:
                wrappedlines[index+1] = line[1:]

                # Append a space to the previous line:
                wrappedlines[index] += ' '
    return wrappedlines

def quoteforpo(text):
  """quotes the given text for a PO file, returning quoted and escaped lines"""
  polines = []
  if text is None:
    return polines
  lines = text.split("\n")
  if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
    if len(lines) != 2 or lines[1]:
        polines.extend(['""'])
    for line in lines[:-1]:
      lns = wrapline(line)
      if len(lns) > 0:
        for ln in lns[:-1]:
          polines.extend(['"' + escapeforpo(ln) + '"'])
        if lns[-1]:
          polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
      else:
        polines.extend(['"\\n"'])
  if lines[-1]:
    polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
  return polines

def extractpoline(line):
  """Remove quote and unescape line from po file.
   
  @param line: a quoted line from a po file (msgid or msgstr)
  """
  extracted = quote.extractwithoutquotes(line,'"','"','\\',includeescapes=unescapehandler)[0]
  return extracted

def unquotefrompo(postr, joinwithlinebreak=False):
  if joinwithlinebreak:
    joiner = "\n"
    if postr and postr[0] == '""': postr = postr[1:]
  else:
    joiner = ""
  return joiner.join([extractpoline(line) for line in postr])

def encodingToUse(encoding):
  """Tests whether the given encoding is known in the python runtime, or returns utf-8.
  This function is used to ensure that a valid encoding is always used."""
  if encoding == "CHARSET" or encoding == None: return 'utf-8'
  return encoding
#  if encoding is None: return False
#  return True
#  try:
#    tuple = codecs.lookup(encoding)
#  except LookupError:
#    return False
#  return True

"""
From the GNU gettext manual:
     WHITE-SPACE
     #  TRANSLATOR-COMMENTS
     #. AUTOMATIC-COMMENTS
     #| PREVIOUS MSGID                 (Gettext 0.16 - check if this is the correct position - not yet implemented)
     #: REFERENCE...
     #, FLAG...
     msgctxt CONTEXT                   (Gettext 0.15)
     msgid UNTRANSLATED-STRING
     msgstr TRANSLATED-STRING
"""

class pounit(base.TranslationUnit):
  # othercomments = []      #   # this is another comment
  # automaticcomments = []  #   #. comment extracted from the source code
  # sourcecomments = []     #   #: sourcefile.xxx:35
  # typecomments = []       #   #, fuzzy
  # visiblecomments = []    #   #_ note to translator  (this is nonsense)
  # msgidcomments = []      #   _: within msgid
  # msgctxt
  # msgid = []
  # msgstr = []

  def __init__(self, source=None, encoding="UTF-8"):
    self.encoding = encodingToUse(encoding)
    self.obsolete = False
    self.initallcomments(blankall=True)
    self.msgctxt = []
    self.msgid = []
    self.msgid_pluralcomments = []
    self.msgid_plural = []
    self.msgstr = []
    self.obsoletemsgctxt = []
    self.obsoletemsgid = []
    self.obsoletemsgid_pluralcomments = []
    self.obsoletemsgid_plural = []
    self.obsoletemsgstr = []
    if source:
      self.setsource(source)
    super(pounit, self).__init__(source)

  def initallcomments(self, blankall=False):
    """Initialises allcomments"""
    if blankall:
      self.othercomments = []
      self.automaticcomments = []
      self.sourcecomments = []
      self.typecomments = []
      self.visiblecomments = []
      self.msgidcomments = []
      self.obsoletemsgidcomments = []
    self.allcomments = [self.othercomments, 
                        self.automaticcomments, 
                        self.sourcecomments, 
                        self.typecomments, 
                        self.visiblecomments, 
                        self.msgidcomments,
                        self.obsoletemsgidcomments]

  def getsource(self):
    """Returns the unescaped msgid"""
    multi = multistring(unquotefrompo(self.msgid), self.encoding)
    if self.hasplural():
      pluralform = unquotefrompo(self.msgid_plural)
      if isinstance(pluralform, str):
          pluralform = pluralform.decode(self.encoding)
      multi.strings.append(pluralform)
    return multi

  def setsource(self, source):
    """Sets the msgid to the given (unescaped) value.
    
    @param source: an unescaped source string.
    """
    if isinstance(source, str):
      source = source.decode(self.encoding)
    if isinstance(source, multistring):
      source = source.strings
    if isinstance(source, list):
      self.msgid = quoteforpo(source[0])
      if len(source) > 1:
        self.msgid_plural = quoteforpo(source[1])
    else:
      self.msgid = quoteforpo(source)
  source = property(getsource, setsource)

  def gettarget(self):
    """Returns the unescaped msgstr"""
    if isinstance(self.msgstr, dict):
      multi = multistring(map(unquotefrompo, self.msgstr.values()), self.encoding)
    else:
      multi = multistring(unquotefrompo(self.msgstr), self.encoding)
    return multi

  def settarget(self, target):
    """Sets the msgstr to the given (unescaped) value"""
    if isinstance(target, str):
      target = target.decode(self.encoding)
    if target == self.target:
      return
    if self.hasplural():
      if isinstance(target, multistring):
        target = target.strings
      elif isinstance(target, basestring):
        target = [target]
    elif isinstance(target,(dict, list)):
      if len(target) == 1:
        target = target[0]
      else:
        raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
    templates = self.msgstr
    if isinstance(templates, list):
      templates = {0: templates}
    if isinstance(target, list):
      self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
    elif isinstance(target, dict):
      self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
    else:
      self.msgstr = quoteforpo(target)
  target = property(gettarget, settarget)

  def getnotes(self, origin=None):
    """Return comments based on origin value (programmer, developer, source code and translator)"""
    if origin == None:
      comments = "".join([comment[2:] for comment in self.othercomments])
      comments += "".join([comment[3:] for comment in self.automaticcomments])
    elif origin == "translator":
      comments = "".join ([comment[2:] for comment in self.othercomments])
    elif origin in ["programmer", "developer", "source code"]:
      comments = "".join([comment[3:] for comment in self.automaticcomments])
    else:
      raise ValueError("Comment type not valid")
    # Let's drop the last newline
    return comments[:-1]

  def addnote(self, text, origin=None):
    """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
    # We don't want to put in an empty '#' without a real comment:
    if not text:
      return
    commentlist = self.othercomments
    linestart = "# "
    if origin in ["programmer", "developer", "source code"]:
      commentlist = self.automaticcomments
      linestart = "#. "
    text = text.split("\n")
    commentlist += [linestart + line + "\n" for line in text]
    
  def removenotes(self):
    """Remove all the translator's notes (other comments)"""
    self.othercomments = []

  def adderror(self, errorname, errortext):
    """Adds an error message to this unit."""
    text = u'(pofilter) %s: %s' % (errorname, errortext)
    # Don't add the same error twice:
    if text not in self.getnotes(origin='translator'):
        self.addnote(text, origin="translator")

  def geterrors(self):
    """Get all error messages."""
    notes = self.getnotes(origin="translator").split('\n')
    errordict = {}
    for note in notes:
      if '(pofilter) ' in note:
        error = note.replace('(pofilter) ', '')
        errorname, errortext = error.split(': ')
        errordict[errorname] = errortext
    return errordict

  def copy(self):
    newpo = self.__class__()
    newpo.othercomments = self.othercomments[:]
    newpo.automaticcomments = self.automaticcomments[:]
    newpo.sourcecomments = self.sourcecomments[:]
    newpo.typecomments = self.typecomments[:]
    newpo.visiblecomments = self.visiblecomments[:]
    newpo.obsolete = self.obsolete
    newpo.msgidcomments = self.msgidcomments[:]
    newpo.initallcomments()
    newpo.msgctxt = self.msgctxt[:]
    newpo.msgid = self.msgid[:]
    newpo.msgid_pluralcomments = self.msgid_pluralcomments[:]
    newpo.msgid_plural = self.msgid_plural[:]
    if isinstance(self.msgstr, dict):
      newpo.msgstr = self.msgstr.copy()
    else:
      newpo.msgstr = self.msgstr[:]
      
    newpo.obsoletemsgctxt = self.obsoletemsgctxt[:]
    newpo.obsoletemsgid = self.obsoletemsgid[:]
    newpo.obsoletemsgid_pluralcomments = self.obsoletemsgid_pluralcomments[:]
    newpo.obsoletemsgid_plural = self.obsoletemsgid_plural[:]
    if isinstance(self.obsoletemsgstr, dict):
      newpo.obsoletemsgstr = self.obsoletemsgstr.copy()
    else:
      newpo.obsoletemsgstr = self.obsoletemsgstr[:]
    return newpo

  def msgidlen(self):
    if self.hasplural():
      return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
    else:
      return len(unquotefrompo(self.msgid).strip())

  def msgstrlen(self):
    if isinstance(self.msgstr, dict):
      combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
      return len(combinedstr.strip())
    else:
      return len(unquotefrompo(self.msgstr).strip())

  def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
    """Merges the otherpo (with the same msgid) into this one.

    Overwrite non-blank self.msgstr only if overwrite is True
    merge comments only if comments is True
    
    """

    def mergelists(list1, list2, split=False):
      #decode where necessary
      if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
        for position, item in enumerate(list1):
          if isinstance(item, str):
            list1[position] = item.decode("utf-8")
        for position, item in enumerate(list2):
          if isinstance(item, str):
            list2[position] = item.decode("utf-8")
            
      #Determine the newline style of list1
      lineend = ""
      if list1 and list1[0]:
        for candidate in ["\n", "\r", "\n\r"]:
          if list1[0].endswith(candidate):
            lineend = candidate
        if not lineend:
          lineend = ""
      else:
        lineend = "\n"
      
      #Split if directed to do so:    
      if split:
        splitlist1 = []
        splitlist2 = []
        prefix = "#"
        for item in list1:
          splitlist1.extend(item.split()[1:])
          prefix = item.split()[0]
        for item in list2:
          splitlist2.extend(item.split()[1:])
          prefix = item.split()[0]
        list1.extend(["%s %s%s" % (prefix,item,lineend) for item in splitlist2 if not item in splitlist1])
      else:
        #Normal merge, but conform to list1 newline style
        for item in list2:
          if lineend:
            item = item.rstrip() + lineend
          if item not in list1:
            list1.append(item)
    if not isinstance(otherpo, pounit):
      super(pounit, self).merge(otherpo, overwrite, comments)
      return
    if comments:
      mergelists(self.othercomments, otherpo.othercomments)
      mergelists(self.typecomments, otherpo.typecomments)
      mergelists(self.visiblecomments, otherpo.visiblecomments)
      if not authoritative:
        # We don't bring across otherpo.automaticcomments as we consider ourself
        # to be the the authority.  Same applies to otherpo.msgidcomments
        mergelists(self.automaticcomments, otherpo.automaticcomments)
        mergelists(self.msgidcomments, otherpo.msgidcomments)
        mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
    if self.isblankmsgstr() or overwrite:
      # Remove kde-style comments from the translation (if any).
      if self.extract_msgidcomments(otherpo.target):
        otherpo.target = otherpo.target.replace('_: ' + otherpo.extract_msgidcomments()+ '\n', '')
      self.target = otherpo.target
      if self.source != otherpo.source:
        self.markfuzzy()
      else:
        self.markfuzzy(otherpo.isfuzzy())
    elif otherpo.isblankmsgstr():
      if self.source != otherpo.source:
        self.markfuzzy()
    else:
      if self.target != otherpo.target:
        self.markfuzzy()

  def isheader(self):
    #return (self.msgidlen() == 0) and (self.msgstrlen() > 0) and (len(self.msgidcomments) == 0)
    #rewritten here for performance:
    return ((self.msgid == [] or self.msgid == ['""']) and 
            not (self.msgstr == [] or self.msgstr == ['""']) 
            and self.msgidcomments == [])

  def isblank(self):
    if self.isheader() or len(self.msgidcomments):
      return False
    if (self.msgidlen() == 0) and (self.msgstrlen() == 0):
      return True
    return False
    # TODO: remove:
    # Before, the equivalent of the following was the final return statement:
    # return len(self.source.strip()) == 0

  def isblankmsgstr(self):
    """checks whether the msgstr is blank"""
    return self.msgstrlen() == 0

  def hastypecomment(self, typecomment):
    """check whether the given type comment is present"""
    # check for word boundaries properly by using a regular expression...
    return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0

  def hasmarkedcomment(self, commentmarker):
    """check whether the given comment marker is present as # (commentmarker) ..."""
    commentmarker = "(%s)" % commentmarker
    for comment in self.othercomments:
      if comment.replace("#", "", 1).strip().startswith(commentmarker):
        return True
    return False

  def settypecomment(self, typecomment, present=True):
    """alters whether a given typecomment is present"""
    if self.hastypecomment(typecomment) != present:
      if present:
        self.typecomments.append("#, %s\n" % typecomment)
      else:
        # this should handle word boundaries properly ...
        typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
        self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)

  def istranslated(self):
    return super(pounit, self).istranslated() and not self.isobsolete()

  def isfuzzy(self):
    return self.hastypecomment("fuzzy")

  def markfuzzy(self, present=True):
    self.settypecomment("fuzzy", present)

  def isreview(self):
    return self.hastypecomment("review") or self.hasmarkedcomment("review") or self.hasmarkedcomment("pofilter")

  def markreviewneeded(self, needsreview=True, explanation=None):
    """Marks the unit to indicate whether it needs review. Adds an optional explanation as a note."""
    if needsreview:
      reviewnote = "(review)"
      if explanation:
        reviewnote += " " + explanation
      self.addnote(reviewnote, origin="translator")
    else:
      # Strip (review) notes.
      notestring = self.getnotes(origin="translator")
      notes = notestring.split('\n')
      newnotes = []
      for note in notes:
        if not '(review)' in note:
          newnotes.append(note)
      newnotes = '\n'.join(newnotes)
      self.removenotes()
      self.addnote(newnotes, origin="translator")
      

  def isnotblank(self):
    return not self.isblank()

  def isobsolete(self):
    return self.obsolete

  def makeobsolete(self):
    """Makes this unit obsolete"""
    self.obsolete = True
    if self.msgctxt:
      self.obsoletemsgctxt = self.msgctxt
    if self.msgid:
      self.obsoletemsgid = self.msgid
      self.msgid = []
    if self.msgidcomments:
      self.obsoletemsgidcomments = self.msgidcomments
      self.msgidcomments = []
    if self.msgid_plural:
      self.obsoletemsgid_plural = self.msgid_plural
      self.msgid_plural = []
    if self.msgstr:
      self.obsoletemsgstr = self.msgstr
      self.msgstr = []
    self.sourcecomments = []
    self.automaticcomments = []

  def resurrect(self):
    """Makes an obsolete unit normal"""
    self.obsolete = False
    if self.obsoletemsgctxt:
      self.msgid = self.obsoletemsgctxt
      self.obsoletemsgctxt = []
    if self.obsoletemsgid:
      self.msgid = self.obsoletemsgid
      self.obsoletemsgid = []
    if self.obsoletemsgidcomments:
      self.msgidcomments = self.obsoletemsgidcomments
      self.obsoletemsgidcomments = []
    if self.obsoletemsgid_plural:
      self.msgid_plural = self.obsoletemsgid_plural
      self.obsoletemsgid_plural = []
    if self.obsoletemsgstr:
      self.msgstr = self.obsoletemsgstr
      self.obsoletemgstr = []

  def hasplural(self):
    """returns whether this pounit contains plural strings..."""
    return len(self.msgid_plural) > 0

  def parse(self, src):
    inmsgctxt = 0
    inmsgid = 0
    inmsgid_comment = 0
    inmsgid_plural = 0
    inmsgstr = 0
    msgstr_pluralid = None
    linesprocessed = 0
    for line in src.split("\n"):
      line = line + "\n"
      linesprocessed += 1
      if len(line) == 0:
        continue
      elif line[0] == '#':
        if inmsgstr and not line[1] == '~':
          # if we're already in the message string, this is from the next element
          break
        if line[1] == '.':
          self.automaticcomments.append(line)
        elif line[1] == ':':
          self.sourcecomments.append(line)
        elif line[1] == ',':
          self.typecomments.append(line)
        elif line[1] == '_':
          self.visiblecomments.append(line)
        elif line[1] == '~':
          line = line[3:]
          self.obsolete = True
        else:
          self.othercomments.append(line)
      if line.startswith('msgid_plural'):
        inmsgctxt = 0
        inmsgid = 0
        inmsgid_plural = 1
        inmsgstr = 0
        inmsgid_comment = 0
      elif line.startswith('msgctxt'):
        inmsgctxt = 1
        inmsgid = 0
        inmsgid_plural = 0
        inmsgstr = 0
        inmsgid_comment = 0
      elif line.startswith('msgid'):
        inmsgctxt = 0
        inmsgid = 1
        inmsgid_plural = 0
        inmsgstr = 0
        inmsgid_comment = 0
      elif line.startswith('msgstr'):
        inmsgctxt = 0
        inmsgid = 0
        inmsgid_plural = 0
        inmsgstr = 1
        if line.startswith('msgstr['):
          msgstr_pluralid = int(line[len('msgstr['):line.find(']')].strip())
        else:
          msgstr_pluralid = None
      extracted = quote.extractstr(line)
      if not extracted is None:
        if inmsgctxt:
          self.msgctxt.append(extracted)
        elif inmsgid:
          # TODO: improve kde comment detection
          if extracted.find("_:") != -1:
            inmsgid_comment = 1
          if inmsgid_comment:
            self.msgidcomments.append(extracted)
          else:
            self.msgid.append(extracted)
          if inmsgid_comment and extracted.find("\\n") != -1:
            inmsgid_comment = 0
        elif inmsgid_plural:
          if extracted.find("_:") != -1:
            inmsgid_comment = 1
          if inmsgid_comment:
            self.msgid_pluralcomments.append(extracted)
          else:
            self.msgid_plural.append(extracted)
          if inmsgid_comment and extracted.find("\\n") != -1:
            inmsgid_comment = 0
        elif inmsgstr:
          if msgstr_pluralid is None:
            self.msgstr.append(extracted)
          else:
            if type(self.msgstr) == list:
              self.msgstr = {0: self.msgstr}
            if msgstr_pluralid not in self.msgstr:
              self.msgstr[msgstr_pluralid] = []
            self.msgstr[msgstr_pluralid].append(extracted)
    if self.obsolete:
      self.makeobsolete()
    # If this unit is the header, we have to get the encoding to ensure that no
    # methods are called that need the encoding before we obtained it.
    if self.isheader():
      charset = re.search("charset=([^\\s]+)", unquotefrompo(self.msgstr))
      if charset:
        self.encoding = encodingToUse(charset.group(1))
    return linesprocessed

  def getmsgpartstr(self, partname, partlines, partcomments=""):
    if isinstance(partlines, dict):
      partkeys = partlines.keys()
      partkeys.sort()
      return "".join([self.getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
    partstr = partname + " "
    partstartline = 0
    if len(partlines) > 0 and len(partcomments) == 0:
      partstr += partlines[0]
      partstartline = 1
    elif len(partcomments) > 0:
      if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
        # if there is a blank leader line, it must come before the comment
        partstr += partlines[0] + '\n'
        # but if the whole string is blank, leave it in
        if len(partlines) > 1:
          partstartline += 1
      else:
        # All partcomments should start on a newline
        partstr += '""\n'
      # combine comments into one if more than one
      if len(partcomments) > 1:
        combinedcomment = []
        for comment in partcomments:
          comment = unquotefrompo([comment])
          if comment.startswith("_:"):
            comment = comment[len("_:"):]
          if comment.endswith("\\n"):
            comment = comment[:-len("\\n")]
          #Before we used to strip. Necessary in some cases?
          combinedcomment.append(comment)
        partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
      # comments first, no blank leader line needed
      partstr += "\n".join(partcomments)
      partstr = quote.rstripeol(partstr)
    else:
      partstr += '""'
    partstr += '\n'
    # add the rest
    for partline in partlines[partstartline:]:
      partstr += partline + '\n'
    return partstr

  def encodeifneccessary(self, output):
    """encodes unicode strings and returns other strings unchanged"""
    if isinstance(output, unicode):
      encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
      return output.encode(encoding)
    return output

  def __str__(self):
    """convert to a string. double check that unicode is handled somehow here"""
    output = self.getoutput()
    return self.encodeifneccessary(output)

  def getoutput(self):
    """return this po element as a string"""
    lines = []
    lines.extend(self.othercomments)
    if self.isobsolete():
      lines.extend(self.typecomments)
      obsoletelines = []
      if self.obsoletemsgctxt:
        obsoletelines.append(self.getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
      obsoletelines.append(self.getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
      if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
        obsoletelines.append(self.getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
      obsoletelines.append(self.getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
      for index, obsoleteline in enumerate(obsoletelines):
        # We need to account for a multiline msgid or msgstr here
        obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
      lines.extend(obsoletelines)
      lines = [self.encodeifneccessary(line) for line in lines]
      return "".join(lines)
    # if there's no msgid don't do msgid and string, unless we're the header
    # this will also discard any comments other than plain othercomments...
    if (len(self.msgid) == 0) or ((len(self.msgid) == 1) and (self.msgid[0] == '""')):
      if not (self.isheader() or self.msgidcomments or self.sourcecomments):
        return "".join(lines)
    lines.extend(self.automaticcomments)
    lines.extend(self.sourcecomments)
    lines.extend(self.typecomments)
    lines.extend(self.visiblecomments)
    if self.msgctxt:
      lines.append(self.getmsgpartstr("msgctxt", self.msgctxt))
    lines.append(self.getmsgpartstr("msgid", self.msgid, self.msgidcomments))
    if self.msgid_plural or self.msgid_pluralcomments:
      lines.append(self.getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
    lines.append(self.getmsgpartstr("msgstr", self.msgstr))
    lines = [self.encodeifneccessary(line) for line in lines]
    postr = "".join(lines)
    return postr

  def getlocations(self):
    """Get a list of locations from sourcecomments in the PO unit

    rtype: List
    return: A list of the locations with '#: ' stripped

    """
    locations = []
    for sourcecomment in self.sourcecomments:
      locations += quote.rstripeol(sourcecomment)[3:].split()
    return locations

  def addlocation(self, location):
    """Add a location to sourcecomments in the PO unit

    @param location: Text location e.g. 'file.c:23' does not include #:
    @type location: String

    """
    self.sourcecomments.append("#: %s\n" % location)

  def extract_msgidcomments(self, text=None):
    """Extract KDE style msgid comments from the unit.
    
    @rtype: String
    @return: Returns the extracted msgidcomments found in this unit's msgid.
    
    """

    if not text:
        text = unquotefrompo(self.msgidcomments)
    return text.split('\n')[0].replace('_: ', '', 1)

  def getcontext(self):
    """Get the message context."""
    return unquotefrompo(self.msgctxt) + self.extract_msgidcomments()

class pofile(base.TranslationStore, poheader.poheader):
  """this represents a .po file containing various units"""
  UnitClass = pounit
  def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
    """construct a pofile, optionally reading in from inputfile.
    encoding can be specified but otherwise will be read from the PO header"""
    self.UnitClass = unitclass
    base.TranslationStore.__init__(self, unitclass=unitclass)
    self.units = []
    self.filename = ''
    self.encoding = encodingToUse(encoding)
    if inputfile is not None:
      self.parse(inputfile)

  def makeheader(self, **kwargs):
    """create a header for the given filename. arguments are specially handled, kwargs added as key: value
    pot_creation_date can be None (current date) or a value (datetime or string)
    po_revision_date can be None (form), False (=pot_creation_date), True (=now), or a value (datetime or string)"""

    headerpo = self.UnitClass(encoding=self.encoding)
    headerpo.markfuzzy()
    headerpo.msgid = ['""']
    headeritems = self.makeheaderdict(**kwargs)
    headerpo.msgstr = ['""']
    for (key, value) in headeritems.items():
        headerpo.msgstr.append(quote.quotestr("%s: %s\\n" % (key, value)))
    return headerpo

  def changeencoding(self, newencoding):
    """changes the encoding on the file"""
    self.encoding = encodingToUse(newencoding)
    if not self.units:
      return
    header = self.header()
    if not header or header.isblank():
      return
    charsetline = None
    headerstr = unquotefrompo(header.msgstr, True)
    for line in headerstr.split("\\n"):
      if not ":" in line: continue
      key, value = line.strip().split(":", 1)
      if key.strip() != "Content-Type": continue
      charsetline = line
    if charsetline is None:
      headerstr += "Content-Type: text/plain; charset=%s" % self.encoding
    else:
      charset = re.search("charset=([^ ]*)", charsetline)
      if charset is None:
        newcharsetline = charsetline
        if not newcharsetline.strip().endswith(";"):
          newcharsetline += ";"
        newcharsetline += " charset=%s" % self.encoding
      else:
        charset = charset.group(1)
        newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self.encoding, 1)
      headerstr = headerstr.replace(charsetline, newcharsetline, 1)
    header.msgstr = quoteforpo(headerstr)

  def parsestring(cls, storestring):
    """Parses the po file contents in the storestring and returns a new pofile object (classmethod, constructor)"""
    parsedfile = pofile()
    parsedfile.parse(storestring)
    return parsedfile
  parsestring = classmethod(parsestring)

  def parse(self, input):
    """parses the given file or file source string"""
    if hasattr(input, 'name'):
      self.filename = input.name
    elif not getattr(self, 'filename', ''):
      self.filename = ''
    if hasattr(input, "read"):
      posrc = input.read()
      input.close()
      input = posrc
    # TODO: change this to a proper parser that doesn't do line-by-line madness
    lines = input.split("\n")
    start = 0
    end = 0
    # make only the first one the header
    linesprocessed = 0
    while end <= len(lines):
      if (end == len(lines)) or (not lines[end].strip()):   # end of lines or blank line
        newpe = self.UnitClass(encoding=self.encoding)
        linesprocessed = newpe.parse("\n".join(lines[start:end]))
        start += linesprocessed
        # TODO: find a better way of working out if we actually read anything
        if linesprocessed >= 1 and newpe.getoutput():
          self.units.append(newpe)
          if newpe.isheader():
            if "Content-Type" in self.parseheader():
              self.encoding = newpe.encoding
            # now that we know the encoding, decode the whole file
            if self.encoding is not None and self.encoding.lower() != 'charset':
              lines = self.decode(lines)
          if self.encoding is None: #still have not found an encoding, let's assume UTF-8
            #TODO: This might be dead code
            self.encoding = 'utf-8'
            lines = self.decode(lines)
            self.units = []
            start = 0
            end = 0
      end = end+1

  def removeblanks(self):
    """remove any units which say they are blank"""
    self.units = filter(self.UnitClass.isnotblank, self.units)

  def removeduplicates(self, duplicatestyle="merge"):
    """make sure each msgid is unique ; merge comments etc from duplicates into original"""
    msgiddict = {}
    uniqueelements = []
    # we sometimes need to keep track of what has been marked
    # TODO: this is using a list as the pos aren't hashable, but this is slow...
    markedpos = []
    def addcomment(thepo):
      thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
      markedpos.append(thepo)
    for thepo in self.units:
      if duplicatestyle.startswith("msgid_comment"):
        msgid = unquotefrompo(thepo.msgidcomments) + unquotefrompo(thepo.msgid)
      else:
        msgid = unquotefrompo(thepo.msgid)
      if thepo.isheader():
        # header msgids shouldn't be merged...
        uniqueelements.append(thepo)
      elif duplicatestyle == "msgid_comment_all":
        addcomment(thepo)
        uniqueelements.append(thepo)
      elif msgid in msgiddict:
        if duplicatestyle == "merge":
          if msgid:
            msgiddict[msgid].merge(thepo)
          else:
            addcomment(thepo)
            uniqueelements.append(thepo)
        elif duplicatestyle == "keep":
          uniqueelements.append(thepo)
        elif duplicatestyle == "msgid_comment":
          origpo = msgiddict[msgid]
          if origpo not in markedpos:
            addcomment(origpo)
          addcomment(thepo)
          uniqueelements.append(thepo)
        elif duplicatestyle == "msgctxt":
          origpo = msgiddict[msgid]
          if origpo not in markedpos:
            origpo.msgctxt.append('"%s"' % " ".join(origpo.getlocations()))
            markedpos.append(thepo)
          thepo.msgctxt.append('"%s"' % " ".join(thepo.getlocations()))
          uniqueelements.append(thepo)
      else:
        if not msgid and duplicatestyle != "keep":
          addcomment(thepo)
        msgiddict[msgid] = thepo
        uniqueelements.append(thepo)
    self.units = uniqueelements

  def __str__(self):
    """convert to a string. double check that unicode is handled somehow here"""
    output = self.getoutput()
    if isinstance(output, unicode):
      return output.encode(getattr(self, "encoding", "UTF-8"))
    return output

  def getoutput(self):
    """convert the units back to lines"""
    lines = []
    for pe in self.units:
      pesrc = str(pe) + "\n"
      lines.append(pesrc)
    lines = "".join(self.encode(lines)).rstrip()
    #After the last pounit we will have \n\n and we only want to end in \n:
    if lines: lines += "\n"
    return lines

  def encode(self, lines):
    """encode any unicode strings in lines in self.encoding"""
    newlines = []
    encoding = self.encoding
    if encoding is None or encoding.lower() == "charset":
      encoding = 'UTF-8'
    for line in lines:
      if isinstance(line, unicode):
        line = line.encode(encoding)
      newlines.append(line)
    return newlines

  def decode(self, lines):
    """decode any non-unicode strings in lines with self.encoding"""
    newlines = []
    for line in lines:
      if isinstance(line, str) and self.encoding is not None and self.encoding.lower() != "charset":
        try:
          line = line.decode(self.encoding)
        except UnicodeError, e:
          raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self.encoding, e, line))
      newlines.append(line)
    return newlines

  def todict(self):
    """returns a dictionary of units based on msgid"""
    # NOTE: these units are quoted strings
    # TODO: make them unquoted strings, if useful...
    return dict([(" ".join(poel.msgid), poel) for poel in self.units])

  def unit_iter(self):
    for unit in self.units:
      if not (unit.isheader() or unit.isobsolete()):
        yield unit

if __name__ == '__main__':
  import sys
  pf = pofile(sys.stdin)
  sys.stdout.write(str(pf))