ports//mail/spamprobe/work/spamprobe-1.4d/src/parser/MimeDecoder.cc

///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: MimeDecoder.cc 272 2007-01-06 19:37:27Z brian $
//
// Copyright (C) 2007 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include "AbstractCharReader.h"
#include "MultiLineString.h"
#include "MultiLineSubString.h"
#include "MessageHeader.h"
#include "MessageHeaderList.h"
#include "RegularExpression.h"
#include "MimeDecoder.h"

static unsigned char char_value(unsigned char ch)
{
  if (ch >= '0' && ch <= '9') {
    return ch - '0';
  }

  if (ch >= 'a' && ch <= 'f') {
    return 10 + ch - 'a';
  }

  if (ch >= 'A' && ch <= 'F') {
    return 10 + ch - 'A';
  }

  return 0;
}

static const int BASE64_CHARS[256] = {
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
    52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
    -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};
static const int MAX_CHAR_INDEX = sizeof(BASE64_CHARS) / sizeof(BASE64_CHARS[0]);

inline bool next_char64(AbstractCharReader *reader,
                        char &ch,
                        unsigned char &value)
{
    while (!reader->atEnd()) {
        ch = reader->currentChar();
        if (ch == '=') {
            value = 0;
            reader->forward();
            return true;
        }

        unsigned int index = (unsigned)(ch & 0xff);
        if (index >= MAX_CHAR_INDEX) {
          assert(!"should not be possible but just to be safe");
          break;
        }

        if (BASE64_CHARS[index] >= 0) {
            value = BASE64_CHARS[index];
            reader->forward();
            return true;
        }

        reader->forward();
    }

    ch = ' ';
    value = 0;
    return false;
}

CRef<AbstractMultiLineString> MimeDecoder::decodeHeaderString(const CRef<AbstractMultiLineString> &header_value)
{
  static const string token_expr("[^] \t()<>@,;:\"/[?.=]+");
  static const string encoded_word_expr = string("=\\?(") + token_expr + ")\\?(" + token_expr + ")\\?([^\\?]+)\\?=";

  string text;
  header_value->join(text);

  if (is_debug) {
    cerr << "decodeHeader: before: " << text << endl;
  }

  bool changed = false;
  RegularExpression::MatchData whole_match;
  string whole_string, charset, method, encoded_word, decoded_text;
  RegularExpression encoded_word_regex(encoded_word_expr, 4, false, true);
  string::size_type offset = 0;
  while ((offset < text.length()) && encoded_word_regex.match(text.c_str() + offset)) {
    string::size_type replace_start = offset;
    encoded_word_regex.getMatch(0, whole_match);
    encoded_word_regex.getMatch(2, method);
    encoded_word_regex.getMatch(3, encoded_word);
    method = trim(to_lower(method));

    bool valid_method = true;
    if (method == "q") {
      Ref<MultiLineString> word(new MultiLineString());
      word->addLine(encoded_word);
      CRef<AbstractMultiLineString> decoded_lines(unquoteString(word, true));
      decoded_lines->join(decoded_text);
      offset += decoded_text.length();
    } else if (method == "b") {
      Ref<MultiLineString> word(new MultiLineString());
      word->addLine(encoded_word);
      CRef<AbstractMultiLineString> decoded_lines(decodeString(word));
      decoded_lines->join(decoded_text);
      offset += decoded_text.length();
    } else {
      valid_method = false;
      offset += whole_match.end_pos;
    }

    if (valid_method) {
      text.replace(replace_start + whole_match.start_pos, whole_match.end_pos - whole_match.start_pos, decoded_text);
      changed = true;
    }
  }

  if (is_debug) {
    if (changed) {
      cerr << "decodeHeader: after: " << text << endl;
    } else {
      cerr << "decodeHeader: no change" << endl;
    }
  }

  if (changed) {
    return MultiLineString::fromText(text);
  } else {
    return header_value;
  }
}


bool MimeDecoder::isQuotedPrintable(const CRef<AbstractMultiLineString> &contentTransferEncoding)
{
    if (contentTransferEncoding.isNull() || contentTransferEncoding->lineCount() < 1) {
        return false;
    }
    return starts_with(to_lower(contentTransferEncoding->line(0)), "quoted-printable");
}

bool MimeDecoder::isBase64(const CRef<AbstractMultiLineString> &contentTransferEncoding)
{
    if (contentTransferEncoding.isNull() || contentTransferEncoding->lineCount() < 1) {
        return false;
    }
    return starts_with(to_lower(contentTransferEncoding->line(0)), "base64");
}

CRef<AbstractMultiLineString> MimeDecoder::decodeText(const MessageHeaderList *headers,
                                                             const CRef<AbstractMultiLineString> &body)
{
    if (!headers->isTextType()) {
        return body;
    }

    CRef<AbstractMultiLineString> transfer_encoding(headers->header("content-transfer-encoding"));
    if (isQuotedPrintable(transfer_encoding)) {
        if (is_debug) {
            cerr << "UNQUOTING BODY" << endl;
        }
        return unquoteString(body);
    }

    if (isBase64(transfer_encoding)) {
        if (is_debug) {
            cerr << "DECODING BODY" << endl;
        }
        return decodeString(body);
    }

    return body;
}

CRef<AbstractMultiLineString> MimeDecoder::unquoteString(const CRef<AbstractMultiLineString> &messageBody,
                                                                bool convert_underscores)
{
    Ref<MultiLineString> answer(new MultiLineString());
    string buffer;
    bool no_changes = true;

    buffer.reserve(10240);
    for (int k = 0; k < messageBody->lineCount(); ++k) {
        bool add_new_line = true;
        const string &line(messageBody->line(k));
        const char *line_chars = line.c_str();
        const int line_length = line.length();
        const int last_index = line_length - 1;
        const int hex_limit = line_length - 2;
        for (string::size_type i = 0; i < line_length; ++i) {
            char ch = line_chars[i];
            if (ch == '=') {
                no_changes = false;
                if (i == last_index) {
                    // soft line break - ignore newline
                    add_new_line = false;
                } else if ((i < hex_limit) && is_xdigit(line_chars[i+1]) && is_xdigit(line_chars[i+2])) {
                    char ch = (char)((char_value(line_chars[i+1]) << 4) | char_value(line_chars[i+2]));
                    buffer += safe_char(ch);
                    i += 2;
                }
            } else if (convert_underscores && ch == '_') {
                buffer += ' ';
                no_changes = false;
            } else {
                buffer += ch;
            }
        }
        if (add_new_line) {
            answer->addLine(buffer);
            buffer.erase();
        }
    }
    answer->addLine(buffer);

    if (no_changes) {
        return messageBody;
    } else {
        return answer;
    }
}

CRef<AbstractMultiLineString> MimeDecoder::decodeString(const CRef<AbstractMultiLineString> &messageBody)
{
    string buffer;
    Ref<AbstractCharReader> char_reader(messageBody->createCharReader());
    decodeString(char_reader.ptr(), buffer);
    if (is_debug) {
        cerr << "DECODED TO " << buffer << endl;
    }
    return MultiLineString::fromText(buffer);
}

void MimeDecoder::decodeString(AbstractCharReader *reader,
                               string &decoded_string)
{
    char c1, c2, c3, c4;
    unsigned char u1, u2, u3, u4;
    reader->forward();
    while (next_char64(reader, c1, u1) && next_char64(reader, c2, u2) &&
           next_char64(reader, c3, u3) && next_char64(reader, c4, u4))
    {
      if (c3 != '=' && c4 != '=') {
        // common case of 3 octets
        decoded_string += safe_char((u1 << 2) | (u2 >> 4));           // 6 + 2
        decoded_string += safe_char(((u2 & 0x0f) << 4) | (u3 >> 2));  // 4 + 4
        decoded_string += safe_char(((u3 & 0x03) << 6) | u4);         // 2 + 6
      } else if (c3 == '=') {
        // padded case with 1 octet
        decoded_string += safe_char((u1 << 2) | (u2 >> 4));           // 6 + 2
      } else {
        // padded case with 2 octets
        decoded_string += safe_char((u1 << 2) | (u2 >> 4));           // 6 + 2
        decoded_string += safe_char(((u2 & 0x0f) << 4) | (u3 >> 2));  // 4 + 4
      }
    }
}

void MimeDecoder::decodeData(const CRef<AbstractMultiLineString> &messageBody,
                             Buffer<unsigned char> &decoded_data)
{
    Ref<AbstractCharReader> char_reader(messageBody->createCharReader());
    decodeData(char_reader.ptr(), decoded_data);
}

void MimeDecoder::decodeData(AbstractCharReader *reader,
                             Buffer<unsigned char> &decoded_data)
{
    char c1, c2, c3, c4;
    unsigned char u1, u2, u3, u4;
    reader->forward();
    while (next_char64(reader, c1, u1) && next_char64(reader, c2, u2) &&
           next_char64(reader, c3, u3) && next_char64(reader, c4, u4))
    {
      if (c3 != '=' && c4 != '=') {
        // common case of 3 octets
        decoded_data.append((u1 << 2) | (u2 >> 4));           // 6 + 2
        decoded_data.append(((u2 & 0x0f) << 4) | (u3 >> 2));  // 4 + 4
        decoded_data.append(((u3 & 0x03) << 6) | u4);         // 2 + 6
      } else if (c3 == '=') {
        // padded case with 1 octet
        decoded_data.append((u1 << 2) | (u2 >> 4));           // 6 + 2
      } else {
        // padded case with 2 octets
        decoded_data.append((u1 << 2) | (u2 >> 4));           // 6 + 2
        decoded_data.append(((u2 & 0x0f) << 4) | (u3 >> 2));  // 4 + 4
      }
    }
}
syntax highlighted by Code2HTML, v. 0.9.1