ports//mail/spamprobe/work/spamprobe-1.4d/src/database/FrequencyDBImpl

///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: FrequencyDBImpl_hash.cc 272 2007-01-06 19:37:27Z brian $
//
// Copyright (C) 2007 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#ifdef USE_MMAP

#include <stdexcept>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include "hash.h"
#include "CleanupManager.h"
#include "DatabaseConfig.h"
#include "LockFD.h"
#include "WordData.h"
#include "FrequencyDBImpl_hash.h"

const char *FrequencyDBImpl_hash::SEARCH_SUFFIX("hash");
static const string TEMP_SUFFIX("rehash");
static const string RENAME_SUFFIX("bak");

enum {
  INDEX_OFFSET = 5,
  FILE_TYPE_INDEX = 0,
  COUNT_INDEX = 1,
  DEBUG_HASH = 0,
  HASH32_FILE_KEY = 0x0bcc0001,
};

HashDataFile::ulong_t hash_string(const string &str)
{
  return jenkins_hash((ub1*)str.c_str(), (ub4)str.length(), 0);
}

FrequencyDBImpl *FrequencyDBImpl_hash::factory(const DatabaseConfig *config)
{
  return new FrequencyDBImpl_hash(config->targetSizeMB());
}

FrequencyDBImpl_hash::FrequencyDBImpl_hash(int _size)
  : m_cursor(0),
    m_dataFile(INDEX_OFFSET, _size * 1024 * 1024)
{
#ifdef USE_HASH_AUTO_CLEAN
  Ptr<CleanupManager> cleaner(new CleanupManager);
  cleaner->addLimit(2, 14);
  cleaner->addLimit(100, 30);
  cleaner->addLimit(10000, 180);
  cleaner->addLimit(100000, 360);
  m_dataFile.setAutoClean(cleaner.release());
#endif
}

FrequencyDBImpl_hash::~FrequencyDBImpl_hash()
{
  close();
}

bool FrequencyDBImpl_hash::open(const string &arg_filename,
                                bool read_only,
                                int create_mode)
{
  close();

  if (is_debug) {
    cerr << "OPEN DATABASE " << arg_filename << endl;
  }

  File db_file(arg_filename);
  db_file.setSuffix(SEARCH_SUFFIX);

  if (!m_dataFile.open(db_file.getPath(), read_only, create_mode)) {
    return false;
  }

  if (m_dataFile.isNewFile()) {
    initializeHeaderRecords();
  } else if (!validateHeaderRecords()) {
    m_dataFile.close();
    cerr << "ERROR: Unexpected header record" << endl;
    return false;
  }

  if (is_debug) {
    cerr << "DATABASE OPENED " << db_file.getPath() << endl;
  }

  return true;
}

void FrequencyDBImpl_hash::initializeHeaderRecords()
{
  HashDataFile::ulong_t key = HASH32_FILE_KEY;
  WordData word_data;
  word_data.reset(HASH32_FILE_KEY, 0, 0);
  m_dataFile.writeRecord(FILE_TYPE_INDEX, key, word_data);
}

bool FrequencyDBImpl_hash::validateHeaderRecords()
{
  HashDataFile::ulong_t key = 0;
  WordData word_data;
  m_dataFile.readRecord(FILE_TYPE_INDEX, key, word_data);
  return key == HASH32_FILE_KEY;
}

void FrequencyDBImpl_hash::close()
{
  m_cursor = 0;
  m_dataFile.close();
}

void FrequencyDBImpl_hash::flush()
{
}

void FrequencyDBImpl_hash::writeWord(const string &word,
                                     const WordData &counts)
{
  assert(m_dataFile.isOpen());

  if (word == FrequencyDB::COUNT_WORD) {
    m_dataFile.writeRecord(COUNT_INDEX, 0, counts);
  } else {
    HashDataFile::ulong_t key = computeKeyForWord(word);
    if (!m_dataFile.write(key, counts)) {
      throw runtime_error(string("no room in hash file for term ") + word);
    }
  }

  if (is_debug) {
    cerr << "wrote word " << word
         << " good " << counts.goodCount()
         << " spam " << counts.spamCount()
         << " total " << counts.totalCount()
         << endl;
  }
}

bool FrequencyDBImpl_hash::readWord(const string &word,
                                    WordData &counts)
{
  assert(m_dataFile.isOpen());

  bool answer = false;
  if (word == FrequencyDB::COUNT_WORD) {
    HashDataFile::ulong_t ignored;
    m_dataFile.readRecord(COUNT_INDEX, ignored, counts);
    answer = true;
  } else {
    HashDataFile::ulong_t key = computeKeyForWord(word);
    answer = m_dataFile.read(key, counts);
  }

  if (is_debug) {
    cerr << "read word " << word
         << " found? " << (answer ? "yes" : "no")
         << " good " << counts.goodCount()
         << " spam " << counts.spamCount()
         << " total " << counts.totalCount()
         << endl;
  }

  return answer;
}

string FrequencyDBImpl_hash::getWordForIndex(int index,
                                             HashDataFile::ulong_t key)
{
  if (index == COUNT_INDEX) {
    return FrequencyDB::COUNT_WORD;
  } else {
    char buffer[128];
    sprintf(buffer, "I0x%08x", key);
    return buffer;
  }
}

HashDataFile::ulong_t FrequencyDBImpl_hash::computeKeyForWord(const string &word)
{
  HashDataFile::ulong_t key = 0;
  if (word == FrequencyDB::COUNT_WORD) {
    // key not used for count
  } else if (starts_with(word, "I0x")) {
    sscanf(word.c_str() + 3, "%x", &key);
  } else {
    key = hash_string(word);
  }

  if (DEBUG_HASH && is_debug) {
    cerr << " KEY 0x" << hex << key
         << " WORD " << word
         << endl;
  }

  return key;
}

bool FrequencyDBImpl_hash::firstWord(string &word,
                                     WordData &counts)
{
  assert(m_dataFile.isOpen());

  HashDataFile::ulong_t key = 0;
  word = FrequencyDB::COUNT_WORD;
  m_dataFile.readRecord(COUNT_INDEX, key, counts);
  m_cursor = INDEX_OFFSET;
  return true;
}

bool FrequencyDBImpl_hash::nextWord(string &word,
                                    WordData &counts)
{
  assert(m_dataFile.isOpen());

  HashDataFile::ulong_t key = 0;
  for (; m_cursor < m_dataFile.indexLimit(); ++m_cursor) {
    m_dataFile.readRecord(m_cursor, key, counts);
    if (counts.totalCount() > 0) {
      word = getWordForIndex(m_cursor, key);
      ++m_cursor;
      return true;
    }
  }

  return false;
}

string FrequencyDBImpl_hash::getDatabaseType() const
{
  return "Hashed-array";
}

void FrequencyDBImpl_hash::sweepOutOldTerms(const CleanupManager &cleanman)
{
  assert(m_dataFile.isOpen());
  assert(!m_dataFile.isReadOnly());

  File live_file(m_dataFile.filename());

  File rehash_file(live_file);
  rehash_file.setSuffix(TEMP_SUFFIX);

  File temp_file(live_file);
  temp_file.setSuffix(RENAME_SUFFIX);

  copyToNewDataFile(cleanman, rehash_file.getPath());
  swapDataFilesAndReopen(live_file.getPath(), rehash_file.getPath(), temp_file.getPath());

  assert(m_dataFile.isOpen());
}

void FrequencyDBImpl_hash::copyToNewDataFile(const CleanupManager &cleanman,
                                             const string &rehash_filename)
{
  assert(m_dataFile.isOpen());

  File rehash_file(rehash_filename);
  if (rehash_file.isFile()) {
    if (is_debug) {
      cerr << "sweepOutJunk: removing existing rehash file " << rehash_filename << endl;
    }
    rehash_file.remove();
  }

  HashDataFile temp_file(INDEX_OFFSET, m_dataFile.size());
  temp_file.open(rehash_filename, false, m_dataFile.createMode());

  m_dataFile.copyHeadersToFile(temp_file);

  HashDataFile::ulong_t key;
  WordData counts;

  // now copy all non-empty keys with acceptable junk_count and max_age
  for (HashDataFile::ulong_t i = INDEX_OFFSET; i < m_dataFile.indexLimit(); ++i) {
    m_dataFile.readRecord(i, key, counts);
    if (key != 0) {
      if (counts.totalCount() == 0 || cleanman.shouldDelete(counts)) {
        if (is_debug) {
          cerr << "sweepOutJunk: removing term " << getWordForIndex(i, key)
               << " with total count " << counts.totalCount()
               << " and age " << counts.age()
               << endl;
        }
      } else {
        temp_file.write(key, counts);
      }
    }
  }

  temp_file.close();
}

void FrequencyDBImpl_hash::swapDataFilesAndReopen(const string &live_filename,
                                                  const string &rehash_filename,
                                                  const string &temp_filename)
{
  assert(m_dataFile.isOpen());

  bool read_only = m_dataFile.isReadOnly();
  int create_mode = m_dataFile.createMode();

  m_dataFile.close();

  if (is_debug) {
    cerr << "renaming " << live_filename << " to " << temp_filename << endl;
  }

  File renamer;
  renamer.setPath(live_filename);
  renamer.rename(temp_filename);

  try {
    if (is_debug) {
      cerr << "renaming " << rehash_filename << " to " << live_filename << endl;
    }
    renamer.setPath(rehash_filename);
    renamer.rename(live_filename);
  } catch (...) {
    if (is_debug) {
      cerr << "recovery: renaming " << temp_filename << " to " << live_filename << endl;
    }
    renamer.setPath(temp_filename);
    renamer.rename(live_filename);
    throw;
  }

  if (is_debug) {
    cerr << "opening new hash file" << endl;
  }

  if (!m_dataFile.open(live_filename, read_only, create_mode)) {
    throw runtime_error("unable to open new file");
  }

  if (is_debug) {
    cerr << "deleting original hash file " << temp_filename << endl;
  }

  renamer.setPath(temp_filename);
  renamer.remove();

  assert(m_dataFile.isOpen());
}

#endif // USE_MMAP
syntax highlighted by Code2HTML, v. 0.9.1