ports//mail/spamprobe/work/spamprobe-1.4d/src/database/HashDataFile.cc

///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: HashDataFile.cc 272 2007-01-06 19:37:27Z brian $
//
// Copyright (C) 2007 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#ifdef USE_MMAP

#include <stdexcept>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include "File.h"
#include "WordData.h"
#include "HashDataFile.h"

enum {
  SIZE_MULTIPLE = 1024,
  DEBUG_HASH = 1,
};

// table of primes indexed roughy by megabyte from 1 to 100 megs
static const int primes_by_meg[] = {
  // assumes 12 bytes per entry
     87359,     174761,     262139,     349519,     436889,   // 1 - 5 megs
    524287,     611657,     699037,     786431,     873787,   // 6 - 10 megs
    961189,    1048573,    1135951,    1223329,    1310719,   // 11 - 15 megs
   1398091,    1485479,    1572853,    1660231,    1747619,   // 16 - 20 megs
   1835003,    1922387,    2009759,    2097143,    2184509,   // 21 - 25 megs
   2271901,    2359267,    2446673,    2534057,    2621431,   // 26 - 30 megs
   2708821,    2796181,    2883577,    2970959,    3058343,   // 31 - 35 megs
   3145721,    3233093,    3320477,    3407857,    3495253,   // 36 - 40 megs
   3582629,    3670013,    3757393,    3844777,    3932153,   // 41 - 45 megs
   4019527,    4106897,    4194301,    4281679,    4369061,   // 46 - 50 megs
   4456433,    4543823,    4631203,    4718579,    4805959,   // 51 - 55 megs
   4893341,    4980727,    5068111,    5155489,    5242877,   // 56 - 60 megs
   5330251,    5417597,    5505023,    5592373,    5679769,   // 61 - 65 megs
   5767129,    5854543,    5941921,    6029299,    6116687,   // 66 - 70 megs
   6204071,    6291449,    6378787,    6466193,    6553577,   // 71 - 75 megs
   6640979,    6728347,    6815741,    6903121,    6990493,   // 76 - 80 megs
   7077883,    7165267,    7252649,    7340009,    7427353,   // 81 - 85 megs
   7514791,    7602151,    7689557,    7776931,    7864301,   // 86 - 90 megs
   7951693,    8039081,    8126453,    8213819,    8301217,   // 91 - 95 megs
   8388593,    8475947,    8563351,    8650727,    8738129,   // 96 - 100 megs
};

#define PRIMES_ARRAY_SIZE (sizeof(primes_by_meg) / sizeof(primes_by_meg[0]))

static int compute_divisor_for_max_elements(int max_elements)
{
  for (int i = PRIMES_ARRAY_SIZE - 1; i >= 0; --i) {
    int divisor = primes_by_meg[i];
    if (divisor <= max_elements) {
      return divisor;
    }
  }
  return primes_by_meg[0];
}

HashDataFile::HashDataFile(int num_headers,
                           HashDataFile::ulong_t size_in_bytes)
  : m_base(0)
{
  initialize(num_headers, size_in_bytes);
}

HashDataFile::~HashDataFile()
{
  close();
}

void HashDataFile::initialize(int num_headers,
                              HashDataFile::ulong_t size_in_bytes)
{
  m_numHeaders = num_headers;
  int max_elements = (size_in_bytes / WordArray::ENTRY_SIZE) - m_numHeaders;
  m_divisor = compute_divisor_for_max_elements(max_elements);
  m_indexLimit = m_divisor + m_numHeaders;
  m_size = WordArray::ENTRY_SIZE * m_indexLimit;
  if (is_debug) {
    cerr << "HASH_DIVISOR " << m_divisor
         << " (0x" << hex << m_divisor << ")"
         << " INDEX_LIMIT " << dec << m_indexLimit
         << " SIZE " << m_size
         << endl;
  }
}

void HashDataFile::populateEmptyFile(int fd)
{
  if (is_debug) {
    cerr << "INITIALIZING NEW HASHFILE" << endl;
  }

  const int SIZE_MULTIPLE = 1024;
  unsigned char zeros[SIZE_MULTIPLE * WordArray::ENTRY_SIZE];
  WordArray::fillNullBuffer(zeros, SIZE_MULTIPLE);

  for (int i = 0; i < m_indexLimit; i += SIZE_MULTIPLE) {
    ::write(fd, &zeros, min(m_indexLimit - i, (HashDataFile::ulong_t)SIZE_MULTIPLE) * WordArray::ENTRY_SIZE);
  }
}

bool HashDataFile::open(const string &filename,
                        bool read_only,
                        int create_mode)
{
  close();

  if (is_debug) {
    cerr << "OPEN HASHFILE " << filename << endl;
  }

  m_isReadOnly = read_only;
  m_createMode = create_mode;

  File data_file(filename);
  bool exists = data_file.isFile();

  m_isNewFile = !exists;

  if (read_only && !exists) {
    cerr << "error: hash file "
         << filename
         << " does not exist and cannot be created in read-only mode"
         << endl;
    return false;
  }

  if (exists) {
    unsigned long file_size = data_file.getSize();
    if ((file_size % WordArray::ENTRY_SIZE) != 0) {
      cerr << "error: hash file "
           << filename
           << " size not a multiple of "
           << WordArray::ENTRY_SIZE
           << " bytes"
           << endl;
      return false;
    }
    initialize(m_numHeaders, file_size);
    if (m_size != file_size) {
      cerr << "error: hash file "
           << filename
           << " size invalid "
           << file_size
           << " != "
           << m_size
           << endl;
      return false;
    }
  }

  if (is_debug) {
    cerr << "HASHFILE " << filename << " SIZE " << m_size << endl;
  }

  int flags = (read_only) ? O_RDONLY : O_RDWR;
  if (!exists) {
    flags |= O_CREAT;
  }

  int fd = ::open(filename.c_str(), flags, create_mode);
  if (fd < 0) {
    cerr << "error: unable to open database " << filename << ": " << strerror(errno) << endl;
    return false;
  }

  if (!exists) {
    populateEmptyFile(fd);
  }

  if (!mmapFile(fd, read_only)) {
    cerr << "error: unable to mmap file " << filename << ": " << strerror(errno) << endl;
    close();
    return false;
  }

  m_filename = filename;

  return true;
}

bool HashDataFile::mmapFile(int fd,
                            bool read_only)
{

  if (is_debug) {
    cerr << "MMAPPING HASHFILE" << endl;
  }

  int flags = (read_only) ? PROT_READ : (PROT_READ | PROT_WRITE);
  m_base = (char *)mmap(0, m_size, flags, MAP_SHARED, fd, 0);
  if (m_base == (char *)-1) {
    m_base = 0;
    return false;
  }

  m_array.reset(m_base, m_indexLimit);
  return true;
}

void HashDataFile::close()
{
  if (m_base) {
    ::munmap(m_base, m_size);
    m_array.reset(0, 0);
    m_base = 0;
    m_filename.erase();
  }
}

bool HashDataFile::isOpen()
{
  return m_base != 0;
}

void HashDataFile::writeRecord(int record_number,
                               HashDataFile::ulong_t key,
                               const WordData &word_data)
{
  assert(m_base);
  assert(record_number >= 0);
  assert(record_number < m_indexLimit);

  m_array.writeWord(record_number, key, word_data);
}

void HashDataFile::readRecord(int record_number,
                              HashDataFile::ulong_t &key,
                              WordData &word_data)
{
  assert(m_base);
  assert(record_number >= 0);
  assert(record_number < m_indexLimit);

  m_array.readWord(record_number, key, word_data);
}

bool HashDataFile::write(ulong_t key,
                         const WordData &word_data)
{
  assert(m_base);

  int index = computeIndexForKey(key);
  for (int i = 0; i < m_divisor; ++i) {
    ulong_t old_key = m_array.readKey(index);
    if (old_key == key || old_key == 0) {
      m_array.writeWord(index, key, word_data);
      return true;
    }

    if (m_autoCleaner.isNotNull()) {
      WordData old_data;
      m_array.readWord(index, old_key, old_data);
      if (m_autoCleaner->shouldDelete(old_data)) {
	m_array.writeWord(index, key, word_data);
	if (is_debug) {
	  cerr << "HashDataFile: overwriting old word for new one: "
	       << " index " << index
	       << " new_key " << key
	       << " old_key " << old_key
	       << " total " << old_data.totalCount()
	       << " good " << old_data.goodCount()
	       << " spam " << old_data.spamCount()
	       << " age " << old_data.age()
	       << endl;
	}
	return true;
      }
    }

    // skip this slot since it contains a collision
    index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
  }

  return false;
}

bool HashDataFile::read(ulong_t key,
                        WordData &word_data)
{
  assert(m_base);

  ulong_t old_key = 0;
  int index = computeIndexForKey(key);
  for (int i = 0; i < m_divisor; ++i) {
    m_array.readWord(index, old_key, word_data);
    if (old_key == key) {
      // slot matches our key so return it's value
      return true;
    }

    if (old_key == 0) {
      // slot is empty - this key is not in file
      return false;
    }

    // skip this slot since it contains a collision
    index = m_numHeaders + ((index - m_numHeaders + 1) % m_divisor);
  }

  return false;
}

void HashDataFile::copyHeadersToFile(HashDataFile &file)
{
  ulong_t key = 0;
  WordData word_data;
  for (int i = 0; i < m_numHeaders; ++i) {
    readRecord(i, key, word_data);
    file.writeRecord(i, key, word_data);
  }
}

void HashDataFile::copyContentsToFile(HashDataFile &file)
{
  ulong_t key = 0;
  WordData word_data;
  for (int i = m_numHeaders; i < m_indexLimit; ++i) {
    readRecord(i, key, word_data);
    if (key != 0) {
      file.write(key, word_data);
    }
  }
}

#endif // USE_MMAP
syntax highlighted by Code2HTML, v. 0.9.1