///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: FrequencyDB.cc 272 2007-01-06 19:37:27Z brian $
//
// Copyright (C) 2007 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt. If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
// http://www.burton-computer.com/qpl.html
// http://www.cooldevtools.com/qpl.html
//
#include <unistd.h>
#include <fcntl.h>
#include "DatabaseConfig.h"
#include "File.h"
#include "InterruptedException.h"
#include "WordData.h"
#include "DatabaseConfig.h"
#include "FrequencyDBImpl.h"
#include "FrequencyDBImpl_null.h"
#include "FrequencyDB.h"
/// defined here to avoid having an otherwise empty WordData.cc file - yeah I know
unsigned long WordData::s_today = 0;
static const string DIGEST_PREFIX("__MD5__");
const string FrequencyDB::COUNT_WORD("__COUNT__");
const int FrequencyDBImpl::SHARED_DB_MODE = 0666;
const int FrequencyDBImpl::PRIVATE_DB_MODE = 0600;
class InterruptTest
{
public:
InterruptTest(FrequencyDB *db)
: m_db(db)
{
m_db->setBusy(true);
}
~InterruptTest()
{
m_db->setBusy(false);
m_db->throwOnInterrupt();
}
private:
FrequencyDB *m_db;
};
FrequencyDB::FrequencyDB(const DatabaseConfig *config)
: m_config(config),
m_isInterrupted(false),
m_isBusy(false),
m_db(new FrequencyDBImpl_null)
{
}
FrequencyDB::~FrequencyDB()
{
close();
}
bool FrequencyDB::requestInterrupt()
{
if (m_isInterrupted || !m_isBusy) {
return true;
}
m_isInterrupted = true;
cerr << "INTERRUPT SCHEDULED: TRY AGAIN AFTER SEVERAL SECONDS IF SHUTDOWN FAILS" << endl;
return false;
}
void FrequencyDB::throwOnInterrupt()
{
if (m_isInterrupted) {
m_isInterrupted = false;
throw InterruptedException();
}
}
bool FrequencyDB::open(bool read_only)
{
close();
WordData::setTodayDate();
m_db.set(m_config->createDatabaseImpl(read_only));
if (m_db.isNull()) {
m_db.set(new FrequencyDBImpl_null);
return false;
} else {
return true;
}
}
void FrequencyDB::close()
{
// don't want to cause a loop in the int_test destructor
m_isInterrupted = false;
InterruptTest int_test(this);
m_db.set(new FrequencyDBImpl_null);
}
void FrequencyDB::flush()
{
assert(m_db.get());
InterruptTest int_test(this);
m_db->flush();
}
void FrequencyDB::beginTransaction()
{
assert(m_db.get());
InterruptTest int_test(this);
m_db->beginTransaction();
}
void FrequencyDB::endTransaction(bool commit)
{
assert(m_db.get());
InterruptTest int_test(this);
m_db->endTransaction(commit);
}
void FrequencyDB::setWordCounts(const string &word,
int good_count,
int spam_count)
{
assert(m_db.get());
assert(good_count >= 0 && spam_count >= 0);
WordData counts(good_count, spam_count);
counts.adjustDate();
InterruptTest int_test(this);
m_db->writeWord(word, counts);
}
void FrequencyDB::touchWord(const string &word)
{
assert(m_db.get());
InterruptTest int_test(this);
WordData counts;
if (m_db->readWord(word, counts)) {
counts.adjustDate();
m_db->writeWord(word, counts);
}
}
void FrequencyDB::addWord(const string &word,
int good_count,
int spam_count)
{
assert(m_db.get());
InterruptTest int_test(this);
WordData counts;
m_db->readWord(word, counts);
counts.adjustGoodCount(good_count);
counts.adjustSpamCount(spam_count);
counts.adjustDate();
m_db->writeWord(word, counts);
}
void FrequencyDB::addWord(const string &word,
int good_count,
int spam_count,
unsigned long flags)
{
assert(m_db.get());
InterruptTest int_test(this);
WordData counts;
bool exists = m_db->readWord(word, counts);
counts.adjustGoodCount(good_count);
counts.adjustSpamCount(spam_count);
if (exists) {
counts.adjustDate();
} else {
counts.setFlags(flags);
}
m_db->writeWord(word, counts);
}
bool FrequencyDB::readWord(const string &word,
WordData &counts)
{
assert(m_db.get());
InterruptTest int_test(this);
return m_db->readWord(word, counts);
}
void FrequencyDB::removeWord(const string &word,
int good_count,
int spam_count)
{
assert(m_db.get());
InterruptTest int_test(this);
addWord(word, -good_count, -spam_count);
}
void FrequencyDB::getWordCounts(const string &word,
int &good_count,
int &spam_count)
{
WordData counts;
readWord(word, counts);
good_count = counts.goodCount();
spam_count = counts.spamCount();
}
void FrequencyDB::getMessageCounts(int &good_message_count,
int &spam_message_count)
{
getWordCounts(COUNT_WORD, good_message_count, spam_message_count);
}
int FrequencyDB::getTotalMessageCount()
{
InterruptTest int_test(this);
int good_message_count, spam_message_count;
getWordCounts(COUNT_WORD, good_message_count, spam_message_count);
return good_message_count + spam_message_count;
}
int FrequencyDB::getMessageCount(const Message &msg,
bool &is_spam)
{
assert(m_db.get());
InterruptTest int_test(this);
WordData counts;
if (!m_db->readWord(DIGEST_PREFIX + msg.getDigest(), counts)) {
is_spam = false;
return 0;
}
assert(counts.goodCount() >= 0 && counts.spamCount() >= 0);
assert(!(counts.goodCount() > 0 && counts.spamCount() > 0));
is_spam = (counts.spamCount() > 0);
return is_spam ? counts.spamCount() : counts.goodCount();
}
bool FrequencyDB::containsMessage(const Message &msg,
bool &is_spam)
{
return getMessageCount(msg, is_spam) > 0;
}
void FrequencyDB::adjustWordCounts(const string &word,
int delta,
bool is_spam)
{
assert(m_db.get());
if (is_spam) {
addWord(word, 0, delta);
} else {
addWord(word, delta, 0);
}
}
void FrequencyDB::touchMessage(const Message &msg)
{
assert(m_db.get());
assert(msg.getDigest().length() > 0);
for (int i = 0; i < msg.getTokenCount(); ++i) {
const Token *tok = msg.getToken(i);
touchWord(tok->getWord());
}
if (is_debug) {
string msg_id;
cerr << "Updated terms from message " << msg.getID(msg_id)
<< "/" << msg.getDigest()
<< " in database"
<< endl;
}
}
void FrequencyDB::addMessage(const Message &msg,
bool new_is_spam,
bool force_update)
{
assert(m_db.get());
assert(msg.getDigest().length() > 0);
bool already_is_spam = false;
bool already_exists = containsMessage(msg, already_is_spam);
assert(!already_exists || (!new_is_spam == !already_is_spam));
if (already_exists) {
assert(!already_is_spam == !new_is_spam);
if (!force_update) {
// message already counted, do nothing
return;
}
// force_update causes us to pretend it doesn't already exist so
// we wind up adding its terms again even though they are already counted
}
for (int i = 0; i < msg.getTokenCount(); ++i) {
const Token *tok = msg.getToken(i);
adjustWordCounts(tok->getWord(), tok->getCount(), new_is_spam);
}
adjustWordCounts(DIGEST_PREFIX + msg.getDigest(), 1, new_is_spam);
if (!already_exists) {
adjustWordCounts(COUNT_WORD, 1, new_is_spam);
}
if (is_debug) {
string msg_id;
cerr << "Updated message " << msg.getID(msg_id)
<< "/" << msg.getDigest()
<< " in database as "
<< (new_is_spam ? "spam." : "good.")
<< endl;
}
}
void FrequencyDB::removeMessage(const Message &msg)
{
assert(m_db.get());
assert(msg.getDigest().length() > 0);
bool is_spam = false;
int message_count = getMessageCount(msg, is_spam);
if (message_count == 0) {
// not in database
return;
}
for (int i = 0; i < msg.getTokenCount(); ++i) {
const Token *tok = msg.getToken(i);
adjustWordCounts(tok->getWord(), -message_count * tok->getCount(), is_spam);
}
adjustWordCounts(DIGEST_PREFIX + msg.getDigest(), -message_count, is_spam);
adjustWordCounts(COUNT_WORD, -1, is_spam);
assert(getMessageCount(msg, is_spam) == 0);
if (is_debug) {
string msg_id;
cerr << "Removed message " << msg.getID(msg_id)
<< "/" << msg.getDigest()
<< " from database." << endl;
}
}
bool FrequencyDB::firstWord(string &word,
WordData &counts)
{
assert(m_db.get());
InterruptTest int_test(this);
return m_db->firstWord(word, counts);
}
bool FrequencyDB::nextWord(string &word,
WordData &counts)
{
assert(m_db.get());
InterruptTest int_test(this);
return m_db->nextWord(word, counts);
}
string FrequencyDB::getDatabaseType()
{
InterruptTest int_test(this);
return m_db.get() ? m_db->getDatabaseType() : "unknown";
}
void FrequencyDB::sweepOutOldTerms(const CleanupManager &cleanman)
{
assert(m_db.get());
InterruptTest int_test(this);
return m_db->sweepOutOldTerms(cleanman);
}
syntax highlighted by Code2HTML, v. 0.9.1