ports//net/tcpxtract/work/tcpxtract-1.0.1/search.c

/* $Id$ */
/* Copyright (C) 2005 Nicholas Harbour
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/

/* This file is part of
   Tcpxtract, a sniffer that extracts files based on headers
   by Nick Harbour
*/

#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "util.h"
#include "search.h"
#include "conf.h"

static srch_node_t *new_srch_node(srch_nodetype_t);
static srch_node_t *add_simple(srch_node_t *, uint8_t, int, int, char *, unsigned long, spectype_t);
static srch_node_t *add_wildcard(srch_node_t *, int, int, char *, unsigned long, spectype_t);
static void update_search(srch_node_t *, srchptr_list_t **, srch_results_t **, uint8_t, int);
static void add_result(srch_results_t **, fileid_t *, spectype_t, int);

static size_t currlen;

srch_node_t *srch_machine;

void compile_srch(srch_node_t **srch_tree, int id, char *ext, unsigned long maxlen, char *spec, spectype_t type)
{
    int i = 0, speclen;
    srch_node_t *node_ptr;
    
    assert(srch_tree != NULL);
    assert(spec != NULL);
 
    speclen = strlen(spec);
    
    if (*srch_tree == NULL)
        *srch_tree = new_srch_node(TABLE);

    if (speclen == 0)
        return;

    currlen = 0;
    
    node_ptr = *srch_tree;

    while (i < speclen) {
        if (spec[i] == '\\') {
            if (i + 1 >= speclen)
                error("Dangling \'\\\' in file type specifier");
            switch (spec[++i]) {
            case '\\':
                node_ptr = add_simple(node_ptr, '\\', speclen - i, id, ext, maxlen, type);
                break;
            case 'x':
                if (i + 2 >= speclen)
                    error("Invalid hexadecimal code in file type specifier");
                else {
                    char c;
                    int ch;
                    char code[3] = {'\0'};
                    code[0] = spec[++i];
                    code[1] = spec[++i];
                    sscanf(code, "%02x", &ch);
                    c = (char) ch;
                    node_ptr = add_simple(node_ptr, c, speclen - i, id, ext, maxlen, type);
                }
                break;
            case 'n':
                node_ptr = add_simple(node_ptr, '\n', speclen - i, id, ext, maxlen, type);
                break;
            case 't':
                node_ptr = add_simple(node_ptr, '\t', speclen - i, id, ext, maxlen, type);
                break;
            case 'r':
                node_ptr = add_simple(node_ptr, '\r', speclen - i, id, ext, maxlen, type);
                break;
            case '0':
                node_ptr = add_simple(node_ptr, '\0', speclen - i, id, ext, maxlen, type);
                break;
            case '?':
                node_ptr = add_wildcard(node_ptr, speclen - i, id, ext, maxlen, type);
                break;
            default:
                error("Invalid escape character in file format specifier");
                break;
            }
        } else
            node_ptr = add_simple(node_ptr, spec[i], speclen - i, id, ext, maxlen, type);

        i++;
    }

    /* this assumes node_ptr is pointing to a COMPLETE node */
    node_ptr->data.fileid.len = currlen;
}

static srch_node_t *new_srch_node(srch_nodetype_t nodetype)
{
    srch_node_t *retval = ecalloc(sizeof (srch_node_t), 1);

    retval->nodetype = nodetype;
    return retval;
}

static srch_node_t *add_simple(srch_node_t *node, uint8_t c, int remaining, int id, char *ext, unsigned long maxlen, spectype_t type)
{
    srch_node_t *newnode;
    srch_node_t *retval;
    
    assert(node != NULL);

    currlen++;
    
    if (remaining == 1) {       /* if remaining is 1 then we need to point to a COMPLETE node */
        newnode = new_srch_node(COMPLETE);
        newnode->spectype = type;
        newnode->data.fileid.id = id;
        newnode->data.fileid.ext = ext;
        newnode->data.fileid.maxlen = maxlen;
        node->data.table[c] = newnode;
        retval = newnode;
    } else if (node->data.table[c] == NULL) {
        newnode = new_srch_node(TABLE);
        node->data.table[c] = newnode;
        retval = newnode;
    } else
        retval = node->data.table[c];

    return retval;
}

static srch_node_t *add_wildcard(srch_node_t *node, int remaining, int id, char *ext, unsigned long maxlen, spectype_t type)
{
    srch_node_t *newnode;
    int i;
    
    assert(node != NULL);

    currlen++;
    
    if (remaining == 1) {       /* if remaining is 1 then we need to point to a COMPLETE node */
        newnode = new_srch_node(COMPLETE);
        newnode->spectype = type;
        newnode->data.fileid.id = id;
        newnode->data.fileid.ext = ext;
        newnode->data.fileid.maxlen = maxlen;
        for (i = 0; i < 256; i++)
            if (node->data.table[i] == NULL)    /* a specific char trumps a wildcard */
                node->data.table[i] = newnode;  /* shhh, that indicates a slight "feature" */
        return newnode;
    } else {
        newnode = new_srch_node(TABLE);
        for (i = 0; i < 256; i++)
            if (node->data.table[i] == NULL)
                node->data.table[i] = newnode;
        return newnode;
    }
}

/* the overall search interface.  You call this bad boy and give it a
 * pointer to your data buffer (i.e. a packet) */
srch_results_t *search(srch_node_t *tree, srchptr_list_t **srchptr_list, uint8_t *buf, size_t len)
{
    srch_results_t *retval = NULL;
    int i;
    
    assert(tree != NULL);
    assert(srchptr_list != NULL);
    assert(buf != NULL);

    for (i = 0; i < len; i++)
        update_search(tree, srchptr_list, &retval, buf[i], i); 
        
    return retval;
}

static void add_srchptr(srchptr_list_t **srchptr_list, srch_node_t *node)
{
    srchptr_list_t *ptr, *ptr2;
    
    assert(srchptr_list != NULL);
    assert(node != NULL);

    ptr = ecalloc(1, sizeof (srchptr_list_t));
    ptr->next = *srchptr_list;
    if (ptr->next != NULL)
        ptr->next->prev = ptr;
    ptr->node = node;
    *srchptr_list = ptr;
    for (ptr2 = ptr->next; ptr2 != NULL && ptr2 != ptr; ptr2 = ptr2->next)
        ;
}

static void remv_srchptr(srchptr_list_t **srchptr_list, srchptr_list_t *sptr)
{
    assert(srchptr_list != NULL);
    assert(sptr != NULL);

    if (sptr->prev != NULL)
        sptr->prev->next = sptr->next;
    if (sptr->next != NULL)
        sptr->next->prev = sptr->prev;
    if (*srchptr_list == sptr)
        *srchptr_list = sptr->next;
    free(sptr);
}

/* I sincerely apologize for this function.  This is called once for every byte of
 * data so I don't want to waste cycles with layers and layers of function calls.
 * The end result is a long, complex and unmaintainable function that is quick */
/* The inner demon of the search mechanism.  this updates all state machine pointers
 * with the current character and fixes the search results list appropriately */
/* FIXME: perhaps make this inline for speed */
static void update_search(srch_node_t *tree, srchptr_list_t **srchptr_list, srch_results_t **results, uint8_t c, int offset)
{
    if (*srchptr_list != NULL) {        /* start by updating existing threads */
        srchptr_list_t *ptr;
        srchptr_list_t *nxt;
        for (ptr = *srchptr_list; ptr != NULL; ptr = nxt) {
            nxt = ptr->next;
            if (ptr->node->data.table[c] != NULL) {
                srch_node_t *node = ptr->node->data.table[c];
                switch (node->nodetype) {
                case TABLE:
                    ptr->node = node;
                    break;
                case COMPLETE:
                    add_result(results, &node->data.fileid, node->spectype, offset);
                    remv_srchptr(srchptr_list, ptr);
                    break;
                default:
                    error("Barf! Unknown node type");
                    break;
                }
            } else {
                remv_srchptr(srchptr_list, ptr);
            }
        }
    }

    /* now see if we want to start a new thread (i.e. a new potential match) */
    if (tree->data.table[c] != NULL) {
        srch_node_t *node = tree->data.table[c];

        switch (node->nodetype) {
        case TABLE:            /* this should be 99.99% of them */
            add_srchptr(srchptr_list, node);
            break;
        case COMPLETE:       /* In the unlikely event of a one byte header */
            /* note to all ideots: if you carve for a one byte header,
             * you deserve the enormous flood of files that will spew forth.
             */
            add_result(results, &node->data.fileid, node->spectype, offset);
            break;
        default:
            error("Barf! Unknown node type");
            break;     
        }
    }
}

static void update_search2(srch_node_t *tree, srchptr_list_t **srchptr_list, srch_results_t **results, uint8_t c, int offset)
{
    if (*srchptr_list != NULL) {        /* start by updating existing threads */
        srchptr_list_t *ptr;
        srchptr_list_t *nxt = NULL, *prv = NULL;
                
        for (ptr = *srchptr_list; ptr != NULL; prv = ptr, nxt = ptr->next, ptr = nxt) {
            if (ptr->node->data.table[c] != NULL) {
                srch_node_t *node = ptr->node->data.table[c];
                switch (node->nodetype) {
                case TABLE:
                    ptr->node = node;
                    break;
                case COMPLETE:
                    add_result(results, &node->data.fileid, node->spectype, offset);
                    
                    /* remove thread from list */
                    if (prv != NULL)
                        prv->next = nxt;
                    else
                        *srchptr_list = nxt;
                    if (nxt != NULL)
                        nxt->prev = prv;
                    free(ptr);
                    break;
                default:
                    error("Barf! Unknown node type");
                    break;
                }
            } else { /*remove thread from list */
                if (prv != NULL)
                    prv->next = nxt;
                else
                    *srchptr_list = nxt;
                if (nxt != NULL)
                    nxt->prev = prv;
                free(ptr);
            }
        }
    }

    /* now see if we want to start a new thread (i.e. a new potential match) */
    if (tree->data.table[c] != NULL) {
        srch_node_t *node = tree->data.table[c];
        srchptr_list_t *ptr;

        switch (node->nodetype) {
        case TABLE:            /* this should be 99.99% of them */
            if (*srchptr_list == NULL) {
                *srchptr_list = ecalloc(1, sizeof **srchptr_list);
                (*srchptr_list)->next = NULL;
                (*srchptr_list)->prev = NULL;
                ptr = *srchptr_list;
            } else {
                for (ptr = *srchptr_list; ptr->next != NULL; ptr = ptr->next)
                    ;
                ptr->next = emalloc(sizeof *ptr->next);
                ptr->next->prev = ptr;
                ptr = ptr->next;
                ptr->next = NULL;
            }
            ptr->node = node;
            break;
        case COMPLETE:       /* In the unlikely event of a one byte header */
            /* note to all ideots: if you carve for a one byte header,
             * you deserve the enormous flood of files that will spew forth.
             */
            add_result(results, &node->data.fileid, node->spectype, offset);
            break;
        default:
            error("Barf! Unknown node type");
            break;     
        }
    }
}

/* Add a result to a results list, allocating as needed */
static void add_result(srch_results_t **results, fileid_t *fileid, spectype_t spectype, int offset)
{
    srch_results_t **ptr, *prev = NULL;

    assert(results != NULL);

    /* find the last element in the list, for setting prev */
    for (ptr = results; *ptr != NULL && (*ptr)->next != NULL; ptr = &(*ptr)->next)
        ;
    if (*ptr != NULL) {
        prev = *ptr;
        ptr = &(*ptr)->next;
    }
        
    *ptr = emalloc(sizeof **ptr);
    (*ptr)->next = NULL;
    (*ptr)->prev = NULL;
    (*ptr)->fileid = fileid;
    (*ptr)->spectype = spectype;
    (*ptr)->offset.start = offset - (fileid->len - 1);
    (*ptr)->offset.end = offset;
}

void free_results_list(srch_results_t **results)
{
    srch_results_t *rptr, *nxt;

    assert(results != NULL);

    for (rptr = *results; rptr != NULL; rptr = nxt) {
        nxt = rptr->next;
        free(rptr);
    }
    *results = NULL;
}
syntax highlighted by Code2HTML, v. 0.9.1