/*
File name: lin.c
Created by: Ljubomir Buturovic
Created: 08/04/2004
Purpose: functions common to all linear classifers.
*/
/*
Copyright 2004 Ljubomir J. Buturovic
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <float.h>
#include "lau.h"
#include "lmat.h"
#include "lin.h"
#include "lau.h"
#include "lerr.h"
static char rcsid[] = "$Id: lin.c,v 1.8 2005/02/23 05:48:00 ljubomir Exp $";
/*
Heuristic method to detect the type of a classifier stored in
'fname'. The file types are MODEL_SINGLE and MODEL_MULTI.
A MODEL_SINGLE file contains a single classifier and looks like
this:
0.0836344 0.253315 -0.219745 -0.0778078 0.00201759
0.236757 -0.330904 0.121814 -0.452632 0.0100153
-0.173435 0.166277 0.0747342 0.484082 -0.00424457
A MODEL_MULTI file contains a committee of classifiers and looks
like this:
10 1 1 -23.683 24.7274 -40.1631
10 1 1 13.7798 18.8408 -22.4262
10 1 1 28.6913 22.7929 -51.1134
10 2 1 -28.2378 33.2956 -50.2484
10 2 1 17.2896 27.7555 -30.9856
10 2 1 36.7149 32.92 -67.6445
10 3 1 -20.7337 28.2182 -41.2056
10 3 1 14.6608 24.2857 -26.3482
10 3 1 30.7893 30.0967 -59.885
10 4 1 -26.5747 32.7901 -50.5292
10 4 1 14.4997 26.1508 -28.0794
10 4 1 32.2912 30.2114 -61.0424
...
The first column is total number of models in the file. The second
column is model index, the third is model weight, followed by the
model coefficients.
The algorithm for heuristic detection of the model type goes through
the file and reads the first two columns. If neither has a dot
character (meaning that they are most likely integers), and the
first column is constant, and the second column has a range of 1
through the value in the first column, the file is considered to be
MODEL_MULTI. Otherwise it is MODEL_SINGLE.
This is not 100% foolproof, but _very_ unlikely to fail.
The function returns the model type and sets the number of models in
'nmd' (obviously, if the type is MODEL_SINGLE, 'nmd' is set to 1).
In case of file or malloc() error, return -1 and set errno.
*/
int detect_model(char *fname, int *nmd)
{
int type;
int ntk;
int llen;
int status;
int first_line;
int nfirst;
int nlast;
int done;
int nmodels;
char *line;
char **tokens;
FILE *fptr;
type = -1;
nlast = 0;
nmodels = 0;
status = file_info(fname, &llen, &ntk, '\0');
if ((status != -1) && (ntk >= 2))
{
status = 0;
/*
MULTI_MODEL requires at least four columns.
*/
if (ntk < 4)
type = MODEL_SINGLE;
else
{
fptr = fopen(fname, "r");
if (fptr)
{
line = malloc(llen+2);
if (line)
{
done = 0;
first_line = 1;
while (fgets(line, llen+2, fptr) && !status && !done)
{
tokens = str_tokenize(line, WHITESPACE);
if (tokens)
{
/*
If either of the first two columns has a
. character, it is a single-classifier
model. We are done.
*/
if (strchr(tokens[0], '.') || strchr(tokens[1], '.'))
{
type = MODEL_SINGLE;
done = 1;
}
else if (first_line == 1)
{
first_line = 0;
nmodels = atoi(tokens[0]);
nfirst = atoi(tokens[1]);
if (nfirst != 1)
type = MODEL_SINGLE;
}
else if (atoi(tokens[0]) != nmodels)
{
type = MODEL_SINGLE;
done = 1;
}
else
nlast = atoi(tokens[1]);
str_free(tokens);
}
else
status = -1;
}
if (!status && (type == -1))
{
if (nlast != nmodels)
type = MODEL_SINGLE;
else
type = MODEL_MULTI;
}
}
else
{
status = -1;
type = -1;
}
if (!status)
{
status = fclose(fptr);
if (status == -1)
type = 1;
}
}
}
}
if (nmd)
{
if (type == MODEL_MULTI)
*nmd = nmodels;
else
*nmd = 1;
}
return type;
}
/*
Return predicted classification of 'vector' by linear classifer
stored in 'model'. The model is a 'c' (number of classes) by 'ad'
matrix. It is assumed that the length of 'vector' is ad-1, and that
the last column of 'model' contains the bias term.
Return -1 in case of bad arguments. Otherwise, the result is in the
[0..c-1] interval.
*/
int lin_predict(float **model, int c, int ad, float *vector)
{
int i;
float dot;
float maxdot;
int predicted_class = -1;
maxdot = -FLT_MAX;
if (model && (c > 0) && (ad > 0) && vector)
for (i = 0; i < c; i++)
{
dot = fvec_dot(model[i], vector, ad-1, (int *) 0);
dot += model[i][ad-1];
if (dot > maxdot)
{
predicted_class = i;
maxdot = dot;
}
}
return predicted_class;
}
/*
Calculate 'model' output for 'vector'. The model is a 'c' (number of
classes) by 'ad' matrix. It is assumed that vector is ad-1 long, and
that the last column of 'model' contains the bias term. The output
is a 'c' long vector. The function allocates space for the output
vector, and it is the caller's responsibility to free() it.
Return NULL in case of memory error or bad arguments. In case of
memory error, set errno.
*/
float *lin_output(float **model, int c, int ad, float *vector)
{
int i;
float *output;
output = (float *) 0;
if (model && (c > 0) && (ad > 0) && vector)
{
output = malloc(c*sizeof(float));
if (output)
for (i = 0; i < c; i++)
{
output[i] = fvec_dot(model[i], vector, ad-1, (int *) 0);
output[i] += model[i][ad-1];
}
}
return output;
}
/*
Load linear classifiers from 'fname'.
The function automatically recognizes two types of model files:
single model (MODEL_SINGLE) or multiple model (MODEL_MULTI). Single
model file contains one linear classifier, while multiple model
contains multiple classifiers.
Single model file has following format: c rows by d+1 columns (d
features plus one bias term), where c is the number of classes and d
the number of features. Multiple model file has number of models in
the first column, followed by the model ID and model weight. The
rest of the information is the same as in single model file.
Note that in both cases, 'rows' and 'columns' contains information
for one model.
The function sets the model file type in 'type'. It returns an array
'models', where models[i] is the i-th model. Each model is a 'rows'
by 'columns' floating point matrix. For multiple model file, the
number of models is returnes in 'nmodels', and model weights are
returned in 'weights'.
In case of failure, return NULL and set 'errc'. Possible errors are
memory allocation and file access errors.
*/
float ***lin_load_models(char *fname, int *type, int *nmodels, float **weights,
int *rows, int *columns, int *errc)
{
int status;
int i;
int ntk;
int lc;
int len;
int llen;
int ift;
int nmd;
int lcntr;
int first_line;
int c;
int ad;
int model_cnt;
char *line;
char *str;
char ftr[20];
float *wts;
float ***models = (float ***) 0;
float **model;
FILE *fptr;
status = 0;
c = 0;
wts = (float *) 0;
model = (float **) 0;
lc = file_info(fname, &llen, &ntk, '\0');
if ((lc != -1) && (ntk > 0))
{
*type = detect_model(fname, &model_cnt);
if (*type != -1)
{
fptr = fopen(fname, "r");
if (fptr)
{
line = malloc(llen+2);
if (line)
{
nmd = 0;
lcntr = 0;
first_line = 1;
while (fgets(line, llen+2, fptr) && !status)
{
str = line;
str += strspn(str, WHITESPACE);
ift = 0;
while (!status && (str && *str) && (ift < ntk))
{
len = strcspn(str, WHITESPACE);
memcpy(ftr, str, len);
ftr[len] = '\0';
if ((first_line == 1) && (ift == 0))
{
first_line = 0;
if (*type == MODEL_MULTI)
ad = ntk-3;
else
ad = ntk;
c = lc/model_cnt;
if (rows)
*rows = c;
if (columns)
*columns = ad;
models = malloc(model_cnt*sizeof(float **));
if (models)
{
for (i = 0; (i < model_cnt) && !status; i++)
{
models[i] = fmx_alloc(c, ad);
if (!models[i])
{
status = -1;
}
}
model = models[0];
wts = malloc(model_cnt*sizeof(float));
if (model_cnt == 1)
wts[0] = 1.0;
}
else
status = -1;
}
if (!status)
{
if ((*type == MODEL_MULTI) && (ift == 0))
{
/*
Skip the model ID.
*/
str += len;
str += strspn(str, WHITESPACE);
str += strcspn(str, WHITESPACE);
/*
Model weight.
*/
str += strspn(str, WHITESPACE);
len = strcspn(str, WHITESPACE);
memcpy(ftr, str, len);
ftr[len] = '\0';
wts[nmd] = atof(ftr);
str += len;
str += strspn(str, WHITESPACE);
len = strcspn(str, WHITESPACE);
memcpy(ftr, str, len);
ftr[len] = '\0';
}
model[lcntr][ift++] = atof(ftr);
str += len;
str += strspn(str, WHITESPACE);
}
}
if (!status && (lcntr == c-1) && (nmd < model_cnt-1))
{
/*
Next model.
*/
nmd++;
model = models[nmd];
lcntr = 0;
}
else
lcntr++;
}
}
else
{
status = -1;
*errc = errno;
}
if (!status)
{
status = fclose(fptr);
if (!status)
*weights = wts;
else
{
for (i = 0; i < model_cnt; i++)
mx_free((void **) models[i], c);
models = vx_free(models);
*errc = errno;
}
}
}
else
*errc = errno;
}
}
else
*errc = errno;
if (nmodels)
*nmodels = model_cnt;
return models;
}
/*
Write linear classifier stored in 'model' in 'fptr'. The first
column is number of models 'nmodels', followed by the model index
'mdx' and model weight 'weight'.
*/
void lin_write(FILE *fptr, int nmodels, float **model, int rows, int columns,
int mdx, float weight)
{
int i;
int j;
for (i = 0; i < rows; i++)
{
fprintf(fptr, "%d\t%d\t%g\t", nmodels, mdx, weight);
for (j = 0; j < columns; j++)
{
if (j == columns-1)
fprintf(fptr, "%g\n", model[i][j]);
else
fprintf(fptr, "%g\t", model[i][j]);
}
}
}
syntax highlighted by Code2HTML, v. 0.9.1