/*
  File name: lin.c
  Created by: Ljubomir Buturovic
  Created: 08/04/2004
  Purpose: functions common to all linear classifers.
*/

/*
  Copyright 2004 Ljubomir J. Buturovic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <float.h>
#include "lau.h"
#include "lmat.h"
#include "lin.h"
#include "lau.h"
#include "lerr.h"

static char rcsid[] = "$Id: lin.c,v 1.8 2005/02/23 05:48:00 ljubomir Exp $";

/*
  Heuristic method to detect the type of a classifier stored in
  'fname'. The file types are MODEL_SINGLE and MODEL_MULTI.

  A MODEL_SINGLE file contains a single classifier and looks like
  this:

  0.0836344       0.253315        -0.219745       -0.0778078      0.00201759
  0.236757        -0.330904       0.121814        -0.452632       0.0100153
  -0.173435       0.166277        0.0747342       0.484082        -0.00424457

  A MODEL_MULTI file contains a committee of classifiers and looks
  like this:

  10      1       1       -23.683 24.7274 -40.1631
  10      1       1       13.7798 18.8408 -22.4262
  10      1       1       28.6913 22.7929 -51.1134
  10      2       1       -28.2378        33.2956 -50.2484
  10      2       1       17.2896 27.7555 -30.9856
  10      2       1       36.7149 32.92   -67.6445
  10      3       1       -20.7337        28.2182 -41.2056
  10      3       1       14.6608 24.2857 -26.3482
  10      3       1       30.7893 30.0967 -59.885
  10      4       1       -26.5747        32.7901 -50.5292
  10      4       1       14.4997 26.1508 -28.0794
  10      4       1       32.2912 30.2114 -61.0424
  ...
  
  The first column is total number of models in the file. The second
  column is model index, the third is model weight, followed by the
  model coefficients.

  The algorithm for heuristic detection of the model type goes through
  the file and reads the first two columns. If neither has a dot
  character (meaning that they are most likely integers), and the
  first column is constant, and the second column has a range of 1
  through the value in the first column, the file is considered to be
  MODEL_MULTI. Otherwise it is MODEL_SINGLE.

  This is not 100% foolproof, but _very_ unlikely to fail.

  The function returns the model type and sets the number of models in
  'nmd' (obviously, if the type is MODEL_SINGLE, 'nmd' is set to 1).
  In case of file or malloc() error, return -1 and set errno.
*/
int detect_model(char *fname, int *nmd)
{
  int  type;
  int  ntk;
  int  llen;
  int  status;
  int  first_line;
  int  nfirst;
  int  nlast;
  int  done;
  int  nmodels;
  char *line;
  char **tokens;
  FILE *fptr;

  type = -1;
  nlast = 0;
  nmodels = 0;
  status = file_info(fname, &llen, &ntk, '\0');
  if ((status != -1) && (ntk >= 2))
    {
      status = 0;
      /*
	MULTI_MODEL requires at least four columns.
      */
      if (ntk < 4)
	type = MODEL_SINGLE;
      else
	{
	  fptr = fopen(fname, "r");
	  if (fptr)
	    {
	      line = malloc(llen+2);
	      if (line)
		{
		  done = 0;
		  first_line = 1;
		  while (fgets(line, llen+2, fptr) && !status && !done)
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      if (tokens)
			{
			  /*
			    If either of the first two columns has a
			    . character, it is a single-classifier
			    model. We are done.
			  */
			  if (strchr(tokens[0], '.') || strchr(tokens[1], '.'))
			    {
			      type = MODEL_SINGLE;
			      done = 1;
			    }
			  else if (first_line == 1)
			    {
			      first_line = 0;
			      nmodels = atoi(tokens[0]);
			      nfirst = atoi(tokens[1]);
			      if (nfirst != 1)
				type = MODEL_SINGLE;
			    }
			  else if (atoi(tokens[0]) != nmodels)
			    {
			      type = MODEL_SINGLE;
			      done = 1;
			    }
			  else
			    nlast = atoi(tokens[1]);
			  str_free(tokens);
			}
		      else
			status = -1;
		    }
		  if (!status && (type == -1))
		    {
		      if (nlast != nmodels)
			type = MODEL_SINGLE;
		      else
			type = MODEL_MULTI;
		    }
		}
	      else
		{
		  status = -1;
		  type = -1;
		}
	      if (!status)
		{
		  status = fclose(fptr);
		  if (status == -1)
		    type = 1;
		}
	    }
	}
    }
  if (nmd)
    {
      if (type == MODEL_MULTI)
	*nmd = nmodels;
      else
	*nmd = 1;
    }
  return type;
}

/*
  Return predicted classification of 'vector' by linear classifer
  stored in 'model'. The model is a 'c' (number of classes) by 'ad'
  matrix. It is assumed that the length of 'vector' is ad-1, and that
  the last column of 'model' contains the bias term.

  Return -1 in case of bad arguments. Otherwise, the result is in the
  [0..c-1] interval.
*/
int lin_predict(float **model, int c, int ad, float *vector)
{
  int   i;
  float dot;
  float maxdot;
  int   predicted_class = -1;

  maxdot = -FLT_MAX;
  if (model && (c > 0) && (ad > 0) && vector)
    for (i = 0; i < c; i++)
      {
	dot = fvec_dot(model[i], vector, ad-1, (int *) 0);
	dot += model[i][ad-1];
	if (dot > maxdot)
	  {
	    predicted_class = i;
	    maxdot = dot;
	  }
      }
  return predicted_class;
}

/*
  Calculate 'model' output for 'vector'. The model is a 'c' (number of
  classes) by 'ad' matrix. It is assumed that vector is ad-1 long, and
  that the last column of 'model' contains the bias term. The output
  is a 'c' long vector. The function allocates space for the output
  vector, and it is the caller's responsibility to free() it.

  Return NULL in case of memory error or bad arguments. In case of
  memory error, set errno.
*/
float *lin_output(float **model, int c, int ad, float *vector)
{
  int   i;
  float *output;

  output = (float *) 0;
  if (model && (c > 0) && (ad > 0) && vector)
    {
      output = malloc(c*sizeof(float));
      if (output)
	for (i = 0; i < c; i++)
	  {
	    output[i] = fvec_dot(model[i], vector, ad-1, (int *) 0);
	    output[i] += model[i][ad-1];
	  }
    }
  return output;
}

/*
  Load linear classifiers from 'fname'. 

  The function automatically recognizes two types of model files:
  single model (MODEL_SINGLE) or multiple model (MODEL_MULTI). Single
  model file contains one linear classifier, while multiple model
  contains multiple classifiers.
  
  Single model file has following format: c rows by d+1 columns (d
  features plus one bias term), where c is the number of classes and d
  the number of features. Multiple model file has number of models in
  the first column, followed by the model ID and model weight. The
  rest of the information is the same as in single model file.

  Note that in both cases, 'rows' and 'columns' contains information
  for one model.

  The function sets the model file type in 'type'. It returns an array
  'models', where models[i] is the i-th model. Each model is a 'rows'
  by 'columns' floating point matrix. For multiple model file, the
  number of models is returnes in 'nmodels', and model weights are
  returned in 'weights'.

  In case of failure, return NULL and set 'errc'. Possible errors are
  memory allocation and file access errors.
*/
float ***lin_load_models(char *fname, int *type, int *nmodels, float **weights, 
			 int *rows, int *columns, int *errc)
{
  int    status;
  int    i;
  int    ntk;
  int    lc;
  int    len;
  int    llen;
  int    ift;
  int    nmd;
  int    lcntr;
  int    first_line;
  int    c;
  int    ad;
  int    model_cnt;
  char   *line;
  char   *str;
  char   ftr[20];
  float  *wts;
  float  ***models = (float ***) 0;
  float  **model;
  FILE   *fptr;

  status = 0;
  c = 0;
  wts = (float *) 0;
  model = (float **) 0;
  lc = file_info(fname, &llen, &ntk, '\0');
  if ((lc != -1) && (ntk > 0))
    {
      *type = detect_model(fname, &model_cnt);
      if (*type != -1)
	{
	  fptr = fopen(fname, "r");
	  if (fptr)
	    {
	      line = malloc(llen+2);
	      if (line)
		{
		  nmd = 0;
		  lcntr = 0;
		  first_line = 1;
		  while (fgets(line, llen+2, fptr) && !status)
		    {
		      str = line;
		      str += strspn(str, WHITESPACE);
		      ift = 0;
		      while (!status && (str && *str) && (ift < ntk))
			{
			  len = strcspn(str, WHITESPACE);
			  memcpy(ftr, str, len);
			  ftr[len] = '\0';
			  if ((first_line == 1) && (ift == 0))
			    {
			      first_line = 0;
			      if (*type == MODEL_MULTI)
				ad = ntk-3;
			      else
				ad = ntk;
			      c = lc/model_cnt;
			      if (rows)
				*rows = c;
			      if (columns)
				*columns = ad;
			      models = malloc(model_cnt*sizeof(float **));
			      if (models)
				{
				  for (i = 0; (i < model_cnt) && !status; i++)
				    {
				      models[i] = fmx_alloc(c, ad);
				      if (!models[i])
					{
					  status = -1;
					}
				    }
				  model = models[0];
				  wts = malloc(model_cnt*sizeof(float));
				  if (model_cnt == 1)
				    wts[0] = 1.0;
				}
			      else
				status = -1;
			    }
			  if (!status)
			    {
			      if ((*type == MODEL_MULTI) && (ift == 0))
				{
				  /*
				    Skip the model ID.
				  */
				  str += len;
				  str += strspn(str, WHITESPACE);
				  str += strcspn(str, WHITESPACE);
				  /*
				    Model weight.
				  */
				  str += strspn(str, WHITESPACE);
				  len = strcspn(str, WHITESPACE);
				  memcpy(ftr, str, len);
				  ftr[len] = '\0';
				  wts[nmd] = atof(ftr);
				  str += len;
				  str += strspn(str, WHITESPACE);
				  len = strcspn(str, WHITESPACE);
				  memcpy(ftr, str, len);
				  ftr[len] = '\0';
				}
			      model[lcntr][ift++] = atof(ftr);
			      str += len;
			      str += strspn(str, WHITESPACE);
			    }
			}
		      if (!status && (lcntr == c-1) && (nmd < model_cnt-1))
			{
			  /*
			    Next model.
			  */
			  nmd++;
			  model = models[nmd];
			  lcntr = 0;
			}
		      else
			lcntr++;
		    }
		}
	      else
		{
		  status = -1;
		  *errc = errno;
		}
	      if (!status)
		{
		  status = fclose(fptr);
		  if (!status)
		    *weights = wts;
		  else
		    {
		      for (i = 0; i < model_cnt; i++)
			mx_free((void **) models[i], c);
		      models = vx_free(models);
		      *errc = errno;
		    }
		}
	    }
	  else
	    *errc = errno;
	}
    }
  else
    *errc = errno;
  if (nmodels)
    *nmodels = model_cnt;
  return models;
}

/*
  Write linear classifier stored in 'model' in 'fptr'. The first
  column is number of models 'nmodels', followed by the model index
  'mdx' and model weight 'weight'.
*/
void lin_write(FILE *fptr, int nmodels, float **model, int rows, int columns, 
	       int mdx, float weight)
{
  int  i;
  int  j;

  for (i = 0; i < rows; i++)
    {
      fprintf(fptr, "%d\t%d\t%g\t", nmodels, mdx, weight);
      for (j = 0; j < columns; j++)
	{
	  if (j == columns-1)
	    fprintf(fptr, "%g\n", model[i][j]);
	  else
	    fprintf(fptr, "%g\t", model[i][j]);
	}
    }
}





syntax highlighted by Code2HTML, v. 0.9.1