/*
  File name: load_dset.c
  Created by: Ljubomir Buturovic
  Created: 09/06/2002
  Purpose: data loading functions.
*/

/*
  Copyright 2004 Ljubomir J. Buturovic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

static char rcsid[] = "$Id: load_dset.c,v 1.68 2006/04/21 14:30:19 ljubomir Exp $";

#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include "lerr.h"
#include "lau.h"
#include "dataset.h"
#include "pau.h"
#include "pcp.h"

#define PSTS_TESTNV               "# testing data set cardinality"
#define PSTS_TESTD                "# number of features"
#define PSTS_TESTC                "# number of data files"
#define PSTS_TESTFMT              "# test set file format"
#define PSTS_TESTND               "# file cardinality"
#define PSTS_TESTNAME             "# file name"
#define PSTS_TNV                  "# training data set cardinality"
#define PSTS_TD                   "# number of features"
#define PSTS_TC                   "# number of classes"
#define PSTS_TFMT                 "# training set file format"
#define PSTS_TND                  "# class cardinality"
#define PSTS_TNAME                "# training set class name"

#define PSTS_STATE_TESTDS         0 /* test data set until test file cardinalities */
#define PSTS_STATE_TESTVECTORS    1 /* test data set file cardinalities */
#define PSTS_STATE_TESTCLASS      2 /* test data set file names */
#define PSTS_STATE_TDS            3 /* training data set until training class cardinalities */
#define PSTS_STATE_TVECTORS       4 /* training data set class cardinalities */
#define PSTS_STATE_TCLASS         5 /* training data set class names */
#define PSTS_STATE_DONE           6

/*
  Intermediate data loading function.
*/
void p_load(int *errc, char **xname)
{
  int  type;
  int  status;
  int  min_range;
  int  max_range;
  char *msg;
  struct dataset *dset;

  dset = (struct dataset *) 0;
  clear_screen();
  cursor_on();
  msg = malloc((PCP_QLEN+1)*sizeof(char));
  min_range = 0;
  max_range = 1;
  sprintf(msg, PCP_UMSG_LOAD, min_range, max_range, min_range);
  type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range, 
		       &max_range);
  free(msg);
  if (type == 0)
    {
      tds = pcp_input(stdin, stdout, errc, xname);
      dset = tds;
    }
  else if (type == 1)
    {
      teds = pcp_input(stdin, stdout, errc, xname);
      dset = teds;
    }
  cursor_off();
  if (dset)
    {
      status = save_sts(PCP_STS, teds, tds);
      if (status == -1)
	{
	  /*
	    Reset data sets here due to save_sts() error.
	   */
	  *errc = errno;
	  teds = dataset_free(teds);
	  tds = dataset_free(tds);
	  *xname = strdup(PCP_STS);
	}
    }
}

/*
  Save status of 'test_dataset', 'training_dataset' in 'fname'.

  In case of success, return 0. In case of failure, return -1 and set
  errno.
*/
int save_sts(char *fname, struct dataset *test_dataset, struct dataset *training_dataset)
{
  int  i;
  int  status = 0;
  int  nx;
  FILE *fptr;

  fptr = fopen(fname, "w");
  if (fptr)
    {
      if (test_dataset)
	{
	  fprintf(fptr, "%-35d     %s\n", test_dataset->nv, PSTS_TESTNV);
	  fprintf(fptr, "%-35d     %s\n", test_dataset->d, PSTS_TESTD);
	  fprintf(fptr, "%-35d     %s\n", test_dataset->c, PSTS_TESTC);
	  fprintf(fptr, "%-35d     %s\n", test_dataset->format, PSTS_TESTFMT);
	  for (i = 0; i < test_dataset->c; i++)
	    fprintf(fptr, "%-35d     %s, file %d\n", test_dataset->nd[i], PSTS_TESTND, i+1);
	  for (i = 0; i < test_dataset->c; i++)
	    fprintf(fptr, "%-35s     %s, file %d\n", test_dataset->fnames[i], PSTS_TESTNAME, i+1);
	}
      else
	{
	  nx = 0;
	  fprintf(fptr, "%-35d     %s\n", nx, PSTS_TESTNV);
	}
      if (training_dataset)
	{
	  fprintf(fptr, "%-35d     %s\n", training_dataset->nv, PSTS_TNV);
	  fprintf(fptr, "%-35d     %s\n", training_dataset->d, PSTS_TD);
	  fprintf(fptr, "%-35d     %s\n", training_dataset->c, PSTS_TC);
	  fprintf(fptr, "%-35d     %s\n", training_dataset->format, PSTS_TFMT);
	  for (i = 0; i < training_dataset->c; i++)
	    fprintf(fptr, "%-35d     %s, class %d\n", training_dataset->nd[i], PSTS_TND, i+1);
	  for (i = 0; i < training_dataset->c; i++)
	    fprintf(fptr, "%-35s     %s, class %d\n", training_dataset->fnames[i], PSTS_TNAME, i+1);
	}
      else
	{
	  nx = 0;
	  fprintf(fptr, "%-35d%s\n", nx, PSTS_TNV);
	}
      status = fclose(fptr);
    }
  else
    status = -1;
  return status;
}

static int load_sts(char *fname, int *tenv, int *ted, int *tec, char ***tenames,
		    int **test_nd, int *tnv, int *td, int *tc, char ***tnames, 
		    int **training_nd, int *test_fmt, int *training_fmt)
{
  int  status = 0;
  int  state = PSTS_STATE_TESTDS;
  int  done;
  int  maxlen;
  int  fcntr;
  int  *nvc;
  char *line;
  char **tokens;
  char **names;
  FILE *fptr;
  
  fcntr = -1;
  nvc = (int *) 0;
  names = (char **) 0;
  status = file_info(fname, &maxlen, (int *) 0, '\0');
  if (status != -1)
    {
      status = 0;
      fptr = fopen(fname, "r");
      if (fptr != (FILE *) 0)
	{
	  line = malloc(maxlen+2);
	  done = 0;
	  while (fgets(line, maxlen+2, fptr) && (done == 0))
	    {
	      /* test data set info */

	      if (state == PSTS_STATE_TESTDS)
		{
		  if (strstr(line, PSTS_TESTNV))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *tenv = atoi(tokens[0]);
		      str_free(tokens);
		      if (*tenv == 0)
			state = PSTS_STATE_TDS;
		    }
		  else if (strstr(line, PSTS_TESTD))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *ted = atoi(tokens[0]); 
		      str_free(tokens);
		    }
		  else if (strstr(line, PSTS_TESTC))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *tec = atoi(tokens[0]); 
		      str_free(tokens);
		    }
		  else if (strstr(line, PSTS_TESTFMT))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *test_fmt = atoi(tokens[0]); 
		      str_free(tokens);
		      state = PSTS_STATE_TESTVECTORS;
		      fcntr = 0;
		      nvc = malloc(*tec*sizeof(int));
		    }
		}
	      else if (state == PSTS_STATE_TESTVECTORS)
		{
		  tokens = str_tokenize(line, WHITESPACE);
		  nvc[fcntr] = atoi(tokens[0]);
		  str_free(tokens);
		  fcntr++;
		  if (fcntr == *tec)
		    {
		      state = PSTS_STATE_TESTCLASS;
		      fcntr = 0;
		      *test_nd = nvc;
		      names = malloc(*tec*sizeof(char *));
		    }
		}
	      else if (state == PSTS_STATE_TESTCLASS)
		{
		  tokens = str_tokenize(line, WHITESPACE);
		  names[fcntr] = strdup(tokens[0]);
		  str_free(tokens);
		  fcntr++;
		  if (fcntr == *tec)
		    {
		      state = PSTS_STATE_TDS;
		      *tenames = names;
		    }
		}

	      /* training data set info */

	      else if (state == PSTS_STATE_TDS)
		{
		  if (strstr(line, PSTS_TNV))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *tnv = atoi(tokens[0]);
		      str_free(tokens);
		    }
		  else if (strstr(line, PSTS_TD))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *td = atoi(tokens[0]); 
		      str_free(tokens);
		    }
		  else if (strstr(line, PSTS_TC))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *tc = atoi(tokens[0]); 
		      str_free(tokens);
		    }
		  else if (strstr(line, PSTS_TFMT))
		    {
		      tokens = str_tokenize(line, WHITESPACE);
		      *training_fmt = atoi(tokens[0]); 
		      str_free(tokens);
		      state = PSTS_STATE_TVECTORS;
		      fcntr = 0;
		      nvc = malloc(*tc*sizeof(int));
		    }
		}	
	      else if (state == PSTS_STATE_TVECTORS)
		{
		  tokens = str_tokenize(line, WHITESPACE);
		  nvc[fcntr] = atoi(tokens[0]);
		  str_free(tokens);
		  fcntr++;
		  if (fcntr == *tc)
		    {
		      *training_nd = nvc;
		      state = PSTS_STATE_TCLASS;
		      fcntr = 0;
		      names = malloc(*tc*sizeof(char *));
		    }
		}
	      else if (state == PSTS_STATE_TCLASS)
		{
		  tokens = str_tokenize(line, WHITESPACE);
		  names[fcntr++] = strdup(tokens[0]);
		  str_free(tokens);
		  if (fcntr == *tc)
		    {
		      state = PSTS_STATE_DONE;
		      *tnames = names;
		    }
		}
	    }
	  fclose(fptr);
	  free(line);
	}
      else
	status = -1;
    }
  return status;
}

/*
  Startup data load. Reads the status file, then loads data from input
  files into the global C data structures 'teds' and 'tds'. 
  
  In case of success, set 'errc' to 0. Otherwise, set errc to error
  code. If 'errc' is file-related error code, 'xname' is name of
  offending file.
*/
void init_load(int *errc, char **xname)
{
  int  status;
  int  test_fmt;
  int  training_fmt;
  int  nf;
  int  tv;
  int  nfs;
  int  tnf;
  int  ttv;
  int  nc;
  int  *test_nd;
  int  *training_nd;
  char *fnm;
  char **tenames;
  char **tnames;

  *errc = 0;
  tenames = (char **) 0;
  tnames = (char **) 0;
  test_nd = (int *) 0;
  training_nd = (int *) 0;
  fnm = strdup(PCP_STS);
  status = load_sts(fnm, &tv, &nf, &nfs, &tenames, &test_nd, &ttv, &tnf, &nc,
		    &tnames, &training_nd, &test_fmt, &training_fmt);
  if (!status)
    {
      free(fnm);
      if (tv > 0)
	teds = load_dataset(nf, nfs, test_nd, tenames, test_fmt, errc, &fnm);
      if (*errc == 0)
	{
	  if (ttv > 0)
	    tds = load_dataset(tnf, nc, training_nd, tnames, training_fmt, errc, &fnm);
	}
    }
  /*
    We don't want to complain if status file is missing - that's OK.
   */
  else
    {
      if (errno != ENOENT)
	*errc = errno;
    }
  if (*errc)
    {
      *xname = strdup(fnm);
      if (*errc == LERR_FILE_FORMAT)
	/*
	  We know that if load_dataset() reports LERR_FILE_FORMAT,
	  either number of features, or number of vectors in the input
	  file is inconsistent with the values found in
	  PCP_STS. Therefore we can report a more specific message.

	  In this particular case, we don't want to remove the PCP_STS
	  file, so that user can diagnose the error.
	*/
	*errc = PERR_INCONSISTENT_FILE;
      else
	/*
	  Other type of file loading error - remove the status file.
	*/
	unlink(PCP_STS);
    }
  strlen_free(tenames, nfs);
  strlen_free(tnames, nc);
  free(training_nd);
  free(test_nd);
}

/*
  Remove datasets (used in case of error).
*/
void remove_datasets(void)
{
  teds = dataset_free(teds);
  tds = dataset_free(tds);
  unlink(PCP_STS);
}

/*
  Copy data set. 
*/
void p_copy(int *errc, char **xname)
{
  int  type;
  int  status;
  int  min_range;
  int  max_range;
  char *msg;

  if (teds || tds)
    {
      *errc = 0;
      if (tds && !teds)
	type = 0;
      else if (!tds && teds)
	type = 1;
      else
	{
	  clear_screen();
	  cursor_on();
	  msg = malloc((PCP_QLEN+1)*sizeof(char));
	  min_range = 0;
	  max_range = 1;
	  sprintf(msg, " Copy training set to test (%d) or vice-versa (%d) [%d]:", 
		  min_range, max_range, min_range);
	  type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range, 
			       &max_range);
	  free(msg);
	}
      if (type == 0)
	{
	  dataset_free(teds);
	  teds = dataset_clone(tds);
	  if (teds == (struct dataset *) 0)
	    {
	      *errc = errno;
	      remove_datasets();
	    }
	}
      else
	{
	  dataset_free(tds);
	  tds = dataset_clone(teds);
	  if (tds == (struct dataset *) 0)
	    {
	      *errc = errno;
	      remove_datasets();
	    }
	}
      if (*errc == 0)
	{
	  status = save_sts(PCP_STS, teds, tds);
	  if (status == -1)
	    {
	      *errc = errno;
	      *xname = strdup(PCP_STS);
	      remove_datasets();
	    }
	}
    }
}

/*
  Transform data set(s) using transformation 'ttype'. The available
  values for ttype are P_NORMALIZE and P_MAP. Optionally copy the
  transformed data set(s) into the current data set.
*/
void p_transform(int ttype, int *errc, char **xname)
{
  int   status;
  int   mode;
  int   lxd;
  int   nx;
  int   md;
  int   nsamples;
  char  *msg;
  char  *iffix;
  char  *fnm;
  float *xmean;
  float *std;
  float **mapx;
  float **srx;
  struct dataset *dset1 = (struct dataset *) 0;
  struct dataset *dset2 = (struct dataset *) 0;

  md = 0;   /* 0: use training and test datasets to compute the scaling/recenterin */
            /* 1: use training dataset to compute the scaling/recentering */
  lxd = -1;
  iffix = (char *) 0;
  status = 0;
  if (ttype == P_NORMALIZE)
    iffix = "nrm";
  else if (ttype == P_MAP)
    iffix = "map";
  clear_screen();
  cursor_on();
  *errc = 0;
  msg = malloc((PCP_QLEN+1)*sizeof(char));
  if (ttype == P_MAP)
    {
      fnm = input_filename(PMSG_LIN_INPUT_FNAME, PCP_LIN, stdout);
      lxd = file_info(fnm, (int *) 0, &nx, '\0');
      if (lxd < 0)
	status = -1;
    }
  if (!status)
    {
      mode = input_replace(&dset1, &dset2, iffix);
      if (ttype == P_NORMALIZE)
	{
	  if (dset1)
	    md = input_transform_mode();
	  if (md == 0)
	    {
	      srx = combine_x(dset2->x, dset2->nv, dset1->x, dset1->nv);
	      nsamples = dset2->nv+dset1->nv;
	    }
	  else
	    {
	      srx = dset2->x;
	      nsamples = dset2->nv;
	    }
	  xmean = fmx_mean(srx, nsamples, dset2->d);
	  std = fmx_std(srx, nsamples, dset2->d);
	  if (md == 0)
	    vx_free(srx);
	  fmx_prenorm(dset2->x, dset2->nv, dset2->d, xmean, std);
	  if (dset1)
	    fmx_prenorm(dset1->x, dset1->nv, dset1->d, xmean, std);
	}
      else if (ttype == P_MAP)
	{
	  /*
	    lxd is dimension of transformed space, nx should be equal to the
	    dimensionality of TDS/TEDS. Verify that nx is consistent with
	    TDS/TEDS.
	  */
	  if ((dset1 && (nx < dset1->d)) || (dset2 && (nx < dset2->d)))
	    {
	      status = -1;
	      *errc = PERR_INCONSISTENT_MAP;
	    }
	  else
	    {
	      mapx = fmx_load(fnm, (int *) 0, (int *) 0, '\0');
	      if (mapx)
		{
		  status = dataset_mapx(dset1, lxd, mapx);
		  if (!status)
		    dataset_mapx(dset2, lxd, mapx);
		}
	      else
		status = -1;
	    }
	}
      if (!status)
	{
	  status = dataset_write(dset1, &fnm);
	  if (!status)
	    status = dataset_write(dset2, &fnm);
	  if ((mode == P_REPLACE) && !status)
	    {
	      status = save_sts(PCP_STS, teds, tds);
	      if (status == -1)
		{
		  *errc = errno;
		  *xname = strdup(PCP_STS);
		  remove_datasets();
		}
	    }
	}
    }
  if (status == -1)
    {
      if (*errc == 0)
	*errc = errno;
      *xname = strdup(fnm);
    }
}


syntax highlighted by Code2HTML, v. 0.9.1