/*
File name: load_dset.c
Created by: Ljubomir Buturovic
Created: 09/06/2002
Purpose: data loading functions.
*/
/*
Copyright 2004 Ljubomir J. Buturovic
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
static char rcsid[] = "$Id: load_dset.c,v 1.68 2006/04/21 14:30:19 ljubomir Exp $";
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include "lerr.h"
#include "lau.h"
#include "dataset.h"
#include "pau.h"
#include "pcp.h"
#define PSTS_TESTNV "# testing data set cardinality"
#define PSTS_TESTD "# number of features"
#define PSTS_TESTC "# number of data files"
#define PSTS_TESTFMT "# test set file format"
#define PSTS_TESTND "# file cardinality"
#define PSTS_TESTNAME "# file name"
#define PSTS_TNV "# training data set cardinality"
#define PSTS_TD "# number of features"
#define PSTS_TC "# number of classes"
#define PSTS_TFMT "# training set file format"
#define PSTS_TND "# class cardinality"
#define PSTS_TNAME "# training set class name"
#define PSTS_STATE_TESTDS 0 /* test data set until test file cardinalities */
#define PSTS_STATE_TESTVECTORS 1 /* test data set file cardinalities */
#define PSTS_STATE_TESTCLASS 2 /* test data set file names */
#define PSTS_STATE_TDS 3 /* training data set until training class cardinalities */
#define PSTS_STATE_TVECTORS 4 /* training data set class cardinalities */
#define PSTS_STATE_TCLASS 5 /* training data set class names */
#define PSTS_STATE_DONE 6
/*
Intermediate data loading function.
*/
void p_load(int *errc, char **xname)
{
int type;
int status;
int min_range;
int max_range;
char *msg;
struct dataset *dset;
dset = (struct dataset *) 0;
clear_screen();
cursor_on();
msg = malloc((PCP_QLEN+1)*sizeof(char));
min_range = 0;
max_range = 1;
sprintf(msg, PCP_UMSG_LOAD, min_range, max_range, min_range);
type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range,
&max_range);
free(msg);
if (type == 0)
{
tds = pcp_input(stdin, stdout, errc, xname);
dset = tds;
}
else if (type == 1)
{
teds = pcp_input(stdin, stdout, errc, xname);
dset = teds;
}
cursor_off();
if (dset)
{
status = save_sts(PCP_STS, teds, tds);
if (status == -1)
{
/*
Reset data sets here due to save_sts() error.
*/
*errc = errno;
teds = dataset_free(teds);
tds = dataset_free(tds);
*xname = strdup(PCP_STS);
}
}
}
/*
Save status of 'test_dataset', 'training_dataset' in 'fname'.
In case of success, return 0. In case of failure, return -1 and set
errno.
*/
int save_sts(char *fname, struct dataset *test_dataset, struct dataset *training_dataset)
{
int i;
int status = 0;
int nx;
FILE *fptr;
fptr = fopen(fname, "w");
if (fptr)
{
if (test_dataset)
{
fprintf(fptr, "%-35d %s\n", test_dataset->nv, PSTS_TESTNV);
fprintf(fptr, "%-35d %s\n", test_dataset->d, PSTS_TESTD);
fprintf(fptr, "%-35d %s\n", test_dataset->c, PSTS_TESTC);
fprintf(fptr, "%-35d %s\n", test_dataset->format, PSTS_TESTFMT);
for (i = 0; i < test_dataset->c; i++)
fprintf(fptr, "%-35d %s, file %d\n", test_dataset->nd[i], PSTS_TESTND, i+1);
for (i = 0; i < test_dataset->c; i++)
fprintf(fptr, "%-35s %s, file %d\n", test_dataset->fnames[i], PSTS_TESTNAME, i+1);
}
else
{
nx = 0;
fprintf(fptr, "%-35d %s\n", nx, PSTS_TESTNV);
}
if (training_dataset)
{
fprintf(fptr, "%-35d %s\n", training_dataset->nv, PSTS_TNV);
fprintf(fptr, "%-35d %s\n", training_dataset->d, PSTS_TD);
fprintf(fptr, "%-35d %s\n", training_dataset->c, PSTS_TC);
fprintf(fptr, "%-35d %s\n", training_dataset->format, PSTS_TFMT);
for (i = 0; i < training_dataset->c; i++)
fprintf(fptr, "%-35d %s, class %d\n", training_dataset->nd[i], PSTS_TND, i+1);
for (i = 0; i < training_dataset->c; i++)
fprintf(fptr, "%-35s %s, class %d\n", training_dataset->fnames[i], PSTS_TNAME, i+1);
}
else
{
nx = 0;
fprintf(fptr, "%-35d%s\n", nx, PSTS_TNV);
}
status = fclose(fptr);
}
else
status = -1;
return status;
}
static int load_sts(char *fname, int *tenv, int *ted, int *tec, char ***tenames,
int **test_nd, int *tnv, int *td, int *tc, char ***tnames,
int **training_nd, int *test_fmt, int *training_fmt)
{
int status = 0;
int state = PSTS_STATE_TESTDS;
int done;
int maxlen;
int fcntr;
int *nvc;
char *line;
char **tokens;
char **names;
FILE *fptr;
fcntr = -1;
nvc = (int *) 0;
names = (char **) 0;
status = file_info(fname, &maxlen, (int *) 0, '\0');
if (status != -1)
{
status = 0;
fptr = fopen(fname, "r");
if (fptr != (FILE *) 0)
{
line = malloc(maxlen+2);
done = 0;
while (fgets(line, maxlen+2, fptr) && (done == 0))
{
/* test data set info */
if (state == PSTS_STATE_TESTDS)
{
if (strstr(line, PSTS_TESTNV))
{
tokens = str_tokenize(line, WHITESPACE);
*tenv = atoi(tokens[0]);
str_free(tokens);
if (*tenv == 0)
state = PSTS_STATE_TDS;
}
else if (strstr(line, PSTS_TESTD))
{
tokens = str_tokenize(line, WHITESPACE);
*ted = atoi(tokens[0]);
str_free(tokens);
}
else if (strstr(line, PSTS_TESTC))
{
tokens = str_tokenize(line, WHITESPACE);
*tec = atoi(tokens[0]);
str_free(tokens);
}
else if (strstr(line, PSTS_TESTFMT))
{
tokens = str_tokenize(line, WHITESPACE);
*test_fmt = atoi(tokens[0]);
str_free(tokens);
state = PSTS_STATE_TESTVECTORS;
fcntr = 0;
nvc = malloc(*tec*sizeof(int));
}
}
else if (state == PSTS_STATE_TESTVECTORS)
{
tokens = str_tokenize(line, WHITESPACE);
nvc[fcntr] = atoi(tokens[0]);
str_free(tokens);
fcntr++;
if (fcntr == *tec)
{
state = PSTS_STATE_TESTCLASS;
fcntr = 0;
*test_nd = nvc;
names = malloc(*tec*sizeof(char *));
}
}
else if (state == PSTS_STATE_TESTCLASS)
{
tokens = str_tokenize(line, WHITESPACE);
names[fcntr] = strdup(tokens[0]);
str_free(tokens);
fcntr++;
if (fcntr == *tec)
{
state = PSTS_STATE_TDS;
*tenames = names;
}
}
/* training data set info */
else if (state == PSTS_STATE_TDS)
{
if (strstr(line, PSTS_TNV))
{
tokens = str_tokenize(line, WHITESPACE);
*tnv = atoi(tokens[0]);
str_free(tokens);
}
else if (strstr(line, PSTS_TD))
{
tokens = str_tokenize(line, WHITESPACE);
*td = atoi(tokens[0]);
str_free(tokens);
}
else if (strstr(line, PSTS_TC))
{
tokens = str_tokenize(line, WHITESPACE);
*tc = atoi(tokens[0]);
str_free(tokens);
}
else if (strstr(line, PSTS_TFMT))
{
tokens = str_tokenize(line, WHITESPACE);
*training_fmt = atoi(tokens[0]);
str_free(tokens);
state = PSTS_STATE_TVECTORS;
fcntr = 0;
nvc = malloc(*tc*sizeof(int));
}
}
else if (state == PSTS_STATE_TVECTORS)
{
tokens = str_tokenize(line, WHITESPACE);
nvc[fcntr] = atoi(tokens[0]);
str_free(tokens);
fcntr++;
if (fcntr == *tc)
{
*training_nd = nvc;
state = PSTS_STATE_TCLASS;
fcntr = 0;
names = malloc(*tc*sizeof(char *));
}
}
else if (state == PSTS_STATE_TCLASS)
{
tokens = str_tokenize(line, WHITESPACE);
names[fcntr++] = strdup(tokens[0]);
str_free(tokens);
if (fcntr == *tc)
{
state = PSTS_STATE_DONE;
*tnames = names;
}
}
}
fclose(fptr);
free(line);
}
else
status = -1;
}
return status;
}
/*
Startup data load. Reads the status file, then loads data from input
files into the global C data structures 'teds' and 'tds'.
In case of success, set 'errc' to 0. Otherwise, set errc to error
code. If 'errc' is file-related error code, 'xname' is name of
offending file.
*/
void init_load(int *errc, char **xname)
{
int status;
int test_fmt;
int training_fmt;
int nf;
int tv;
int nfs;
int tnf;
int ttv;
int nc;
int *test_nd;
int *training_nd;
char *fnm;
char **tenames;
char **tnames;
*errc = 0;
tenames = (char **) 0;
tnames = (char **) 0;
test_nd = (int *) 0;
training_nd = (int *) 0;
fnm = strdup(PCP_STS);
status = load_sts(fnm, &tv, &nf, &nfs, &tenames, &test_nd, &ttv, &tnf, &nc,
&tnames, &training_nd, &test_fmt, &training_fmt);
if (!status)
{
free(fnm);
if (tv > 0)
teds = load_dataset(nf, nfs, test_nd, tenames, test_fmt, errc, &fnm);
if (*errc == 0)
{
if (ttv > 0)
tds = load_dataset(tnf, nc, training_nd, tnames, training_fmt, errc, &fnm);
}
}
/*
We don't want to complain if status file is missing - that's OK.
*/
else
{
if (errno != ENOENT)
*errc = errno;
}
if (*errc)
{
*xname = strdup(fnm);
if (*errc == LERR_FILE_FORMAT)
/*
We know that if load_dataset() reports LERR_FILE_FORMAT,
either number of features, or number of vectors in the input
file is inconsistent with the values found in
PCP_STS. Therefore we can report a more specific message.
In this particular case, we don't want to remove the PCP_STS
file, so that user can diagnose the error.
*/
*errc = PERR_INCONSISTENT_FILE;
else
/*
Other type of file loading error - remove the status file.
*/
unlink(PCP_STS);
}
strlen_free(tenames, nfs);
strlen_free(tnames, nc);
free(training_nd);
free(test_nd);
}
/*
Remove datasets (used in case of error).
*/
void remove_datasets(void)
{
teds = dataset_free(teds);
tds = dataset_free(tds);
unlink(PCP_STS);
}
/*
Copy data set.
*/
void p_copy(int *errc, char **xname)
{
int type;
int status;
int min_range;
int max_range;
char *msg;
if (teds || tds)
{
*errc = 0;
if (tds && !teds)
type = 0;
else if (!tds && teds)
type = 1;
else
{
clear_screen();
cursor_on();
msg = malloc((PCP_QLEN+1)*sizeof(char));
min_range = 0;
max_range = 1;
sprintf(msg, " Copy training set to test (%d) or vice-versa (%d) [%d]:",
min_range, max_range, min_range);
type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range,
&max_range);
free(msg);
}
if (type == 0)
{
dataset_free(teds);
teds = dataset_clone(tds);
if (teds == (struct dataset *) 0)
{
*errc = errno;
remove_datasets();
}
}
else
{
dataset_free(tds);
tds = dataset_clone(teds);
if (tds == (struct dataset *) 0)
{
*errc = errno;
remove_datasets();
}
}
if (*errc == 0)
{
status = save_sts(PCP_STS, teds, tds);
if (status == -1)
{
*errc = errno;
*xname = strdup(PCP_STS);
remove_datasets();
}
}
}
}
/*
Transform data set(s) using transformation 'ttype'. The available
values for ttype are P_NORMALIZE and P_MAP. Optionally copy the
transformed data set(s) into the current data set.
*/
void p_transform(int ttype, int *errc, char **xname)
{
int status;
int mode;
int lxd;
int nx;
int md;
int nsamples;
char *msg;
char *iffix;
char *fnm;
float *xmean;
float *std;
float **mapx;
float **srx;
struct dataset *dset1 = (struct dataset *) 0;
struct dataset *dset2 = (struct dataset *) 0;
md = 0; /* 0: use training and test datasets to compute the scaling/recenterin */
/* 1: use training dataset to compute the scaling/recentering */
lxd = -1;
iffix = (char *) 0;
status = 0;
if (ttype == P_NORMALIZE)
iffix = "nrm";
else if (ttype == P_MAP)
iffix = "map";
clear_screen();
cursor_on();
*errc = 0;
msg = malloc((PCP_QLEN+1)*sizeof(char));
if (ttype == P_MAP)
{
fnm = input_filename(PMSG_LIN_INPUT_FNAME, PCP_LIN, stdout);
lxd = file_info(fnm, (int *) 0, &nx, '\0');
if (lxd < 0)
status = -1;
}
if (!status)
{
mode = input_replace(&dset1, &dset2, iffix);
if (ttype == P_NORMALIZE)
{
if (dset1)
md = input_transform_mode();
if (md == 0)
{
srx = combine_x(dset2->x, dset2->nv, dset1->x, dset1->nv);
nsamples = dset2->nv+dset1->nv;
}
else
{
srx = dset2->x;
nsamples = dset2->nv;
}
xmean = fmx_mean(srx, nsamples, dset2->d);
std = fmx_std(srx, nsamples, dset2->d);
if (md == 0)
vx_free(srx);
fmx_prenorm(dset2->x, dset2->nv, dset2->d, xmean, std);
if (dset1)
fmx_prenorm(dset1->x, dset1->nv, dset1->d, xmean, std);
}
else if (ttype == P_MAP)
{
/*
lxd is dimension of transformed space, nx should be equal to the
dimensionality of TDS/TEDS. Verify that nx is consistent with
TDS/TEDS.
*/
if ((dset1 && (nx < dset1->d)) || (dset2 && (nx < dset2->d)))
{
status = -1;
*errc = PERR_INCONSISTENT_MAP;
}
else
{
mapx = fmx_load(fnm, (int *) 0, (int *) 0, '\0');
if (mapx)
{
status = dataset_mapx(dset1, lxd, mapx);
if (!status)
dataset_mapx(dset2, lxd, mapx);
}
else
status = -1;
}
}
if (!status)
{
status = dataset_write(dset1, &fnm);
if (!status)
status = dataset_write(dset2, &fnm);
if ((mode == P_REPLACE) && !status)
{
status = save_sts(PCP_STS, teds, tds);
if (status == -1)
{
*errc = errno;
*xname = strdup(PCP_STS);
remove_datasets();
}
}
}
}
if (status == -1)
{
if (*errc == 0)
*errc = errno;
*xname = strdup(fnm);
}
}
syntax highlighted by Code2HTML, v. 0.9.1