/* File name: load_dset.c Created by: Ljubomir Buturovic Created: 09/06/2002 Purpose: data loading functions. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ static char rcsid[] = "$Id: load_dset.c,v 1.68 2006/04/21 14:30:19 ljubomir Exp $"; #include #include #include #include #include "lerr.h" #include "lau.h" #include "dataset.h" #include "pau.h" #include "pcp.h" #define PSTS_TESTNV "# testing data set cardinality" #define PSTS_TESTD "# number of features" #define PSTS_TESTC "# number of data files" #define PSTS_TESTFMT "# test set file format" #define PSTS_TESTND "# file cardinality" #define PSTS_TESTNAME "# file name" #define PSTS_TNV "# training data set cardinality" #define PSTS_TD "# number of features" #define PSTS_TC "# number of classes" #define PSTS_TFMT "# training set file format" #define PSTS_TND "# class cardinality" #define PSTS_TNAME "# training set class name" #define PSTS_STATE_TESTDS 0 /* test data set until test file cardinalities */ #define PSTS_STATE_TESTVECTORS 1 /* test data set file cardinalities */ #define PSTS_STATE_TESTCLASS 2 /* test data set file names */ #define PSTS_STATE_TDS 3 /* training data set until training class cardinalities */ #define PSTS_STATE_TVECTORS 4 /* training data set class cardinalities */ #define PSTS_STATE_TCLASS 5 /* training data set class names */ #define PSTS_STATE_DONE 6 /* Intermediate data loading function. */ void p_load(int *errc, char **xname) { int type; int status; int min_range; int max_range; char *msg; struct dataset *dset; dset = (struct dataset *) 0; clear_screen(); cursor_on(); msg = malloc((PCP_QLEN+1)*sizeof(char)); min_range = 0; max_range = 1; sprintf(msg, PCP_UMSG_LOAD, min_range, max_range, min_range); type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range, &max_range); free(msg); if (type == 0) { tds = pcp_input(stdin, stdout, errc, xname); dset = tds; } else if (type == 1) { teds = pcp_input(stdin, stdout, errc, xname); dset = teds; } cursor_off(); if (dset) { status = save_sts(PCP_STS, teds, tds); if (status == -1) { /* Reset data sets here due to save_sts() error. */ *errc = errno; teds = dataset_free(teds); tds = dataset_free(tds); *xname = strdup(PCP_STS); } } } /* Save status of 'test_dataset', 'training_dataset' in 'fname'. In case of success, return 0. In case of failure, return -1 and set errno. */ int save_sts(char *fname, struct dataset *test_dataset, struct dataset *training_dataset) { int i; int status = 0; int nx; FILE *fptr; fptr = fopen(fname, "w"); if (fptr) { if (test_dataset) { fprintf(fptr, "%-35d %s\n", test_dataset->nv, PSTS_TESTNV); fprintf(fptr, "%-35d %s\n", test_dataset->d, PSTS_TESTD); fprintf(fptr, "%-35d %s\n", test_dataset->c, PSTS_TESTC); fprintf(fptr, "%-35d %s\n", test_dataset->format, PSTS_TESTFMT); for (i = 0; i < test_dataset->c; i++) fprintf(fptr, "%-35d %s, file %d\n", test_dataset->nd[i], PSTS_TESTND, i+1); for (i = 0; i < test_dataset->c; i++) fprintf(fptr, "%-35s %s, file %d\n", test_dataset->fnames[i], PSTS_TESTNAME, i+1); } else { nx = 0; fprintf(fptr, "%-35d %s\n", nx, PSTS_TESTNV); } if (training_dataset) { fprintf(fptr, "%-35d %s\n", training_dataset->nv, PSTS_TNV); fprintf(fptr, "%-35d %s\n", training_dataset->d, PSTS_TD); fprintf(fptr, "%-35d %s\n", training_dataset->c, PSTS_TC); fprintf(fptr, "%-35d %s\n", training_dataset->format, PSTS_TFMT); for (i = 0; i < training_dataset->c; i++) fprintf(fptr, "%-35d %s, class %d\n", training_dataset->nd[i], PSTS_TND, i+1); for (i = 0; i < training_dataset->c; i++) fprintf(fptr, "%-35s %s, class %d\n", training_dataset->fnames[i], PSTS_TNAME, i+1); } else { nx = 0; fprintf(fptr, "%-35d%s\n", nx, PSTS_TNV); } status = fclose(fptr); } else status = -1; return status; } static int load_sts(char *fname, int *tenv, int *ted, int *tec, char ***tenames, int **test_nd, int *tnv, int *td, int *tc, char ***tnames, int **training_nd, int *test_fmt, int *training_fmt) { int status = 0; int state = PSTS_STATE_TESTDS; int done; int maxlen; int fcntr; int *nvc; char *line; char **tokens; char **names; FILE *fptr; fcntr = -1; nvc = (int *) 0; names = (char **) 0; status = file_info(fname, &maxlen, (int *) 0, '\0'); if (status != -1) { status = 0; fptr = fopen(fname, "r"); if (fptr != (FILE *) 0) { line = malloc(maxlen+2); done = 0; while (fgets(line, maxlen+2, fptr) && (done == 0)) { /* test data set info */ if (state == PSTS_STATE_TESTDS) { if (strstr(line, PSTS_TESTNV)) { tokens = str_tokenize(line, WHITESPACE); *tenv = atoi(tokens[0]); str_free(tokens); if (*tenv == 0) state = PSTS_STATE_TDS; } else if (strstr(line, PSTS_TESTD)) { tokens = str_tokenize(line, WHITESPACE); *ted = atoi(tokens[0]); str_free(tokens); } else if (strstr(line, PSTS_TESTC)) { tokens = str_tokenize(line, WHITESPACE); *tec = atoi(tokens[0]); str_free(tokens); } else if (strstr(line, PSTS_TESTFMT)) { tokens = str_tokenize(line, WHITESPACE); *test_fmt = atoi(tokens[0]); str_free(tokens); state = PSTS_STATE_TESTVECTORS; fcntr = 0; nvc = malloc(*tec*sizeof(int)); } } else if (state == PSTS_STATE_TESTVECTORS) { tokens = str_tokenize(line, WHITESPACE); nvc[fcntr] = atoi(tokens[0]); str_free(tokens); fcntr++; if (fcntr == *tec) { state = PSTS_STATE_TESTCLASS; fcntr = 0; *test_nd = nvc; names = malloc(*tec*sizeof(char *)); } } else if (state == PSTS_STATE_TESTCLASS) { tokens = str_tokenize(line, WHITESPACE); names[fcntr] = strdup(tokens[0]); str_free(tokens); fcntr++; if (fcntr == *tec) { state = PSTS_STATE_TDS; *tenames = names; } } /* training data set info */ else if (state == PSTS_STATE_TDS) { if (strstr(line, PSTS_TNV)) { tokens = str_tokenize(line, WHITESPACE); *tnv = atoi(tokens[0]); str_free(tokens); } else if (strstr(line, PSTS_TD)) { tokens = str_tokenize(line, WHITESPACE); *td = atoi(tokens[0]); str_free(tokens); } else if (strstr(line, PSTS_TC)) { tokens = str_tokenize(line, WHITESPACE); *tc = atoi(tokens[0]); str_free(tokens); } else if (strstr(line, PSTS_TFMT)) { tokens = str_tokenize(line, WHITESPACE); *training_fmt = atoi(tokens[0]); str_free(tokens); state = PSTS_STATE_TVECTORS; fcntr = 0; nvc = malloc(*tc*sizeof(int)); } } else if (state == PSTS_STATE_TVECTORS) { tokens = str_tokenize(line, WHITESPACE); nvc[fcntr] = atoi(tokens[0]); str_free(tokens); fcntr++; if (fcntr == *tc) { *training_nd = nvc; state = PSTS_STATE_TCLASS; fcntr = 0; names = malloc(*tc*sizeof(char *)); } } else if (state == PSTS_STATE_TCLASS) { tokens = str_tokenize(line, WHITESPACE); names[fcntr++] = strdup(tokens[0]); str_free(tokens); if (fcntr == *tc) { state = PSTS_STATE_DONE; *tnames = names; } } } fclose(fptr); free(line); } else status = -1; } return status; } /* Startup data load. Reads the status file, then loads data from input files into the global C data structures 'teds' and 'tds'. In case of success, set 'errc' to 0. Otherwise, set errc to error code. If 'errc' is file-related error code, 'xname' is name of offending file. */ void init_load(int *errc, char **xname) { int status; int test_fmt; int training_fmt; int nf; int tv; int nfs; int tnf; int ttv; int nc; int *test_nd; int *training_nd; char *fnm; char **tenames; char **tnames; *errc = 0; tenames = (char **) 0; tnames = (char **) 0; test_nd = (int *) 0; training_nd = (int *) 0; fnm = strdup(PCP_STS); status = load_sts(fnm, &tv, &nf, &nfs, &tenames, &test_nd, &ttv, &tnf, &nc, &tnames, &training_nd, &test_fmt, &training_fmt); if (!status) { free(fnm); if (tv > 0) teds = load_dataset(nf, nfs, test_nd, tenames, test_fmt, errc, &fnm); if (*errc == 0) { if (ttv > 0) tds = load_dataset(tnf, nc, training_nd, tnames, training_fmt, errc, &fnm); } } /* We don't want to complain if status file is missing - that's OK. */ else { if (errno != ENOENT) *errc = errno; } if (*errc) { *xname = strdup(fnm); if (*errc == LERR_FILE_FORMAT) /* We know that if load_dataset() reports LERR_FILE_FORMAT, either number of features, or number of vectors in the input file is inconsistent with the values found in PCP_STS. Therefore we can report a more specific message. In this particular case, we don't want to remove the PCP_STS file, so that user can diagnose the error. */ *errc = PERR_INCONSISTENT_FILE; else /* Other type of file loading error - remove the status file. */ unlink(PCP_STS); } strlen_free(tenames, nfs); strlen_free(tnames, nc); free(training_nd); free(test_nd); } /* Remove datasets (used in case of error). */ void remove_datasets(void) { teds = dataset_free(teds); tds = dataset_free(tds); unlink(PCP_STS); } /* Copy data set. */ void p_copy(int *errc, char **xname) { int type; int status; int min_range; int max_range; char *msg; if (teds || tds) { *errc = 0; if (tds && !teds) type = 0; else if (!tds && teds) type = 1; else { clear_screen(); cursor_on(); msg = malloc((PCP_QLEN+1)*sizeof(char)); min_range = 0; max_range = 1; sprintf(msg, " Copy training set to test (%d) or vice-versa (%d) [%d]:", min_range, max_range, min_range); type = input_integer(stdin, stdout, msg, PCP_QLEN, &min_range, &min_range, &max_range); free(msg); } if (type == 0) { dataset_free(teds); teds = dataset_clone(tds); if (teds == (struct dataset *) 0) { *errc = errno; remove_datasets(); } } else { dataset_free(tds); tds = dataset_clone(teds); if (tds == (struct dataset *) 0) { *errc = errno; remove_datasets(); } } if (*errc == 0) { status = save_sts(PCP_STS, teds, tds); if (status == -1) { *errc = errno; *xname = strdup(PCP_STS); remove_datasets(); } } } } /* Transform data set(s) using transformation 'ttype'. The available values for ttype are P_NORMALIZE and P_MAP. Optionally copy the transformed data set(s) into the current data set. */ void p_transform(int ttype, int *errc, char **xname) { int status; int mode; int lxd; int nx; int md; int nsamples; char *msg; char *iffix; char *fnm; float *xmean; float *std; float **mapx; float **srx; struct dataset *dset1 = (struct dataset *) 0; struct dataset *dset2 = (struct dataset *) 0; md = 0; /* 0: use training and test datasets to compute the scaling/recenterin */ /* 1: use training dataset to compute the scaling/recentering */ lxd = -1; iffix = (char *) 0; status = 0; if (ttype == P_NORMALIZE) iffix = "nrm"; else if (ttype == P_MAP) iffix = "map"; clear_screen(); cursor_on(); *errc = 0; msg = malloc((PCP_QLEN+1)*sizeof(char)); if (ttype == P_MAP) { fnm = input_filename(PMSG_LIN_INPUT_FNAME, PCP_LIN, stdout); lxd = file_info(fnm, (int *) 0, &nx, '\0'); if (lxd < 0) status = -1; } if (!status) { mode = input_replace(&dset1, &dset2, iffix); if (ttype == P_NORMALIZE) { if (dset1) md = input_transform_mode(); if (md == 0) { srx = combine_x(dset2->x, dset2->nv, dset1->x, dset1->nv); nsamples = dset2->nv+dset1->nv; } else { srx = dset2->x; nsamples = dset2->nv; } xmean = fmx_mean(srx, nsamples, dset2->d); std = fmx_std(srx, nsamples, dset2->d); if (md == 0) vx_free(srx); fmx_prenorm(dset2->x, dset2->nv, dset2->d, xmean, std); if (dset1) fmx_prenorm(dset1->x, dset1->nv, dset1->d, xmean, std); } else if (ttype == P_MAP) { /* lxd is dimension of transformed space, nx should be equal to the dimensionality of TDS/TEDS. Verify that nx is consistent with TDS/TEDS. */ if ((dset1 && (nx < dset1->d)) || (dset2 && (nx < dset2->d))) { status = -1; *errc = PERR_INCONSISTENT_MAP; } else { mapx = fmx_load(fnm, (int *) 0, (int *) 0, '\0'); if (mapx) { status = dataset_mapx(dset1, lxd, mapx); if (!status) dataset_mapx(dset2, lxd, mapx); } else status = -1; } } if (!status) { status = dataset_write(dset1, &fnm); if (!status) status = dataset_write(dset2, &fnm); if ((mode == P_REPLACE) && !status) { status = save_sts(PCP_STS, teds, tds); if (status == -1) { *errc = errno; *xname = strdup(PCP_STS); remove_datasets(); } } } } if (status == -1) { if (*errc == 0) *errc = errno; *xname = strdup(fnm); } }