/*
File name: pau.h
Created by: Ljubomir Buturovic
Created: 02/18/2002
Purpose: declarations and macros for assorted utilities in PCP.
*/
/*
Copyright 2004 Ljubomir J. Buturovic
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef PAU_H
#define PAU_H
#include "pcp.h"
#include "svm.h"
#include "hash.h"
#define ESCAPE_CHAR 27
#define DIGITS "+-.eE0123456789"
/*
Max. length of numeric input in PCP.
*/
#define PCP_MAX_NUM 30
/*
Initial number of models in .svm file.
*/
#ifndef INIT_NO_MODELS
#define INIT_NO_MODELS 20
#endif
struct pcp_stat
{
int internal_code;
int exit_code;
char *msg;
};
void pcp_stat_set(struct pcp_stat *pcp_status, int internal_code, int exit_code, char *msg);
struct pcp_stat *hashPutStat(hash_t *hashtable, int key, struct pcp_stat *value);
struct pcp_stat *hashGetStat(hash_t *hashtable, int key);
/*
Create hash table of PCP status and error codes. Each element in the
table is a struct pcp_status, which has internal code, exit code,
and msg corresponding to the status. The exit code is a translation
of internal code into legal exit status values, which have to be in
the range [1..255]. This is because UNIX programs return 'status &
0377' - i.e., the lower byte of the status code passed to exit() -
to the parent process; see exit(3) for more details.
Note that this means we only have 127 codes available for PCP status
(since [0..128] seem to be taken, at least in Linux). If we need
more, we are doomed. Also, this function needs to be updated every
time a new error code is introduced.
*/
hash_t *create_status_table(void);
/*
Log matrix.
*/
void log_mx(float **mx, int rows, int columns, FILE *fdbg);
/*
Log columns of 'mx'.
*/
void log_mxt(float **mx, int rows, int columns, FILE *fdbg);
/*
Calculate and display covariance matrix and matrix of linear
correlation coefficients for data in 'xmx'.
*/
void covar(float **xmx, int d, int nv, int *errc, int dbg);
/*
Read input data: number of features, number of classes, number of
vectors per class, and input files.
Return the populated 'dataset' structure. In case of error, return
(struct dataset *) 0 and set 'errc' to error code. Error codes are
PERR_BAD_INPUT_FILE, LERR_FILE_FORMAT, or <errno.h> error codes. In
case of file-related errors, 'fname' has name of file which caused
the error.
*/
struct dataset *pcp_input(FILE *indev, FILE *outdev, int *errc, char **xname);
/*
Print question 'msg' on 'outdev', read integer response from
'indev'. The function loops until the answer is in ['min_range',
'max_range']. 'length' is total length of question line.
If 'dflt' is not NULL, the function returns that value when the user
enters empty line.
In case of error return EOF. The errors are bad arguments ('msg',
'indev' or 'outdev' NULL) or malloc() error. In the latter case set
'errno'.
*/
int input_integer(FILE *indev, FILE *outdev, char *msg, int length,
int *dflt, int *min_range, int *max_range);
/*
Print question 'msg' on 'outdev', read integer response from
'indev'. The function repeats the question until the response is one
of the values in vector 'choices' of length 'len'. 'length' is total
length of question line.
If 'dflt' is not NULL, the function returns that value when the user
enters empty line.
In case of error return EOF. The errors are bad arguments ('msg',
'indev' or 'outdev' NULL) or malloc() error. In the latter case set
'errno'.
*/
int input_choice(FILE *indev, FILE *outdev, char *msg, int *dflt, int *choices, int len);
/*
Float version of input_integer(). The input value must be in
[min_range, max_range].
*/
float input_float(FILE *indev, FILE *outdev, char *msg, int length,
float *dflt, float *min_range, float *max_range);
/*
Get filename from stdin. 'dflt' is the default filename. The query
message is 'txt[dflt]:' or 'txt', however at most PCP_QLEN of the
message is printed.
*/
char *input_filename(char *txt, char *dflt, FILE *outdev);
/*
Print 'msg' to 'outdev', centered within a string of 'length'
characters. If 'msg' is longer than 'length', it is truncated.
*/
void center_line(FILE *outdev, char *msg, int length);
/*
Print fixed-length message, with variable number of
arguments. 'nargs' is the number of arguments following 'nargs'. The
arguments are the same as for printf() and friends.
*/
void vprint_line(int nargs, ...);
/*
Like vprint_line(), with inverse video. First argument after `nargs'
is an integer. If it equals one, display message in inverse video,
otherwise display in normal video.
*/
void viprint_line(int nargs, ...);
/*
Print 'msg' to 'outdev'. Fill remainder up to 'length' with space
characters.
*/
void print_line(FILE *outdev, char *msg, int length);
/*
Similar to print_line(), but doesn't set video, and doesn't print
newline.
*/
void print_ln(FILE *outdev, char *msg, int length);
void clear_screen(void);
/*
Position cursor at (x, y). x is column, y is row. (0, 0) is upper
left corner.
*/
void cpos(int x, int y);
void cursor_on(void);
void cursor_off(void);
void inverse_video(void);
void reset_video(void);
unsigned int input_seed(FILE *indev, FILE *outdev);
int input_nmodels(FILE *indev, FILE *outdev);
/*
Enter dimension of transformed space for linear dimension
reduction. The arguments have obvious meanings.
*/
int input_d(FILE *indev, FILE *outdev, int max_value, int default_value);
/*
Enter number of features to select for feature subset selection.
*/
int input_nfeat(FILE *indev, FILE *outdev, int max_value, int default_value);
int input_nxval(FILE *indev, FILE *outdev, int max_nxval);
/*
Read MLP parameters from stdin.
*/
int input_mlp(FILE *outdev, int inputs, int outputs, int nvec, int *mlp_continue,
int *nlayers, int **npl, int *itmax, float *range, int *opt_method,
float *eta, float *mu, int *nxval, int max_nxval, int *nmlp,
unsigned int *seed, char **fname);
/*
Get ensemble variant of the original method (bagging or boosting).
*/
int input_ensemble_method(int method);
/*
Input k-NN distance code.
*/
int input_knn_dist(FILE *outdev);
/*
Get transformation (SVD, scaling) mode.
0: use training and test sets to compute the transformation
1: use only the training set
*/
int input_transform_mode();
/*
Read k-NN parameters from stdin.
*/
int input_knn(FILE *outdev, int *nxval, int max_nxval, int *nmlp, int *k, int maxk,
unsigned int *seed, char **fname, int *dist);
int input_svm_type(char *msg, FILE *outdev);
/*
Accept kernel type from stdin.
*/
int input_kernel_type(char *msg, FILE *outdev);
/*
Accept kernel type from a limited selection (only the types
supported by the model selection functions pcp_svm_grid() and
pcp_svm_simplex()).
*/
int input_kernel(char *msg, FILE *outdev);
void input_class_costs(struct svm_parameter *parameters, int c);
/*
Read SVM parameters from stdin. Check consistency with 'problem'.
If `full_par' is not 0, read C/nu, gamma, degree and coef0
parameters, depending on the SVM type. Otherwise, if `ktype' is
PCP_SVM_K_NONE, read only the kernel type. Otherwise, we don't need
these parameter values (presumably because we are called from a
function which will optimize them).
If `method' is not NULL, read the ensemble variant of PALG_SVM
(i.e., PALG_BAG_SVM or PALG_ADABOOST_SVM).
*/
int input_svm(FILE *outdev, int c, int nvec, struct svm_problem *problem,
struct svm_parameter *parameters, char **fname, int *nxval,
int max_nxval, int full_par, int ktype, int *ensemble_method);
/*
Kernel types are internally coded starting from 1 (because 0 is
reserved for the default kernel type). This function converts the
internal kernel types to LIBSVM kernel types.
*/
int convert_kernel_type(int type);
int input_nexp(char *msg);
/*
Read feature selection criterion. Used to evaluate feature subsets.
Note that PCP_FSEL_GOLUB is only defined for two classes.
*/
int input_fscrit(FILE *outdev, int dr_method, int c);
/*
Read dimension reduction parameters: method, dimension of
transformed space, and feature subset evaluation criterion, for
feature selection methods.
The function returns the chosen method, and sets the dimension of
transformed space in `idr' and the feature subset evaluation
criterion in `fscrit'.
The methods offered are based on number of vectors 'nv', number of
classes 'c', and data dimension 'd'. The logic is:
available dimension reduction methods
c == 2, nv <= d none, Golub, SVD, EMAP
c == 2, nv > d none, Golub, PCA, FLD, EMAP
c > 2, nv <= d none, SVD, EMAP
c > 2, nv > d none, PCA, FLD, EMAP
The 'none' option is the default.
*/
int input_dr(FILE *outdev, int nv, int d, int c, int *idr, int *fscrit);
/*
Read feature selection method.
*/
int input_fsel(FILE *outdev);
/*
Write prediction for vector `idx' in `dset' to 'fptr' in PCP_RCL
format.
`tset' is the training dataset. It is used to extract the actual
class name for the input vector. This is brain-dead; it is a
consequence of the current one-class-per-file input format. This has
to be converted to actual class labels, in which case the argument
will become obsolete.
*/
void write_rcl(FILE *fptr, int idx, struct dataset *dset, struct dataset *tset);
/*
Display classification results for 'test_set'. The prediction were
obtained using 'algorithm'.
The function employs the following logic to evaluate and display the
predictions.
If test_set->c > 1, we assume that the test_set class labels are the
correct classifications of the test samples, and compute error rates
correspondingly. The test_set file names are used as class labels.
If test_set->c == 1, we interpret that to mean that the actual class
labels for the test_set are not known, and therefore error rates
cannot be calculated; hence the function just displayes the
(integer) class IDs.
If actual class labels can be established using the above logic, and
if 'verbose' is 1, the function displays class prediction for all
vectors. If 'verbose' is 0, only misclassified vectors are
displayed.
If actual class labels are unknown (i.e., test_set->c == 1),
'verbose' is ignored, and the function displays prediction for all
vectors.
*/
void predict_disp(struct dataset *test_set, int verbose, int algorithm);
/*
Extract 'subset_id'-th 'training_set' and 'test_set' from
'x'. 'test_set' is obtained by extracting vectors listed in 'sx'
from 'x' (see xss() for structure of 'sx'). 'training_set' has the
remaining vectors in 'x'.
*/
int extract_sets(float **x, int subset_id, int c, int *nd, int d,
int **sx, int **lx, float ***training_set, float ***test_set,
FILE *fdbg);
void log_tt(FILE *fdbg, char *msg, int c, int d, int *training_nd,
int *test_nd, float **training_set, float **test_set);
/*
Log cross-validation subset 'idx'.
*/
void log_ites(FILE *fdbg, int idx, int c, int nv, int **lxc, int **sxc);
/*
Return pseudo-random number in [0, 1].
*/
float float_rand(void);
/*
Calculate and display eigenvalues and condition number of covariance
matrices of training data set.
*/
void p_eigen(int *errc);
/*
Calculate inverse covariance matrices and determinants. In case of
success, set 'errc' to 0, otherwise set to error code. Possible
errors are <errno.h> codes for malloc()/calloc() functions, and
LERR_SINGCOV (singular covariance matrix).
*/
void p_sigma(int *errc);
void xlearn_disp(int *errc, char **xname, char *default_name);
/*
Print message in the designated area. The function has variable
number of arguments. 'nargs' is the number of arguments following
'nargs'. The following arguments are int (message code) and, if
nargs is larger than 1, (char *). The second argument is name
(usually of a file causing error), if message code is non-zero. If
message code is zero, the second argument is printed as is.
TBD: change name. The name is bad since not all messages are
necessarily error messages.
*/
void errmsg(int nargs, ...);
/*
Create a temporary file in the current directory and return the
name. The file is created with permissions -rw-r--r--.
Return (char *) 0 in case of error and set errno. The errors are
malloc(), mkstemp() and fchmod() errors.
*/
char *tempfile(void);
/*
Set 'dset1' and 'dset2' to the deep or shallow copies of TEDS and
TDS, respectively, depending on user input. If the copies are deep,
the current data sets will get replaced by the result of the
subsequent operations; it the copies are shallow, new data sets will
be created and saved to disk. 'iffix' is used to define the file
names for the saved data sets.
Return the mode (replace data sets -> 1, do not replace -> 0).
*/
int input_replace(struct dataset **dset1, struct dataset **dset2, char *iffix);
/*
Utility function for feature extraction and selection. Saves
'dset1', 'dset2'. If 'mode' is P_REPLACE, updates the status file.
In case of error, return -1 and set 'errc'. In case of file error,
set file name in 'xname'.
*/
int save_datasets(struct dataset *dset1, struct dataset *dset2, int mode,
int *errc, char **xname);
/*
Save the clusters defined by 'ncl' (number of clusters), 'ccard'
(cluster cardinalities), 'clist' (list of vectors in the clusters),
using file 'format'.
If 'format' is Named Vector format (DATASET_FF_VECTOR), and TDS is
in the Raw format (DATASET_FF_RAW) , we store vector IDs instead of
the (non-existent) vector names.
In case of error, set 'errc' and set file name in 'xname' (the only
possible errors are file errors).
*/
void save_clusters(int ncl, int *ccard, int *clist, int format, int *errc, char **xname);
/*
Read dataset file format from 'indev'.
*/
int input_format(FILE *indev, FILE *outdev);
/*
A convenient memory allocator for simplex()
optimization. Initializes the criterion vector `fval' and simplex
`smx'.
*/
int smplx_alloc(float **fval, float ***smx, int n);
/*
Accept input parameters and pass them to the corresponding ensemble
(bagging or boosting) learning function.
In case of error, set 'errc'. If error is file access error, set
'xname'.
*/
void pcp_ensemble(int method, int *errc, char **xname, int dbg);
/*
Combine matrices `x1' and `x2' with `n1' and `n2' rows,
respectively. The combination is an array of pointers pointing to
rows of x1 followed by rows of x2.
In case of malloc() failure, return (float **) 0 and set errno.
*/
float **combine_x(float **x1, int n1, float **x2, int n2);
#endif
syntax highlighted by Code2HTML, v. 0.9.1