/************************************************************ * HMMER - Biological sequence analysis with profile-HMMs * Copyright (C) 1992-1998 Washington University School of Medicine * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and * GNULICENSE for details. * ************************************************************/ /* structs.h * * Data structures used in HMMER. * Also, a few miscellaneous macros and global variable declarations. * * RCS $Id: structs.h,v 1.1.1.1 2001/06/18 13:59:51 birney Exp $ */ #ifndef STRUCTSH_INCLUDED #define STRUCTSH_INCLUDED #include "squid.h" #include "config.h" /* Miscellaneous math macros used in the package */ #define sreLOG2(x) ((x) > 0 ? log(x) * 1.44269504 : -9999.) #define sreEXP2(x) (exp((x) * 0.69314718 )) #define SQR(x) ((x) * (x)) /* an idiom for determining a symbol's position in the array * by pointer arithmetic. * does no error checking, so caller must already be damned sure x is * valid in the alphabet! */ #define SYMIDX(x) (strchr(Alphabet, (x)) - Alphabet) /* Worry() is a conditional warning macro, used for debugging; * it inserts file name and line number information and * calls VerboseWorry(). See debug.c. * * PANIC is called for failures of Std C/POSIX functions, * instead of my own functions. It calls perror() and exits * abnormally. */ #define Worry(x,s) VerboseWorry(x, __FILE__, __LINE__, s) #define PANIC (Panic(__FILE__, __LINE__)) /* The symbol alphabet. * Must deal with IUPAC degeneracies. Nondegenerate symbols * come first in Alphabet[], followed by degenerate symbols. * Nucleic alphabet also must deal with other common symbols * like U (in RNA) and X (often misused for N). * Example: * Nucleic: "ACGTUNRYMKSWHBVDX" size=4 iupac=17 * Amino: "ACDEFGHIKLMNPQRSTVWYBZX" size=20 iupac=23 * * Parts of the code assume that the last symbol is a * symbol for an unknown residue, i.e. 'X'. * * MAXCODE and MAXABET constants are defined in config.h */ extern char Alphabet[MAXCODE]; /* "ACDEFGHIKLMNPQRSTVWYBZX" for example */ extern int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */ extern int Alphabet_size; /* uniq alphabet size: 4 or 20 */ extern int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */ extern char Degenerate[MAXCODE][MAXABET]; extern int DegenCount[MAXCODE]; #define hmmNOTSETYET 0 #define hmmNUCLEIC 2 /* compatibility with squid's kDNA */ #define hmmAMINO 3 /* compatibility with squid's kAmino */ /********************************************************************** * * Plan7 * Implementation of the new Plan7 HMM architecture. * Fully probabilistic even for hmmsw, hmmls, and hmmfs; * No insert->delete or delete->insert transitions; * Improved structure layout. * * The strategy is to infiltrate plan7 code into HMMER in * an evolutionary rather than revolutionary manner. * **********************************************************************/ /* Plan 7 construction strategies. */ enum p7_construction { P7_MAP_CONSTRUCTION, /* maximum a posteriori architecture */ P7_HAND_CONSTRUCTION, /* hand specified architecture */ P7_FAST_CONSTRUCTION /* fast ad hoc architecture */ }; /* Plan 7 parameter optimization strategies */ enum p7_param { P7_MAP_PARAM, /* standard maximum a posteriori */ P7_MD_PARAM, /* maximum discrimination */ P7_MRE_PARAM, /* maximum relative entropy */ P7_WMAP_PARAM /* ad hoc weighted MAP */ }; /* Structure: plan7_s * * Declaration of a Plan 7 profile-HMM. */ struct plan7_s { /* Annotation on the model. A name is mandatory. * Other fields are optional; whether they are present is * flagged in the stateflags bit array. * * desc is only valid if PLAN7_DESC is set. * rf is only valid if PLAN7_RF is set. * cs is only valid if PLAN7_CS is set. */ char *name; /* name of the model +*/ char *desc; /* brief description of model +*/ char *rf; /* reference line from alignment 0..M +*/ char *cs; /* consensus structure line 0..M +*/ char *comlog; /* command line(s) that built model +*/ int nseq; /* number of training sequences +*/ char *ctime; /* creation date +*/ /* The main model in probability form: data-dependent probabilities. * This is the core Krogh/Haussler model. * Transition probabilities are usually accessed as a * two-D array: hmm->t[k][TMM], for instance. They are allocated * such that they can also be stepped through in 1D by pointer * manipulations, for efficiency in DP algorithms. */ int M; /* length of the model (# nodes) +*/ float **t; /* transition prob's. t[1..M-1][0..6] +*/ float **mat; /* match emissions. mat[1..M][0..19] +*/ float **ins; /* insert emissions. ins[1..M-1][0..19] +*/ float tbd1; /* B->D1 prob (data dependent) +*/ /* The unique states of Plan 7 in probability form. * These are the algorithm-dependent, data-independent probabilities. */ float xt[4][2]; /* N,E,C,J extra states: 2 transitions +*/ float *begin; /* 1..M B->M state transitions +*/ float *end; /* 1..M M->E state transitions (!= a dist!) +*/ /* The null model probabilities. */ float null[MAXABET]; /* "random sequence" emission prob's +*/ float p1; /* null model loop probability +*/ /* The model in log-odds score form. * These are created from the probabilities by LogoddsifyHMM(). * By definition, null[] emission scores are all zero. * Note that emission distributions are over 26 upper-case letters, * not just the unambiguous protein or DNA alphabet: we * precalculate the scores for all IUPAC degenerate symbols we * may see. Non-IUPAC symbols simply have a -INFTY score. * Note the reversed indexing on msc and isc -- for efficiency reasons. * * Only valid if PLAN7_HASBITS is set. */ int **tsc; /* transition scores [1.M-1][0.6] -*/ int **msc; /* match emission scores [0.MAXCODE-1][1.M] -*/ int **isc; /* ins emission scores [0.MAXCODE-1][1.M-1] -*/ int xsc[4][2]; /* N,E,C,J transitions -*/ int *bsc; /* begin transitions [1.M] -*/ int *esc; /* end transitions [1.M] -*/ /* DNA translation scoring parameters * For aligning protein Plan7 models to DNA sequence. * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3, * where 'pos1' is the digitized value of the first nucleotide position; * if any of the positions are ambiguous codes, lookup value 64 is used * (which will generally have a score of zero) * * Only valid if PLAN7_HASDNA is set. */ int **dnam; /* triplet match scores [0.64][1.M] -*/ int **dnai; /* triplet insert scores [0.64][1.M] -*/ int dna2; /* -1 frameshift, doublet emission, M or I -*/ int dna4; /* +1 frameshift, doublet emission, M or I -*/ /* P-value and E-value statistical parameters * Only valid if PLAN7_STATS is set. */ float mu; /* EVD mu +*/ float lambda; /* EVD lambda +*/ float wonka; /* EVD fit display fudge factor +*/ int flags; /* bit flags indicating state of HMM +*/ }; /* Flags for plan7->flags. * Note: Some models have scores but no probabilities (for instance, * after reading from an HMM save file). Other models have * probabilities but no scores (for instance, during training * or building). Since it costs time to convert either way, * I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion * until absolutely necessary. This means I have to be careful * about keeping these flags set properly when I fiddle a model. */ #define PLAN7_HASBITS (1<<0) /* raised if model has log-odds scores */ #define PLAN7_DESC (1<<1) /* raised if description exists */ #define PLAN7_RF (1<<2) /* raised if #RF annotation available */ #define PLAN7_CS (1<<3) /* raised if #CS annotation available */ #define PLAN7_XRAY (1<<4) /* raised if structural data available */ #define PLAN7_HASPROB (1<<5) /* raised if model has probabilities */ #define PLAN7_HASDNA (1<<6) /* raised if protein HMM->DNA seq params set*/ #define PLAN7_STATS (1<<7) /* raised if EVD parameters are available */ /* Indices for special state types, I: used for dynamic programming xmx[][] * mnemonic: eXtra Matrix for B state = XMB */ #define XMB 0 #define XME 1 #define XMC 2 #define XMJ 3 #define XMN 4 /* Indices for special state types, II: used for hmm->xt[] indexing * mnemonic: eXtra Transition for N state = XTN */ #define XTN 0 #define XTE 1 #define XTC 2 #define XTJ 3 /* Indices for Plan7 main model state transitions. * Used for indexing hmm->t[k][] * mnemonic: Transition from Match to Match = TMM */ #define TMM 0 #define TMI 1 #define TMD 2 #define TIM 3 #define TII 4 #define TDM 5 #define TDD 6 /* Indices for extra state transitions * Used for indexing hmm->xt[][]. */ #define MOVE 0 /* trNB, trEC, trCT, trJB */ #define LOOP 1 /* trNN, trEJ, trCC, trJJ */ /* Declaration of Plan7 dynamic programming matrix structure. */ struct dpmatrix_s { int **xmx; /* special scores [0.1..N][BECJN] */ int **mmx; /* match scores [0.1..N][0.1..M] */ int **imx; /* insert scores [0.1..N][0.1..M-1.M] */ int **dmx; /* delete scores [0.1..N][0.1..M-1.M] */ }; /* Structure: HMMFILE * * Purpose: An open HMM file or HMM library. See hmmio.c. */ struct hmmfile_s { FILE *f; /* pointer to file opened for reading */ int (*parser)(struct hmmfile_s *, struct plan7_s **); /* parsing function*/ int is_binary; /* TRUE if format is a binary one */ int byteswap; /* TRUE if binary and byteswapped */ }; typedef struct hmmfile_s HMMFILE; /* Plan 7 model state types * used in traceback structure */ enum p7stype { STM, STD, STI, STS, STN, STB, STE, STC, STT, STJ, STBOGUS }; /* Structure: p7trace_s * * Traceback structure for alignments of model to sequence. * Each array in a trace_s is 0..tlen-1. * Element 0 is always to STATE_S. Element tlen-1 is always to STATE_T. */ struct p7trace_s { int tlen; /* length of traceback */ enum p7stype *statetype; /* state type used for alignment */ int *nodeidx; /* index of aligned node, 1..M (if M,D,I) */ int *pos; /* position in dsq, 1..L or -1 */ }; /* Structure: p7prior_s * * Dirichlet priors on HMM parameters. */ struct p7prior_s { int strategy; /* PRI_DCHLET, etc. */ int tnum; /* number of transition Dirichlet mixtures */ float tq[MAXDCHLET]; /* probabilities of tnum components */ float t[MAXDCHLET][7]; /* transition terms per mix component */ int mnum; /* number of mat emission Dirichlet mixtures */ float mq[MAXDCHLET]; /* probabilities of mnum components */ float m[MAXDCHLET][MAXABET]; /* match emission terms per mix component */ int inum; /* number of insert emission Dirichlet mixes */ float iq[MAXDCHLET]; /* probabilities of inum components */ float i[MAXDCHLET][MAXABET]; /* insert emission terms */ }; #define PRI_DCHLET 0 /* simple or mixture Dirichlets */ #define PRI_PAM 1 /* PAM prior hack */ /********************************************************************** * Other structures, not having to do with HMMs. **********************************************************************/ /* Structure: histogram_s * * Keep a score histogram. * * The main implementation issue here is that the range of * scores is unknown, and will go negative. histogram is * a 0..max-min array that represents the range min..max. * A given score is indexed in histogram array as score-min. * The AddToHistogram() function deals with dynamically * resizing the histogram array when necessary. */ struct histogram_s { int *histogram; /* counts of hits */ int min; /* elem 0 of histogram == min */ int max; /* last elem of histogram == max */ int highscore; /* highest active elem has this score */ int lowscore; /* lowest active elem has this score */ int lumpsize; /* when resizing, overalloc by this */ int total; /* total # of hits counted */ float *expect; /* expected counts of hits */ int fit_type; /* flag indicating distribution type */ float param[3]; /* parameters used for fits */ float chisq; /* chi-squared val for goodness of fit*/ float chip; /* P value for chisquared */ }; #define HISTFIT_NONE 0 /* no fit done yet */ #define HISTFIT_EVD 1 /* fit type = extreme value dist */ #define HISTFIT_GAUSSIAN 2 /* fit type = Gaussian */ #define EVD_MU 0 /* EVD fit parameter mu */ #define EVD_LAMBDA 1 /* EVD fit parameter lambda */ #define EVD_WONKA 2 /* EVD fit fudge factor */ #define GAUSS_MEAN 0 /* Gaussian parameter mean */ #define GAUSS_SD 1 /* Gaussian parameter std. dev. */ /* Structure: fancyali_s * * Alignment of a hit to an HMM, for printing. */ struct fancyali_s { char *rfline; /* reference coord info */ char *csline; /* consensus structure info */ char *model; /* aligned query consensus sequence */ char *mline; /* "identities", conservation +'s, etc. */ char *aseq; /* aligned target sequence */ int len; /* length of strings */ char *query; /* name of query HMM */ char *target; /* name of target sequence */ int sqfrom; /* start position on sequence (1..L) */ int sqto; /* end position on sequence (1..L) */ }; /* Structure: hit_s * * Info about a high-scoring database hit. * We keep this info in memory, so we can output a * sorted list of high hits at the end. * * sqfrom and sqto are the coordinates that will be shown * in the results, not coords in arrays... therefore, reverse * complements have sqfrom > sqto */ struct hit_s { double sortkey; /* number to sort by; big is better */ float score; /* score of the hit */ double pvalue; /* P-value of the hit */ float mothersc; /* score of whole sequence */ double motherp; /* P-value of whole sequence */ char *name; /* name of the database seq */ char *desc; /* description of database seq */ int sqfrom; /* start position in seq (1..N) */ int sqto; /* end position in seq (1..N) */ int sqlen; /* length of sequence (N) */ int hmmfrom; /* start position in HMM (1..M) */ int hmmto; /* end position in HMM (1..M) */ int hmmlen; /* length of HMM (M) */ int domidx; /* index of this domain */ int ndom; /* total # of domains in this seq */ struct fancyali_s *ali; /* ptr to optional alignment info */ }; /* Structure: tophit_s * * Array of high scoring hits, suitable for efficient sorting * when we prepare to output results. "hit" list is NULL and * unavailable until after we do a sort. */ struct tophit_s { struct hit_s **hit; /* array of ptrs to top scoring hits */ struct hit_s *unsrt; /* unsorted array */ int alloc; /* current allocation size */ int num; /* number of hits in list now */ int lump; /* allocation lumpsize */ }; /********************************************************** * Plan 9: obsolete HMMER1.x code. We still need these structures * for reading old HMM files (e.g. reverse compatibility) **********************************************************/ /* We define a "basic" state, which covers the basic match, insert, and * delete states from the Haussler paper. Numbers are stored as * pre-calculated negative logs. */ struct basic_state { float t[3]; /* state transitions to +1 M, +0 I, +1 D */ float p[MAXABET]; /* symbol emission probabilities */ }; /* A complete hidden Markov model */ struct plan9_s { int M; /* length of the model */ struct basic_state *ins; /* insert states 0..M+1 */ struct basic_state *mat; /* match 0..M+1; 0 = BEGIN, M+1 = END */ struct basic_state *del; /* delete 0..M+1 */ float null[MAXABET]; /* the *suggested* null model */ /* Optional annotation on the HMM, taken from alignment */ char *name; /* a name for the HMM */ char *ref; /* reference coords and annotation */ char *cs; /* consensus structure annotation */ float *xray; /* Structural annotation: xray[0..M+1][NINPUTS], indexed manually */ int flags; /* flags for what optional info is in HMM */ }; /* Flags for optional info in an HMM structure */ #define HMM_REF (1<<0) #define HMM_CS (1<<1) #define HMM_XRAY (1<<2) #define MATCH 0 #define INSERT 1 #define DELETE 2 #define BEGIN MATCH #define END MATCH #endif /* STRUCTSH_INCLUDED */