#ifndef _DEF_PROFILE_H_ #define _DEF_PROFILE_H_ #include #include #include"seq_util.h" #include"extreme_ml.h" /* PROFILE_ALIGNMENT is used to describe the alignment of a sequence to a profile. Alignments are arrays of PROFILE_ALIGNMENT objects */ typedef struct { int pos1, pos2, type; /* pos1, pos2 = coords in sequence and profile. type is of BACKTRACK, giving path direction */ } PROFILE_ALIGNMENT; typedef struct { int pos, score; } WORD_HIT; /* PREPROCESS is a struct containing linked lists of all word types of length 3 scoring more than threshold in a profile or sequence */ typedef struct { int *mapping; WORD_HIT **hitlist; int *hits; int total; int threshold; int diag_threshold; int max_dist; int window; struct profile *psi; } PREPROCESS; #define A1_SIZE 21 #define A2_SIZE 441 #define A3_SIZE 9261 typedef struct { short *score; short *best; int *last; int min, max; int best_diag; } DIAGONAL_SCORE; /* PROFILE contains a psi-blast type profile */ typedef struct profile { char *name; /* The name of the profile (NOT the filename !) */ int length, width; /* length = length of the underlying sequence. width = # of amino-acid types */ char *seq; /* the underlying protein sequence (containing "length" residues) */ char *residue; /* char[i] array which says which amino-acid is associated with column i in the profile */ int **profile; /* profile[j][i] gives the score for matching amino-acid i with row j in the profile */ int max_score; /* the max-scoring entry */ double K, L, H, s; /* default statistical parameters for camparisons with proteins of standard composition */ PREPROCESS *pre; } PROFILE; /* BACKTRACK defines the types of move possible when backtracking through the path matrix */ typedef enum { STOP, HORIZ, VERT, DIAG, FRAME1, FRAME2 } BACKTRACK; PROFILE *NextProfile( FILE *fp ); PROFILE *ReadProfile( FILE *fp ); void FreeProfile( PROFILE *pro ); PROFILE *ShuffleProfile( PROFILE *pro, int *seed ); /* DNA - profile alignment functions */ PROFILE_ALIGNMENT *DnaBacktrack( char **path, int I, int J, int *length ); void PrintProfileAlignment( FILE *fp, int count, PROFILE *pro, SEQUENCE *interlaced, PROFILE_ALIGNMENT *align, int len, int score, char strand, double evalue, double percent, double K, double L, double H, double alpha, int show_align ); PROFILE_ALIGNMENT *AlignIseqToProfile( SEQUENCE *interlaced, PROFILE *pro, int pro_start, int pro_extend, int frameshift, int banded, int dstart, int dstop, int *score, int *length, double *percent ); int FastScoreIseqToProfile( SEQUENCE *interlaced, PROFILE *pro, char *proname, int pro_start, int pro_extend, int frameshift, int strand, PREPROCESS *pre, double ungappedFilter, int dbsize, double ethresh, double default_K, double default_L, int align, FILE *outfp ); void PrintProfile( FILE *fp, PROFILE *pro ); double PsiLambda( PROFILE *pro, SEQUENCE *seq ); double PercentIdentity( PROFILE_ALIGNMENT *align, int length, SEQUENCE *seq, PROFILE *pro ); /* protein - profile alignment functions */ PROFILE_ALIGNMENT *ProteinBacktrack( unsigned int **path, int I, int J, int *length ); PROFILE_ALIGNMENT *AlignProteinToProfile( SEQUENCE *protein, PROFILE *pro, int start, int extend, int *score, int *length, double *percent ); int ScoreProteinToProfile( SEQUENCE *protein, PROFILE *pro, char *proname, int pro_start, int pro_extend, int banded, int band_start, int band_stop, int align, int dbsize, double ethresh, double default_K, double default_L, FILE *outfp ); int FastScoreProteinToProfile( SEQUENCE *protein, PROFILE *pro, char *proname, int pro_start, int pro_extend, PREPROCESS *pre, int align, int dbsize, double ethresh, double default_K, double default_L, FILE *outfp ); void PrintProteinProfileAlignment( FILE *fp, int count, PROFILE *pro, SEQUENCE *protein, PROFILE_ALIGNMENT *align, int len, int score, double evalue, double percent, double K, double L, double H, double alpha, int show_align ); int *ResidueMapping(void); void FreePreProcess( PREPROCESS *pre ); DIAGONAL_SCORE *DiagonalScore( SEQUENCE *seq, PREPROCESS *pre ); DIAGONAL_SCORE *InterlacedDiagonalScore( SEQUENCE *seq, PREPROCESS *pre); void FreeDiagonalScore( DIAGONAL_SCORE *ds ); int FindFastPvalues( SCORE *hit, int hits, double *K, double *L, int *positives, double *length ); PROFILE *ReadBinaryProfile( FILE *fp ); int WriteBinaryProfile( PROFILE *pro, FILE *fp ); int WritePreProcessedProfile( PREPROCESS *pre, FILE *fp ); PREPROCESS *ReadPreProcesedProfile( FILE *fp ); int DeleteWords( PREPROCESS *pre, int position_frequency ); int EvaluateProteinMatchSignificance( SEQUENCE *protein, PROFILE *pro, int A, int B, int score, int dbsize, double ethresh, double default_K, double default_L, int show_align, FILE *out_fp ); int EvaluateDNAMatchSignificance( SEQUENCE *interlaced, PROFILE *pro, int strand, int A, int B, int F, int banded, int dstart, int dstop, int score, int dbsize, double ethresh, double default_K, double default_L, int show_align, FILE *out_fp ); int KarlinAltschulProfileStatistics( PROFILE *pro, double *freq, double *lambda, double *Kminus, double *Kplus, double *H, double *r, double *s ); double *GetPsiH( PROFILE *pro, double *freq, int *hmin, int *hmax, double *mean ); int SetDefaultParameters( PROFILE *pro ); PROFILE *SeqToProfile( SEQUENCE *seq, int **matrix ); PROFILE *CheapSeqToProfile( SEQUENCE *seq, int **matrix ); void FreeCheapProfile( PROFILE *cheap ); PREPROCESS *PreProcessProfile( PROFILE *pro, int threshold, int diag_threshold, int max_dist, int window, int position_frequency ); void IterativelyIncreaseGapPenalty( SEQUENCE *seq, PROFILE *pro, int *A, int *B, double lambda0, double K0, double H, double s, double *alpha, int *sw_score ); #endif