/* ARIADNE V.1.3 Copyright Richard Mott 2000 Wellcome Trust Centre For Human Genetics Univeristy of Oxford Roosevelt Drive Oxford OX3 7AD UK The software package ARIADNE is distributed in the hope that it will be useful, but in order that the University as a charitable foundation protects its assets for the benefit of its educational and research purposes, the University makes clear that no condition is made or to be implied, nor is any warranty given or to be implied, as to the accuracy of ARIADNE, or that it will be suitable for any particular purpose or for use under any specific conditions, or that the content or use of ARIADNE will not constitute or result in infringement of third-party rights. Furthermore, the University disclaims all responsibility for the use which is made of ARIADNE. */ #include #include #include #include #include #include"seq_util.h" #include"profile.h" #include"gapstat.h" int SWScoreProteinToProfile( SEQUENCE *protein, PROFILE *pro, int pro_start, int pro_extend ) { /* *****************************************************************************/ int **S, *SS; int **H, *HH; int **V, *VV; int *M; int plen = pro->length; int len = protein->len; int len2 = len+2; int i, j; int score = -1; /*PROFILE_ALIGNMENT *align;*/ char *seq = protein->s; /* int fast_diagonal = 0;*/ int j_start, j_stop; /* make sure all penalties have the correct sign (negative) */ pro_start = pro_start > 0 ? -pro_start : pro_start; pro_extend = pro_extend > 0 ? -pro_extend : pro_extend; pro_start += pro_extend; /* allocate the memory */ S = (int**)calloc( plen+2, sizeof(int*))+2; H = (int**)calloc( plen+2, sizeof(int*))+2; V = (int**)calloc( plen+2, sizeof(int*))+2; for(i=-2;i<0;i++) { S[i] = (int*)calloc( len+1, sizeof(int))+1; H[i] = (int*)calloc( len+1, sizeof(int))+1; V[i] = (int*)calloc( len+1, sizeof(int))+1; } UpCase(protein->s); /* printf( "seq %s len %d pro %s len %d\n", protein->s, protein->len, pro->seq, pro->length ); */ /* main DP loop */ for(j=0;jprofile[j]; for(i=0;i h2 ? h1 : h2; if ( h < 0 ) h = 0; HH[i] = h; /* vertical (insertion in profile) */ v1 = VV[i-1] + pro_extend; v2 = SS[i-1] + pro_start; v = v1 > v2 ? v1 : v2; if ( v < 0 ) v = 0; VV[i] = v; if ( v > h ) { s = v; } else { s = h; } /* diagonal (match) */ m = M[(int)seq[i]]; d = S[j-1][i-1] + m; if ( d > s ) { s = d; } /* update the max score */ if ( s > 0 ) { SS[i] = s; if ( s > score ) { score = s; } } else SS[i] = 0; } } /* free the arrays */ for(i=-2;i<0;i++) { free(S[i]-1); free(H[i]-1); free(V[i]-1); } free(S-2); free(H-2); free(V-2); return score; } /* ***************************************************************************** */ PROFILE_ALIGNMENT *AlignProteinToProfile( SEQUENCE *protein, PROFILE *pro, int gap_start, int gap_extend, int *score, int *length, double *percent ) { /* ***************************************************************************** Align a protein sequence to a PSI-BLAST profile, using a modified Smith-Waterman local alignment, with an affine gap penalty . protein is a SEQUENCE* containing the protein seq pro is a PROFILE* containing the psi-blast profile gap_start, gap_extend are gap penalties for inserting a gap in the profile or sequence *score is the score of the optimal local alignment *length is the length of the alignment Return value is a pointer to an array of PROFILE_ALIGNMENT objects, describing the optimal local alignment, which can be printed using print_PROFILE_ALIgnment() ***************************************************************************** */ int **S; int **H; int **V; unsigned int **path; int plen = pro->length+1; int len = protein->len; int len2 = len+1; int i, j; int max=-1, I=0, J=0; PROFILE_ALIGNMENT *align; char *seq = protein->s; /* make sure all penalties have the correct sign (negative) */ gap_start = gap_start > 0 ? -gap_start : gap_start; gap_extend = gap_extend > 0 ? -gap_extend : gap_extend; gap_start += gap_extend; /* allocate the memory */ S = (int**)calloc( len2, sizeof(int*))+1; H = (int**)calloc( len2, sizeof(int*))+1; V = (int**)calloc( len2, sizeof(int*))+1; for(i=-1;ilength;j++) { int h, h1, h2, v, v1, v2, d, /*f1, f2,*/ m, p, s; /* horizontal (insertion in protein) */ h1 = H[i-1][j] + gap_extend; h2 = S[i-1][j] + gap_start; if ( h1 > h2 ) { if ( h1 < 0 ) h = 0; else { h = h1; path[i-1][j] |= HORIZ << 8; } } else { if ( h2 < 0 ) h = 0; else { h = h2; path[i-1][j] |= DIAG << 8; } } H[i][j] = h; /* vertical (insertion in profile) */ v1 = V[i][j-1] + gap_extend; v2 = S[i][j-1] + gap_start; if ( v1 > v2 ) { if ( v1 < 0 ) v = 0; else { v = v1; path[i][j-1] |= VERT << 16; } } else { if ( v2 < 0 ) v = 0; else { v = v2; path[i][j-1] |= DIAG << 16; } } V[i][j] = v; if ( v > h ) { s = v; p = VERT; } else { s = h; p = HORIZ; } /* diagonal (match) */ m = pro->profile[j][(int)seq[i]]; d = S[i-1][j-1] + m; if ( d >= s ) { s = d; p = DIAG; } /* update the path (note defaults are 0 because calloc was used to initialise */ if ( s > 0 ) { S[i][j] = s; path[i][j] |= p; if ( s > max ) { max = s; I = i; J = j; } } /* printf (" %2d", S[i][j]); */ } /* printf("\n"); */ } /* printf("score = %d %d %d\n", max, I, J ); */ /* backtrack */ align = ProteinBacktrack( path, I, J, length ); /* percentage identity */ *percent = PercentIdentity( align, *length, protein, pro ); /* free the arrays */ for(i=-1;i>8; } else if ( p == VERT ) { j--; p = (path[i][j] & Vmask)>>16; } /* fprintf(stderr, "%d %d %d %d %u %u\n", len, i, j, p, path[i][j], path[i][j] & Vmask ); */ } align = (PROFILE_ALIGNMENT*)calloc(len+10,sizeof(PROFILE_ALIGNMENT)); *length = len; i = I; j = J; p = path[i][j] & Dmask; while( p != STOP ) { align[--len].pos1 = j; align[len].pos2 = i; align[len].type = p; if ( p == DIAG ) { /* match */ j--; i--; p = path[i][j] & Dmask; } else if ( p == HORIZ ) { i--; p = (path[i][j] & Hmask)>>8; } else if ( p == VERT ) { j--; p = (path[i][j] & Vmask)>>16; } } return align; } /* ***************************************************************************** */ void PrintProteinProfileAlignment( FILE *fp, int count, PROFILE *pro, SEQUENCE *protein, PROFILE_ALIGNMENT *align, int len, int score, double evalue, double percent, double K, double L, double H, double alpha, int show_align ) { /* ***************************************************************************** Print a pretty-formatted alignment of the protein sequence with the psi-blast profile. Typical output looks like this: >1ayl00.atm 532 vs gnl|UG|Hs#S268568 1218 + score 45 eval 1.777480e+01 333 RVSYPIYHIDNIVKPVSKAGHATKVIFLTADAFGVLPPVSRLTADQTQYHFLSGFTAKLA 393 1ayl00.atm |: | | :| :| :::: : : | :: : |: 504 RAVYYKQRISXFSSP-AKEETPKNILIYAXTSNNXRFXVYEYIPADVSNIFIYNHIILFL 563 gnl|UG|Hs#S268568 394 PTPTFSACFGAAFLSLHPTQYAEVLVKRMQAAGAQAYLVNTGWNGTGKRI SIKDTRAII 453 1ayl00.atm | ||: |||: :| : | | | || | :: 564 ADPRRGLLFGG----LHPHLXKIQLXRQYRSGRVXNSHXGRGXRSAEKYI.KISLGRNLV 619 gnl|UG|Hs#S268568 Identical mataches are indicated by '|', and positive-scoring matches by ':' ***************************************************************************** */ int start1, start2; int stop1, stop2; /* int sum=0; */ /* double lambda = PsiLambda( pro, protein ); */ start1 = align[0].pos1+1; start2 = align[0].pos2+1; stop1 = align[len-1].pos1+1; stop2 = align[len-1].pos2+1; fprintf(fp, "> %d %s len %d from %d to %d vs %s len %d from %d to %d score %d eval %e identity %.2f%% K %e L %e H %e alpha %e\n", count, pro->name, pro->length, start1, stop1, protein->name, protein->len, start2, stop2, score, evalue, percent, K, L, H, alpha ); if ( show_align ) { int n, pos; int buflen = 3*len+10; char *seq = protein->s; int off1, off2; char *buf1 = (char*)calloc( buflen, sizeof(char)); char *buf2 = (char*)calloc( buflen, sizeof(char)); char *buf3 = (char*)calloc( buflen, sizeof(char)); fprintf(fp, "\n"); pos = 0; for(n=0;nseq[align[n].pos1]; buf2[pos] = seq[(int) align[n].pos2]; if ( toupper(seq[align[n].pos2] == 'X' ) ) buf3[pos] = ' '; if ( toupper(pro->seq[align[n].pos1]) == toupper(seq[align[n].pos2] ) ) buf3[pos] = '|'; else if ( pro->profile[align[n].pos1][(int)seq[align[n].pos2]] > 0 ) buf3[pos] = ':'; else buf3[pos] = ' '; /* printf (" %c %c %d %d\n", pro->seq[align[n].pos1],seq[align[n].pos2], pro->profile[align[n].pos1][seq[align[n].pos2]], sum += pro->profile[align[n].pos1][seq[align[n].pos2]] ); */ } else if ( align[n].type == HORIZ ) { buf1[pos] = '-'; buf2[pos] = seq[align[n].pos2]; buf3[pos] = ' '; } else if ( align[n].type == VERT ) { buf1[pos] = pro->seq[align[n].pos1]; buf2[pos] = '-'; buf3[pos] = ' '; } pos++; } off1 = align[0].pos1+1; off2 = align[0].pos2+1; for(n=0;n pos ? pos : n+60 ; int end1 = off1; int end2 = off2; int i; for(i=n,end1=off1-1,end2=off2-1;iname ); fprintf ( fp, "%5s %-.60s\n", "", &buf3[n]); /*, m ); rtm 7.xii.99 */ fprintf ( fp, "%5d %-.60s %5d %s\n\n", off2, &buf2[n], end2, protein->name ); off1 = end1+1; off2 = end2+1; } free( buf1 ); free( buf2 ); free( buf3 ); } } /*>double PercentIdentity( PROFILE_ALIGNMENT *align, int length, SEQUENCE *seq, PROFILE *pro ) { ----------------------------------- Input: PROFILE_ALIGNMENT *align pointer to array of PROFILE_ALIGNMENT objects int length size of align[] SEQUENCE *seq pointer to SEQUENCE object containing the protein sequence PROFILE *pro pointer to PROFILE object containing the profile Returns: double percent the percent identity of the alignment, defined to be 100 * #identities / #aligned residues Preconditions The inputs must all be defined. *align should be be created by a call to AlignIseqToProfile() or AlignProteinToProfile() ***************************************************************************** */ double PercentIdentity( PROFILE_ALIGNMENT *align, int length, SEQUENCE *seq, PROFILE *pro ) { /* ***************************************************************************** */ int matches = 0; int diags = 0; int i; for(i=0;is[align[i].pos2]); diags++; if ( p != 'X' && p == toupper(pro->seq[align[i].pos1]) ) /* recall pos2 indexes the sequence, pos1 the profile */ matches++; } } return 100.0*matches/(diags+0.0001); } void IterativelyIncreaseGapPenalty( SEQUENCE *seq, PROFILE *pro, int *A, int *B, double lambda0, double K0, double H, double s, double *alpha, int *sw_score ) { double a = *alpha; int A1 = *A; int B1 = *B; while( 1 ) { A1++; a = 2*s*exp(-lambda0*(A1+B1))/(1-exp(-lambda0*B1) ); if ( a < 0.25 ) break; B1++; a = 2*s*exp(-lambda0*(A1+B1))/(1-exp(-lambda0*B1) ); if ( a < 0.25 ) break; } /* printf( "Increasing gap penalty for %s vs %s A: %d B: %d\n", seq->name, pro->name, A1, B1 ); */ *sw_score = SWScoreProteinToProfile( seq, pro, A1, B1 ); *alpha = a; *A = A1; *B = B1; } /* ***************************************************************************** EOB ***************************************************************************** */