/* ARIADNE V.1.3 Copyright Richard Mott 2000 Wellcome Trust Centre For Human Genetics Univeristy of Oxford Roosevelt Drive Oxford OX3 7AD UK The software package ARIADNE is distributed in the hope that it will be useful, but in order that the University as a charitable foundation protects its assets for the benefit of its educational and research purposes, the University makes clear that no condition is made or to be implied, nor is any warranty given or to be implied, as to the accuracy of ARIADNE, or that it will be suitable for any particular purpose or for use under any specific conditions, or that the content or use of ARIADNE will not constitute or result in infringement of third-party rights. Furthermore, the University disclaims all responsibility for the use which is made of ARIADNE. */ #include #include #include #include #include #include #include"seq_util.h" #include"profile.h" #include"gapstat.h" /* ***************************************************************************** */ PROFILE *ReadProfile( FILE *fp ) { /* ***************************************************************************** Read a psi-blast profile into a PROFILE struct. A reasonable ammount of error trapping is done, but it is not guarranteed to find everything **************************************************************************** */ PROFILE *psi=NULL; char buf[256]; char name[256]; int len; /*, width;*/ /*char *tok;*/ int i, j; int l, u; /*, p;*/ if ( fp ) { if (!(fgets( buf, 256, fp ))) return NULL; /* fprintf(stderr,"ReadProfile: %s",buf); */ if ( sscanf(buf, ">%s", name ) == 1 ){ psi = (PROFILE*)calloc(1,sizeof(PROFILE)); psi->name = strdup(name); fgets( buf, 256, fp ); if ( (sscanf(buf,"%d %d", &psi->length, &psi->width ) == 2) && psi->length > 0 && psi->width > 0 ) { psi->seq = (char*)calloc(psi->length+1,sizeof(char)); psi->residue = (char*)calloc(psi->width+1,sizeof(char)); psi->profile = (int**)calloc(psi->length,sizeof(int*)); for(i=0;ilength;i++) psi->profile[i] = (int*)calloc(256,sizeof(int)); fgets( buf, 256, fp ); len = strlen(buf); for(i=0,j=0;iwidth;i++) { if ( isalpha((int) buf[i]) ) psi->residue[j++] = buf[i]; } psi->max_score = -1; for(i=0;ilength;i++) { char *ptr = buf; long value; fgets( buf, 256, fp ); while( isspace((int) *ptr) ) ptr++; value = strtol(ptr,&ptr,10); if ( value != i+1 ) { fprintf(stderr, "ERROR parsing profile %s, position %d %d : %s\n", name, i, (int)value, buf ); exit(1); } while( isspace((int) *ptr) ) ptr++; psi->seq[i] = *(ptr++); for(j=0;jwidth;j++) { value = strtol( ptr, &ptr, 10); l = tolower(psi->residue[j]); u = toupper(psi->residue[j]); psi->profile[i][l] = psi->profile[i][u] = (int)value; if ( value > psi->max_score ) value = psi->max_score; } } return psi; } } } return NULL; } /* ***************************************************************************** */ void FreeProfile( PROFILE *psi ) { /* ***************************************************************************** Frees all the memory associated with the psi-blast profile ***************************************************************************** */ int i; for(i=0;ilength;i++) free( psi->profile[i] ); free( psi->profile ); /* rtm 7.xii.99 */ free( psi->name ); free( psi->seq ); free( psi->residue ); if ( psi->pre ) FreePreProcess( psi->pre ); free( psi ); } /* ***************************************************************************** */ double PsiLambda( PROFILE *psi, SEQUENCE *protein ) { /* ***************************************************************************** */ /* compute lambda for a given profile, assuming amino acid freqs as defined in protein */ double *histogram; int min=0; int max=0; int n, m, s; double lambda=0, total=0.0; double *comp = ResidueFrequencies( protein ); for(n=0;nlength;n++) { for(m='A';m<='Z';m++) { s = psi->profile[n][m]; if ( s < min ) min = s; if ( s > max ) max = s; } } histogram = (double*)calloc(max-min+1,sizeof(double))-min; for(n=0;nlength;n++) { for(m='A';m<='Z';m++) { s = psi->profile[n][m]; histogram[s] += comp[m]; } } total = 1.0e-10; for(s=min;s<=max;s++) total += histogram[s]; for(s=min;s<=max;s++) histogram[s] /= total; total = 0.0; /* for(s=min;s<=max;s++) printf( "%5d %.5f %.5f\n", s, histogram[s], total+=histogram[s]); */ lambda = solve_for_lambda( histogram, min, max ); free(histogram+min); free(comp); return lambda; } int WritePreProcessedProfile( PREPROCESS *pre, FILE *fp ) { if ( pre && fp ) { char temp[] = "PREPROCESS "; int i, j, k; short *buffer; /*, inde;*/ int buflen = 0; for(i=0;ihits[i] ) { buflen += 2 + 2*pre->hits[i]; } } buffer = (short*)calloc( buflen, sizeof(short)); fwrite( temp, sizeof(char), strlen(temp), fp ); fwrite( &pre->total, sizeof(int), 1, fp ); fwrite( &pre->threshold, sizeof(int), 1, fp ); fwrite( &pre->diag_threshold, sizeof(int), 1, fp ); fwrite( &pre->max_dist, sizeof(int), 1, fp ); fwrite( &pre->window, sizeof(int), 1, fp ); fwrite( &buflen, sizeof(int), 1, fp ); j = 0; for(i=0;ihits[i] ) { buffer[j++] = (short)i; buffer[j++] = (short)pre->hits[i]; for(k=0;khits[i];k++) { buffer[j++] = (short)pre->hitlist[i][k].pos; buffer[j++] = (short)pre->hitlist[i][k].score; } } } fwrite( buffer, sizeof(short), j, fp ); free(buffer); fprintf(stderr, "written binary %s\n", pre->psi->name ); } return 0; } PREPROCESS *ReadPreProcessedProfile( FILE *fp ) { char buf[256]; int i, j, k; int len = strlen("PREPROCESS "); short *buffer; int buflen; int header[6]; fread( &buf, sizeof(char), len, fp ); buf[len] = 0; if ( strcmp( buf, "PREPROCESS " ) == 0 ) { PREPROCESS *pre = calloc(1,sizeof(PREPROCESS)); fread( header, sizeof(int), 6, fp ); pre->total = header[0]; pre->threshold = header[1]; pre->diag_threshold = header[2]; pre->max_dist = header[3]; pre->window = header[4]; buflen = header[5]; pre->mapping = ResidueMapping(); pre->hits = (int*)calloc( A3_SIZE, sizeof(int)); pre->hitlist = (WORD_HIT**)calloc( A3_SIZE, sizeof(WORD_HIT*)); buffer = (short*)calloc( buflen, sizeof(short)); fread( buffer, sizeof(short), buflen, fp ); j = 0; while( j < buflen ) { i = (int)buffer[j++]; pre->hits[i] = (int)buffer[j++]; pre->hitlist[i] = (WORD_HIT*)malloc( pre->hits[i]*sizeof(WORD_HIT)); for(k=0;khits[i];k++) { pre->hitlist[i][k].pos = (int)buffer[j++]; pre->hitlist[i][k].score = (int)buffer[j++]; } } free(buffer); return pre; } else { fprintf( stderr, "READ ERROR in ReadPreProcess()\n" ); exit(1); } } int WriteBinaryProfile( PROFILE *psi, FILE *fp ) { if ( psi && fp ) { char buf[256]; int len = psi->length*('Z'-'A'+1); short *buffer = (short*)calloc( len, sizeof(short) ); int i, j, k; sprintf( buf, "PSI_PROFILE " ); fwrite( buf, sizeof(char), strlen(buf), fp ); fwrite( psi->name, sizeof(char), strlen(psi->name)+1, fp ); fwrite( &psi->length, sizeof(int), 1, fp ); fwrite( &psi->width, sizeof(int), 1, fp ); fwrite( &psi->K, sizeof(double), 1, fp ); fwrite( &psi->L, sizeof(double), 1, fp ); fwrite( &psi->H, sizeof(double), 1, fp ); fwrite( &psi->s, sizeof(double), 1, fp ); fwrite( psi->seq, sizeof(char), psi->length, fp ); fwrite( psi->residue, sizeof(char), psi->width, fp ); k = 0; for(i=0;ilength;i++) { for(j='A';j<='Z';j++) { buffer[k++] = (short)psi->profile[i][j]; } } fwrite( buffer, sizeof(short), len, fp ); free(buffer); if ( psi->pre ) WritePreProcessedProfile( psi->pre, fp ); return 1; } return 0; } PROFILE *NextProfile( FILE *fp ) { while ( fp && ! feof(fp) ) { int c = fgetc( fp ); if ( c == '>' ) { ungetc(c,fp); return ReadProfile( fp ); } else if ( c == 'P' ) { ungetc(c,fp); return ReadBinaryProfile( fp ); } } return NULL; } PROFILE *ReadBinaryProfile( FILE *fp ) { if ( fp ) { char buf[256]; int i, j, k, c; int len = strlen("PSI_PROFILE "); long int here = ftell( fp ); int AZ = ('Z'-'A'+1); fread( buf, sizeof(char), len, fp ); buf[len] = 0; if ( strcmp( buf, "PSI_PROFILE " ) == 0 ) { PROFILE *psi = (PROFILE*)calloc(1, sizeof(PROFILE)); int buflen; short *buffer; psi->name = (char*)calloc(256,sizeof(char)); i = 0; while( (c = fgetc( fp )) > 0 ) { psi->name[i++] = c; } fread( &psi->length, sizeof(int), 1, fp ); fread( &psi->width, sizeof(int), 1, fp ); fread(&psi->K, sizeof(double), 1, fp ); fread(&psi->L, sizeof(double), 1, fp ); fread(&psi->H, sizeof(double), 1, fp ); fread(&psi->s, sizeof(double), 1, fp ); psi->seq = (char*)calloc(psi->length+1,sizeof(char)); fread( psi->seq, sizeof(char), psi->length, fp ); psi->residue = (char*)calloc(psi->width+1,sizeof(char)); fread( psi->residue, sizeof(char), psi->width, fp ); psi->profile = (int**)calloc(psi->length,sizeof(int*)); buflen = psi->length*AZ; buffer = (short*)calloc( buflen, sizeof(short) ); fread( buffer, sizeof(short), buflen, fp ); for(i=0;ilength;i++) { psi->profile[i] = (int*)calloc(256,sizeof(int)); for(j=0,k='A';k<='Z';k++,j++) { psi->profile[i][k] = buffer[i*AZ+j]; } } free(buffer); if ( (psi->pre = ReadPreProcessedProfile( fp )) ) psi->pre->psi = psi; return psi; } /* else fseek( fp, here, SEEK_SET ); */ } return NULL; } PREPROCESS *GetPreProcessedProfile( char *name, char *binary_dir ) { char path[256]; FILE *fp; sprintf( path, "%s/%s.binary", binary_dir, name ); if ( (fp = fopen( path, "r" )) ) { return ReadPreProcessedProfile( fp ); } else { return NULL; } } int PutPreProcessedProfile( PREPROCESS *pre, char *name, char *binary_dir ) { char path[256]; FILE *fp; sprintf( path, "%s/%s.binary", binary_dir, name ); if ( (fp = fopen( path, "r" )) ) { fclose(fp); return -1; } if ( (fp = fopen( path, "w" )) ) { return WritePreProcessedProfile( pre, fp ); fclose(fp); } return 0; } PROFILE *SeqToProfile( SEQUENCE *seq, int **matrix ) { /* make a pseudo-profile from a sequence and a matrix. New memory is allocated, so it is safe to free the profile separately */ PROFILE *psi = (PROFILE*)calloc(1,sizeof(PROFILE)); int n, m; psi->length = seq->len; psi->seq = strdup(seq->s); psi->name = strdup(seq->name); psi->profile = (int**)calloc(psi->length,sizeof(int*)); for(n=0;nlength;n++) { psi->profile[n] = (int*)calloc(256,sizeof(int)); for(m='A';m<='Z';m++) psi->profile[n][m] = matrix[seq->s[n]][m]; } psi->residue = strdup(""); psi->width = 0; return psi; } PROFILE *CheapSeqToProfile( SEQUENCE *seq, int **matrix ) { /* make a pseudo-profile from a sequence and a matrix. New memory is NOT allocated, so it is NOT safe to free the profile separately */ PROFILE *psi = (PROFILE*)calloc(1,sizeof(PROFILE)); int n, m; psi->length = seq->len; psi->seq = seq->s; psi->name = seq->name; psi->profile = (int**)calloc(psi->length,sizeof(int*)); for(n=0;nlength;n++) { psi->profile[n] = matrix[seq->s[n]]; } psi->residue = NULL; psi->width = 0; return psi; } void FreeCheapProfile( PROFILE *cheap ) { free(cheap->profile); free(cheap); } /*************************************************************************/ void FreePreProcess( PREPROCESS *pre ) { /*************************************************************************/ if ( pre ) { int i; for(i=0;ihitlist[i] ) free(pre->hitlist[i]); } free( pre->hitlist ); free( pre->hits ); free( pre ); } } /*************************************************************************/ int *ResidueMapping(void) { /*************************************************************************/ static int *mapping = NULL; if ( NULL == mapping ) { int n, m; char residues[22] = "XACDEFGHIKLMNPQRSTVWY"; int X = (int)(strchr(residues,'X')-residues); mapping = (int*)calloc(256,sizeof(int)); for(m=0;m<256;m++) { mapping[m] = X; /* everything is equivalent to X, to start with */ } m = strlen(residues); for(n=0;n