/* ARIADNE V.1.3 Copyright Richard Mott 2000 Wellcome Trust Centre For Human Genetics Univeristy of Oxford Roosevelt Drive Oxford OX3 7AD UK The software package ARIADNE is distributed in the hope that it will be useful, but in order that the University as a charitable foundation protects its assets for the benefit of its educational and research purposes, the University makes clear that no condition is made or to be implied, nor is any warranty given or to be implied, as to the accuracy of ARIADNE, or that it will be suitable for any particular purpose or for use under any specific conditions, or that the content or use of ARIADNE will not constitute or result in infringement of third-party rights. Furthermore, the University disclaims all responsibility for the use which is made of ARIADNE. */ #include #include #include #include #include #include"cl.h" #include"seq_util.h" static DATABASE *databases[100]; static int database_count; #include #include FILE *OpenFileInEnvPath(); FILE *OpenFileInSeqPath(); void AddDatabase ( DATABASE *db ); int hash_name( char *string ); extern int read_line( FILE *file, char *string ); extern int skip_comments( FILE *file, char *string ); extern int fcmp( const void *A, const void *B); FILE * OpenFileInSeqPathArg( char *format, char *ext, char *mode, int argc, char **argv, char *fullname ) { char buf[256]; if ( GetArg( format, buf, argc, argv ) ) return OpenFileInSeqPath( buf, ext, mode, fullname ); else return NULL; } FILE * OpenFileInSeqPath( char *BaseName, char *ext, char *mode, char *fullname ) { char buf[256]; strcpy(buf,BaseName); Extension(buf,ext); return OpenFileInEnvPath( buf, mode, SEQPATH, fullname ); } /* char *seq_data_home(name) char *name; { char buf[256], *s; strcpy(buf,name); s = buf; while(*s) { *s = toupper(*s); s++; } if ( ( s = (char*)getenv(buf) ) || (s = (char*)getenv("SEQ_DATA_HOME") ) ) return s; else return (char*)strdup("./"); } */ DATABASE * OpenDatabase( char *name ) { DATABASE *db; if ( (db = WhichDatabase( name )) ) return db; if ( (db = OpenEmblDatabase( name )) ) return db; else if ( (db = OpenNbrfDatabase( name )) ) return db ; else if ( (db = OpenFastaDatabase( name )) ) return db; else { fprintf(stderr, "ERROR could not open database %s\n", name ); exit(1); } } DATABASE * OpenEmblDatabase( char *name ) { char buf[256]; DATABASE *db; db = (DATABASE*)calloc(1,sizeof(DATABASE)); db->database = (char*)strdup(name); db->datafile = OpenFileInSeqPath(name, "dat", "r",buf); db->indexfile = OpenFileInSeqPath(name, "index","r",buf); db->type = EMBL; if ( ! db->datafile || ! db->indexfile ) { free( db ); return NULL; } MakeEmblIndex( db ); fprintf(stderr,"\n! embl format database %s opened: %d sequences\n", name, db->sequences ); AddDatabase(db); return db; } char * AccNos( SEQUENCE *seq ) { if ( seq->database->type == EMBL ) return EmblSeqComment( seq, "AC" ); else return NbrfSeqComment( seq, "ACCESSION" ); } char * SeqComment( SEQUENCE *seq, char *key ) { if ( seq->database->type == EMBL ) return EmblSeqComment( seq, key ); else return NbrfSeqComment( seq, key ); } char * EmblSeqComment( SEQUENCE *seq, char *key ) { char line[256]; unsigned long text_offset; unsigned long offset = GetOffset( seq->name, seq->database, &text_offset ); unsigned long end_pos; static char buf[MAX_BUF]; *buf = 0; fseek(seq->database->datafile,offset,0); end_pos = FindNext( seq->database->datafile, "//", line ); fseek(seq->database->datafile,offset,0); while (1) { FindNext( seq->database->datafile, key, line ); if ( end_pos < ftell(seq->database->datafile ) ) break; else { if ( strlen(buf) + strlen(line) + 2 < MAX_BUF ) strcat( buf, CleanLine(line)+strlen(key) ); else break; } } return buf; } char * NbrfSeqComment( SEQUENCE *seq, char *key) { char line[256]; unsigned long text_offset=0; /*unsigned long offset = GetOffset(seq->name, seq->database, &text_offset );*/ unsigned long end_pos; static char buf[MAX_BUF]; *buf = 0; if ( seq->database->textfile ) { fseek(seq->database->textfile,text_offset+4,0); end_pos = FindNext( seq->database->textfile, ">>>>", line ); fseek(seq->database->textfile,text_offset,0); while (1) { FindNext( seq->database->textfile, key, line ); if ( end_pos < ftell(seq->database->textfile ) ) break; else { if ( strlen(buf) + strlen(line) + 2 < MAX_BUF ) strcat( buf, CleanLine(line)+strlen(key) ); else break; } } } return buf; } DATABASE * OpenFastaDatabase( char *name ) { char buf[256]; DATABASE *db; db = (DATABASE*)calloc(1,sizeof(DATABASE)); db->database = (char*)strdup(name); db->datafile = OpenFileInSeqPath(name, "fasta", "r",buf); db->indexfile = OpenFileInSeqPath(name, "index","r",buf); db->type = FASTA; if ( ! db->datafile || ! db->indexfile ) { free( db ); return NULL; } MakeFastaIndex( db ); fprintf(stderr,"\n! fasta format database %s opened: %d sequences\n", name, db->sequences ); AddDatabase(db); return db; } DATABASE * OpenNbrfDatabase( char *name ) { char buf[256]; DATABASE *db; db = (DATABASE*)calloc(1,sizeof(DATABASE)); db->database = (char*)strdup(name); db->datafile = OpenFileInSeqPath(name, "seq", "r",buf); db->textfile = OpenFileInSeqPath(name, "ref", "r",buf); db->indexfile = OpenFileInSeqPath(name, "index", "r",buf); db->type = NBRF; if ( ! db->datafile || ! db->textfile || ! db->indexfile ) { free( db ); return NULL; } MakeNbrfIndex( db ); fprintf(stderr,"\n! nbrf format database %s opened: %d sequences\n", name, db->sequences ); AddDatabase(db); return db; } DATABASE * WhichDatabase( database_name ) char *database_name; { int n; for(n=0;ndatabase ) ) return databases[n]; return NULL; } void AddDatabase ( DATABASE *db ) { if ( database_count < 100 ) databases[database_count++] = db; else { fprintf(stderr, "\nERROR: too many databases! \n"); exit (1); } } void MakeEmblIndex( DATABASE *db ) { int n; unsigned long offset; HASH_LIST *item; char name[256]; db->index = (HASH_LIST**)calloc(PRIME_NUM,sizeof(HASH_LIST*)); while( fscanf( db->indexfile, "%s %lu\n", name, &offset ) == 2 ) { n = hash_name(name); item = (HASH_LIST*)calloc(1,sizeof(HASH_LIST)); item->name = (char*)strdup(name); item->offset = offset; db->sequences++; if ( ! db->index[n] ) db->index[n] = item; else { item->next = db->index[n]; db->index[n] = item; } } rewind(db->indexfile); } void MakeFastaIndex( DATABASE *db ) { int n; unsigned long offset; HASH_LIST *item; char name[256]; db->index = (HASH_LIST**)calloc(PRIME_NUM,sizeof(HASH_LIST*)); while( fscanf( db->indexfile, "%s %lu\n", name, &offset ) == 2 ) { n = hash_name(name); item = (HASH_LIST*)calloc(1,sizeof(HASH_LIST)); item->name = (char*)strdup(name); item->offset = offset; db->sequences++; if ( ! db->index[n] ) db->index[n] = item; else { item->next = db->index[n]; db->index[n] = item; } } rewind(db->indexfile); } void MakeNbrfIndex( DATABASE *db ) { int n; unsigned long offset1, offset2; HASH_LIST *item; char name[256]; db->index = (HASH_LIST**)calloc(PRIME_NUM,sizeof(HASH_LIST*)); while( fscanf( db->indexfile, "%s %lu %lu\n", name, &offset1, &offset2 ) == 3 ) { n = hash_name(name); item = (HASH_LIST*)calloc(1,sizeof(HASH_LIST)); item->name = (char*)strdup(name); item->offset = offset1; item->text_offset = offset2; db->sequences++; if ( ! db->index[n] ) db->index[n] = item; else { item->next = db->index[n]; db->index[n] = item; } } rewind(db->indexfile); } SEQUENCE *Read_Seq( char *name ) { char *db, *sname; DATABASE *d; sname = (char*)strchr( name, ':' )+1; *(sname-1) = 0; db = name; d = OpenDatabase( db ); return ReadSequence( sname, d ); } SEQUENCE *ReadSequence( char *name, DATABASE *database ) { unsigned long text_offset; unsigned long offset = GetOffset( name, database, &text_offset ); if ( database->type == EMBL ) return Read_embl_Sq( name, database, offset ); else if ( database->type == NBRF ) return Read_nbrf_Sq( name, database, offset, text_offset ); else if ( database->type == FASTA ) return Read_fasta_Sq( name, database, offset); else return NULL; } unsigned long GetOffset(char *name, DATABASE *database, unsigned long *text_offset ) { int n; HASH_LIST *h; n = hash_name(name); h = database->index[n]; *text_offset = 0; while( h ) { if ( ! strcmp( name, h->name ) ) { *text_offset = h->text_offset; return h->offset; } else h = h->next; } return 0; } SEQUENCE *Read_embl_Sq( char *name, DATABASE *database, unsigned long offset ) { char sname[256], line[512]; int c; SEQUENCE *seq = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); int len, n; fseek(database->datafile, offset, 0 ); FindNext( database->datafile, "ID", line ); *sname = 0; sscanf( line, "ID %s", sname ); if ( strcmp( sname, name ) ) { fprintf(stderr,"\n! ERROR on Read_embl_Sq: pointer offset for %s wrong! (gets %s instead)\n", name, sname ); exit(1); } seq->name = (char*)strdup(name); seq->database = database; FindNext( database->datafile, "DE", line ); seq->desc = (char*)strdup(&line[5]); FindNext( database->datafile, "SQ", line ); sscanf( line, "SQ Sequence %d", &len ); seq->len = len; seq->s = (char*)calloc(len+1,sizeof(char)); n = 0; while ( (c = getc(database->datafile)) != '/' && c != EOF ) { if ( isalpha(c) && ! isspace(c) && n < len ) seq->s[n++] = tolower(c); } return seq; } SEQUENCE ** GetFastaSeqs( FILE *fp, int *nseqs ) { int n = 100; SEQUENCE ** array = (SEQUENCE**)calloc(n,sizeof(SEQUENCE*)); SEQUENCE *seq; *nseqs=0; while( (seq = GetFastaSeq( fp )) ) { if ( *nseqs >= n ) { n *=2; array = (SEQUENCE**)realloc( array, n*sizeof(SEQUENCE*)); } array[(*nseqs)++] = seq; fprintf(stderr, "%s %d\n", seq->name, seq->len); } return array; } int WriteFastaSq( SEQUENCE *seq, FILE *fp ) { int n; fprintf(fp, ">%s %s\n", seq->name, seq->desc ? seq->desc : "" ); for(n=0;nlen;n+=60) { fprintf(fp, "%-60.60s\n", &seq->s[n] ); } return seq->len; } SEQUENCE *SkipFastaSeq( FILE *fp, int skip ) { int c = 0; SEQUENCE *seq; /*int start = 1;*/ long here = ftell(fp); int previous = '\n'; /* printf ("here = %u\n", here ); */ if ( skip > 0 ) { do { c = fgetc(fp); if ( (previous == '\n') && (c == '>') ) { skip--; /* printf( "skip %d at %u\n", skip, ftell(fp) );*/ } previous = c; } while( (c != EOF) && (skip > 0) ); } here = ftell(fp); seq = GetFastaSeq( fp ); /* printf( "skipped to %s at %u (%u)\n", seq->name, ftell(fp), here ); */ return seq; } SEQUENCE * GetFastaSeq( FILE *fp ) /* read a fasta sequence in from a file. Can be used for multiple reads */ { int c=0; SEQUENCE *seq=NULL; char name[256], desc[256]; /*char *s;*/ int i, len; int buflen; char *buffer; char previous = '\n'; if ( fp ) { while( (c=fgetc(fp)) != '>' && previous != '\n' && c != EOF ) previous = c; if ( c == EOF ) return NULL; else { buflen = FASTA_BUFLEN; seq = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); buffer = (char*)calloc(buflen,sizeof(char)); i = 0; while( ! isspace(c=fgetc(fp)) ) if ( i < 254 ) name[i++] = c; name[i] = 0; seq->name = strdup(name); while( c != '\n' && c != EOF && isspace(c=getc(fp)) ); i = 0; while( c != '\n' && c != EOF ) { if ( i < 254 ) desc[i++] = c; c = getc(fp); } desc[i] = 0; seq->desc = strdup(desc); len=0; while( (c=getc(fp)) != '>' && c != EOF ) { if ( isalpha(c)||(c=='-')||(c=='*') ) { if ( len >= buflen ) { buflen *= 2; buffer = (char*)realloc(buffer, buflen*sizeof(char)); if ( buffer == NULL ) { fprintf( stderr, "REALLOC FAILED getting %d bytes in GetFastaSeq\n", buflen); exit(1); } } buffer[len++] = c; } } buffer[len] = 0; seq->len = len; seq->s = buffer; ungetc(c,fp); } return seq; } return NULL; } SEQUENCE * Read_fasta_Sq( name, database, offset ) char *name; DATABASE *database; unsigned long offset; { char sname[256], line[512]; SEQUENCE *seq = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); int len, n, c; unsigned long current; fseek(database->datafile, offset, 0 ); read_line(database->datafile,line); *sname = 0; sscanf( line, ">%s", sname ); if ( strcmp( sname, name ) ) { fprintf(stderr,"\n! ERROR on Read_fasta_Sq: pointer offset for %s wrong! (gets %s instead)\n", name, sname ); exit(1); } seq->name = (char*)strdup(name); seq->database = database; seq->desc = (char*)strdup(""); current = ftell(database->datafile); len=0; while((c=getc(database->datafile)) != '>' && c != EOF ) len += ( ! isspace(c) && c != '\n' ); seq->len = len; seq->s = (char*)calloc(len+1,sizeof(char)); n = 0; fseek(database->datafile, current, 0 ); while( (c=getc(database->datafile)) != '>' && c != EOF ) if ( ! isspace(c) && c != '\n' && n < len ) seq->s[n++] = tolower(c); seq->len = n; return seq; } SEQUENCE * Read_nbrf_Sq( name, database, offset, text_offset ) char *name; DATABASE *database; unsigned long offset; { char sname[256], line[512]; SEQUENCE *seq = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); unsigned long current; int len, c; fseek(database->datafile, offset, 0 ); *sname = 0; read_line(database->datafile,line); sscanf( line, ">>>>%s", sname ); if ( strcmp( sname, name ) ) { fprintf(stderr,"\n! ERROR on Read_nbrf_Sq: pointer offset for %s wrong! (gets %s instead)\n", name, sname ); exit(1); } seq->name = (char*)strdup(name); seq->database = database; read_line(database->datafile,line); seq->desc = (char*)strdup(line); current = ftell(database->datafile); len = 0; while ( (c = getc(database->datafile)) != '>' && c != EOF ) { if ( isalpha(c) && ! isspace(c) ) len++; } seq->len = len; seq->s = (char*)calloc(len+1,sizeof(char)); fseek( database->datafile, current, 0 ); len = 0; while ( (c = getc(database->datafile)) != '>' && c != EOF ) { if ( isalpha(c) && ! isspace(c) ) seq->s[len++] = tolower(c); } return seq; } unsigned long FindNext( fp, text, line ) FILE *fp; char *line, *text; { while( read_line(fp,line) != EOF ) { CleanLine(line); if ( !strncmp( line, text, strlen(text) ) ) return ftell(fp)-strlen(line); } return 0; } int hash_name( string ) /* computes a integer hash value between 0 and PRIME_NUM-1 for the character string (case-insensitive)*/ char *string; { int n=0; while( *string ) { n = (64*n + tolower(*string) ) % PRIME_NUM; string++; } return n; } void compile_embl_index( name ) char *name; { char buf[256]; FILE *datafile, *indexfile; char sname[256]; int c; unsigned long n, offset; int state=-1; datafile = OpenFileInSeqPath( name, "dat", "r", buf); indexfile = OpenFileInSeqPath( name, "index", "w", buf); fprintf( stderr, "compiling embl format index for database %s ...\n", name ); state = -1; offset = 0; n = 0; while ( (c = fgetc( datafile ) ) != EOF ) { if ( c == '\n' ) { state = 1; } else if ( (state == 1 || state == -1 ) && c == 'I' ) { state = 2; } else if ( state == 2 && c == 'D' ) { offset = ftell(datafile)-2; /* while( isspace(c=fgetc(datafile)) );*/ fscanf( datafile, "%s", sname ); n++; fprintf(indexfile, "%-10s %lu\n", sname, (unsigned long) offset ); printf( "%-10s %lu\n", sname, (unsigned long) offset ); state = 0; } else state = 0; } fclose(indexfile); fclose(datafile); fprintf(stderr, "... done. %lu sequences indexed\n", n ); } void compile_fasta_index( char *name ) { char buf[256]; FILE *datafile, *indexfile; char sname[256]; int c; unsigned long n, offset, state=-1; datafile = OpenFileInSeqPath( name, "fasta", "r", buf); indexfile = OpenFileInSeqPath( name, "index", "w", buf); fprintf( stderr, "compiling fasta format index for database %s ...\n", name ); state = -1; offset = 0; n = 0; while ( (c = fgetc( datafile ) ) != EOF ) { if ( c == '\n' ) { state = 1; } else if ( (state == 1 || state == -1 ) && c == '>' ) { state = 2; } else state = 0; if ( state == 2 ) { offset = ftell(datafile)-1; /* while( isspace(c=fgetc(datafile)) );*/ fscanf( datafile, "%s", sname ); n++; fprintf(indexfile, "%-10s %lu\n", sname, offset ); printf( "%-10s %lu\n", sname, offset ); state = 0; } } fclose(indexfile); fclose(datafile); fprintf(stderr, "... done. %lu sequences indexed\n", n ); } void compile_nbrf_index( char *name ) { char buf[256]; FILE *datafile, *textfile, *indexfile; char name1[256], name2[256]; unsigned long c, d; unsigned long n; datafile = OpenFileInSeqPath( name, "seq", "r",buf); textfile = OpenFileInSeqPath( name, "ref", "r",buf); indexfile = OpenFileInSeqPath( name, "index", "w",buf); fprintf( stderr, "compiling nbrf format index for database %s ...\n", name ); n = 0; while (1) { c = SeekTo( datafile, ">>>>"); d = SeekTo( textfile, ">>>>"); if ( c == EOF || d == EOF ) break; fscanf( datafile, ">>>>%s", name1 ); fscanf( textfile, ">>>>%s", name2 ); if ( strcmp( name1, name2 ) ) { fprintf( stderr, "\n! ERROR when indexing: different names %s %s\n", name1, name2 ); exit(1); } fprintf( indexfile, "%-10s %10lu %10lu\n", name1, c, d ); n++; } fclose(indexfile); fclose(datafile); fclose(textfile); fprintf(stderr, "... done. %lu sequences indexed\n", n ); } unsigned long SeekTo( FILE *fp, char *text ) { unsigned long p; char *s; int c; while(1) { c = getc(fp); if ( c == EOF ) return EOF; else if ( c == *text ) { p = ftell( fp ); s = text; while( *s ) { if ( c != *s ) break; else if ( c == EOF ) return EOF; else { s++; c = getc(fp); } } if (! *s ) { p--; fseek(fp,p,0); return p; } else fseek(fp,p+1,0); } } return EOF; } void FreeSeq( SEQUENCE *seq ) { if ( seq ) { if ( seq->name /*&& *(seq->name)*/) free( seq->name ); if ( seq->s /*&& *(seq->s)*/ ) free( seq->s ); if ( seq->desc /*&& *(seq->desc)*/ ) free( seq->desc ); free( seq ); } } FILE * WhichFileOfSequences( char *filename ) { static char *filenames[100]; static FILE *fp[100]; static int files; int n; if ( *filename != '@' ) return NULL; else filename++; for(n=0;nindexfile, line ) != EOF ) { sscanf( line, "%s", name ); if ( !strncmp( name, wild, len ) ) return ReadSequence( name, db ); } return NULL; } DATABASE * IsSequenceSpec( char *spec, char *wild ) { static char *old_spec=NULL; static char *old_wild; static DATABASE *old_db; char buf[256], *sname, *db, *s; if ( ! old_spec || strcmp( spec, old_spec ) ) { old_spec = (char*)strdup(spec); strcpy(buf,spec); sname = (char*)strchr( buf, ':' ); if ( ! sname ) { sname = (char*)strdup(""); } else { sname++; *(sname-1) = 0; } if ( (s = (char*)strchr( sname, '*')) ) *s = 0; db = buf; old_wild = (char*)strdup(sname); old_db = OpenDatabase(db); } strcpy(wild,old_wild); return old_db; } void rewind_spec( char *spec ) { char buf[256]; DATABASE *db = IsSequenceSpec( spec, buf ); FILE *fp; if ( db ) { rewind(db->indexfile); rewind(db->datafile); } else if ( (fp = WhichFileOfSequences( spec )) ) rewind(fp); } SEQUENCE *NextSeq( char *spec_or_file ) { char wild[256]; DATABASE *db = IsSequenceSpec(spec_or_file,wild); FILE *fp; if ( db ) return NextSeq_from_database_spec(spec_or_file); else { if ( *spec_or_file == '@' && (fp = WhichFileOfSequences( spec_or_file ) ) ) return nextSeqFrom_list_file(fp); } return (SEQUENCE *) 0; } /*main(argc, argv) int argc; char **argv; { DATABASE *db; SEQUENCE *seq; int n=2; char spec[256]; while ( seq = NextSeq_from_database_spec( argv[1] ) ) { printf("seq: %s len: %d\n%s\n%s\n", seq->name, seq->len, seq->desc, seq->s ); FreeSeq(seq); } }*/ static char *complement_table=NULL; char complement_base( char c ) { if ( !complement_table) { int x; complement_table = (char*)calloc(256,sizeof(char))+127; for(x=-127;x<128;x++) complement_table[x] = 'n'; complement_table['a'] = 't'; complement_table['A'] = 'T'; complement_table['c'] = 'g'; complement_table['C'] = 'G'; complement_table['g'] = 'c'; complement_table['G'] = 'C'; complement_table['t'] = 'a'; complement_table['T'] = 'A'; complement_table['u'] = 'a'; complement_table['U'] = 'A'; complement_table['['] = ']'; complement_table[']'] = '['; complement_table['*'] = '*'; complement_table['-'] = '-'; } return complement_table[(int)c]; } void free_ct () { if (!complement_table) return; free( complement_table-127 ); complement_table = (char *) NULL; } SEQUENCE * ReverseSeq( SEQUENCE *dna ) { char *s, *t, u; s = dna->s; t = &dna->s[dna->len-1]; complement_base('a'); /* to initialise */ while ( t >= s ) { u = complement_table[(int) *s]; *s = complement_table[(int) *t]; *t = u; s++; t--; } return dna; } SEQUENCE * reverse_protein_seq( SEQUENCE *protein ) { char *s, *t, u; s = protein->s; t = &protein->s[protein->len-1]; while ( t >= s ) { u = *s; *s = *t; *t = u; s++; t--; } return protein; } char *ComplementSeq( seq ) char *seq; { char *s = seq; char c, *t; complement_base('a'); /* to initialise */ while (*s) { *s = complement_table[(int) *s]; s++; } t = seq; s--; while ( t < s ) { c = *s; *s = *t; *t = c; t++; s--; } return seq; } SEQUENCE * SubSeq( SEQUENCE *seq, int start, int stop ) /* Creates a SEQUENCE containing the SubSequence start to stop inclusive If start > stop then returns the reverse complement If start and stop are out of range they are truncated */ { SEQUENCE *subs = SeqDup(seq); char *s; int i; int reverse; if ( start > stop ) { reverse = 1; i = start; start = stop; stop = i; } else reverse = 0; if ( start < 0 ) start = 0; if ( start > seq->len-1 ) return NULL; if ( stop > seq->len-1 ) stop = seq->len-1; subs->len = stop-start+1; s = (char*)calloc(subs->len+1,sizeof(char)); for(i=0;ilen;i++) s[i] = seq->s[start+i]; if ( reverse ) ComplementSeq(s); free(subs->s); subs->s = s; return subs; } char * DownDase(char *s) { char *t = s; while (*s) { *s = tolower(*s); s++; } return t; } char * UpCase(char *s) { char *t = s; while (*s) { *s = toupper(*s); s++; } return t; } char * CleanLine(char *s) { char *t = s; while(*s) { if ( ! isprint((int) *s) ) *s = ' '; s++; } return t; } /* convert a sequence into a regular expression. Allocates memory for the new string */ char * IubToRegexp( char *iubstring ) { int len=strlen(iubstring); char *s = iubstring; char *t, *r; while(*s) if ( (t = IubRegexp(*s++)) ) len += strlen(t); r = (char*)calloc(len+3,sizeof(char)); return Iub2Regexp( iubstring, r, len+1 ); } /* convert a sequence with iub codes into a regular expression returns either regexp or NULL if the length of the regexp is greater that maxlen, which should be set to the max permitted size */ char * Iub2Regexp( char *iubstring, char *regexp, int maxlen ) { char /* *r = regexp,*/ *s, c; int len=0; maxlen--; while(*iubstring && len < maxlen) { if ( (s = IubRegexp(c=*iubstring++)) ) { strcat( regexp, s ); len += strlen(s); } else { regexp[len++] = c; } } if ( len <= maxlen ) { regexp[len]= 0; return regexp; } else return NULL; } /* return a pointer to the regexp corresponding to an iub code, or NULL otherwise - note that only ambiguity codes return a non-null result*/ char * IubRegexp( char c ) { static char *iub[256]; static int initialised; if ( ! initialised ) { iub['r']= "[ag]"; iub['y']= "[ctu]"; iub['m']= "[ac]"; iub['k']= "[gtu]"; iub['s']= "[cg]"; iub['w']= "[atu]"; iub['h']= "[actu]"; iub['b']= "[cgtu]"; iub['v']= "[acg]"; iub['d']= "[agtu]"; iub['n']= "[acgtu]"; initialised = 1; } return iub[tolower(c)]; } #define MAXSEQLEN 100000 SEQUENCE * GetSeqFrom_file( char *filename ) /* read a sequence from file */ { FILE *fp; SEQUENCE *seq; char name[256]; char desc[256]; char buf[MAXSEQLEN+1]; char *s, *t; int len; if ( (fp=OpenFile( filename, "r" )) ) { seq = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); read_line( fp, desc ); sscanf( desc, "%s", name ); seq->name = (char*)strdup(name); read_line( fp, desc ); seq->desc = (char*)strdup(desc); len=fread(buf, sizeof(char), MAXSEQLEN, fp ); buf[len] = 0; s = buf; t = buf; len = 0; while ( *s ) { if ( isalpha((int) *s) ) { *t++ = tolower(*s); len++; } s++; } *t = 0; seq->len = len; seq->s = (char*)strdup(buf); fclose(fp); /* fprintf(stderr,"read sequence %s len %d from file %s\n", name, len, filename ); */ return seq; } return NULL; } SEQUENCE * SeqDup( SEQUENCE *seq ) { SEQUENCE *dup; if ( seq ) { dup = (SEQUENCE*)calloc(1, sizeof(SEQUENCE)); *dup = *seq; if ( seq->name ) dup->name = (char*)strdup(seq->name); else dup->name = NULL; if ( seq->desc ) dup->desc = (char*)strdup(seq->desc); else dup->desc = NULL; if ( seq->s ) dup->s = (char*)strdup(seq->s); else dup->s = NULL; return dup; } return NULL; } SEQUENCE * IntoSequence( char *name, char *desc, char *s ) { SEQUENCE *seq = (SEQUENCE*)calloc(1,sizeof(SEQUENCE)); seq->name = name; seq->desc = desc; seq->s = s; seq->len = strlen(s); return seq; } int KvCmp( const void *A, const void *B ) { KEYVALUE *a = (KEYVALUE*)A; KEYVALUE *b = (KEYVALUE*)B; return fcmp( &(a->key), &(b->key) ); } double *ReadFreqTable( FILE *fp ) { char line[256]; int c; double f, t=0.0; double *freq = (double*)calloc(256, sizeof(double)); while(skip_comments(fp, line) != EOF ) { if ( sscanf(line, "%c %lf\n", (char *)&c, &f ) == 2 && isascii(c) && f >= 0.0 ) freq[tolower(c)] = f; } t = 1.0e-10; for(c=0;c<256;c++) t += freq[c]; for(c=0;c<256;c++) freq[c] /= t; for(c='a';c<='z';c++) freq[toupper(c)] = freq[c]; freq['.'] = 1.0; fclose(fp); return freq; } double * ResidueFrequencies( SEQUENCE *seq ) { int n, c; double *freq = (double*)calloc(256, sizeof(double)); double t=0.0; if ( seq->len > 0 ) { for(n=0;nlen;n++) { freq[toupper(seq->s[n])]++; } for(c='A';c<='Z';c++) t += freq[c]; if ( t > 0 ) for(c='A';c<='Z';c++) freq[c] /= t; } return freq; } double * PseudoResidueFrequencies( SEQUENCE *seq, int pseudo_len ) { int n, c; double *freq = (double*)calloc(256, sizeof(double)); double t=0.0; if ( seq->len > 0 ) { freq['A'] = 35155; freq['R'] = 23105; freq['N'] = 20212; freq['D'] = 24161; freq['C'] = 8669; freq['Q'] = 19208; freq['E'] = 28354; freq['G'] = 33229; freq['H'] = 9906; freq['I'] = 23161; freq['L'] = 40625; freq['K'] = 25872; freq['M'] = 10101; freq['F'] = 17367; freq['P'] = 23435; freq['S'] = 32070; freq['T'] = 26311; freq['W'] = 5990; freq['Y'] = 14488; freq['V'] = 29012; for(c='A';c<='Z';c++) t += freq[c]; for(c='A';c<='Z';c++) freq[c] = pseudo_len*freq[c]/t; for(n=0;nlen;n++) { freq[toupper(seq->s[n])]++; } t = 0; for(c='A';c<='Z';c++) t += freq[c]; if ( t > 0 ) for(c='A';c<='Z';c++) freq[c] /= t; } return freq; } double * DayhoffResidueFrequencies (void) { /* these are are amino-acid frequencies used by blast, from Dayhoff */ double *freq = (double*)calloc(256, sizeof(double)); int c; double t=0.0; freq[ 'A']= 87.13 ; freq[ 'C']= 33.47 ; freq[ 'D']= 46.87 ; freq[ 'E']= 49.53 ; freq[ 'F']= 39.77 ; freq[ 'G']= 88.61 ; freq[ 'H']= 33.62 ; freq[ 'I']= 36.89 ; freq[ 'K']= 80.48 ; freq[ 'L']= 85.36 ; freq[ 'M']= 14.75 ; freq[ 'N']= 40.43 ; freq[ 'P']= 50.68 ; freq[ 'Q']= 38.26 ; freq[ 'R']= 40.90 ; freq[ 'S']= 69.58 ; freq[ 'T']= 58.54 ; freq[ 'V']= 64.72 ; freq[ 'W']= 10.49 ; freq[ 'Y']= 29.92; freq['*'] = 0.00+0; freq['X'] = 0.00+0; freq['x'] = 0.00+0; freq['.'] = 0.0; t = 0.0; for(c='A';c<='Z';c++) t += freq[c]; for(c='A';c<='Z';c++){ freq[toupper(c)] = freq[c] /= t; } return freq; } double * AltschulResidueFrequencies (void) { /* these are are amino-acid frequencies used by blast, from Robinson & Robinson */ double *freq = (double*)calloc(256, sizeof(double)); int c; double t=0.0; freq['A'] = 81.00 ; freq['C'] = 15.00 ; freq['D'] = 54.00 ; freq['E'] = 61.00 ; freq['F'] = 40.00 ; freq['G'] = 68.00 ; freq['H'] = 22.00 ; freq['I'] = 57.00 ; freq['K'] = 56.00 ; freq['L'] = 93.00 ; freq['M'] = 25.00 ; freq['N'] = 45.00 ; freq['P'] = 49.00 ; freq['Q'] = 39.00 ; freq['R'] = 57.00 ; freq['S'] = 68.00 ; freq['T'] = 58.00 ; freq['V'] = 67.00 ; freq['W'] = 13.00 ; freq['Y'] = 32.00; freq['*'] = 0.00+0; freq['X'] = 0.00+0; freq['x'] = 0.00+0; freq['.'] = 0.0; t = 0.0; for(c='A';c<='Z';c++) t += freq[c]; for(c='A';c<='Z';c++){ freq[toupper(c)] = freq[c] /= t; } return freq; } double * RobinsonResidueFrequencies (void) { /* these are are amino-acid frequencies used by blast, from Robinson & Robinson */ double *freq = (double*)calloc(256, sizeof(double)); int c; double t=0.0; freq['A'] = 35155; freq['R'] = 23105; freq['N'] = 20212; freq['D'] = 24161; freq['C'] = 8669; freq['Q'] = 19208; freq['E'] = 28354; freq['G'] = 33229; freq['H'] = 9906; freq['I'] = 23161; freq['L'] = 40625; freq['K'] = 25872; freq['M'] = 10101; freq['F'] = 17367; freq['P'] = 23435; freq['S'] = 32070; freq['T'] = 26311; freq['W'] = 5990; freq['Y'] = 14488; freq['V'] = 29012; /* old freqs freq[ 'A' ] = 78.00 ; freq[ 'C' ] = 19.00 ; freq[ 'D' ] = 54.00 ; freq[ 'E' ] = 63.00 ; freq[ 'F' ] = 39.00 ; freq[ 'G' ] = 74.00 ; freq[ 'H' ] = 22.00 ; freq[ 'I' ] = 52.00 ; freq[ 'K' ] = 57.00 ; freq[ 'L' ] = 90.00 ; freq[ 'M' ] = 22.00 ; freq[ 'N' ] = 45.00 ; freq[ 'P' ] = 52.00 ; freq[ 'Q' ] = 43.00 ; freq[ 'R' ] = 51.00 ; freq[ 'S' ] = 71.00 ; freq[ 'T' ] = 59.00 ; freq[ 'V' ] = 64.00 ; freq[ 'W' ] = 13.00 ; freq[ 'Y' ] = 32.00; */ freq['*'] = 0.00+0; freq['X'] = 0.00+0; freq['x'] = 0.00+0; freq['.'] = 0.0; t = 0.0; for(c='A';c<='Z';c++) t += freq[c]; for(c='A';c<='Z';c++){ freq[toupper(c)] = freq[c] /= t; } return freq; } double * GemResidueFrequencies (void) { double *freq = (double*)calloc(256, sizeof(double)); int c; double t=0.0; freq['A'] = 7.04+0; freq['C'] = 2.21+0; freq['D'] = 4.88+0; freq['E'] = 6.95+0; freq['F'] = 3.74+0; freq['G'] = 6.75+0; freq['H'] = 2.53+0; freq['I'] = 4.74+0; freq['*'] = 0.00+0; freq['X'] = 0.00+0; freq['x'] = 0.00+0; freq['K'] = 5.65+0; freq['L'] = 9.49+0; freq['M'] = 2.25+0; freq['N'] = 3.70+0; freq['P'] = 5.91+0; freq['Q'] = 4.44+0; freq['R'] = 5.42+0; freq['S'] = 8.17+0; freq['T'] = 5.63+0; freq['V'] = 6.39+0; freq['W'] = 1.31+0; freq['Y'] = 2.81+0; t = 0.0; for(c='A';c<='Z';c++) t += freq[c]; for(c='A';c<='Z';c++){ freq[toupper(c)] = freq[c] /= t; } return freq; } double *ModelResidueFrequencies( char *model ) { if ( !strcmp(model,"gem") ) return GemResidueFrequencies(); else if ( !strcmp(model,"robinson") ) return RobinsonResidueFrequencies(); else if ( !strcmp(model,"dayhoff")) return DayhoffResidueFrequencies(); else if ( !strcmp(model,"altschul")) return AltschulResidueFrequencies(); else { fprintf(stderr, "ERROR: protein model %s unknown\n", model ); return NULL; } } SEQUENCE_INDEX * createSequenceIndex( SEQUENCE *seq ) { SEQUENCE_INDEX *seqin = (SEQUENCE_INDEX*)calloc(1,sizeof(SEQUENCE_INDEX)); int c, n, next[256]; seqin->offset = (int*)calloc(seq->len+1,sizeof(int)); for(c=0;c<256;c++) seqin->start[c] = next[c] = -1; for(n=0;nlen;n++) { c = toupper(seq->s[n]); if ( seqin->start[c] == -1 ) { seqin->start[c] = next[c] = n; seqin->offset[n] = -1; } else { seqin->offset[next[c]] = n; next[c] = n; seqin->offset[n] = -1; } } return seqin; } void FreeSequenceIndex( SEQUENCE_INDEX *seqin ) { free(seqin->offset); free(seqin); } int **CopyMatrix( int **old_matrix ) { int **matrix = (int**)calloc(256,sizeof(int*)); int c, d; for(c=0;c<256;c++) matrix[c] = (int*)calloc(256,sizeof(int)); for(c=0;c<256;c++) for(d=0;d<256;d++) matrix[c][d] = old_matrix[c][d]; return matrix; } int ** ReadBlastMatrix( char *name ) { char fullname[256], *tok; FILE *fp; char line[256], *s; int **matrix=NULL; int row[256], rows; int c, d, n, num; fp = OpenFileInSearchPath( name, "r", getenv("BLASTMAT"), fullname ); if ( ! fp ) { char temp_name[256]; sprintf( temp_name, "aa/%s", name ); fp = OpenFileInSearchPath( temp_name, "r", getenv("BLASTMAT"), fullname ); } if ( fp ) { if ( skip_comments( fp, line ) != EOF ) { /* printf("%s\n", line ); */ matrix = (int**)calloc(256,sizeof(int*)); for(c=0;c<256;c++) matrix[c] = (int*)calloc(256,sizeof(int)); s = line; rows = 0; while(*s) { if ( isalpha((int)*s) || *s == '*' ) row[rows++] = *s; s++; } num = 0; while( skip_comments( fp, line ) != EOF ) { s = line; c = *s; if ( isalpha(c) || c == '*' ) s++; else c = row[num]; tok = strtok(s," "); for(d=0;d 0.0 && freq0[i] > 0.0 ? log( freq1[i]/freq0[i] ) : 0.0; logratio2[i] = freq2[i] > 0.0 && freq0[i] > 0.0 ? log( freq2[i]/freq0[i] ) : 0.0; printf("logratio %c %e %e %e %e %e\n", i, freq0[i], freq1[i], freq2[i], logratio1[i], logratio2[i] ); } lambda /= scale; for(i='A';i<='Z';i++) for(j='A';j<='Z';j++) if ( freq0[i] > 0.0 && freq0[j] > 0.0 ) new[i][j] = (int)(scale*matrix[i][j] - (logratio1[i]+logratio2[j])/lambda); else new[i][j] = 0; for(i='A';i<='Z';i++) { printf("%c ", i ); for(j='A';j<='Z';j++) printf(" %2d", new[i][j] ); printf("\n"); } return new; } SEQUENCE *NewSeq( int len, char *name ) { SEQUENCE *seq = (SEQUENCE*)calloc(1,sizeof(SEQUENCE)); seq->len = len; seq->s = (char*)calloc(len+1,sizeof(char)); seq->name = name ? (char*)strdup(name) : NULL; return seq; }