/* Last edited: Apr 11 09:23 1997 (birney) */ %{ #include "wisebase.h" #include "codon.h" #ifdef LINUX #include "posix.h" #endif #define SEQUENCEBLOCK 8000 enum SequenceType { SEQUENCE_UNKNOWN = 64, SEQUENCE_PROTEIN, SEQUENCE_DNA, SEQUENCE_CDNA, SEQUENCE_GENOMIC, SEQUENCE_EST, SEQUENCE_RNA }; #define is_dna_SequenceType(type) (type == SEQUENCE_DNA || type == SEQUENCE_CDNA || type == SEQUENCE_GENOMIC || type == SEQUENCE_EST ? TRUE : FALSE) #define is_rna_SequenceType(type) (type == SEQUENCE_RNA ? TRUE : FALSE) #define is_protein_SequenceType(type) (type == SEQUENCE_PROTEIN ? TRUE : FALSE ) #define is_dna_Sequence(seq) (is_dna_SequenceType(seq->type)) #define is_rna_Sequence(seq) (is_rna_SequenceType(seq->type)) #define is_protein_Sequence(seq) (is_protein_SequenceType(seq->type)) #define SequenceSetLISTLENGTH 64 %} struct Sequence char * name !def="NULL" // name of the sequence char * seq !def="NULL" // actual sequence int len; !def="0" // length of the sequence int maxlen; !def="0" // internal counter, indicating how much space in seq there is int offset; !def="1"; // start (in bio-coords) of the sequence. Not called start due to weird legacy. int end; !def="(-1)"; // end (in bio-coords == C coords) of the sequence int type; !def="SEQUENCE_UNKNOWN"; // guess of protein/dna type %info This object is the basic sequence object, trying to hold little more than the name and sequence of the DNA/protein. The len/maxlen is the actual length of the sequence (strlen(obj->seq)) and amount of memory allocated in obj->seq mainly for parsing purposes. You are strongly encouraged to used the typed counterparts of Sequence, namely, Protein, cDNA and Genomic. By doing this you are much, much less likely to mess up algorithms which expect specific sequence types. %% struct SequenceSet Sequence ** set !list %info A list of sequences. Not a database (you should use the db stuff for that!). But useful anyway %% api object Sequence func uppercase_Sequence func force_to_dna_Sequence func is_reversed_Sequence func translate_Sequence func reverse_complement_Sequence func magic_trunc_Sequence func trunc_Sequence func read_fasta_file_Sequence func read_Sequence_EMBL_seq func read_fasta_Sequence func show_Sequence_residue_list func write_fasta_Sequence func make_len_type_Sequence des free_Sequence endobject object SequenceSet des free_SequenceSet endobject func Sequence_type_to_string func new_Sequence_from_strings endapi %{ #include "wisebase.h" #include "sequence.h" %func Makes a new sequence from strings given. Separate memory will be allocated for them and them copied into it. They can be NULL, in which case o a dummy name SequenceName will be assigned o No sequence placed and length of zero. Though this is dangerous later on. The sequence type is calculated automatically using /best_guess_type. If you want a DNA sequence but are unsure of the content of, for example, IUPAC codes, please use /force_to_dna_Sequence before using the sequence. Most of the rest of dynamite relies on a five letter A,T,G,C,N alphabet, but this function will allow any sequence type to be stored, so please check if you want to save yourself alot of grief. In perl and other interfaces, this is a much safer constructor than the raw "new" type %arg name r name of sequence, memory is allocated for it. seq r char * of sequence, memory is allocated for it. %% Sequence * new_Sequence_from_strings(char * name,char * seq) { Sequence * out; out = Sequence_alloc(); if( name == NULL) name = "SequenceName"; out->name = stringalloc(name); if( seq == NULL ) { out->seq = NULL; out->len = 0; return out; } out->seq = stringalloc(seq); out->len = strlen(out->seq); out->offset = 1; out->end = out->len; out->type = best_guess_type(out); return out; } %func Returns true if name looks like [A-Za-z]+[0-9]+ This should be an accession number %arg name r name to be tested %% boolean looks_like_accession(char * name) { char * run; for(run=name;*run && isalpha((int)*run);run++) ; if( *run == '\0') return FALSE; for(;*run && isalnum((int)*run) && !isalpha((int)*run);run++) ; if( *run == '\0') return TRUE; return FALSE; } %func makes seq->len and seq->end match the seq->seq length number. It also checks the type of the sequence with /best_guess_type %simple validate %arg seq rw Sequence object %% void make_len_type_Sequence(Sequence * seq) { seq->len = strlen(seq->seq); seq->end = seq->len + seq->offset -1; seq->type = best_guess_type(seq); } %func Guesses DNA or protein, by adding all the A,T,G,C up and if len < 300 && > 95% or len > 300 && > 75% then considers it to be DNA. NB - Ns now counted. %arg seq r Sequence to be guessed return o SEQUENCE_DNA or SEQUENCE_PROTEIN %% int best_guess_type(Sequence * seq) { register int i; int ch[26]; int no; int unplaced = 0; int total; for(i=0;i<26;i++) ch[i] = 0; for(i=0;ilen;i++) { /*fprintf(stderr,"character is %c and total is %d\n",seq->seq[i],unplaced);*/ if( (no=(int)( toupper(seq->seq[i])-'A')) > 26 || no < 0 ) unplaced++; else ch[no]++; } total = (ch['A' - 'A']+ch['T' - 'A'] + ch['G' - 'A'] + ch['C' - 'A'] + ch['N' - 'A']); if( seq->len < 300 ) { if( (double)total/(double)seq->len > 0.95 ) return SEQUENCE_DNA; else return SEQUENCE_PROTEIN; } else { if( (double)total/(double)seq->len > 0.75 ) return SEQUENCE_DNA; else return SEQUENCE_PROTEIN; } } %func Converts sequence type (SEQUENCE_*) to a string %arg type type eg SEQUENCE_PROTEIN %% char * Sequence_type_to_string(int type) { switch (type ) { case SEQUENCE_UNKNOWN : return "Unknown type"; case SEQUENCE_PROTEIN : return "Protein"; case SEQUENCE_DNA : return "Dna"; case SEQUENCE_CDNA : return "cDNA"; case SEQUENCE_GENOMIC : return "Genomic"; case SEQUENCE_EST : return "Est"; case SEQUENCE_RNA : return "RNA"; default : return "Undefined!"; } } %func makes all the sequence uppercase %simple uppercase %arg seq rw Sequence to be uppercased %% void uppercase_Sequence(Sequence * seq) { int i; for(i=0;ilen;i++) seq->seq[i] = toupper((int)seq->seq[i]); } %func This a) sees how many non ATGCN characters there are in Seq b) If the level is below fraction a) flips non ATGC chars to N b) writes number of conversions to number_of_conver c) returns TRUE c) else returns FALSE fraction of 0.0 means completely intolerant of errors fraction of 1.0 means completely tolerant of errors %simple force_to_dna %arg seq rw sequence object read and converted fraction r number 0..1 for tolerance of conversion number_of_conver wN number of conversions actually made return r TRUE for conversion to DNA, FALSE if not %% boolean force_to_dna_Sequence(Sequence * seq,double fraction,int * number_of_conver) { int count =0; int i; if( seq == NULL ) { warn("Attempting to force a sequence with no Sequence object!\n"); return FALSE; } if( seq->len <= 0 ) { warn("Trying to make a sequence with a length of %d. Bad news!",seq->len); return FALSE; } for(i=0;ilen;i++) { /* if it is lower case, uppercase it! */ seq->seq[i] = (char)toupper((int)seq->seq[i]); if( !is_valid_base_char(seq->seq[i]) ) { count++; } } if( ((double)count/(double)seq->len) < fraction ) { seq->type = SEQUENCE_DNA; if( count != 0 ) { for(i=0;ilen;i++) { if( !is_valid_base_char(seq->seq[i]) ) { seq->seq[i] = 'N'; } } } if( number_of_conver != NULL ) { *number_of_conver = count; } return TRUE; } else { if( number_of_conver != NULL ) { *number_of_conver = count; } return FALSE; } } %func Currently the sequence object stores reversed sequences as start > end. This tests that and returns true if it is %simple is_reversed %arg seq r sequence to test %% boolean is_reversed_Sequence(Sequence * seq) { if( seq->offset > seq->end) return TRUE; return FALSE; } %func This translates a DNA sequence to a protein. It assummes that it starts at first residue (use trunc_Sequence to chop a sequence up). %simple translate %arg dna r DNA sequence to be translated ct r Codon table to do codon->aa mapping return o new protein sequence %% Sequence * translate_Sequence(Sequence * dna,CodonTable * ct) { Sequence * out; int i; int j; int len; char * seq; char * name; char buffer[512]; if( is_dna_Sequence(dna) == FALSE) { warn("Trying to make a translation from a non DNA sequence... type is [%s]",Sequence_type_to_string(dna->type)); return NULL; } len = dna->len/3 + 1; seq = ckcalloc(len,sizeof(char)); sprintf(buffer,"%s.tr",dna->name == NULL ? "NoNameDNASeq" : dna->name); name = stringalloc(buffer); out = Sequence_from_dynamic_memory(name,seq); for(i=0,j=0;ilen-3;i+=3,j++) { out->seq[j] = aminoacid_from_seq(ct,dna->seq+i); } out->seq[j] = '\0'; out->type = SEQUENCE_PROTEIN; out->len = strlen(out->seq); return out; } %func This both complements and reverses a sequence, - a common wish! The start/end are correct with respect to the start/end of the sequence (ie start = end, end = start). %simple revcomp %arg seq r Sequence to that is used to reverse (makes a new Sequence) return o new Sequence which is reversed %% Sequence * reverse_complement_Sequence(Sequence * seq) { Sequence * out; int i; int j; if( is_dna_Sequence(seq) != TRUE ) { warn("Cannot reverse complement non-DNA sequence... type is %s",Sequence_type_to_string(seq->type)); return NULL; } out = Sequence_from_static_memory(seq->name,seq->seq); for(j=0,i=seq->len-1;i >= 0;i--,j++) { out->seq[j] = char_complement_base(seq->seq[i]); /*fprintf(stderr,"In position %d placed %c from %c\n",j,out->seq[j],seq->seq[i]);*/ } out->len = strlen(seq->seq); out->offset = seq->end; out->end = seq->offset; out->type = seq->type; return out; } %func Clever function for dna sequences. When start < end, truncates normally when start > end, truncates end,start and then reverse complements. ie. If you have a coordinate system where reverse sequences are labelled in reverse start/end way, then this routine produces the correct sequence. %simple magic_trunc %arg seq r sequence that is the source to be truncated start r start point end r end point return o new Sequence which is truncated/reversed %% Sequence * magic_trunc_Sequence(Sequence * seq,int start,int end) { Sequence * temp; Sequence * out; if( is_dna_Sequence(seq) == FALSE) { warn("Cannot magic truncate on a non DNA sequence... type is %s",Sequence_type_to_string(seq->type)); return NULL; } if( start < 0 || end < 0 ) { warn("Attempting a magic truncation on indices which are less than zero [%d:%d]. Clearly impossible",start,end); return NULL; } if( start < end ) { return trunc_Sequence(seq,start,end); } else { temp = trunc_Sequence(seq,end,start); if( temp == NULL ) { warn("Unable to truncate sequence"); return NULL; } out = reverse_complement_Sequence(temp); free_Sequence(temp); return out; } } %func truncates a sequence. It produces a new memory structure which is filled from sequence start to end. Please notice Truncation is in C coordinates. That is the first residue is 0 and end is the number of the residue after the cut-point. In otherwords to 2 - 3 would be a single residue truncation. So - if you want to work in more usual, 'inclusive' molecular biology numbers, which start at 1, then you need to say trunc_Sequence(seq,start-1,end); (NB, should be (end - 1 + 1) = end for the last coordinate). Truncation occurs against the *absolute* coordinate system of the Sequence, not the offset/end pair inside. So, this is a very bad error ** wrong code, and also leaks memory ** tru = trunc_Sequence(trunc_Sequence(seq,50,80),55,75); This the most portable way of doing this temp = trunc_Sequence(seq,50,80); tru = trunc_Sequence(temp,55-temp->offset,75-temp->offset); free_Sequence(temp); %simple trunc %arg seq r object holding the sequence to be truncated start r start point of truncation end r end point of truncation return o newly allocated sequence structure %% Sequence * trunc_Sequence(Sequence * seq,int start,int end) { char * name; char * seqb; Sequence * out; if( start < 0 || end < 0 ) { warn("Attempting a truncation on indices which are less than zero [%d:%d]. Clearly impossible",start,end); return NULL; } if( end <= start ) { warn("Trying to truncate Sequence from %d - %d",start,end); return NULL; } if( end > seq->len ) { warn("Trying to truncate Sequecne %s from %d - %d when length is %d", seq->name,start,end,seq->len); return NULL; } name = stringalloc(seq->name); seqb = (char *) ckcalloc(end-start+1,sizeof(char)); memcpy(seqb,seq->seq+start,(end-start)); seqb[end-start] = '\0'; out = Sequence_from_dynamic_memory(name,seqb); out->len = strlen(out->seq); out->type = seq->type; out->offset = seq->offset+start; out->end = seq->offset + end-1; return out; } %func A function for you to easily specify the sequence name and the database separately. Just concatonates the two strings with : betwqeen them. Therefore you should use "swisprot-id" for example as your datastring. calls /read_SRS_Sequence %arg datastring r string representing the database (swissprot-id) srsstring r string for the name (eg, ROA1_HUMAN) %% Sequence * read_SRS_db_Sequence(char * datastring,char * srsstring) { char buffer[256]; sprintf(buffer,"%s:%s",datastring,srsstring); return read_SRS_Sequence(buffer); } %func reads SRS specified sequence. calls popoen with getz -f using srs4 syntax. Will only read the first sequence if there is more than one in the SRS spec, and does not warn you about additional sequences %arg srsstring r srs spec'd string swissprot-id:ROA1_HUMAN %% Sequence * read_SRS_Sequence(char * srsstring) { FILE * pipe; char buffer[MAXLINE]; Sequence * out; sprintf(buffer,"getz -d '[%s]' ",srsstring); pipe = popen(buffer,"r"); if ( pipe == NULL ) { warn("Could not open %s as an SRS database string - probably no getz",srsstring); return NULL; } out = read_fasta_Sequence(pipe); pclose(pipe); return out; } %func reads efetch specificed sequence. calls popen to efetch. A hack around accession numbers so that if the thing looks like WP:acc number, calls it with -a... otherwise assummes you have both database and name in the efetchstring %arg efetchstring r efetch valid string %% Sequence * read_efetch_Sequence(char * efetchstring) { FILE * pf; Sequence * out; char buffer[MAXLINE]; if( strstartcmp(efetchstring,"WP:") != 0 && looks_like_accession(efetchstring+3) == TRUE) { sprintf(buffer,"efetch -f -a %s",efetchstring); } else { sprintf(buffer,"efetch -f %s",efetchstring); } pf = popen(buffer,"r"); if( pf == NULL ) { warn("Could not open efetch pipe with [%s]",efetchstring); return NULL; } out = read_fasta_Sequence(pf); pclose(pf); return out; } %func Just a call a) open filename b) read sequence with /read_fasta_Sequence c) close file. %arg filename r filename to open %% Sequence * read_fasta_file_Sequence(char * filename) { Sequence * out; FILE * ifp; ifp = openfile(filename,"r"); if( ifp == NULL ) { warn("Cannot open %s for read_fasta_file",filename); return NULL; } out = read_fasta_Sequence(ifp); fclose(ifp); return out; } %func reads the sequence part of an EMBL file. This function can either take a file which starts %arg ifp r input file to read from buffer rw buffer containing the first line. maxlen r length of buffer %% Sequence * read_Sequence_EMBL_seq(char * buffer,int maxlen,FILE * ifp) { Sequence * out; char seqbuffer[SEQUENCEBLOCK]; int i = 0; signed char c; if( !isalpha((int)*buffer) ) { warn("I don't like this - got a buffer of [%s] in reading an EMBL sequence section",buffer); } do { if( strstartcmp(buffer,"SQ") == 0 ) { break; } } while ( fgets(buffer,maxlen,ifp) != NULL); out = empty_Sequence_from_dynamic_memory(stringalloc("EMBLseq")); while( (c=fgetc(ifp)) != EOF ) { if( c == '/' && (c=fgetc(ifp)) == '/') break; /*** ugly perhaps. what about single / lines? ***/ if( isalpha(c) ) seqbuffer[i++] = c; if( i > SEQUENCEBLOCK-2) { seqbuffer[i] = '\0'; if( add_string_to_Sequence(out,seqbuffer) == FALSE ) { warn("Could not read full sequence of %s - returning\n",out->name); return out; } i = 0; } } /* ok have to now put away final buffer read! */ seqbuffer[i] = '\0'; add_string_to_Sequence(out,seqbuffer); /** add back > if need be **/ if( feof(ifp) || c != '/' ) { warn("In parsing an EMBL file got an poor ending of a sequence region"); } else { while( (c=fgetc(ifp)) != '\n' && c != EOF ) ; } make_len_type_Sequence(out); return out; } %func reads the fasta file: format is >name sequence allocates a structure and puts in the sequence. Calls /make_len_type_Sequence to check type and length. It leaves the '>' on the next fasta sequence for multiple sequence reading %arg ifp r input file to read from return o new Sequence structure %% Sequence * read_fasta_Sequence(FILE * ifp) { Sequence * out; char seqbuffer[SEQUENCEBLOCK]; int i = 0; signed char c = EOF; if( feof(ifp) ) { /* fail silently */ return NULL; } while( (c=fgetc(ifp)) != EOF && isspace(c)) ; if( feof(ifp) ) return NULL; if( c != '>' ) { warn("First letter read is not '>' - assumming it is not a fasta stream"); return NULL; } if( c == EOF || feof(ifp) ) return NULL; /*** ok = got to > ****/ /*** read in name ****/ while( !isspace(c=fgetc(ifp)) && c != EOF) seqbuffer[i++]=c; if( c == EOF) return NULL; seqbuffer[i]='\0'; /*** name now in sequence buffer - make sequence ***/ out = empty_Sequence_from_dynamic_memory(stringalloc(seqbuffer)); if( out == NULL ) return NULL; /*** ok, suck in the rest of this line if necessary (ie something else on the 1st line) ***/ while( c != EOF && c != '\n' ) c=fgetc(ifp); /*** now read in sequence ***/ for(i=0; !feof(ifp) && (c=fgetc(ifp)) != '>' && c != EOF;) { if( isalpha(c) ) seqbuffer[i++] = c; if( i > SEQUENCEBLOCK-2) { seqbuffer[i] = '\0'; if( add_string_to_Sequence(out,seqbuffer) == FALSE ) { warn("Could not read full sequence of %s - returning\n",out->name); return out; } i = 0; } } /* ok have to now put away final buffer read! */ seqbuffer[i] = '\0'; add_string_to_Sequence(out,seqbuffer); /** add back > if need be **/ if( c == '>' ) ungetc(c,ifp); make_len_type_Sequence(out); return out; } %func shows a region of a sequence as 124 A 125 T etc from start to end. The numbers are in C coordinates (ie, 0 is the first letter). useful for debugging %simple show_debug %arg seq r Sequence to show start r start of list end r end of list %% void show_Sequence_residue_list(Sequence * seq,int start,int end,FILE * ofp) { int i; for(i=start;iseq[i]); } } %func Dodgy function. This is meant to add the more sequence to seq (into ->seq). Not sure how stable this is. In theory it reallocates memory on the basis of ->maxlen. %type internal %arg seq Sequence to add sequence to more pointer to sequence to add return TRUE if successful, FALSE if not %% boolean add_string_to_Sequence(Sequence * seq,char * more) { register int len; register int blocklen; void * temp; len = strlen(more)+1; if( len < seq->maxlen - seq->len ) { /*** ok can add to this block! ****/ strcat(seq->seq,more); seq->len = strlen(seq->seq); return TRUE; } /*** nope - need to realloc ****/ len -= (seq->maxlen - seq->len); /* amount that needs to be realloc'd */ blocklen = 1 + (int)(len / SEQUENCEBLOCK); /* number of blocks */ blocklen *= SEQUENCEBLOCK; /* make that into bytes */ blocklen += seq->maxlen; /* final size of string */ temp = ckrealloc ( seq->seq,blocklen); if( temp == NULL ) { warn("Sequence block error for sequence %s on blocklen %d\n",CKS(seq->name),blocklen); return FALSE; } seq->seq = (char *) temp; /* realloc moves the memory for us as well */ seq->maxlen = blocklen; /*** copy in string ****/ strcat(seq->seq,more); seq->len = strlen(seq->seq); return TRUE; } %func Only allocates sequence structure and name %% Sequence * empty_Sequence_from_dynamic_memory(char * name) { Sequence * out; out = Sequence_alloc(); if( out == NULL ) return NULL; if( name == NULL ) { warn("Attempting to make an empty sequence with no name: assigning dummy name"); name = stringalloc("DummyName"); } out->name = name; out->seq = (char *) ckcalloc (SEQUENCEBLOCK,sizeof(char)); out->maxlen = SEQUENCEBLOCK; out->len = 0; return out; } %func allocates sequence structure with enough length in char for len sequence. %arg len r length of blank sequene space %% Sequence * Sequence_alloc_len(int len) { Sequence * out; out = Sequence_alloc(); if( out == NULL) return NULL; out->seq = (char *) ckcalloc (len,sizeof(char)); out->maxlen = out->len = len; return out; } %func Allocates the sequence structure and memory for name and seq, copies them in. %type internal %% Sequence * Sequence_from_static_memory (char * name,char * seq) { return Sequence_from_dynamic_memory(stringalloc(name),stringalloc(seq)); } %func Allocates the sequence structure and simple attaches name and seq to the correct places %type internal %arg name name of sequence seq a char * to correct sequence %% Sequence * Sequence_from_dynamic_memory(char * name,char * seq) { Sequence * out; if( seq == NULL) { warn("Cannot make a sequence with no sequence!"); return NULL; } if( name == NULL ) { warn("You are attempting to make a sequence with no name - assigning dummy name"); name = stringalloc ("DummyName"); } out = Sequence_alloc(); if( out == NULL) return out; out->name = name; out->seq = seq; out->maxlen = out->len = strlen(seq); return out; } %func writes a fasta file of the form >name Sequence %simple write_fasta %arg seq r sequence to be written ofp file to write to %% void write_fasta_Sequence(Sequence * seq,FILE * ofp) { assert(seq); fprintf(ofp,">%s\n",seq->name == NULL ? "NoName_Null_string" : seq->name ); show_line(seq->seq,60,ofp); } %}