%{ #include "sequence.h" #define SeqAlignLISTLENGTH 64 %} struct SeqAlign char * name Sequence ** seq !list %info The most stupid and bad representation of a sequence alignment, being a list of sequences with padding characters (-) in them Not very useful for anything but reformatting and column frequencies (just). Dont use this for complicated alignment manipulations. Use the more hard core aln stuff For reading in this data structure, you can use the wise2xhmmer2x bridge to Sean Eddy's reading code. This copes inherently with automatically detecting sequence alignment. Might bridge to his output stuff sometime as well... %% struct ColumnCount double count[27] %info counts from a column of an alignment %% %{ #include "seqalign.h" %func Gives you a column count You are supposed to have enough sense to cache this on the caller if you need it again %% ColumnCount * ColumnCount_from_SeqAlign(SeqAlign * sa,int col) { ColumnCount * out; double count[27] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0}; int i; if( sa == NULL || sa->seq[0]->len < col ) { warn("Cannot make column count at %d - out of range/seqalign null",col); return NULL; } for(i=0;ilen;i++) { if( sa->seq[i]->len < col ) { warn("For seqalign sequence %d (%s), although first seq ok, this doesn't extend to %d. Error.",i,sa->seq[i]->name,col); return NULL; } if ( isalpha(sa->seq[i]->seq[col]) ) { count[toupper(sa->seq[i]->seq[col]) - 'A']++; } else { count[26]++; } } out = ColumnCount_alloc(); for(i=0;i<27;i++) out->count[i] = count[i]; return out; } %func Writes selex format %% boolean write_selex_SeqAlign(const SeqAlign * sa,int name_len,int block_len,FILE * ofp) { int col; int i; int j; int k; for(col=0;colseq[0]->len;) { for(j=0;jlen;j++) { for(k=0;kseq[j]->name[k] != '\0';k++) { fputc(sa->seq[j]->name[k],ofp); } if( sa->seq[j]->name[k] == '\0' && k >= name_len ) { warn("In printing selex alignment, name %s is truncated at %d",sa->seq[j]->name,k); } /* put in the rest of k */ for(;kseq[j]->len ; i++) { fputc(sa->seq[j]->seq[i],ofp); } fputc('\n',ofp); } /* end over all sequences for this block */ col += block_len; fprintf(ofp,"\n\n"); } return TRUE; } %func Reads in selex (Stockholm) alignment At the moment ignores all #= stuff on the sequence Read HMMER documentation for a definition of the format %% SeqAlign * read_selex_SeqAlign(FILE * ifp) { char buffer[4096]; SeqAlign * out; Sequence * temp; int i; boolean isfirst; boolean isfseq; boolean isend; char * name; char * seq; char * strip; int pos; out = SeqAlign_alloc_std(); while( fgets(buffer,4096,ifp) != NULL ) { fprintf(stderr,"reading %s in selex alignment\n",buffer); if( buffer[0] == '#' ) continue; /* skips over # */ if( buffer[0] == '/' && buffer[1] == '/' ) break; if( !isspace(buffer[0]) ) { /** in a block **/ i = 0; /* this is the first sequence */ if( out->len == 0 ) { isfirst = TRUE; isfseq = TRUE; } else { isfirst = FALSE; isfseq = FALSE; } for(;;) { /* strip trailing white space */ for(strip = buffer + strlen(buffer) - 1; strip > buffer && isspace(*strip);strip--) ; strip++; *strip = '\0'; for(name = seq = buffer;!isspace(*seq) && *seq; seq++) ; /** if '\0' - worry **/ if( !*seq ) { warn("For name %s in selex alignment, no sequence!",name); return out; } *seq = '\0'; if( isfirst == TRUE ) { temp = empty_Sequence_from_dynamic_memory(stringalloc(name)); add_SeqAlign(out,temp); } else { if ( i >= out->len ) { warn("For sequence %s, got over previous blocks size",name); return NULL; } temp = out->seq[i++]; } /* if is fseq, then find the next character */ if( isfseq == TRUE ) { for(seq++;*seq && isspace(*seq);seq++) ; if( *seq == '\0' ) { warn("In first sequence %s, got to end of line with no sequence!",name); return NULL; } isfseq = FALSE; pos = seq - buffer; } /* else pos is as normal */ /* map all non alnum chars to '-' */ for(seq = buffer+pos;*seq;seq++) { if( !isalnum(*seq) ) { *seq = '-'; } } /* add to sequence */ add_string_to_Sequence(temp,buffer+pos); /* get next line */ while( (seq = fgets(buffer,4096,ifp)) != NULL ) { if( buffer[0] != '#' ) break; if( buffer[0] == '/' && buffer[1] == '/' ) break; } if( seq == NULL ) { isend = TRUE; break; } /* if it is blank, break out of block */ if( !isalnum(buffer[0]) ) { break; } if( buffer[0] == '/' && buffer[1] == '/' ) break; } if( isend == TRUE ) break; } /* end of was a non space line */ /* else - ignore it !*/ } return out; }