%{ #include "dyna.h" %} %{ #include "geneutil.h" %func Makes a new GenomicRegion with the genes predicted from this AlnBlock Really a wrapper around the add_Genes_to_GenomicRegion_GeneWise and other functionality %arg gen genomic sequence to use pseudo If true, predicts frameshifted genes as pseudogenes alb genewise alignment to predict genes from return a newly allocated structure %% GenomicRegion * new_GenomicRegion_from_GeneWise(Genomic * gen,boolean pseudo,AlnBlock * alb) { GenomicRegion * out; out = new_GenomicRegion(gen); add_Genes_to_GenomicRegion_GeneWise(out,gen->baseseq->offset,gen->baseseq->end,alb,NULL,pseudo,NULL); return out; } %func Potential an Alnblock may have more than one gene due to looping models This adds all the genes to gr %arg gr genomic region to add genes to org_start start point of the dna to which the alb was made from org_end end point of the dna to which the alb was made from alb logical label alignment pseudo If true, frameshifted genes are predicted as pseudo genes root the second argument to make_name make_name fN a pointer to a function to actually make the name of the gene %% int add_Genes_to_GenomicRegion_GeneWise(GenomicRegion * gr,int org_start,int org_end,AlnBlock * alb,char * root,boolean pseudo,char * (*make_name)(Wise2_Genomic *,char *,int ,Wise2_Gene * )) { int count = 0; Gene * gene; AlnColumn * alc; alc = alb->start; while( (gene = Gene_from_AlnColumn_GeneWise(alc,org_start,org_end,pseudo,&alc)) != NULL ) { if( make_name != NULL ) { gene->name = (*make_name)(gr->genomic,root,gr->len,gene); } if( root != NULL ) { gene->seqname = stringalloc(root); } add_Gene_to_GenomicRegion(gr,gene); count++; if( alc == NULL ) break; } return count; } %func A wrap for making a gene structure from an AlnBlock derived from one of the genewise methods %arg alc Alignment column in an AlnBlock produced by genewise org_start offset that the genewise alignment was to the coordinate system org_end emd that the genewise alignment was to the coordinate system end w pointer to a AlnColumn * to say when it has finished with this gene %% Gene * Gene_from_AlnColumn_GeneWise(AlnColumn * alc,int org_start,int org_end,boolean predict_pseudo_for_frameshift,AlnColumn ** end) { Gene * out; Transcript * tr; Translation * ts; Exon * ex; AlnColumn * prev; SupportingFeature * sf; int score = 0; int dosf = 0; int phase = 0; int frame_break = 0; while(alc != NULL && is_random_AlnColumn_genewise(alc) == TRUE ) alc = alc->next; while (alc != NULL && strcmp(alc->alu[1]->text_label,"CODON") != 0 ) alc = alc->next; if( alc == NULL) return NULL; out = Gene_alloc_len(1); tr = Transcript_alloc_std(); add_Gene(out,tr); out->start = alc->alu[1]->start +1; score += alc->alu[0]->score[0]; for(;;) { /* fprintf(stderr,"Top loop - alc at %s %s\n",alc->alu[1]->text_label,alc->alu[0]->text_label); */ /* fprintf(stderr,"2 Score is %.2f %s %d %.2f\n",Score2Bits(score),alc->alu[1]->text_label,alc->alu[0]->score[0],Score2Bits(alc->alu[0]->score[0]));*/ ex = Exon_alloc_std(); add_ex_Transcript(tr,ex); /* * this is always the start of an exon */ dosf = 0; if( strcmp(alc->alu[1]->text_label,"CODON") == 0 ) { ex->start = alc->alu[1]->start+1 - out->start; /* coordinated in alignment coords */ dosf = 1; phase = 0; } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_0") == 0 ) { ex->start = alc->alu[1]->start +4 - out->start; dosf = 1; phase = 0; } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_1") == 0 ) { ex->start = alc->alu[1]->start +4 - out->start; phase = 1; } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_2") == 0 ) { ex->start = alc->alu[1]->start +4 - out->start; phase = 2; } else { ex->start = alc->alu[1]->start +1 - out->start; /* coordinated in alignment coords */ phase = -1; } ex->phase = phase; /* * Exons can start in INSERTs (yuk). In which case we don't * make a supporting feature */ if( dosf == 1 && strstartcmp(alc->alu[0]->text_label,"INSERT") != 0 ) { sf = SupportingFeature_alloc(); /* we fill in start and end from the exon at the moment. Not pretty */ sf->hstart = alc->alu[0]->start+1; sf->hstrand = 1; /* currently only got proteins. Thank the lord! */ sf->start = ex->start; } else { sf = NULL; /* make sure we don't generate a sf here */ } for(prev=alc,alc=alc->next;alc != NULL; ) { /* fprintf(stderr,"Exon loop - alc at %s %s %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,sf); */ score += alc->alu[0]->score[0]; /* fprintf(stderr,"1 Score is %.2f %s\n",Score2Bits(score),alc->alu[1]->text_label); */ if( is_frameshift_AlnColumn_genewise(alc) == TRUE && predict_pseudo_for_frameshift == TRUE ) { score += alc->alu[0]->score[0]; fprintf(stderr,"Score is %.2f\n",Score2Bits(score)); out->ispseudo = TRUE; alc = alc->next; continue; } if( is_random_AlnColumn_genewise(alc) == TRUE) break; if( strcmp(alc->alu[1]->text_label,"CODON") != 0 ) { if( strstartcmp(alc->alu[0]->text_label,"DELETE") == 0 ) { /* must add sf and start a new one */ /*fprintf(stderr,"Looking at alc at %s %s %d %d %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,prev,out,sf);*/ if( sf != NULL ) { sf->end = prev->alu[1]->end+1 - out->start; sf->hend = prev->alu[0]->end+1; add_Exon(ex,sf); sf = NULL; } /* * go the end of this delete run, which are residues in the query with no * target info */ while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"DELETE") == 0 ) { alc = alc->next; } if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) { /* the next position is the start of the new alignment */ sf = SupportingFeature_alloc(); sf->hstart = alc->next->alu[0]->start+1; sf->start = alc->next->alu[1]->start+1 - out->start; } else { sf = NULL; } } else { break; } } else { /* it is a codon match, but it could be an insert */ if( strstartcmp(alc->alu[0]->text_label,"INSERT") == 0) { /* break at this point, add this supporting feature */ if( sf != NULL ) { sf->end = prev->alu[1]->end+1 - out->start; sf->hend = prev->alu[0]->end+1; add_Exon(ex,sf); sf = NULL; } frame_break = 0; /* go to the end of this insert run, watching for frameshifts */ while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"INSERT") == 0 ) { if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE || is_random_AlnColumn_genewise(alc->next) == TRUE) { if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE && predict_pseudo_for_frameshift == TRUE ) { out->ispseudo = TRUE; alc = alc->next; continue; } else { alc = alc->next; frame_break = 1; break; } } alc = alc->next; } if( frame_break == 1 ) { break; /* out of this gene */ } if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) { /* the next position is the start of the new alignment */ sf = SupportingFeature_alloc(); /* do not understand why not having a +1 here is correct. Hmph */ sf->hstart = alc->next->alu[0]->start+1; sf->start = alc->next->alu[1]->start+1 - out->start; } else { sf = NULL; } } else { /* could be the start of a run from INSERT into match */ if( sf == NULL ) { sf = SupportingFeature_alloc(); /* we fill in start and end from the exon at the moment. Not pretty */ sf->hstart = alc->alu[0]->start+1; sf->hstrand = 1; /* currently only got proteins. Thank the lord! */ sf->start = alc->alu[1]->start+1 - out->start; } } } /* end of else it is a codon match */ prev = alc; alc = alc->next; } /* * The exon has ended. But why? */ if( sf != NULL ) { sf->hend = prev->alu[0]->end+1; add_Exon(ex,sf); } if( alc == NULL ) { out->end = prev->alu[1]->end +1; ex->end = out->end - out->start; if( sf != NULL ) { sf->end = ex->end; } break; } if( is_random_AlnColumn_genewise(alc) == TRUE) { out->end = prev->alu[1]->end +1; ex->end = out->end - out->start; if( sf != NULL ) { sf->end = ex->end; } break; } if( strcmp(alc->alu[1]->text_label,"5SS_PHASE_0") == 0 ) { out->end = alc->alu[1]->start+1; phase = 0; } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_1") == 0 ) { out->end = alc->alu[1]->start+2; phase = 1; } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_2") == 0 ) { out->end = alc->alu[1]->start+3; phase = 2; } else { phase = 0; out->end = prev->alu[1]->end +1; ex->end = out->end - out->start; if( sf != NULL ) { sf->end = ex->end; } break; } /** set end of exon to the correct size from here **/ ex->end = out->end - out->start; /** sf is to the codon, not to the exon */ if( sf != NULL ) { if( phase == 0 ) { sf->end = ex->end; } else if ( phase == 1 ) { sf->end = ex->end-1; } else { sf->end = ex->end-2; } } while( alc != NULL && strstartcmp(alc->alu[1]->text_label,"3SS") != 0 ) { alc = alc->next; score += alc->alu[0]->score[0]; } if( alc == NULL ) { warn("Got to the end of an alignment inside an intron. Oooops!"); break; } }/* back to for(;;) */ if( end != NULL ) *end = alc; if( org_start < org_end) { out->start += org_start-1; out->end += org_start-1; } else { /* fprintf(stderr,"was %d to %d\n",out->start,out->end); */ out->start = org_start-1 - out->start; out->end = org_start-1 - out->end; /* fprintf(stderr,"now %d to %d (%d)\n",out->start,out->end,org_start); */ } if( out->ispseudo == FALSE ) { ts = Translation_alloc(); ts->start = 0; ts->end = length_Transcript(tr); ts->parent = tr; add_Transcript(tr,ts); } tr->parent = out; out->bits = Score2Bits(score); return out; } %func This function is to say what is a frameshift label %type internal %% boolean is_frameshift_AlnColumn_genewise(const AlnColumn * alc) { if( strcmp(alc->alu[1]->text_label,"SEQUENCE_INSERTION") == 0 ) { return TRUE; } if( strcmp(alc->alu[1]->text_label,"SEQUENCE_DELETION") == 0 ) { return TRUE; } return FALSE; } %func This function is to say where this should be skipped in alignment/gene prediction problems %type internal %% boolean is_random_AlnColumn_genewise(const AlnColumn * alc) { char * la; la = alc->alu[1]->text_label; if( strcmp(la,"RANDOM_SEQUENCE") == 0 ) return TRUE; if( strcmp(la,"END") == 0 ) return TRUE; la = alc->alu[0]->text_label; if( strstr(la,"_RND_") != NULL ) return TRUE; return FALSE; } %func This function is to say where things are introns %type internal %% boolean is_intron_AlnColumn_genewise(const AlnColumn * alc) { char * la; la = alc->alu[1]->text_label; if( strcmp(la,"CENTRAL_INTRON") == 0 ) return TRUE; if( strcmp(la,"PYRIMIDINE_TRACT") == 0 ) return TRUE; if( strcmp(la,"SPACER") == 0 ) return TRUE; return FALSE; } %}