/* Last edited: Apr 24 10:20 1997 (birney) */ %{ #include "dyna.h" #include "genewise6.h" #include "geneloop6.h" #include "genewise4.h" #include "gwlite.h" #include "geneutil.h" #include "genewise21.h" #include "geneloop21.h" /** #include "genelink21.h" **/ #include "geneparameter.h" #include "genefrequency.h" #include "seqhit.h" #include "estwise3.h" #include "estloop3.h" #define PotentialGeneListLISTLENGTH 64 #define PotentialGeneLISTLENGTH 64 #define PotentialTranscriptLISTLENGTH 64 enum GWWRAP_ALG_TYPE { GWWRAP_2193 = 12, GWWRAP_2193L, GWWRAP_2193I, GWWRAP_623, GWWRAP_623L, GWWRAP_421, GWWRAP_6LITE, GWWRAP_333, GWWRAP_333L }; %} struct PotentialExon int tstart int tend int qstart int qend %info Data structure to represent exon information to be passed into dpenvelopes etc %% struct PotentialTranscript PotentialExon ** pex !list struct PotentialGene int guess_start !def="-1" int guess_end !def="-1" PotentialTranscript ** pet !list boolean is_global !def="FALSE" char * name Protein * homolog; // one of these two will be used ThreeStateModel * tsm; AlnBlock * alb; // if we want to save the alignments. double bitscore; int slop_query !def="5" int slop_target !def="25" int query_length %info This data structure hopefully stores the necessary information for finding a gene via a gene wise type algorithm %% struct PotentialGeneList PotentialGene ** pg !list %info a list of potential genes. Made either - externally or from a MSP crunch datastructure %% api object PotentialGeneList func PotentialGeneList_from_DnaSequenceHitList func read_PotentialGeneList_pgfasta_file func read_PotentialGeneList_pgfasta des free_PotentialGeneList endobject object PotentialGene des free_PotentialGene endobject object PotentialTranscript des free_PotentialTranscript endobject object PotentialExon des free_PotentialExon endobject func resolve_PotentialGenes_on_GenomicRegion func new_GenomicRegion_from_GeneWise func add_Genes_to_GenomicRegion_GeneWise func Hscore_from_TSM_genewise #func Gene_from_AlnColumn_GeneWise func GeneParameter21_wrap func AlnBlock_from_protein_genewise_wrap func AlnBlock_from_TSM_genewise_wrap func gwrap_alg_type_from_string endapi %{ #include "gwrap.h" %func Mainly for debugging. Shows a potential gene list %type internal %% void show_PotentialGeneList(PotentialGeneList *pgl,FILE * ofp) { int i; for(i=0;ilen;i++) fprintf(ofp,"%s from %d %d\n",pgl->pg[i]->name,pgl->pg[i]->guess_start,pgl->pg[i]->guess_end); } %func reads in potential gene format having open the file %% PotentialGene * read_PotentialGene_file(char * filename) { PotentialGene * out = NULL; FILE * ifp; char buffer[MAXLINE]; ifp = openfile(filename,"r"); if( ifp == NULL ) { warn("Could not open %s for reading a potential gene",filename); return NULL; } while( fgets(buffer,MAXLINE,ifp) != NULL ) { if( strstartcmp(buffer,"pgene") == 0 ) { out = read_PotentialGene(buffer,ifp); break; } } fclose(ifp); return out; } %func Reads in a potential gene format %% PotentialGene * read_PotentialGene(char *line,FILE * ifp) { PotentialGene * out; char buffer[MAXLINE]; PotentialTranscript * pet; out = PotentialGene_alloc_std(); while( fgets(buffer,MAXLINE,ifp) != NULL ) { if( strstartcmp(buffer,"end") == 0 ) break; if( strstartcmp(buffer,"ptrans") == 0 ) { pet = read_PotentialTranscript(buffer,ifp); if( pet == NULL ) { warn("Could not read a potential transcript. Continuing"); } else { add_PotentialGene(out,pet); } } else { warn("Did not understand line [%s] in PotentialGene",buffer); } } return out; } %func read in format -> pgene ptrans pexon start end qstart qend endptrans endpgene %% PotentialTranscript * read_PotentialTranscript(char * line,FILE * ifp) { int i,tempi; char buffer[MAXLINE]; char temp[MAXLINE]; PotentialTranscript * out; char * runner; PotentialExon * pex; out = PotentialTranscript_alloc_std(); while( fgets(buffer,MAXLINE,ifp) != NULL ) { if( strstartcmp(buffer,"end") == 0 ) break; if( strstartcmp(buffer,"pexon") == 0 ) { strcpy(temp,buffer); pex = PotentialExon_alloc(); runner = strtok(buffer,spacestr); for(i=0;i<4;i++) { runner = strtok(NULL,spacestr); if( runner == NULL || is_integer_string(runner,&tempi) == FALSE ) { warn("Could not parse [%s] as a Potential Exon line",temp); free_PotentialExon(pex); break; } switch (i) { case 0 : pex->tstart = tempi; break; case 1 : pex->tend = tempi; break; case 2 : pex->qstart = tempi; break; case 3 : pex->qend = tempi; break; default : warn("Bollocks. V.bad bug"); break; } } if( i == 4 ) { add_PotentialTranscript(out,pex); } else { free_PotentialExon(pex); } /* back to while */ } else { warn("Got an unparsable line %s in PotentialTranscript",buffer); } } return out; } %func Makes a DPEnv structure from a potential gene with Exons in it If there are no potential transcripts, returns NULL, which is what you probably want %% DPEnvelope * DPEnvelope_from_PotentialGene(PotentialGene * pg) { int i; DPEnvelope * out; if( pg->len == 0 ) { return NULL; } out = DPEnvelope_alloc_std(); for(i=0;ilen;i++) { add_PotentialTranscript_to_DPEnvelope(out,pg->pet[i],pg); } return out; } %func Adds the potential exons in the Potential transcript to the dpenvelope. %% boolean add_PotentialTranscript_to_DPEnvelope(DPEnvelope * dpenv,PotentialTranscript * pet,PotentialGene * pg) { int i; DPUnit * unit; boolean is_rev = FALSE; if( pg->guess_start > pg->guess_end ) { is_rev = TRUE; } if( is_rev ) invsort_PotentialExons_by_start(pet); else sort_PotentialExons_by_start(pet); /* do links to start points */ /* by definition we start at the start point of the Potential gene... */ unit = DPUnit_alloc(); unit->starti = 0; /* always */ unit->startj = 0; unit->height = pet->pex[0]->qstart + 1 + pg->slop_query; if( is_rev ) unit->length = pg->guess_start - pet->pex[0]->tstart +1 + pg->slop_target; else unit->length = pet->pex[0]->tstart - pg->guess_start +1 + pg->slop_target; add_DPEnvelope(dpenv,unit); for(i=0;ilen;i++) { unit = DPUnit_alloc(); unit->starti = pet->pex[i]->qstart - pg->slop_query; unit->height = pet->pex[i]->qend - pet->pex[i]->qstart + 1 + 2*pg->slop_query; if( is_rev ) unit->startj = pg->guess_start - pet->pex[i]->tstart - pg->slop_target; else unit->startj = pet->pex[i]->tstart - pg->guess_start - pg->slop_target; unit->length = abs(pet->pex[i]->tend - pet->pex[i]->tstart) + 1 + 2*pg->slop_target; add_DPEnvelope(dpenv,unit); if( i == pet->len-1 ) continue; /* connecting unit */ unit = DPUnit_alloc(); unit->starti = pet->pex[i]->qend - pg->slop_query; unit->height = pet->pex[i+1]->qstart - pet->pex[i]->qend +1 + 2*pg->slop_query; if( is_rev ) unit->startj = pg->guess_start - pet->pex[i]->tend - pg->slop_target; else unit->startj = pet->pex[i]->tend - pg->guess_start - pg->slop_target; unit->length = abs(pet->pex[i+1]->tstart - pet->pex[i]->tend) +1 + 2*pg->slop_target; add_DPEnvelope(dpenv,unit); } /* end point */ i--; unit = DPUnit_alloc(); unit->starti = pet->pex[i]->qend - pg->slop_query; unit->height = pg->query_length - pet->pex[i]->qend +1 + pg->slop_query; if( is_rev ) unit->startj = pg->guess_start - pet->pex[i]->tend - pg->slop_target; else unit->startj = pet->pex[i]->tend - pg->guess_start - pg->slop_target; unit->length = abs(pg->guess_end - pet->pex[i]->tend) +1 + pg->slop_target; if( is_rev ) unit->length = pg->guess_start - pet->pex[0]->tstart +1 + 2*pg->slop_target; else unit->length = pet->pex[0]->tstart - pg->guess_start +1 + 2*pg->slop_target; add_DPEnvelope(dpenv,unit); return TRUE;; } %func Sorts by start point %type internal %% void sort_PotentialExons_by_start(PotentialTranscript * pet) { sort_PotentialTranscript(pet,compare_PotentialExons_start); } %func compares by start point %type internal %% int compare_PotentialExons_start(PotentialExon * one,PotentialExon * two) { return one->tstart - two->tstart; } %func Sorts by start point backwards %type internal %% void invsort_PotentialExons_by_start(PotentialTranscript * pet) { sort_PotentialTranscript(pet,invcompare_PotentialExons_start); } %func compares by start point but reversed order %type internal %% int invcompare_PotentialExons_start(PotentialExon * one,PotentialExon * two) { return two->tstart - one->tstart; } %func This makes a PotentialGeneList from a DnaSequenceHitList, which is a module which, for example, abstracts the MSP crunch output. The three parameters are: window - what window size to consider a potential gene in wing_length - length of wing sequences to add onto the start/end points of a hit min_score - minimum score to trigger a potential gene. The potential genes are selected as follows: foreach window Take the best scoring segment. if( > min_score) if( there_is_a_segment which start/end + wing_length overlaps + the same name) extend that segment else make a new potential gene %% PotentialGeneList * PotentialGeneList_from_DnaSequenceHitList(DnaSequenceHitList * dsl,int window,int wing_length,double min_score) { int pos; int win_start; double best_score; SegmentHit * best_segment; PotentialGeneList * out; PotentialGene * pg; int i; int back_st_point; out = PotentialGeneList_alloc_std(); /** assumme dsl is sorted **/ pos = 0; win_start = 0; for(;pos < dsl->forward->len;) { best_score = -10000000; best_segment = NULL; for(;pos < dsl->forward->len && dsl->forward->seghit[pos]->qstart < win_start + window;pos++) { if( dsl->forward->seghit[pos]->score > best_score ) { best_score = dsl->forward->seghit[pos]->score; best_segment = dsl->forward->seghit[pos]; } } if( best_segment == NULL || best_score < min_score ) { win_start += window; continue; /* no hits here! */ } /* check to see if we already have used this name */ for(i=0;ilen;i++) { if( strcmp(best_segment->name,out->pg[i]->name) == 0 ) { if( best_segment->qstart < out->pg[i]->guess_end ) { out->pg[i]->guess_end = best_segment->qend + wing_length; break; } } } if( i != out->len ) { /** means we have placed this into another hit **/ win_start += window; continue; } /** ok - we have a new potential gene **/ pg = PotentialGene_alloc(); pg->name = stringalloc(best_segment->name); pg->guess_start = best_segment->qstart - wing_length; pg->guess_end = best_segment->qend + wing_length; add_PotentialGeneList(out,pg); win_start += window; } /** backward strands **/ win_start = 0; back_st_point = out->len; for(pos=0;pos < dsl->backward->len;) { best_score = -10000; best_segment = NULL; for(;pos < dsl->backward->len && dsl->backward->seghit[pos]->qend < win_start + window;pos++) { if( dsl->backward->seghit[pos]->score > best_score ) { best_score = dsl->backward->seghit[pos]->score; best_segment = dsl->backward->seghit[pos]; } } if( best_segment == NULL || best_score < min_score ) { win_start += window; continue; /* no hits here! */ } /* check to see if we already have used this name Only need to check backwards ones*/ for(i=back_st_point;ilen;i++) { if( strcmp(best_segment->name,out->pg[i]->name) == 0 ) { if( best_segment->qstart < out->pg[i]->guess_start ) { out->pg[i]->guess_start = best_segment->qstart + wing_length; break; } } } if( i != out->len ) { /** means we have placed this into another hit **/ win_start += window; continue; } /** ok - we have a new potential gene **/ pg = PotentialGene_alloc(); pg->name = stringalloc(best_segment->name); /** it is reversed, hence the flip from end to start **/ pg->guess_end = best_segment->qend - wing_length; pg->guess_start = best_segment->qstart + wing_length; add_PotentialGeneList(out,pg); win_start += window; } return out; } %func reads a file with lines like >ROA1_HUMAN:1234:3445 As potential gene with a guess start point at 1234 and end at 3445 %% PotentialGeneList * read_PotentialGeneList_pgfasta_file(char * filename) { FILE * ifp; PotentialGeneList * out; ifp = openfile(filename,"r"); if( ifp == NULL ) { warn("Could not open %s for reading Potential Gene List",filename); return NULL; } out = read_PotentialGeneList_pgfasta(ifp); fclose(ifp); return out; } %func reads a file with lines like >ROA1_HUMAN:1234:3445 As potential gene with a guess start point at 1234 and end at 3445 %% PotentialGeneList * read_PotentialGeneList_pgfasta(FILE * ifp) { Sequence * seq; PotentialGeneList * pgl; PotentialGene * pg; char * runner,*run2; pgl = PotentialGeneList_alloc_std(); while( (seq = read_fasta_Sequence(ifp)) != NULL ) { pg = PotentialGene_alloc(); pg->guess_start = -1; pg->guess_end = -1; add_PotentialGeneList(pgl,pg); pg->homolog = Protein_from_Sequence(seq); if ( (runner=strchr(seq->name,':')) != NULL ) { *runner = '\0'; ++runner; if( (run2 = strchr(runner,':')) != NULL ) { *run2 = '\0'; ++run2; is_integer_string(runner,&pg->guess_start); is_integer_string(run2,&pg->guess_end); } } } return pgl; } %func Takes the potential gene list, (made for example from MSP crunch file through DnaSequenceHitList object), and calculates genes on it. This is the core functionality to postwise. There are two basic modes to this routine that probably should be split up. In the first mode, the Potential Gene List has actual proteins/or HMMs (tsm - threestatemodels) in memory in the structure. This then loops through the list calling genewise functions (going through the wrap functions in this module). The other mode has no proteins in memory but a way of fetching proteins from a pipe using a string passed in. The string looks like a sprintf string where the name of the protein is the only thing to be substituted. For example efetch -f %s or getz -d '[swissprot-id:%s]' would be sensible examples. Remember that the command should produce things on stdout, and that you should (obviously) make sure that the indexer uses the same name as the name in, for example, the msp crunch file. If should_free is true then it frees the protein sequence and any alignment after the calculation. Otherwise it stores both of these in the potentialgene structure. For the business end of the algorithm, this function uses the /AlnBlock_from_protein_genewise_wrap and /AlnBlock_from_TSM_genewise_wrap functions in this module. %arg gr genomic region to make genes on pgl potential gene list of homologs and start/end points alg_protein algorithm to use with protein comparisons alg_hmm algorithm to use with HMM comparisons prot_thr Threshold under which genes are not predicted comp Comparison matrix for protein sequences gap Gap cost for protein sequences ext Extension cost for protein sequences gpara Gene parameters rmd Random Model to compare against inter Random Model for intergenic regions fetch_from_pipe For potential genes with just a name, a pipe to get sequences from should_free ? Free the protein sequences after resolving it make_name fN a pointer to a function which makes the name of the gene %% int resolve_PotentialGenes_on_GenomicRegion(GenomicRegion * gr,PotentialGeneList * pgl,int alg_protein,int alg_hmm,double prot_thr,double hmm_thr,CompMat * comp,int gap,int ext,GeneParameter21 * gpara,RandomModelDNA * rmd,RandomModelDNA * inter,char * fetch_from_pipe,boolean should_free,char *(*make_name)(Wise2_Genomic * gen,char *,int,Wise2_Gene *),double bit_cut_off,DPRunImpl * dpri) { int i; Genomic * gentemp; RandomModel * rm; AlnBlock * alb; int count = 0; char buffer[128]; FILE * pipe; if( gr == NULL || pgl == NULL ) { warn("Look - you've passed in some NULL genomicregion or pgl's to resolve_PotentialGenes. You can't expect me to do something with them??"); return -1; } if( gr->genomic == NULL ) { warn("Your genomic region has no DNA, so I can't put any genes onto it!"); return -1; } rm = default_RandomModel(); for(i=0;ilen;i++) { auto PotentialGene * pg; pg = pgl->pg[i]; if( pg->tsm == NULL && pg->homolog == NULL && fetch_from_pipe == NULL) { warn("You have neither a HMM nor a homolog for a potential gene, or a fetchable name. Yikes!"); continue; } if( pg->tsm == NULL && pg->homolog == NULL ) { /* lets fetch from pipe */ sprintf(buffer,fetch_from_pipe,pg->name); pipe = popen(buffer,"r"); pg->homolog = read_fasta_Protein(pipe); if( pg->homolog == NULL ) { warn("Could not read protein [%s] with pipe [%s]",pg->name,fetch_from_pipe); pclose(pipe); continue; } pclose(pipe); } /* * ok - use guess_start and end to pick out region to feed genewise. * if end < start then it truncates (clever eh!) */ /* fprintf(stderr,"Here we go %d and %d\n",pg->guess_start,pg->guess_end); */ if( pg->guess_start == -1 || pg->guess_start < 0) pg->guess_start = 0; if( pg->guess_end < 0) pg->guess_end = gr->genomic->baseseq->len-1; if( pg->guess_end == -1 || pg->guess_end > gr->genomic->baseseq->len-1) pg->guess_end = gr->genomic->baseseq->len-1; if( pg->guess_start > gr->genomic->baseseq->len-1) pg->guess_start = gr->genomic->baseseq->len-1; /*fprintf(stderr,"And now we go %d and %d\n",pg->guess_start,pg->guess_end); */ gentemp = truncate_Genomic(gr->genomic,pg->guess_start,pg->guess_end); if( gentemp == NULL ) { warn("Cannot make genomic truncation!"); continue; } /* gentemp->baseseq->name = stringalloc("TempGene"); write_fasta_Sequence(gentemp->baseseq,stdout); */ /* * decide whether this is protein sequence or HMM */ if( pg->tsm != NULL && pg->homolog != NULL ) { warn("Nope - you have a potential gene with both a threestatemodel and homolog thing. Taking the HMM!"); } if( pg->tsm != NULL ) { log_full_error(INFO,0,"Using HMM [%s] [%d/%d] in region %d %d",pg->tsm->name,i+1,pgl->len,pg->guess_start,pg->guess_end); alb = AlnBlock_from_TSM_genewise_wrap(pg->tsm,gentemp,gpara,rmd,inter,TRUE,alg_hmm,1.0,1,pg,dpri,NULL); if( alb == NULL ) { warn("In attempting to map a region of %s to %s, got no alignment. This seems like a bad error!",gr->genomic->baseseq->name,pg->tsm->name); continue; } /*** could be multiple genes ***/ count += add_Genes_to_GenomicRegion_GeneWise(gr,pg->guess_start,pg->guess_end,alb,pg->tsm->name,FALSE,make_name); } else { log_full_error(INFO,0,"Using protein [%s] [%d/%d] in region %d %d",pg->homolog->baseseq->name,i+1,pgl->len,pg->guess_start,pg->guess_end); alb = AlnBlock_from_protein_genewise_wrap(pg->homolog,gentemp,comp,gap,ext,gpara,rmd,inter,alg_protein,pg->is_global,TRUE,rm,1.0,pg,dpri,NULL); if( alb == NULL ) { warn("In attempting to map a region of %s to %s, got no alignment. This seems like a bad error!",gr->genomic->baseseq->name,pg->homolog->baseseq->name); continue; } /*** check the score against the bit_cut_off ***/ pg->bitscore = Score2Bits(alb->score); /* wont work well with 21:93 */ if( pg->bitscore >= bit_cut_off) { /*** could be multiple genes ***/ count += add_Genes_to_GenomicRegion_GeneWise(gr,gentemp->baseseq->offset,gentemp->baseseq->end,alb,pg->homolog->baseseq->name,FALSE,make_name); } } free_Genomic(gentemp); if( should_free == TRUE ) { pg->homolog = free_Protein(pg->homolog); } if( should_free == TRUE) alb = free_AlnBlock(alb); else pg->alb = alb; } free_RandomModel(rm); return count; } %func A general wrap over the production of parameters for the GeneWise programs. The geneparameter21 holds all the parameters, and can be approximated for the 6:23 and 4:21 algorithms This function is the best way to make a GeneParameter21 object as all the different options for how to make it or modify its contents are laid out as arguments to this function %arg gf r Gene Frequency data structure, holding counts for splice sites etc subs_error substitution error on the dna sequence indel_error rough estimate of the insertion/deletion per base error rate rmd the random model of the DNA that is used use_modelled_codon if TRUE, model codon frequency use_modelled_splice if TRUE, make splice models from gf parameters ct codon table which is used for codon->aa mapping return A newly allocated structure %% GeneParameter21 * GeneParameter21_wrap(GeneFrequency21 * gf,double subs_error,double indel_error,RandomModelDNA * rmd,boolean use_modelled_codon,boolean use_modelled_splice,boolean tie_intron_prob,CodonTable * ct,Probability rnd_loop,Probability cds_loop,Probability rnd_to_model,Probability link_loop,Probability link_to_model) { GeneParameter21 * out; int i; out = GeneParameter21_from_GeneFrequency21(gf,ct,rmd,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model); if( use_modelled_codon == FALSE ) { out->cm = free_CodonMapper(out->cm); out->cm = flat_CodonMapper(ct); } if( use_modelled_splice == FALSE ) { out->cses = free_ComplexSequenceEvalSet(out->cses); out->cses = default_genomic_ComplexSequenceEvalSet(); out->modelled_splice = FALSE; } if( tie_intron_prob == TRUE ) { for(i=0;i<5;i++) out->gp->central[i] = rmd->base[i]; } /*** errors ***/ sprinkle_errors_over_CodonMapper(out->cm,subs_error); add_flat_error_probabilities_GeneParser21(out->gp,indel_error); GeneParser21_fold_in_RandomModelDNA(out->gp,rmd); fold_in_RandomModelDNA_into_RandomCodon(out->rc,rmd); return out; } %func A function which aligns a Protein sequecne to a Genomic sequence under the Comparison matrix comp and the gene paras in gpara. This is the best function for accessing GeneWise functionality for a protein to dna comparison, allowing for introns. To make the protein object, you will first read in a generic sequence object using something like read_fasta_Sequence and then convert it to a protein object using new_Protein_from_Sequence To make the genomic object, you will first read in a generic sequence object using something like read_fasta_Sequence and then convert it to a genomic object using new_Genomic_from_Sequence To make a CompMat object you will use read_Blast_file_CompMat from the compmat module. It is likely, if the Wise2 enviroment has been set up correctly that read_Blast_file_CompMat("blosum62.bla") will be fine. You should at the moment only use halfbit matrices (blosum62 is one such matrix) To make the necessary random modules use the default construtors in the randommodel module To make the gene parameter object use the GeneParameter21_wrap function found in this module. It will need GeneFrequencies read in using the read_GeneFrequency21_file function in the genefrequency module. Again if Wise2 has been set up correctly, read_GeneFrequency21_file("human.gf") should work To again a valid algorithm type use gwrap_alg_type_from_string found in this module. gwrap_alg_type_from_string("623") would be a good choice This function basically makes a threestatemodel (standard HMM) from the protein and the comparison matrix with the *scary* assumption that the comparison matrix is in half bit form. It then calls /AlnBlock_from_TSM_genewise_wrap to do the nasty stuff. %arg protein protein sequence used in the comparison dna genomic DNA sequence used comp protein comparison matrix *in half bits* gap gap penalty (negative) ext extension penalty (negative) gpara Gene parameters. rmd models to be compared to intergenic model of random dna between genes alg algorithm type is_global has now become flag for local/global/end-biased switch pg r Potential gene - could be NULL - if rough exon positions are known pal wN Raw alginment to be saved if non-NULL %% AlnBlock * AlnBlock_from_protein_genewise_wrap(Protein * protein,Genomic * dna,CompMat * comp,int gap,int ext,GeneParameter21 * gpara,RandomModelDNA * rmd,RandomModelDNA * intergenic,int alg,int is_global,boolean use_syn,RandomModel * rm,Probability allN,PotentialGene * pg,DPRunImpl * dpri,PackAln ** pal) { ThreeStateModel * tsm; RandomModel * rm2; AlnBlock * out; if( protein == NULL || dna == NULL || comp == NULL || gpara == NULL || rmd == NULL ){ warn("trappable error in PackAln from protein sequence, passed some NULL objects, Complain!"); return NULL; } rm2 = default_RandomModel(); if( is_global == 1) tsm = global_ThreeStateModel_from_half_bit_Sequence(protein,comp,rm2,gap,ext); else { tsm = ThreeStateModel_from_half_bit_Sequence(protein,comp,rm2,gap,ext); if( is_global > 2 ) { set_startend_policy_ThreeStateModel(tsm,is_global,30,halfbit2Probability(-15)); /* set end bias */ } } out = AlnBlock_from_TSM_genewise_wrap(tsm,dna,gpara,rmd,intergenic,use_syn,alg,allN,1,pg,dpri,pal); free_ThreeStateModel(tsm); free_RandomModel(rm2); return out; } %func A function which aligns a protein HMM (as found in my threestatemodel structure) to a genomic DNA sequence. At the moment you are unlikely to be reading in the HMM structure yourself, so this is not something you will be doing. The core algorithms for each method are found in genewise21/geneloop21 etc files. %arg tsm protein TSM to be used in the comparison gen genomic DNA sequence used gpara Gene parameters. rmd models to be compared to intergenic model of random dna between genes alg algorithm type use_syn use a synchronous null model pg r Potential gene - could be NULL - if rough exon positions are known palpoi wN Raw alginment to be saved if non-NULL %% AlnBlock * AlnBlock_from_TSM_genewise_wrap(ThreeStateModel * tsm,Genomic * gen,GeneParameter21 * gpara,RandomModelDNA * rmd,RandomModelDNA * intergenic,boolean use_syn,int alg,Probability allN,boolean flat_insert,PotentialGene * pg,DPRunImpl * dpri,PackAln ** palpoi) { AlnBlock * out = NULL; PackAln * pal = NULL; ComplexSequence * cs = NULL; GeneWise * gw = NULL; GeneWiseScore * gws = NULL; RandomCodonScore * rcs = NULL ; GeneParser21Score * gps = NULL; GeneParser4Score * gp4s = NULL; RandomModelDNAScore * ids = NULL; DPEnvelope * dpenv; Sequence * dna; cDNAParserScore * cps = NULL; /* for estwise type algorithms */ GwLite * gwl = NULL; GwLiteScore * gwls = NULL; ComplexSequenceEval * tempcse; ComplexSequenceEvalSet * cses; dna = gen->baseseq; assert(tsm); assert(gen); assert(gpara); assert(rmd); assert(gpara->rc); /*show_Genomic(gen,stderr);*/ /*show_GeneParser21(gpara->gp,stderr); */ if( tsm == NULL || dna == NULL || gpara == NULL || rmd == NULL){ warn("trappable error in PackAln from TSM sequence, passed some NULL objects, Complain!"); return NULL; } /*** prepare cses ***/ if( prepare_ComplexSequenceEvalSet(gpara->cses) == FALSE ) { warn("Unable to prepare complexsequenceevalset in TMS2DNA wrap"); goto exit; } if( alg == GWWRAP_333 || alg == GWWRAP_333L ) { cses = default_cDNA_ComplexSequenceEvalSet(); cs = new_ComplexSequence(gen->baseseq,cses); free_ComplexSequenceEvalSet(cses); } else if ( alg == GWWRAP_6LITE ) { /* yup. This is scary. */ tempcse = gpara->cses->cse[1]; gpara->cses->cse[1] = codon64_number_ComplexSequenceEval(); cs = new_ComplexSequence(gen->baseseq,gpara->cses); free_ComplexSequenceEval(gpara->cses->cse[1]); gpara->cses->cse[1] = tempcse; } else { if( (cs=evaluate_ComplexSequence_Genomic(gen,gpara->cses,0,Probability2Score(0.01))) == FALSE ) { warn("Unable to make ComplexSequence in TMS2DNA wrap"); goto exit; } } /*show_ComplexSequence(cs,stderr);*/ if( (gw=GeneWise_from_ThreeStateModel(tsm,gpara->gp,gpara->cm,allN,gpara->gwcm)) == NULL) { warn("Unable to make GeneWise model"); goto exit; } if( gpara->modelled_splice == FALSE) { flatten_balance_scores_GeneWise(gw); } if( pg == NULL || (pg->guess_start == -1) ) { dpenv = NULL ; } else { info("Using DPEnvelope over matrix"); pg->guess_start = dna->offset; pg->guess_end = dna->end; pg->query_length = tsm->len; dpenv = DPEnvelope_from_PotentialGene(pg); /* show_DPEnvelope(dpenv,stderr); */ } /* show_GeneWiseSegment(gw->seg[0],stderr); */ if( use_syn == TRUE ) { if( tsm->rm == NULL ) { warn("Ugh - a threestatemodel without a random model. Not in this code matey"); goto exit; } GeneWise_fold_in_synchronised_RandomModel(gw,tsm->rm,gpara->cm,gpara->ct,0.5); flatten_RandomCodon(gpara->rc); } else { GeneWise_fold_in_RandomModelDNA(gw,rmd); } if( alg == GWWRAP_6LITE ) { gwl = GwLite_from_GeneWise(gw); gwls = GwLiteScore_from_GwLite(gwl); } if( flat_insert == TRUE ) { check_flat_insert(gw,1,0,gpara->cm->ct); } if( (gws = GeneWiseScore_from_GeneWise(gw)) == NULL) { warn("Unable to make GeneWiseScore model"); goto exit; } if( (gps = GeneParser21Score_from_GeneParser21(gpara->gp)) == NULL) { warn("Unable to make GeneParserScore model"); goto exit; } if( (rcs = RandomCodonScore_from_RandomCodon(gpara->rc)) == NULL) { warn("Unable to make RandomCodonScore model"); goto exit; } ids = folded_RandomModelDNAScore_from_2RMD(intergenic,rmd); gp4s = GeneParser4Score_from_GeneParser21Score(gps); if( alg == GWWRAP_333 || alg == GWWRAP_333L ) { cps = cDNAParserScore_from_GeneParser21Score(gps); } switch(alg) { case GWWRAP_2193 : pal = PackAln_bestmemory_GeneWise21(gws,cs,gps,rcs,ids,dpenv,dpri); out = convert_PackAln_to_AlnBlock_GeneWise21(pal); break; case GWWRAP_2193I : warn("Algorithm currently disabled! Sorry!"); break; /** pal = PackAln_dc_build_GeneLinker21(gws,cs,gps,rcs,ids); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneLinker21(pal,NULL); break; **/ case GWWRAP_2193L : pal = PackAln_bestmemory_GeneLoop21(gws,cs,gps,rcs,ids,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneLoop21(pal); break; case GWWRAP_623L : pal = PackAln_bestmemory_GeneLoop6(gws,cs,gp4s,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneLoop6(pal); break; case GWWRAP_623 : pal = PackAln_bestmemory_GeneWise6(gws,cs,gp4s,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneWise6(pal); break; case GWWRAP_333 : cps = cDNAParserScore_from_GeneParser21Score(gps); pal = PackAln_bestmemory_EstWise3(gws,cs,cps,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_EstWise3(pal); break; case GWWRAP_333L : cps = cDNAParserScore_from_GeneParser21Score(gps); pal = PackAln_bestmemory_EstLoop3(gws,cs,cps,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_EstLoop3(pal); break; case GWWRAP_421 : pal = PackAln_bestmemory_GeneWise4(gws,cs,gp4s,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneWise4(pal); break; case GWWRAP_6LITE : pal = PackAln_bestmemory_GeneLiteModel(gwls,cs,gp4s,dpenv,dpri); if(pal == NULL) goto exit; out = convert_PackAln_to_AlnBlock_GeneLiteModel(pal); GwLite_AlnBlock_surgery(out); break; default : warn("A major problem. No valid algorithm type passed in"); goto exit; } map_phase0_codons_AlnBlock_GeneWise(out,gws,cs); if( palpoi != NULL ) { *palpoi = pal; pal = NULL; } goto exit; exit : if(pal != NULL) pal = free_PackAln(pal); if( cps != NULL ) cps = free_cDNAParserScore(cps); if( ids != NULL ) ids = free_RandomModelDNAScore(ids); if(cs != NULL ) cs = free_ComplexSequence(cs); if(gw != NULL ) gw = free_GeneWise(gw); if(gws != NULL ) gws = free_GeneWiseScore(gws); if(gps != NULL ) free_GeneParser21Score(gps); if(rcs != NULL ) rcs = free_RandomCodonScore(rcs); if(gp4s != NULL) gp4s = free_GeneParser4Score(gp4s); return out; } %func Runs a database search of the genewise algorithm. This makes a high score object which you can then use to retrieve enteries as well as print out the top score (!) %arg tdb r a database of profileHMMs gdb r a database of genomic sequence gpara r geneparameters rmd r random model to be compared with in non syn mode intergenic r random model of intergenic DNA (usually the same as rmd) use_syn use synchronous random model alg algorithm type bits_cutoff cutoff in bits of the scores to store report_level stagger rate of reporting progress on stderr -1 means never die_on_error if true, exits on error (not used at the moment) return a new Hscore object of the entire db search %% Hscore * Hscore_from_TSM_genewise(ThreeStateDB * tdb,GenomicDB * gdb,GeneParameter21 * gpara,RandomModelDNA * rmd,RandomModelDNA * intergenic,boolean use_syn,int alg,double bits_cutoff,Probability allN,int report_level,boolean die_on_error,boolean flat_insert,DBSearchImpl * dbsi) { Hscore * out = NULL; GeneWiseDB * gwdb = NULL; cDNADB * cdb = NULL; cDNAParserScore * cps= NULL; GeneParser21Score * gps = NULL; GeneParser4Score * gp4s = NULL; RandomCodonScore * rcs = NULL; RandomModelDNAScore * ids = NULL; cDNA * temp; Search_Return_Type ret; ComplexSequenceEval * tempcse; ret = SEARCH_ERROR; gwdb = new_GeneWiseDB(tdb,gpara,rmd,use_syn,allN); gwdb->flat_insert = flat_insert; if( gwdb == NULL ) { warn("Could not build a new GeneWiseDB from the objects provided. Exiting without completing the search"); goto exit; } if( (gps = GeneParser21Score_from_GeneParser21(gpara->gp)) == NULL) { warn("Unable to make GeneParserScore model"); goto exit; } if( (rcs = RandomCodonScore_from_RandomCodon(gpara->rc)) == NULL) { warn("Unable to make RandomCodonScore model"); goto exit; } ids = folded_RandomModelDNAScore_from_2RMD(intergenic,rmd); gp4s = GeneParser4Score_from_GeneParser21Score(gps); if( alg == GWWRAP_333 || alg == GWWRAP_333L ) { /* could be a single dna sequence */ if( gdb->is_single_seq == TRUE ) { temp = cDNA_from_Sequence(hard_link_Sequence(gdb->forw->seq)); cdb = new_cDNADB_from_single_seq(temp); free_cDNA(temp); /* hard linked by database */ } else { cdb = new_cDNADB(gdb->sdb); } } /*** allocate Hscore structure ***/ out = std_bits_Hscore(bits_cutoff,report_level); switch(alg) { case GWWRAP_2193 : ret = Wise2_search_GeneWise21(dbsi,out,gwdb,gdb,gps,rcs,ids); break; case GWWRAP_2193I : warn("Algorithm currently disabled! Sorry!"); break; case GWWRAP_2193L : warn("Unable to do db looping searches (sort of - pointless - )!"); break; case GWWRAP_623L : warn("Unable to do db looping searches (sort of - pointless - )!"); break; case GWWRAP_623 : ret = Wise2_search_GeneWise6(dbsi,out,gwdb,gdb,gp4s); break; case GWWRAP_6LITE : ret = Wise2_search_GeneLiteModel(dbsi,out,gwdb,gdb,gp4s); break; case GWWRAP_333 : cps = cDNAParserScore_from_GeneParser21Score(gps); ret = search_EstWise3(dbsi,out,gwdb,cdb,cps); break; case GWWRAP_333L : warn("Unable to do db looping searches (sort of - pointless - )!"); break; case GWWRAP_421 : ret = Wise2_search_GeneWise4(dbsi,out,gwdb,gdb,gp4s); break; default : warn("A major problem. No valid algorithm type passed in"); goto exit; } goto exit; exit : /* for 6LITE leaking a tiny amount of memory. Oh well... */ if( ids != NULL ) ids = free_RandomModelDNAScore(ids); if( cps != NULL ) free_cDNAParserScore(cps); if( cdb != NULL ) free_cDNADB(cdb); if(gps != NULL ) free_GeneParser21Score(gps); if(rcs != NULL ) rcs = free_RandomCodonScore(rcs); if(gp4s != NULL) gp4s = free_GeneParser4Score(gp4s); if( gwdb != NULL ) { free_GeneWiseDB(gwdb); } if( die_on_error == TRUE && ret == SEARCH_ERROR) { if( out != NULL ) { free_Hscore(out); } return NULL; } return out; } %func Makes a cdna parser from a genewise parser. Basically copies the indel penalties across. %type internal %% cDNAParserScore * cDNAParserScore_from_GeneParser21Score(GeneParser21Score * gps) { cDNAParserScore * out; out = cDNAParserScore_alloc(); out->trans[PCD_INSERT_2_BASE] = gps->transition[GP21_INSERT_2_BASE]; out->trans[PCD_INSERT_1_BASE] = gps->transition[GP21_INSERT_1_BASE]; out->trans[PCD_DELETE_2_BASE] = gps->transition[GP21_DELETE_2_BASE]; out->trans[PCD_DELETE_1_BASE] = gps->transition[GP21_DELETE_1_BASE]; return out; } %func Gives you the integer interpretation from the string, which is one of 2193 2193L, 623, 623L, 421, 2193LINK This integer can then be passed into routines like AlnBlock_from_protein_genewise_wrap %% int gwrap_alg_type_from_string(char * str) { int t; t = get_number_from_slashed_string(str,"2193/2193L/623/623L/421/2193LINK/333/333L/6LITE"); switch (t) { case 0 : return GWWRAP_2193; case 1 : return GWWRAP_2193L; case 2 : return GWWRAP_623; case 3 : return GWWRAP_623L; case 4 : return GWWRAP_421; case 5 : return GWWRAP_2193I; case 6 : return GWWRAP_333; case 7 : return GWWRAP_333L; case 8 : return GWWRAP_6LITE; default : warn("Cannot convert string %s into a valid genewise algorithm type\n",str); return -1; } } %}