%{ #include "wisebase.h" #include "seqalign.h" #include "pwmdna.h" #include "randommodel.h" #include "complexsequence.h" #include "complexevalset.h" /* for the standard evals */ #define DEFAULT_SPLICE_OFFSET_SCORE 4.5 %} struct SpliceSiteScore pwmDNAScore * score int offset Score min_collar Score max_collar Score score_offset struct GeneStats SeqAlign * splice5 int splice5_offset SeqAlign * splice3 int splice3_offset RandomModelDNA * intron; !def="NULL"// actually counts double average_intron RandomModelDNA * polyp; !def="NULL"// actually counts double average_polyp RandomModelDNA * rnd; !def="NULL" double codon[64] %info This structure is to hold the new generation of gene statistics for the genewise algorithms %% struct GeneModelParam double splice5_pseudo double splice3_pseudo double intron_emission_pseudo double polyp_emission_pseudo Bits min_collar Bits max_collar Bits score_offset char * gene_stats_file %info A small helper object containing the ways of converting the actual counts/alignments to the model Here really for convience so we can keep all the code associated with the creation of the GeneModel together %% struct GeneModel pwmDNA * splice5 int splice5_offset pwmDNA * splice3 int splice3_offset RandomModelDNA * intron double intron_stay_prob RandomModelDNA * polyp double polyp_stay_prob RandomModelDNA * rnd SpliceSiteScore * splice5score SpliceSiteScore * splice3score double codon[64] %info This structure is to hold the new generation of models for the genewise algorithm %% %{ #include "genestats.h" %func Shows help %% void show_help_GeneModelParam(FILE * ofp) { fprintf(ofp,"New gene model statistics\n"); fprintf(ofp," -splice_max_collar [5.0] maximum Bits value for a splice site \n"); fprintf(ofp," -splice_min_collar [-5.0] minimum Bits value for a splice site \n"); fprintf(ofp," -splice_score_offset [%.1f] score offset for splice sites\n",DEFAULT_SPLICE_OFFSET_SCORE); fprintf(ofp," -genestats [gene.stat]\n"); } %func Makes a GeneModelParam from argv %% GeneModelParam * new_GeneModelParam_from_argv(int * argc,char ** argv) { GeneModelParam * out; char * temp; out = std_GeneModelParam(); if( (temp=strip_out_assigned_argument(argc,argv,"splice_min_collar")) != NULL ) { if( is_double_string(temp,&out->min_collar) == FALSE ) { warn("%s is not a floating point number. Can't be a splice_min_collar",temp); free_GeneModelParam(out); return NULL; } } if( (temp=strip_out_assigned_argument(argc,argv,"splice_max_collar")) != NULL ) { if( is_double_string(temp,&out->max_collar) == FALSE ) { warn("%s is not a floating point number. Can't be a splice_max_collar",temp); free_GeneModelParam(out); return NULL; } } if( (temp=strip_out_assigned_argument(argc,argv,"splice_score_offset")) != NULL ) { if( is_double_string(temp,&out->score_offset) == FALSE ) { warn("%s is not a floating point number. Can't be a splice_score_offset",temp); free_GeneModelParam(out); return NULL; } } if( (temp=strip_out_assigned_argument(argc,argv,"genestats")) != NULL ) { if( out->gene_stats_file != NULL ) { ckfree(out->gene_stats_file); } out->gene_stats_file = stringalloc(temp); } return out; } %func Makes a standard GeneModelParam %% GeneModelParam * std_GeneModelParam(void) { GeneModelParam * out; out = GeneModelParam_alloc(); out->splice5_pseudo = 1.0; out->splice3_pseudo = 1.0; out->intron_emission_pseudo = 1.0; out->polyp_emission_pseudo = 1.0; out->min_collar = -5.0; out->max_collar = +5.0; out->score_offset = DEFAULT_SPLICE_OFFSET_SCORE; out->gene_stats_file = stringalloc("gene.stat"); return out; } %func Combines GeneStats_from_GeneModelParam and GeneModel_from_GeneStats %% GeneModel * GeneModel_from_GeneModelParam(GeneModelParam * p) { GeneStats * st; GeneModel * out; assert(p); st = GeneStats_from_GeneModelParam(p); out = GeneModel_from_GeneStats(st,p); free_GeneStats(st); return out; } %func Makes a GeneStats from GeneModelParam - basically just opening the file %% GeneStats * GeneStats_from_GeneModelParam(GeneModelParam * p) { GeneStats * gs; FILE * ifp; assert(p); assert(p->gene_stats_file); ifp = openfile(p->gene_stats_file,"r"); if( ifp == NULL ) { warn("Unable to open %s as gene stats file",p->gene_stats_file); return NULL; } gs = read_GeneStats(ifp); return gs; } %func Makes a model from the stats file %% GeneModel * GeneModel_from_GeneStats(GeneStats * gs,GeneModelParam * p) { GeneModel * out; int i; double total; out = GeneModel_alloc(); assert(gs); assert(gs->splice5); assert(gs->splice3); assert(gs->intron); assert(gs->rnd); for(i=0;i<64;i++) { out->codon[i] = gs->codon[i]; } out->splice5 = pwmDNA_from_SeqAlign(gs->splice5,p->splice5_pseudo); fold_randommodel_pwmDNA(out->splice5,gs->rnd); out->splice5score = SpliceSiteScore_alloc(); out->splice5score->score = pwmDNAScore_from_pwmDNA(out->splice5); out->splice5score->offset = gs->splice5_offset; out->splice5score->min_collar = Probability2Score(Bits2Probability(p->min_collar)); out->splice5score->max_collar = Probability2Score(Bits2Probability(p->max_collar)); out->splice5score->score_offset = Probability2Score(Bits2Probability(p->score_offset)); out->splice3 = pwmDNA_from_SeqAlign(gs->splice3,p->splice3_pseudo); fold_randommodel_pwmDNA(out->splice3,gs->rnd); out->splice3score = SpliceSiteScore_alloc(); out->splice3score->score = pwmDNAScore_from_pwmDNA(out->splice3); out->splice3score->offset = gs->splice3_offset; out->splice3score->min_collar = Probability2Score(Bits2Probability(p->min_collar)); out->splice3score->max_collar = Probability2Score(Bits2Probability(p->max_collar)); out->splice3score->score_offset = Probability2Score(Bits2Probability(p->score_offset)); out->intron = RandomModelDNA_alloc(); for(total = 0.0,i=0;i<4;i++) total += gs->intron->base[i] + p->intron_emission_pseudo; for(i=0;i<4;i++) out->intron->base[i] = (gs->intron->base[i] + p->intron_emission_pseudo)/total; out->intron->base[4] = 1.0; if( gs->polyp != NULL ) { out->polyp = RandomModelDNA_alloc(); for(total = 0.0,i=0;i<4;i++) total += gs->polyp->base[i] + p->polyp_emission_pseudo; for(i=0;i<4;i++) out->polyp->base[i] = (gs->polyp->base[i] + p->polyp_emission_pseudo)/total; } out->rnd = hard_link_RandomModelDNA(gs->rnd); return out; } %func shows a genemodel %% void show_GeneModel(GeneModel * gm,FILE * ofp) { fprintf(ofp,"Splice5\n"); show_pwmDNA_col(gm->splice5,ofp); fprintf(ofp,"Splice3\n"); show_pwmDNA_col(gm->splice3,ofp); } %func Makes an entire ComplexSequenceEvalSet for genomic work %% ComplexSequenceEvalSet * new_ComplexSequenceEvalSet_from_GeneModel(GeneModel * gm) { ComplexSequenceEvalSet * out; assert(gm); assert(gm->splice5score); assert(gm->splice3score); out = ComplexSequenceEvalSet_alloc_len(11); add_ComplexSequenceEvalSet(out,base_number_ComplexSequenceEval()); add_ComplexSequenceEvalSet(out,codon_number_ComplexSequenceEval()); add_ComplexSequenceEvalSet(out,ComplexSequenceEval_from_pwmDNAScore_splice(gm->splice5score)); add_ComplexSequenceEvalSet(out,ComplexSequenceEval_from_pwmDNAScore_splice(gm->splice3score)); add_ComplexSequenceEvalSet(out,flat_zero()); add_ComplexSequenceEvalSet(out,flat_zero()); out->type = SEQUENCE_GENOMIC; prepare_ComplexSequenceEvalSet(out); return out; } %func Makes a ComplexSequenceEval for a splice site pwmdna %type internal %% ComplexSequenceEval * ComplexSequenceEval_from_pwmDNAScore_splice(SpliceSiteScore * score) { ComplexSequenceEval * out; out = ComplexSequenceEval_alloc(); /* shouldn't really add ones, but this is ok anyway. Yukky hack due to not understanding a bug in the window determination */ /** *STILL don't know precisely what is going on down here! ***/ /* out->left_window = ssm->offset + ssm->pre_splice_site +1; */ out->left_window = 10; /* out->right_window = ssm->offset + ssm->post_splice_site +1; */ out->right_window = 10; out->left_lookback =10; out->outside_score= NEGI; out->data_type = 245; /* any old key */ out->data = (void *) score; out->type = SEQUENCE_GENOMIC; out->eval_func = pwmDNA_splice_ComplexSequence_eval_func; out->score_type = CseScoreType_Bits; return out; } %func This function is used as a pointer to function in the eval func You should never be using this function yourself! %type internal %% int pwmDNA_splice_ComplexSequence_eval_func(int type,void * data,char * seq) { SpliceSiteScore * sc; pwmDNAScore * pds; int score; sc = (SpliceSiteScore* ) data; pds = sc->score; /* offset is written in biological coordinates. Need to get c style coordiates */ /* no idea what is happening here, but it works ;) */ score = score_pwmDNAScore_string(pds,seq-sc->offset+1); if( score < sc->min_collar ) { score = sc->min_collar; } if( score > sc->max_collar ) { score = sc->max_collar; } score -= sc->score_offset; /* fprintf(stderr,"Scoring %d at some position\n",score);*/ return score; } %func Reads a GeneStats file %% GeneStats * read_GeneStats(FILE * ifp) { char buffer[MAXLINE]; GeneStats * out; SeqAlign * temp; char **base; char **brk; out = GeneStats_alloc(); while( fgets(buffer,MAXLINE,ifp) != NULL ) { fprintf(stderr,"Reading %s",buffer); if( buffer[0] == '#' ) continue; if( buffer[0] == '%' && buffer[1] == '%' ) break; if( strstartcmp(buffer,"splice5") == 0 ) { base = brk = breakstring(buffer,spacestr); if( *brk == NULL || *(brk+1) == NULL || is_integer_string(*(brk+1),&out->splice5_offset) == 0) { warn("Cannot read splice5 offset - must be splice5 "); return NULL; } ckfree(base); temp = read_selex_SeqAlign(ifp); if( temp == NULL ) { warn("Could not read in selex alignment for splice5"); continue; } out->splice5 = temp; continue; } if( strstartcmp(buffer,"splice3") == 0 ) { base = brk = breakstring(buffer,spacestr); if( *brk == NULL || *(brk+1) == NULL || is_integer_string(*(brk+1),&out->splice3_offset) == 0) { warn("Cannot read splice3 offset - must be splice3 "); return NULL; } ckfree(base); temp = read_selex_SeqAlign(ifp); if( temp == NULL ) { warn("Could not read in selex alignment for splice5"); continue; } out->splice3 = temp; continue; } if( strstartcmp(buffer,"intron_emission") == 0 ) { if( fgets(buffer,MAXLINE,ifp) == NULL ) { warn("Could not read in intron emission line"); break; } out->intron = get_genestat_emission(buffer); if( fgets(buffer,MAXLINE,ifp) != NULL ) { continue; } else { break; } } if( strstartcmp(buffer,"polyp_emission") == 0 ) { if( fgets(buffer,MAXLINE,ifp) == NULL ) { warn("Could not read in polyp emission line"); break; } out->polyp = get_genestat_emission(buffer); if( fgets(buffer,MAXLINE,ifp) != NULL ) { continue; } else { break; } } if( strstartcmp(buffer,"rnd_emission") == 0 ) { if( fgets(buffer,MAXLINE,ifp) == NULL ) { warn("Could not read in rnd emission line"); break; } out->rnd = get_genestat_emission(buffer); if( fgets(buffer,MAXLINE,ifp) != NULL ) { continue; } else { break; } } if( strstartcmp(buffer,"rndcodon") == 0 ) { if( read_codon_GeneStats(out->codon,buffer,ifp) == FALSE ) { warn("Problem in reading codon line!"); } continue; } if( isalpha(buffer[0]) ) { warn("Could not read line %s in genestats reading\n",buffer); } } fprintf(stderr,"About to leave function\n"); return out; } %func reads in the emission stuff in a genestats line %type internal %% RandomModelDNA * get_genestat_emission(char * buffer) { RandomModelDNA * out; int i; char ** base; char ** brk; double d; out = RandomModelDNA_alloc(); base = brk = breakstring(buffer,spacestr); for(i=0;*brk != NULL && i < 4;i++, brk++){ if( is_double_string(*brk,out->base+i) == FALSE) { warn("For genestat word %s, not a double in emission!",*brk); return FALSE; } } ckfree(base); if( i < 4 ) { warn("Did not read in 5 numbers for emission scores in genestats"); } return out; } %func testing function %% void dump_GeneStats(GeneStats * st,FILE * ofp) { int i; assert(st); assert(ofp); fprintf(ofp,"#\n# Dumping gene stats, wise2.2 style\n#\n"); fprintf(ofp,"splice5\n"); write_selex_SeqAlign(st->splice5,10,70,ofp); fprintf(ofp,"//\nsplice3\n"); write_selex_SeqAlign(st->splice3,10,70,ofp); fprintf(ofp,"//\n"); fprintf(ofp,"intron_emission\n"); for(i=0;i<4;i++) { fprintf(ofp,"%f ",st->intron->base[i]); } fprintf(ofp,"\n"); fprintf(ofp,"//\n"); if( st->polyp != NULL ) { fprintf(ofp,"polyp_emission\n"); for(i=0;i<4;i++) { fprintf(ofp,"%f ",st->polyp->base[i]); } } fprintf(ofp,"\n"); fprintf(ofp,"//\n"); } %func assummes codon_array is 64 positions long line should have begin consensus on it and be of MAXLINE length as it will be used as the buffer. This does **not** check that you have filled up all 64 positions. %% boolean read_codon_GeneStats(double * codon_array,char* line,FILE * ifp) { boolean ret = TRUE; char * codon; char * number; if( strwhitestartcmp(line,"rndcodon",spacestr) != 0 ) { warn("In reading codon line, got no 'rndcoodon' tag [%s]",line); return FALSE; } while( fgets(line,MAXLINE,ifp) != NULL ) { if( line[0] == '#' ) continue; if( strwhitestartcmp(line,"//",spacestr) == 0 ) break; codon = strtok(line,spacestr); number = strtok(NULL,spacestr); if( codon == NULL ) { warn("Found an uncommented line in codon consensus with no leading codon word"); continue; } if( number == NULL ) { warn("For codon %s, no number found",codon); ret = FALSE; continue; } if( strchr(codon,'N') != NULL ) continue; if( is_non_ambiguous_codon_seq(codon) == FALSE ) { warn("Codon %s is not really a codon... problem!"); ret = FALSE; continue; } codon_array[base4_codon_from_seq(codon)]= atof(number); } return ret; } %}