/* has to be before the others due to nasty namespace clashes */ #define WISE2_CROSS_HMMER2 #include "wise2xhmmer2.h" #include "dyna.h" #include "version.h" #include "gwrap.h" char * program_name = "genewisedb"; /* * program specific includes */ #include "gwrap.h" #include "genedisplay.h" #include "matchsum.h" /* * program specific variables */ enum PG_SEARCH_MODE { PG_SEARCH_S2DB, PG_SEARCH_DB2S, PG_SEARCH_DB2DB }; int search_mode; GeneWiseDB * gwdb = NULL; char * dna_seq_file = NULL; GenomicDB * gdb = NULL; SequenceDB * sdb = NULL; Genomic * gen = NULL; boolean use_single_dna = FALSE; char * protein_file = NULL; Protein * pro = NULL; char * hmm_file = NULL; ThreeStateModel * tsm = NULL; char * hmm_name = NULL; ThreeStateDB * tsmdb = NULL; GeneWiseScore * gws = NULL; Hscore * hs = NULL; /** different protein possibilities **/ boolean use_single_pro = FALSE; boolean use_db_pro = FALSE; boolean use_tsm = FALSE; boolean use_pfam1 = FALSE; boolean use_pfam2 = FALSE; char * qstart_str = NULL; int qstart = -1; char * qend_str = NULL; int qend = -1; char * gene_file = NULL; GeneFrequency21 * gf = NULL; char * matrix_file = "blosum62.bla"; CompMat * mat = NULL; char * gap_str = "12"; int gap = 12; char * ext_str = "2"; int ext = 2; char * length_of_N_str = "10"; int length_of_N = 10; char * prob_in_rep = "0.001"; double rep_prob = 0.001 ; char * codon_file = NULL; CodonTable * ct = NULL; char * output_file = "-"; FILE * ofp = NULL; char * report_str = NULL; int report_stagger = -1; RandomModelDNA * rmd = NULL; char * subs_string = "0.00001"; double subs_error = 0.00001; char * indel_string = "0.00001"; double indel_error = 0.00001; char * cfreq_string = "flat"; boolean model_codon = FALSE; char * splice_string = "model"; boolean model_splice = TRUE; boolean flat_insert = TRUE; char * startend_string = "default"; int startend = TSM_default; char * allN_string = "0.9"; Probability allN = 0.9; char * null_string = "syn"; boolean use_syn = FALSE; char * intron_string = "tied"; boolean use_tied_model = FALSE; DBSearchImpl * dbsi = NULL; int alg = GWWRAP_623; char * alg_str = NULL; DPRunImpl * dpri = NULL; int aln_alg = GWWRAP_623; char * aln_alg_str = NULL; int aln_number = 50; char * aln_number_str = "50"; double search_cutoff = 20.00; char * search_cutoff_str = "20.00"; double evalue_search_cutoff = -1.0; char * evalue_search_str = NULL; boolean show_histogram = TRUE; boolean show_PackAln = FALSE; boolean show_AlnBlock = FALSE; boolean show_ace = FALSE; boolean show_gff = FALSE; boolean show_trans = FALSE; boolean show_pep = FALSE; boolean show_cdna = FALSE; boolean show_pretty = FALSE; boolean show_pretty_gene = FALSE; boolean show_gene_plain = FALSE; boolean show_match_sum = FALSE; boolean show_para = FALSE; boolean show_overlap = FALSE; boolean pseudo = FALSE; char * main_block_str = "50"; int main_block = 50; char * divide_str = "//"; boolean complete = FALSE; /* complete analysis or not */ boolean complete_trans = FALSE; boolean complete_gene = FALSE; boolean complete_cdna = FALSE; boolean complete_ace = FALSE; boolean show_diana = FALSE; boolean show_acehalf = FALSE; boolean show_embl = FALSE; Probability rnd_loop = 0.99; Probability cds_loop = 0.97; Probability rnd_to_model = (1 - 0.99) / 3; Probability link_loop = 0.98; Probability link_to_model = (1- 0.98) / 3; AlnBlock * alb; PackAln * pal; GenomicRegion * gr; GenomicRegion * embl; MatchSummarySet * mss; RandomModel * rm; GeneParameter21 * gpara; GeneParser21Score * gps; GeneParser4Score * gp4s; RandomCodonScore * rcs; RandomModelDNAScore * ids; boolean show_output(void) { int i,j,k; Genomic * gent; ThreeStateModel * temptsm; AlnBlock * alb; PackAln * pal; GenomicRegion * gr; GenomicRegion * cgr; MatchSummarySet * mss; Protein * ps; Protein * trans; cDNA * cdna; double bits; double aln_cutoff; boolean fitted_res = FALSE; AlnColumn * alt; /* sort by bit score first */ sort_Hscore_by_score(hs); aln_cutoff = search_cutoff; /* for the moment */ if( search_mode == PG_SEARCH_S2DB ) { if( hs->his == NULL || hs->his->total < 1000 ) { info("Cannot fit histogram to a db smaller than 1,000"); fprintf(ofp,"[Warning: Can't fit histogram to a db smaller than 1,000]\n\n"); fitted_res = FALSE; show_histogram = FALSE; } else { fit_Hscore_to_EVD(hs,20); fitted_res = TRUE; } } /* if you think this is scary, you are absolutely right! */ if( alg == GWWRAP_6LITE && aln_alg != GWWRAP_6LITE) { /* memory leak here. Apologies */ gpara->cses->cse[1] = codon_number_ComplexSequenceEval(); } /* dofus catcher */ if( aln_alg != alg ) { fprintf(ofp,"\n#\n#WARNING!\n#\n# Your alignment algorithm is different from your search algorithm.\n# This is probably quite sensible but will lead to differing scores.\n# Use the search score as an indicator of the significance of the match\n# Read the docs for more information\n#\n"); } fprintf(ofp,"\n\n#High Score list\n"); fprintf(ofp,"#Protein ID DNA Str ID Bits Evalue\n"); fprintf(ofp,"--------------------------------------------------------------------------\n"); for(i=0;ilen;i++) { bits = Score2Bits(hs->ds[i]->score); if( bits < search_cutoff ) { break; } if( fitted_res == TRUE && evalue_search_str != NULL ) { if( hs->ds[i]->evalue > evalue_search_cutoff ) break; } if( fitted_res == TRUE) fprintf(ofp,"Protein %-20sDNA [%c] %-24s %.2f %.2g\n",hs->ds[i]->query->name,hs->ds[i]->target->is_reversed == TRUE ? '-' : '+',hs->ds[i]->target->name,bits,hs->ds[i]->evalue); else fprintf(ofp,"Protein %-20sDNA [%c] %-24s %.2f\n",hs->ds[i]->query->name,hs->ds[i]->target->is_reversed == TRUE ? '-' : '+',hs->ds[i]->target->name,bits); /* fprintf(ofp,"Sequence %-24s %c %.2f\n",hs->ds[i]->target->name,hs->ds[i]->target->is_reversed == TRUE ? '-' : '+',bits); */ } if( search_mode == PG_SEARCH_S2DB && show_histogram == TRUE ) { fprintf(ofp,"\n\n#Histogram\n"); fprintf(ofp,"-----------------------------------------------------------------------\n"); PrintASCIIHistogram(hs->his,ofp); } fprintf(ofp,"\n\n#Alignments\n"); fprintf(ofp,"-----------------------------------------------------------------------\n"); if( complete == TRUE ) { cgr = new_GenomicRegion(gen); } for(i=0;ilen;i++) { bits = Score2Bits(hs->ds[i]->score); if( bits < search_cutoff ) { break; } if( i >= aln_number ) { break; } if( fitted_res == TRUE && evalue_search_str != NULL ) { if( hs->ds[i]->evalue > evalue_search_cutoff ) break; } fprintf(ofp,"\n>Results for %s vs %s (%s) [%d]\n",hs->ds[i]->query->name,hs->ds[i]->target->name,hs->ds[i]->target->is_reversed == TRUE ? "reverse" : "forward",i ); gent = get_Genomic_from_GenomicDB(gdb,hs->ds[i]->target); if( gent == NULL ) { warn("Unable to retrieve %s for alignment. Skipping...",hs->ds[i]->target->name); continue; } temptsm = indexed_ThreeStateModel_ThreeStateDB(tsmdb,hs->ds[i]->query); alb = AlnBlock_from_TSM_genewise_wrap(temptsm,gent,gpara,rmd,rmd,use_syn,aln_alg,allN,flat_insert,NULL,dpri,&pal); gr = new_GenomicRegion(gent); add_Genes_to_GenomicRegion_GeneWise(gr,gent->baseseq->offset,gent->baseseq->end,alb,NULL,pseudo,NULL); /* put in their seqname */ for(j=0;jlen;j++) { if( temptsm->accession != NULL ) gr->gene[j]->seqname = stringalloc(temptsm->accession); else gr->gene[j]->seqname = stringalloc(temptsm->name); } if( complete == TRUE ) { /* copy genes over into cgr region */ for(j=0;jlen;j++) { if( gr->gene[j]->bits > aln_cutoff ) add_Gene_to_GenomicRegion(cgr,hard_link_Gene(gr->gene[j])); } } if( use_single_pro == FALSE || pro == NULL) mss = MatchSummarySet_from_AlnBlock_genewise(alb,temptsm->name,1,gent->baseseq); else mss = MatchSummarySet_from_AlnBlock_genewise(alb,pro->baseseq->name,pro->baseseq->offset,gent->baseseq); if( show_pretty == TRUE ) { fprintf(ofp,"\n%s output\nScore %4.2f bits over entire alignment.\nThis will be different from per-alignment scores. See manual for details\nFor computer parsable output, try %s -help or read the manual\n",program_name,Score2Bits(pal->score),program_name); if( alg == GWWRAP_2193L || alg == GWWRAP_2193) { fprintf(ofp,"Entrie alignment score contains unseen 'random' score segments\nYou should only use the per-alignments score printed below\nfor the bits score of the alignment\n\n"); } if( use_syn == FALSE ) { fprintf(ofp,"Scores as bits over a flat simple random model\n\n"); } else { fprintf(ofp,"Scores as bits over a synchronous coding model\n\n"); } ps = pseudo_Protein_from_ThreeStateModel(temptsm); protgene_ascii_display(alb,ps->baseseq->seq,ps->baseseq->name,ps->baseseq->offset,gent,ct,15,main_block,TRUE,ofp); free_Protein(ps); fprintf(ofp,"%s\n",divide_str); } if( show_match_sum == TRUE ) { show_MatchSummary_genewise_header(ofp); show_MatchSummarySet_genewise(mss,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_pretty_gene == TRUE ) { show_pretty_GenomicRegion(gr,0,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_trans == TRUE ) { for(k=0;klen;k++) { if( gr->gene[k]->ispseudo == TRUE ) { fprintf(ofp,"#Gene %d is a pseudo gene - no translation possible\n",k); } else { trans = get_Protein_from_Translation(gr->gene[k]->transcript[0]->translation[0],ct); write_fasta_Sequence(trans->baseseq,ofp); } } fprintf(ofp,"%s\n",divide_str); } if( show_pep == TRUE ) { alt = alb->start; for(;alt != NULL;) { trans = Protein_from_GeneWise_AlnColumn(gent->baseseq,alt,1,&alt,ct,is_random_AlnColumn_genewise); if ( trans == NULL ) break; write_fasta_Sequence(trans->baseseq,ofp); free_Protein(trans); } fprintf(ofp,"%s\n",divide_str); } if( show_cdna == TRUE ) { for(k=0;klen;k++) { cdna = get_cDNA_from_Transcript(gr->gene[k]->transcript[0]); write_fasta_Sequence(cdna->baseseq,ofp); } fprintf(ofp,"%s\n",divide_str); } if( show_ace == TRUE ) { show_ace_GenomicRegion(gr,gent->baseseq->name,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_gff == TRUE ) { show_GFF_GenomicRegion(gr,gent->baseseq->name,"GeneWise",ofp); fprintf(ofp,"%s\n",divide_str); } if( show_gene_plain == TRUE ) { show_GenomicRegion(gr,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_AlnBlock == TRUE ) { mapped_ascii_AlnBlock(alb,Score2Bits,0,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_PackAln == TRUE ) { show_simple_PackAln(pal,ofp); fprintf(ofp,"%s\n",divide_str); } alb = free_AlnBlock(alb); pal = free_PackAln(pal); mss = free_MatchSummarySet(mss); gr = free_GenomicRegion(gr); gent = free_Genomic(gent); temptsm = free_ThreeStateModel(temptsm); } if( complete == TRUE ) { fprintf(ofp,"\n\n#Complete Analysis\n"); fprintf(ofp,"-------------------------------------------------------------\n\n"); sort_GenomicRegion_absolute(cgr); if( complete_ace == TRUE ) { show_ace_GenomicRegion(cgr,cgr->genomic->baseseq->name,ofp); fprintf(ofp,"%s\n",divide_str); } if( complete_gene == TRUE ) { show_pretty_GenomicRegion(cgr,0,ofp); fprintf(ofp,"%s\n",divide_str); } if( complete_cdna == TRUE ) { dump_transcripts_GenomicRegion(cgr,ofp); fprintf(ofp,"%s\n",divide_str); } if( complete_trans == TRUE ) { dump_translations_GenomicRegion(cgr,ct,ofp); fprintf(ofp,"%s\n",divide_str); } if( show_diana == TRUE ) { write_Diana_FT_GenomicRegion(cgr,ofp); } if( show_embl == TRUE ) { write_Embl_FT_GenomicRegion(cgr,ofp); } if( show_acehalf == TRUE ) { show_halfwise_GenomicRegion(cgr,cgr->genomic->baseseq->name,"HALFWISE","PFAM",1,"Pfam-Sanger",ofp); } } return TRUE; } boolean show_header(FILE * ofp) { fprintf(ofp,"Wise2 - database searching mode\n"); fprintf(ofp,"Program: %s version: %s released: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY); fprintf(ofp,"This program is freely distributed under a Gnu Public License.\n"); fprintf(ofp," See -version for more info on copyright\n"); fprintf(ofp,"Bugs and credits to Ewan Birney \n"); fprintf(ofp,"-----------------------------------------------------\n\n"); fprintf(ofp,"Algorithm type: GeneWise\n"); fprintf(ofp,"Search algorithm used: %s\n",alg_str); fprintf(ofp,"Implementation: %s\n",impl_string_DBSearchImpl(dbsi)); fprintf(ofp,"Search mode: %s\n",search_mode == PG_SEARCH_S2DB ? "Single protein vs genomic db" : search_mode == PG_SEARCH_DB2S ? "Single genomic vs protein db" : "Protein db vs genomic db"); fprintf(ofp,"Protein info from: %s\n",protein_file); fprintf(ofp,"Dna info from: %s\n",dna_seq_file); if( use_single_pro == TRUE || use_db_pro == TRUE ) { fprintf(ofp,"Comp Matrix: %s\n",matrix_file); fprintf(ofp,"Gap open: %d\n",gap); fprintf(ofp,"Gap extension: %d\n",ext); } fprintf(ofp,"Start/End (protein) %s\n",startend_string); fprintf(ofp,"Gene Paras: %s\n",gene_file); fprintf(ofp,"Codon Table: %s\n",codon_file); fprintf(ofp,"Subs error: %2.2g\n",subs_error); fprintf(ofp,"Indel error: %2.2g\n",indel_error); fprintf(ofp,"Model splice? %s\n",splice_string); fprintf(ofp,"Model codon bias? %s\n",cfreq_string); fprintf(ofp,"Model intron bias? %s\n",intron_string); fprintf(ofp,"Null model %s\n",null_string); fprintf(ofp,"Alignment Alg %s\n",aln_alg_str); return TRUE; } boolean search_db(void) { info("Starting search..."); hs = Hscore_from_TSM_genewise(tsmdb,gdb,gpara,rmd,rmd,use_syn,alg,search_cutoff,allN,report_stagger,FALSE,flat_insert,dbsi); if( hs == NULL ) { return FALSE; } return TRUE; } boolean build_db_objects(void) { gpara = GeneParameter21_wrap(gf,subs_error,indel_error,rmd,model_codon,model_splice,use_tied_model,ct,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model); if( gpara == NULL ) { warn("Sorry - could not build gene parameters. Must be a bug of some sort"); return FALSE; } if( prepare_ComplexSequenceEvalSet(gpara->cses) == FALSE ) { warn("Unable to prepare complexsequenceevalset"); return FALSE; } if( alg == GWWRAP_6LITE ) { /* memory leak here. Apologies */ gpara->cses->cse[1] = codon64_number_ComplexSequenceEval(); } if( use_single_dna == TRUE ) { gdb = new_GenomicDB_from_single_seq(gen,gpara->cses,Probability2Score(rep_prob)); } else { gdb = new_GenomicDB(sdb,gpara->cses,length_of_N,Probability2Score(rep_prob)); } if( gdb == NULL ) { return FALSE; } return TRUE; } boolean build_objects(void) { boolean ret = TRUE; Protein * pro_temp; SequenceDB * psdb; startend = threestatemodel_mode_from_string(startend_string); if( startend == TSM_unknown ) { warn("String %s was unable to converted into a start/end policy\n",startend_string); ret = FALSE; } if( use_single_dna == TRUE ) { gen = read_fasta_file_Genomic(dna_seq_file,length_of_N); if( gen == NULL ) { warn("Could not open single dna sequence in %s",dna_seq_file); ret = FALSE; } } else { sdb = single_fasta_SequenceDB(dna_seq_file); if( sdb == NULL ) { warn("Could not build a sequence database on %s",dna_seq_file); ret = FALSE; } } rm = default_RandomModel(); if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) { if( use_tsm == TRUE ) { info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file); } else { warn("Could not read Comparison matrix file in %s",matrix_file); ret = FALSE; } } if( is_integer_string(gap_str,&gap) == FALSE ) { warn("Could not get gap string number %s",gap_str); ret = FALSE; } if( is_integer_string(ext_str,&ext) == FALSE ) { warn("Could not get ext string number %s",ext_str); ret = FALSE; } if( qstart_str != NULL ) { if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) { warn("Could not make %s out as query start",qstart); ret = FALSE; } } if( qend_str != NULL ) { if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) { warn("Could not make %s out as query end",qend); ret = FALSE; } } if( aln_number_str != NULL ) { if( is_integer_string(aln_number_str,&aln_number) == FALSE || aln_number < 0) { warn("Weird aln number string %s...\n",aln_number_str); ret = FALSE; } } if( report_str != NULL ) { if( is_integer_string(report_str,&report_stagger) == FALSE ) { warn("Weird report stagger asked for %s",report_str); ret = FALSE; } } if( use_pfam1 == TRUE ) { tsmdb = new_PfamHmmer1DB_ThreeStateDB(protein_file); if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) { warn("Unable to set global/local switch on threestatedb"); ret = FALSE; } } else if ( use_pfam2 == TRUE ) { tsmdb = HMMer2_ThreeStateDB(protein_file); if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) { warn("Unable to set global/local switch on threestatedb"); ret = FALSE; } } else if ( use_tsm == TRUE) { /** using a HMM **/ /*tsm = read_HMMer_1_7_ascii_file(hmm_file);*/ /* tsm = Wise2_read_ThreeStateModel_from_hmmer1_file(protein_file);*/ tsm = HMMer2_read_ThreeStateModel(protein_file); if( tsm == NULL ) { warn("Could not read hmm from %s\n",protein_file); ret = FALSE; } else { display_char_in_ThreeStateModel(tsm); if( hmm_name != NULL ) { if( tsm->name != NULL ) ckfree(tsm->name); tsm->name = stringalloc(hmm_name); } else { if( tsm->name == NULL ) { tsm->name = stringalloc(protein_file); } } set_startend_policy_ThreeStateModel(tsm,startend,30,0.2); tsmdb = new_single_ThreeStateDB(tsm,rm); if( tsmdb == NULL ) { warn("Could not build a threestatemodel database from a single tsm. Weird!"); ret = FALSE; } } /* end of else tsm != NULL */ } /* end of else is tsm */ else if( use_single_pro ) { if( startend != TSM_default && startend != TSM_global && startend != TSM_local ) { warn("Proteins can only have local/global startend policies set, not %s",startend_string); ret = FALSE; } if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) { ret = FALSE; warn("Could not read Protein sequence in %s",protein_file); } else { if( qstart != -1 || qend != -1 ) { if( qstart == -1 ) qstart = 0; if( qend == -1 ) qend = pro->baseseq->len; pro_temp = truncate_Protein(pro,qstart-1,qend); if( pro_temp == NULL ){ ret = FALSE; } else { free_Protein(pro); pro = pro_temp; } } if( startend == TSM_global) tsm = global_ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext); else tsm = ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext); if( tsm == NULL ) { warn("Could not build ThreeStateModel from a single protein sequence..."); ret = FALSE; } else { tsmdb = new_single_ThreeStateDB(tsm,rm); if( tsmdb == NULL ) { warn("Could not build a threestatemodel database from a single tsm. Weird!"); ret = FALSE; } } /* end of could build a TSM */ } /* else is a real protein */ } /* end of else is single protein */ else if (use_db_pro == TRUE ) { psdb = single_fasta_SequenceDB(protein_file); tsmdb = new_proteindb_ThreeStateDB(psdb,mat,-gap,-ext); if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) { warn("Unable to set global/local switch on threestatedb"); ret = FALSE; } free_SequenceDB(psdb); } else { warn("No protein input file! Yikes!"); } /*** if( use_tsm == FALSE ) { } else { ****/ if( main_block_str != NULL ) { if( is_integer_string(main_block_str,&main_block) == FALSE ) { warn("Could not get maximum main_block number %s",main_block_str); ret = FALSE; } } if( alg_str != NULL ) { alg = gwrap_alg_type_from_string(alg_str); } else { alg_str = "623"; alg = gwrap_alg_type_from_string(alg_str); } if( aln_alg_str != NULL ) { aln_alg = gwrap_alg_type_from_string(aln_alg_str); } else { if( use_single_pro == TRUE || use_db_pro == TRUE) { aln_alg_str = "623"; } else { aln_alg_str = "623L"; } aln_alg = gwrap_alg_type_from_string(aln_alg_str); } if( evalue_search_str != NULL && is_double_string(evalue_search_str,&evalue_search_cutoff) == FALSE ) { warn("Could not convert %s to a double",evalue_search_str); ret = FALSE; } if( is_double_string(search_cutoff_str,&search_cutoff) == FALSE ) { warn("Could not convert %s to a double",search_cutoff_str); ret = FALSE; } if( is_double_string(subs_string,&subs_error) == FALSE ) { warn("Could not convert %s to a double",subs_error); ret = FALSE; } if( is_double_string(indel_string,&indel_error) == FALSE ) { warn("Could not convert %s to a double",indel_error); ret = FALSE; } if( is_double_string(allN_string,&allN) == FALSE ) { warn("Could not convert %s to a double",allN_string); ret = FALSE; } if( strcmp(cfreq_string,"model") == 0 ) { model_codon = TRUE; } else if ( strcmp(cfreq_string,"flat") == 0 ) { model_codon = FALSE; } else { warn("Cannot interpret [%s] as a codon modelling parameter\n",cfreq_string); ret = FALSE; } if( strcmp(splice_string,"model") == 0 ) { model_splice = TRUE; } else if ( strcmp(splice_string,"flat") == 0 ) { model_splice = FALSE; } else { warn("Cannot interpret [%s] as a splice modelling parameter\n",splice_string); ret = FALSE; } if( strcmp(null_string,"syn") == 0 ) { use_syn = TRUE; } else if ( strcmp(null_string,"flat") == 0 ) { use_syn = FALSE; } else { warn("Cannot interpret [%s] as a null model string\n",null_string); ret = FALSE; } if( strcmp(intron_string,"model") == 0 ) { use_tied_model = FALSE; } else if ( strcmp(intron_string,"tied") == 0 ) { use_tied_model = TRUE; } else { warn("Cannot interpret [%s] as a intron tieing switch\n",intron_string); ret = FALSE; } if( (rm = default_RandomModel()) == NULL) { warn("Could not make default random model\n"); ret = FALSE; } if( (gf = read_GeneFrequency21_file(gene_file)) == NULL) { ret = FALSE; warn("Could not read a GeneFrequency file in %s",gene_file); } if( (ct = read_CodonTable_file(codon_file)) == NULL) { ret = FALSE; warn("Could not read codon table file in %s",codon_file); } if( (ofp = openfile(output_file,"W")) == NULL) { warn("Could not open %s as an output file",output_file); ret = FALSE; } rmd = RandomModelDNA_std(); return ret; } void show_short_help(void) { fprintf(stdout,"%s (%s)\n",program_name,VERSION_NUMBER); fprintf(stdout,"This program is freely distributed under a GPL. See -version for more info\n"); fprintf(stdout,"Copyright (c) GRL limited: portions of the code are from separate copyrights\n\n"); fprintf(stdout,"swisepg in fasta format\n"); fprintf(stdout," Options. In any order, '-' as filename (for any input) means stdin\nDon't use stdin for databases, as on-the-fly indexing is used\n"); fprintf(stdout," Protein type [-protein,-prodb,-hmmer,-pfam] [default - protein]\n"); fprintf(stdout," Dna type [-dnas,-dnadb] [default - dnadb]\n"); fprintf(stdout," Protein [-s,-t,-g,-e,-m]\n HMM [-hmmer,-hname]\n"); fprintf(stdout," Model [-codon,-gene,-cfreq,-splice,-subs,-indel,-intron,-null]\n Alg [-kbyte,-alg,-aalg,-aln,-noh]\n"); fprintf(stdout," Output [-pretty,-genes,-para,-sum,-cdna,-trans,-ace,]\n"); fprintf(stdout," ..cont [-gff,-gener,-alb,-pal,-block,-divide]\n"); fprintf(stdout," Standard [-help,-version,-silent,-quiet,-errorlog]\n"); fprintf(stdout,"\nFor more help go %s -help.\n",program_name); fprintf(stdout,"\nSee WWW help at http://www.sanger.ac.uk/Software/Wise2/\n"); exit(63); } void show_help(FILE * ofp) { fprintf(ofp,"%s (%s)\n",program_name,VERSION_NUMBER); fprintf(ofp,"%s \n",program_name); /* program specific help */ fprintf(ofp,"Protein input type\n"); fprintf(ofp," -protein [default] single protein\n"); fprintf(ofp," -prodb protein fasta format db\n"); fprintf(ofp," -pfam pfam hmm library\n"); fprintf(ofp," -pfam2 pfam old style model directory (2.1) \n"); fprintf(ofp," -hmmer single hmmer HMM (version 2 compatible)\n"); fprintf(ofp,"DNA input type\n"); fprintf(ofp," -dnadb [default] dna fasta database\n"); fprintf(ofp," -dnas a single dna fasta sequence\n"); fprintf(ofp,"Protein comparison options\n"); fprintf(ofp," -gap [%3d] gap penalty\n",gap); fprintf(ofp," -ext [%3d] extension penalty\n",ext); fprintf(ofp," -matrix [%s] Comparison matrix\n",matrix_file); fprintf(ofp,"HMM options\n"); fprintf(ofp," -hname For single hmms, use this as the name, not filename\n"); fprintf(ofp,"Gene Model options\n"); fprintf(ofp," -init [%s] [default/global/local/wing/endbias] start-end policy\n",startend_string); fprintf(ofp," -codon [%s] Codon file\n",codon_file); fprintf(ofp," -gene [%s] Gene parameter file\n",gene_file); fprintf(ofp," -subs [%2.2g] Substitution error rate\n",subs_error); fprintf(ofp," -indel [%2.2g] Insertion/deletion error rate\n",indel_error); fprintf(ofp," -cfreq [model/flat] Using codon bias or not? [default flat]\n"); fprintf(ofp," -splice [model/flat] Using splice model or GT/AG? [default model]\n"); fprintf(ofp," -intron [model/tied] Use tied model for introns [default tied]\n"); fprintf(ofp," -null [syn/flat] Random Model as synchronous or flat [default syn]\n"); fprintf(ofp," -insert [model/flat] Use protein insert model [default flat]\n"); fprintf(ofp,"Algorithm options\n"); fprintf(ofp," -alg [623/2193/] Algorithm used for searching [default 623]\n"); fprintf(ofp," -aalg [623/623L/2193/2193L] Algorithm used for alignment [default 623/623L]\n"); fprintf(ofp," -cut [%.2f] Bits cutoff for reporting in search algorithm\n",search_cutoff); fprintf(ofp," -ecut [n/a] Evalue cutoff for single protein vs DNA searches.\n"); fprintf(ofp," -aln [%d] Max number of alignments (even if above cut)\n",aln_number); fprintf(ofp," -alln [%s] Probability of matching a NNN codon (only for single HMM vs DNAdb)\n",allN_string); fprintf(ofp," -nohis Don't show histogram on single protein/hmm vs DNA search\n"); fprintf(ofp," -report [0] Issue a report every x comparisons (default 0 comparisons)\n"); fprintf(ofp,"Output options [default -pretty -para]\n"); fprintf(ofp," -pretty show pretty ascii output\n"); fprintf(ofp," -genes show gene structure\n"); fprintf(ofp," -para show parameters\n"); fprintf(ofp," -sum show summary output\n"); fprintf(ofp," -cdna show cDNA\n"); fprintf(ofp," -trans show protein translation\n"); fprintf(ofp," -ace ace file gene structure\n"); fprintf(ofp," -gff Gene Feature Format file\n"); fprintf(ofp," -gener raw gene structure\n"); fprintf(ofp," -alb show logical AlnBlock alignment\n"); fprintf(ofp," -pal show raw matrix alignment\n"); fprintf(ofp," -block [%s] Length of main block in pretty output\n",main_block_str); fprintf(ofp," -divide [%s] divide string for multiple outputs\n",divide_str); fprintf(ofp,"Output for complete analysis (only available for single dna seq vs proteindb)\n"); fprintf(ofp," -pseudo Mark genes with frameshifts as pseudogenes\n"); fprintf(ofp," -ctrans provide all translations\n"); fprintf(ofp," -ccdna provide all cdna\n"); fprintf(ofp," -cgene provide all gene structures\n"); fprintf(ofp," -cace provide all gene structures in ace format\n"); fprintf(ofp," -cdiana provide all gene structures in diana EMBL FT format\n"); fprintf(ofp," -cembl provide all gene structures in EMBL FT format\n"); fprintf(ofp," -caceh provide all gene structures in halfwise type ace format\n"); show_help_DBSearchImpl(ofp); show_help_DPRunImpl(ofp); show_standard_options(ofp); fprintf(ofp,"\nSee WWW help at http://www.sanger.ac.uk/Software/Wise2/\n"); exit(63); } void show_version(FILE * ofp) { fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE); fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n"); fprintf(ofp,"The source code is copyright (c) GRL 1998 and others\n"); fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n"); fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n"); fprintf(ofp,"Credits: Ewan Birney wrote the core code.\n"); fprintf(ofp," Portions of this code was from HMMer2, written by Sean Eddy\n"); exit(63); } void build_defaults(void) { gene_file = "human.gf"; codon_file = "codon.table"; matrix_file = "blosum62.bla"; } int main(int argc,char ** argv) { int i; char * temp; build_defaults(); strip_out_standard_options(&argc,argv,show_help,show_version); if( (temp = strip_out_assigned_argument(&argc,argv,"gap")) != NULL ) gap_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"g")) != NULL ) gap_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"ext")) != NULL ) ext_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"e")) != NULL ) ext_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"matrix")) != NULL ) matrix_file = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"m")) != NULL ) matrix_file = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"s")) != NULL ) qstart_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"t")) != NULL ) qend_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"aln")) != NULL ) aln_number_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"codon")) != NULL ) codon_file = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"gene")) != NULL ) gene_file = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"alg")) != NULL ) alg_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"aalg")) != NULL ) aln_alg_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"cut")) != NULL ) search_cutoff_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"ecut")) != NULL ) evalue_search_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"subs")) != NULL ) subs_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"indel")) != NULL ) indel_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"alln")) != NULL ) allN_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"cfreq")) != NULL ) cfreq_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"splice")) != NULL ) splice_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"init")) != NULL ) startend_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"null")) != NULL ) null_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"intron")) != NULL ) intron_string = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"insert")) != NULL ) { if( strcmp(temp,"flat") == 0 ) { flat_insert = TRUE; } else { flat_insert = FALSE; } } if( (temp = strip_out_assigned_argument(&argc,argv,"report")) != NULL ) report_str = temp; pseudo = strip_out_boolean_argument(&argc,argv,"pseudo"); if( (strip_out_boolean_argument(&argc,argv,"dnas")) == TRUE ) use_single_dna = TRUE; if( (strip_out_boolean_argument(&argc,argv,"dnadb")) == TRUE ) use_single_dna = FALSE; if( (strip_out_boolean_argument(&argc,argv,"hmmer")) == TRUE ) use_tsm = TRUE; if( (strip_out_boolean_argument(&argc,argv,"pfam2")) == TRUE ) use_pfam1 = TRUE; if( (strip_out_boolean_argument(&argc,argv,"pfam")) == TRUE ) use_pfam2 = TRUE; if( (strip_out_boolean_argument(&argc,argv,"protein")) == TRUE ) use_single_pro = TRUE; if( (strip_out_boolean_argument(&argc,argv,"prodb")) == TRUE ) use_db_pro = TRUE; if( (strip_out_boolean_argument(&argc,argv,"intie")) == TRUE ) use_tied_model = TRUE; if( (temp = strip_out_assigned_argument(&argc,argv,"hname")) != NULL ) hmm_name = temp; if( (strip_out_boolean_argument(&argc,argv,"nohis")) != FALSE ) show_histogram = FALSE; if( (strip_out_boolean_argument(&argc,argv,"pretty")) != FALSE ) show_pretty = TRUE; if( (strip_out_boolean_argument(&argc,argv,"gff")) != FALSE ) show_gff = TRUE; if( (strip_out_boolean_argument(&argc,argv,"genes")) != FALSE ) show_pretty_gene = TRUE; if( (strip_out_boolean_argument(&argc,argv,"para")) != FALSE ) show_para = TRUE; if( (strip_out_boolean_argument(&argc,argv,"trans")) != FALSE ) show_trans = TRUE; if( (strip_out_boolean_argument(&argc,argv,"pep")) != FALSE ) show_pep = TRUE; if( (strip_out_boolean_argument(&argc,argv,"cdna")) != FALSE ) show_cdna = TRUE; if( (strip_out_boolean_argument(&argc,argv,"sum")) != FALSE ) show_match_sum = TRUE; if( (strip_out_boolean_argument(&argc,argv,"alb")) != FALSE ) show_AlnBlock = TRUE; if( (strip_out_boolean_argument(&argc,argv,"ace")) != FALSE ) show_ace = TRUE; if( (strip_out_boolean_argument(&argc,argv,"pal")) != FALSE ) show_PackAln = TRUE; if( (strip_out_boolean_argument(&argc,argv,"gener")) != FALSE ) show_gene_plain = TRUE; if( (strip_out_boolean_argument(&argc,argv,"over")) != FALSE ) show_overlap = TRUE; if( (temp = strip_out_assigned_argument(&argc,argv,"divide")) != NULL ) divide_str = temp; if( (temp = strip_out_assigned_argument(&argc,argv,"block")) != NULL ) main_block_str = temp; complete_trans = strip_out_boolean_argument(&argc,argv,"ctrans") ; complete_gene = strip_out_boolean_argument(&argc,argv,"cgene") ; complete_ace = strip_out_boolean_argument(&argc,argv,"cace") ; complete_cdna = strip_out_boolean_argument(&argc,argv,"ccdna"); if( (strip_out_boolean_argument(&argc,argv,"cdiana")) != FALSE ) show_diana = TRUE; if( (strip_out_boolean_argument(&argc,argv,"cembl")) != FALSE ) show_embl = TRUE; if( (strip_out_boolean_argument(&argc,argv,"caceh")) != FALSE ) show_acehalf = TRUE; dbsi = new_DBSearchImpl_from_argv(&argc,argv); dpri = new_DPRunImpl_from_argv(&argc,argv); strip_out_remaining_options_with_warning(&argc,argv); if( argc != 3 ) { warn("Wrong number of arguments (expect 2)!\n"); if( argc > 1 ){ warn("Arg line looked like (after option processing)"); for(i=1;i