/************************************************************ * HMMER - Biological sequence analysis with profile-HMMs * Copyright (C) 1992-1998 Washington University School of Medicine * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and * GNULICENSE for details. * ************************************************************/ /* hmmsearch.c * SRE, Tue Jan 7 17:19:20 1997 * * Search a sequence database with a profile HMM. * RCS $Id: hmmsearch.c,v 1.1.1.1 2001/06/18 13:59:50 birney Exp $ */ #include #include #include #include #include #include "structs.h" /* data structures, macros, #define's */ #include "config.h" /* compile-time configuration constants */ #include "funcs.h" /* function declarations */ #include "globals.h" /* alphabet global variables */ #include "squid.h" /* general sequence analysis library */ #ifdef MEMDEBUG #include "dbmalloc.h" #endif static char banner[] = "hmmsearch - search a sequence database with a profile HMM"; static char usage[] = "\ Usage: hmmsearch [-options] \n\ Available options are:\n\ -h : help; print brief help on version and usage\n\ -A : sets alignment output limit to best domain alignments\n\ -E : sets E value cutoff (globE) to \n\ -T : sets T bit threshold (globT) to \n\ -Z : sets Z (# seqs) for E-value calculation\n\ "; static char experts[] = "\ --domE : sets domain Eval cutoff (2nd threshold) to \n\ --domT : sets domain T bit thresh (2nd threshold) to \n\ --forward : use the full Forward() algorithm instead of Viterbi\n\ --null2 : turn OFF the post hoc second null model\n\ --xnu : turn ON XNU filtering of target protein sequences\n\ "; static struct opt_s OPTIONS[] = { { "-h", TRUE, sqdARG_NONE }, { "-A", TRUE, sqdARG_INT }, { "-E", TRUE, sqdARG_FLOAT}, { "-T", TRUE, sqdARG_FLOAT}, { "-Z", TRUE, sqdARG_INT }, { "--domE", FALSE, sqdARG_FLOAT}, { "--domT", FALSE, sqdARG_FLOAT}, { "--forward", FALSE, sqdARG_NONE }, { "--null2", FALSE, sqdARG_NONE }, { "--xnu", FALSE, sqdARG_NONE }, }; #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) static void record_domains(struct tophit_s *h, struct plan7_s *hmm, char *dsq, SQINFO *sqinfo, struct p7trace_s *tr, double whole_pval, float whole_sc, int do_null2); int main(int argc, char **argv) { char *hmmfile; /* file to read HMM(s) from */ HMMFILE *hmmfp; /* opened hmmfile for reading */ char *seqfile; /* file to read target sequence(s) from */ SQFILE *sqfp; /* opened seqfile for reading */ int format; /* format of seqfile */ char *seq; /* target sequence */ SQINFO sqinfo; /* optional info for seq */ char *dsq; /* digitized target sequence */ int i; struct plan7_s *hmm; /* HMM to search with */ struct histogram_s *histogram;/* histogram of all scores */ struct p7trace_s *tr; /* traceback */ struct fancyali_s *ali; /* displayed alignment info */ struct tophit_s *ghit; /* list of top hits for whole sequences */ struct tophit_s *dhit; /* list of top hits for domains */ float sc; /* score of an HMM search */ double pvalue; /* pvalue of an HMM score */ double evalue; /* evalue of an HMM score */ double motherp; /* pvalue of a whole seq HMM score */ float mothersc; /* score of a whole seq parent of domain */ int sqfrom, sqto; /* coordinates in sequence */ int hmmfrom, hmmto; /* coordinate in HMM */ char *name, *desc; /* hit sequence name and description */ int sqlen; /* length of seq that was hit */ int nseq; /* number of sequences searched */ int Z; /* # of seqs for purposes of E-val calc */ int domidx; /* number of this domain */ int ndom; /* total # of domains in this seq */ int namewidth; /* max width of sequence name */ int Alimit; /* A parameter limiting output alignments */ float globT; /* T parameter: keep only hits > globT bits */ double globE; /* E parameter: keep hits < globE E-value */ float domT; /* T parameter for individual domains */ double domE; /* E parameter for individual domains */ char *optname; /* name of option found by Getopt() */ char *optarg; /* argument found by Getopt() */ int optind; /* index in argv[] */ int do_null2; /* TRUE to adjust scores with null model #2 */ int do_forward; /* TRUE to use Forward() not Viterbi() */ int do_xnu; /* TRUE to filter sequences thru XNU */ #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, current_size; orig_size = malloc_inuse(&histid1); fprintf(stderr, "[... memory debugging is ON ...]\n"); #endif /*********************************************** * Parse command line ***********************************************/ do_forward = FALSE; do_null2 = TRUE; do_xnu = FALSE; Z = 0; Alimit = INT_MAX; /* no limit on alignment output */ globE = 10.0; /* use a reasonable Eval threshold; */ globT = -FLT_MAX; /* but no bit threshold, */ domT = -FLT_MAX; /* no domain bit threshold, */ domE = FLT_MAX; /* and no domain Eval threshold. */ while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg); else if (strcmp(optname, "-E") == 0) globE = atof(optarg); else if (strcmp(optname, "-T") == 0) globT = atof(optarg); else if (strcmp(optname, "-Z") == 0) Z = atoi(optarg); else if (strcmp(optname, "--domE") == 0) domE = atof(optarg); else if (strcmp(optname, "--domT") == 0) domT = atof(optarg); else if (strcmp(optname, "--forward") == 0) do_forward = TRUE; else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE; else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE; else if (strcmp(optname, "-h") == 0) { Banner(stdout, banner); puts(usage); puts(experts); exit(0); } } if (argc - optind != 2) Die("Incorrect number of arguments.\n%s\n", usage); hmmfile = argv[optind++]; seqfile = argv[optind++]; /*********************************************** * Open sequence database (might be in BLASTDB or current directory) ***********************************************/ if (! SeqfileFormat(seqfile, &format, "BLASTDB")) switch (squid_errno) { case SQERR_NOFILE: Die("Sequence file %s could not be opened for reading", seqfile); break; case SQERR_FORMAT: default: Die("Failed to determine format of sequence file %s", seqfile); } if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); /*********************************************** * Open HMM file (might be in HMMERDB or current directory). * Read a single HMM from it. (Config HMM, if necessary). ***********************************************/ if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) Die("Failed to open HMM file %s\n%s", hmmfile, usage); if (!HMMFileRead(hmmfp, &hmm)) Die("Failed to read any HMMs from %s\n", hmmfile); if (hmm == NULL) Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); P7Logoddsify(hmm, !do_forward); /*********************************************** * Show the banner ***********************************************/ Banner(stdout, banner); printf( "HMM file: %s [%s]\n", hmmfile, hmm->name); printf( "Sequence database: %s\n", seqfile); printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); /*********************************************** * Search HMM against each sequence ***********************************************/ /* set up structures for storing output */ histogram = AllocHistogram(-200, 200, 100); /* keeps full histogram */ ghit = AllocTophits(200); /* per-seq hits: 200=lumpsize */ dhit = AllocTophits(200); /* domain hits: 200=lumpsize */ nseq = 0; while (ReadSeq(sqfp, format, &seq, &sqinfo)) { /* Silently skip empty sequences, which, God help us, appear * sometimes in wormpep. (AGB bug) */ if (sqinfo.len == 0) continue; nseq++; dsq = DigitizeSequence(seq, sqinfo.len); if (do_xnu) XNU(dsq, sqinfo.len); /* 1. Recover a trace by Viterbi. */ if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT) sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr); else sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr); /* 2. If we're using Forward scores, do another DP * to get it; else, we already have a Viterbi score * in sc. */ if (do_forward) sc = P7Forward(dsq, sqinfo.len, hmm, NULL); if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); #if DEBUGLEVEL >= DEBUG_LOTS P7PrintTrace(stdout, tr, hmm, dsq); #endif /* 2. Store score/pvalue for global alignment; will sort on score. * Keep all domains in a significant sequence hit. * We can only make a lower bound estimate of E-value since * we don't know the final value of nseq yet. */ pvalue = PValue(hmm, sc); evalue = Z ? (double) Z * pvalue : (double) nseq * pvalue; if (sc > globT && evalue < globE) { RegisterHit(ghit, sc, pvalue, sc, 0., 0., /* no mother seq */ sqinfo.name, sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL, 0,0,0, /* seq positions */ 0,0,0, /* HMM positions */ 0, TraceDomainNumber(tr), /* domain info */ NULL); /* alignment info */ record_domains(dhit, hmm, dsq, &sqinfo, tr, pvalue, sc, do_null2); } AddToHistogram(histogram, sc); FreeSequence(seq, &sqinfo); P7FreeTrace(tr); free(dsq); } /* We're done searching an HMM over the whole sequence database. * Set the theoretical EVD curve in our histogram using * calibration in the HMM, if available. */ if (hmm->flags & PLAN7_STATS) ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda, histogram->lowscore, histogram->highscore, 1.0, 0); if (!Z) Z = nseq; /* set Z for good now that we're done. */ /* Now format and report our output */ /* 1. Report overall sequence hits (sorted on E-value) */ printf("\nQuery HMM: %s %s\n", hmm->name, hmm->desc != NULL ? hmm->desc : ""); if (hmm->flags & PLAN7_STATS) printf(" [HMM has been calibrated; E-values are empirical estimates]\n"); else printf(" [No calibration for HMM; E-values are upper bounds]\n"); FullSortTophits(ghit); namewidth = MAX(8, TophitsMaxName(ghit)); printf("\nScores for complete sequences (score includes all domains):\n"); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", 52-namewidth, "Description", "Score", "E-value", " N "); printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", 52-namewidth, "-----------", "-----", "-------", "---"); for (i = 0; i < ghit->num; i++) { GetRankedHit(ghit, i, &pvalue, &sc, NULL, NULL, &name, &desc, NULL, NULL, NULL, /* sequence positions */ NULL, NULL, NULL, /* HMM positions */ NULL, &ndom, /* domain info */ NULL); /* alignment info */ evalue = pvalue * (double) Z; if (evalue < globE && sc > globT) printf("%-*s %-*.*s %7.1f %10.2g %3d\n", namewidth, name, 52-namewidth, 52-namewidth, desc != NULL ? desc : "", sc, evalue, ndom); else if (evalue >= globE) { if (i > 0) printf("\t[no more scores below E threshold]\n"); break; } else if (sc <= globT) { if (i > 0) printf("\t[no more scores above T threshold]"); break; } } if (i == 0) printf("\t[no hits above thresholds]\n"); /* 2. Report domain hits (also sorted on E-value) */ FullSortTophits(dhit); namewidth = MAX(8, TophitsMaxName(dhit)); printf("\nParsed for domains:\n"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value"); printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------"); for (i = 0; i < dhit->num; i++) { GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, NULL, &sqfrom, &sqto, &sqlen, /* seq position info */ &hmmfrom, &hmmto, NULL, /* HMM position info */ &domidx, &ndom, /* domain info */ NULL); /* alignment info */ evalue = pvalue * (double) Z; if (motherp * (double) Z >= globE || mothersc <= globT) continue; else if (evalue < domE && sc > domT) printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n", namewidth, name, domidx, ndom, sqfrom, sqto, sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.', hmmfrom, hmmto, hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.', sc, evalue); else if (evalue >= domE) { if (i > 0) printf("\t[no more scores below domE threshold]\n"); break; } else if (sc <= domT) { if (i > 0) printf("\t[no more scores above domT threshold]\n"); break; } } if (i == 0) printf("\t[no hits above thresholds\n"); /* 3. Alignment output, also by domain. * dhits is already sorted and namewidth is set, from above code. * Number of displayed alignments is limited by Alimit parameter; * also by domE (evalue threshold), domT (score theshold). */ if (Alimit != 0) { printf("\nAlignments of top-scoring domains:\n"); for (i = 0; i < dhit->num; i++) { if (i == Alimit) break; /* limit to Alimit output alignments */ GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, NULL, &sqfrom, &sqto, &sqlen, /* seq position info */ &hmmfrom, &hmmto, NULL, /* HMM position info */ &domidx, &ndom, /* domain info */ &ali); /* alignment info */ evalue = pvalue * (double) Z; if (motherp * (double) Z >= globE || mothersc <= globT) continue; else if (evalue < domE && sc > domT) { printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", name, domidx, ndom, sqfrom, sqto, sc, evalue); PrintFancyAli(stdout, ali); } else if (evalue >= domE) { if (i > 0) printf("\t[no more alignments below domE threshold\n"); break; } else if (sc <= domT) { if (i > 0) printf("\t[no more alignments above domT threshold\n"); break; } } if (i == 0) printf("\t[no hits above thresholds\n"); if (i == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit); } /* 4. Histogram output */ printf("\nHistogram of all scores:\n"); PrintASCIIHistogram(stdout, histogram); /* 5. Tophits summaries, while developing... */ printf("\nWhole sequence top hits:\n"); TophitsReport(ghit, globE, nseq); printf("\nDomain top hits:\n"); TophitsReport(dhit, domE, nseq); /*********************************************** * Clean-up and exit. ***********************************************/ FreeHistogram(histogram); HMMFileClose(hmmfp); SeqfileClose(sqfp); FreeTophits(ghit); FreeTophits(dhit); FreePlan7(hmm); SqdClean(); #ifdef MEMDEBUG current_size = malloc_inuse(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "[No memory leaks.]\n"); #endif return 0; } /* Function: record_domains() * Date: SRE, Tue Nov 4 11:25:14 1997 [St. Louis] * * Purpose: Decompose a trace, and register scores, P-values, alignments, * etc. for individual domains in a hitlist. * * Args: hmm - the HMM structure * dsq - digitized sequence 1..L * sqinfo - contains name of sequence, etc. * tr - traceback of the whole sequence aligned to HMM * whole_pval - P-value of complete alignment * whole_sc - score of complete alignment (bits) * do_null2 - TRUE to use post hoc null model correction * * Return: (void) */ static void record_domains(struct tophit_s *h, struct plan7_s *hmm, char *dsq, SQINFO *sqinfo, struct p7trace_s *tr, double whole_pval, float whole_sc, int do_null2) { struct p7trace_s **tarr; /* array of per-domain traces */ struct fancyali_s *ali; /* alignment of a domain */ int ntr; /* number of domain traces */ int idx; /* index for traces */ int k1, k2; /* start, stop coord in model */ int i1, i2; /* start, stop in sequence */ float score; double pvalue; TraceDecompose(tr, &tarr, &ntr); if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */ for (idx = 0; idx < ntr; idx++) { /* Get the score and bounds of the match. */ score = P7TraceScore(hmm, dsq, tarr[idx]); if (do_null2) score -= TraceScoreCorrection(hmm, tarr[idx], dsq); pvalue = PValue(hmm, score); TraceSimpleBounds(tarr[idx], &i1, &i2, &k1, &k2); #if DEBUGLEVEL >= DEBUG_LOTS P7PrintTrace(stdout, tarr[idx], hmm, dsq); #endif /* Record the match. Use score as the sort key. */ ali = CreateFancyAli(tarr[idx], hmm, dsq, sqinfo->name); RegisterHit(h, score, pvalue, score, whole_pval, whole_sc, sqinfo->name, sqinfo->flags & SQINFO_DESC ? sqinfo->desc : NULL, i1,i2, sqinfo->len, k1,k2, hmm->M, idx+1, ntr, ali); } for (idx = 0; idx < ntr; idx++) P7FreeTrace(tarr[idx]); free(tarr); return; }