/* * Permission to use, copy, modify, distribute, and sell this software * for any purpose and without fee, restriction or acknowledgement is * hereby granted. The author (James Knight of the Univ. of California, * Davis) places it in the public domain. * * This software is provided AS IS with no warranties of any kind. The * author shall have no liability with respect to the infringement of * copyrights, trade secrets or any patents by this software or any part * thereof. In no event will the author be liable for any lost revenue * or profits or other special, indirect and consequential damages. */ #include #include #include #include #include #include "seqio.h" #define ASCII 4 /* * The variables used in the matching algorithm. This algorithm uses * the bits of an unsigned integer to hold whether a match to that * position of the keyword occurs (i.e. if during the matching the * i'th bit is 1, then the text up to the current position matches * the first i positions of the keyword). */ char *keyword; /* the keyword string */ int num_errors; /* the number of allowed errors */ int misonly_flag; /* only mismatch errors or allow indels */ int ambiguous_flag; /* should ambiguous text chars. match */ int dna_flag; /* is the keyword valid for DNA */ unsigned int *dna_alpha_mask; /* bitmasks for the possible DNA chars. */ int prt_flag; /* is the keyword valid for Protein */ unsigned int *prt_alpha_mask;; /* bitmasks for the possible PRT chars. */ unsigned int *ascii_alpha_mask; /* bitmasks for the possible ASCII chars. (no flag because ASCII always works) */ int size; /* the size of the pattern (not necess. the length of keyword) */ unsigned int *masklist; /* dyn. allocated array used to hold the bits for current matches allowing different errors */ unsigned int accept_mask; /* bitmask used to determine if a match has been found. */ /* * The variables used to keep track of the output file, and to * make sure that the output consists of a single file format. * See function match_file for a description of their use. */ char *outfile, outformat[128]; FILE *output_fp; SEQFILE *sfpout; int isopened, origannoteflag; /* * Variables and data structures dealing with the alphabet. */ int alphabet; int dna_table[128], prt_table[128], ascii_table[128]; /* * Prototypes for the functions in this file. */ void match_file(char *filename, int list_mode); void prep_keyword(void); char *isa_match(SEQFILE *sfp, char *seq, int seqlen, int alpha, int list_mode); void init_tables(void); char *dna_expand(char ch); char *prt_expand(char ch); int guessalpha(char *seq, int seqlen); void *my_malloc(int bufsize); /* * * The Functions * */ void usage(void) { fprintf(stderr, "Usage: grepseq [-# | -m | -d | -p | -a | -l | -o filename]" " pattern files...\n"); exit(1); } int main(int argc, char *argv[]) { int i, inflag, list_mode; char *s; /* * Initialize all of the important variables. * inflag - specifies whether any files are listed on the command line. * alphabet - is the alphabet of the sequence known? */ inflag = 0; alphabet = UNKNOWN; num_errors = list_mode = 0; ambiguous_flag = 0; sfpout = NULL; output_fp = NULL; outfile = NULL; isopened = origannoteflag = 0; /* * Parse the options: * -# - The number of errors to allow. * -a - Ambiguous characters of the text should used in the matching. * -d - The text will be DNA or RNA. * -l - Only list the sequences that match. Don't output their entries. * -m - Only allow mismatch errors. No indels. * -o filename - The output file. * -p - The text will be Protein. * * The first argument without a '-' is the keyword. The rest are files. */ for (i=1; i < argc; i++) { if (argv[i][0] == '-') { if (isdigit(argv[i][1])) { for (s=argv[i]+1; *s && isdigit(*s); s++) { num_errors *= 10; num_errors += *s - '0'; } } else if (argv[i][1] == 'o') { /* * For the "-o" option, permit the filename to be specified either * immediately after the "-o" (like "-omatches.txt") or as the * next argument (like "-o matches.txt"). */ if (argv[i][2] != '\0') outfile = argv[i] + 2; else if (i+1 < argc) { outfile = argv[i+1]; i++; } else usage(); } else if (argv[i][1] == 'a' && argv[i][2] == '\0') ambiguous_flag = 1; else if (argv[i][1] == 'm' && argv[i][2] == '\0') misonly_flag = 1; else if (argv[i][1] == 'd' && argv[i][2] == '\0') alphabet = DNA; else if (argv[i][1] == 'p' && argv[i][2] == '\0') alphabet = PROTEIN; else if (argv[i][1] == 'l' && argv[i][2] == '\0') list_mode = 1; else usage(); } else if (keyword == NULL) keyword = argv[i]; else inflag = 1; } /* * A keyword must have been specified. */ if (keyword == NULL) usage(); /* * Initialize the alphabet tables and parse the keyword. */ init_tables(); prep_keyword(); /* * If no files were specified on the command line, use standard input. * Otherwise, rescan the arguments for the files. */ if (inflag == 0) match_file("-", list_mode); else { for (i=1; i < argc; i++) { if (argv[i][0] == '-' || argv[i] == keyword || argv[i] == outfile) continue; match_file(argv[i], list_mode); } } /* * Close the output file. */ if (sfpout != NULL) seqfclose(sfpout); else if (output_fp != NULL && output_fp != stdout) fclose(output_fp); return 0; } /* * match_file * * Search the sequences of a file or database for a keyword, and output * either a description (if list_mode is 1) or an annotated/converted * entry (if list_mode is 0). */ void match_file(char *filename, int list_mode) { static int buflen = 0; static char *buffer = NULL; int seqlen, entrylen, len, tempalpha, comlen, annoteflag; char *seq, *str, *entry, *format, linebuffer[81]; SEQINFO *info, tempinfo; SEQFILE *sfp; /* * Open the file or database. It must be in a format that can be * automatically determined by the SEQIO package. */ if ((sfp = seqfopen2(filename)) == NULL) return; format = seqfformat(sfp, 0); /* * Output Handling: * All of the output (from all of the calls to match_file) is * directed to the output file (or standard out). However, to * ensure that the output consists of a single file format, even * when the input formats are different, the opening of that output * must be delayed until the first input file's format is known. * * In the first call to match_file, the output handling determines * how the output should be structured. When list_mode is set, the * output file is simply opened using a stdio FILE structure. If * list_mode is off, the output handling first sets the output * format to be the format of that first file (or "FASTA" if that * format cannot be output by the SEQIO package). It also determines * whether entries in that format can be annotated. * * Then, for each file that is matched, the output handling checks * whether that file's format differs from the output format and/or * whether the entries of this file can be annotated (i.e., if the * output format supports annotation and this file's format is the * same as the output format). */ if (!isopened) { if (list_mode) { if (outfile == NULL) output_fp = stdout; else if ((output_fp = fopen(outfile, "w")) == NULL) { fprintf(stderr, "%s: %s\n", outfile, sys_errlist[errno]); exit(1); } } else { sfpout = seqfopen((outfile == NULL ? "-" : outfile), "w", (seqfcanwrite(format) ? format : "FASTA")); if (sfpout == NULL) exit(1); origannoteflag = seqfcanannotate(format); } isopened = 1; } annoteflag = (!list_mode && origannoteflag && strcmp(format, seqfformat(sfpout, 0)) == 0); /* * The main loop reading the sequences. */ while (seqfread(sfp, 0) == 0) { if ((seq = seqfsequence(sfp, &seqlen, 0)) == NULL || seqlen == 0) continue; /* * Try to figure out the alphabet, either using information from the * command line, from the entry or by looking at the sequence itself. */ if (alphabet != UNKNOWN) tempalpha = alphabet; else if ((tempalpha = seqfalphabet(sfp)) == UNKNOWN) tempalpha = guessalpha(seq, seqlen); /* * Match the keyword against the sequence. */ if ((str = isa_match(sfp, seq, seqlen, tempalpha, list_mode)) == NULL) continue; /* * If any matches occur, either annotate the entry or extract the * information about the sequence and either list the info or * convert the entry (depending on the value of list_mode). */ if (annoteflag) { entry = seqfentry(sfp, &entrylen, 0); seqfannotate(sfpout, entry, entrylen, str, 1); } else if ((info = seqfinfo(sfp, 0)) != NULL) { /* * In list mode, just output an identifier and the description * (and maybe the organism's name, if there's room). */ if (list_mode) { fputs("\n>>", output_fp); seqfoneline(info, linebuffer, 80, 0); fputs(linebuffer, output_fp); fputc('\n', output_fp); fputs(str, output_fp); fputc('\n', output_fp); } else { /* * If we're outputting an entry, append the string specifying * the matches to the comments occurring in the entry. * * Because the strings returned by the SEQIO package are dyn. * allocated and so their size cannot be increased, another * buffer may be necessary to hold both the entry's comments * and the new comment about the matches. */ if (info->comment == NULL) info->comment = str; else { comlen = strlen(info->comment); len = comlen + strlen(str) + 10; if (buflen < len) { if (buflen == 0) { buflen = len; buffer = (char *) malloc(buflen); } else { buflen += len; buffer = (char *) realloc(buffer, buflen); } if (buffer == NULL) { fprintf(stderr, "Memory Error: Ran out of memory.\n"); exit(1); } } strcpy(buffer, info->comment); buffer[comlen] = '\n'; strcpy(buffer+comlen+1, str); info->comment = buffer; } seqfwrite(sfpout, seq, seqlen, info); } } else { /* * If the entry information cannot be extracted, just create a * dummy entry and output something. */ memset(&tempinfo, 0, sizeof(SEQINFO)); tempinfo.idlist = "oth:Unknown"; tempinfo.description = "Unable to retrieve information from entry"; tempinfo.comment = str; seqfwrite(sfpout, seq, seqlen, &tempinfo); } } seqfclose(sfp); } /* * prep_keyword * * Parse the keyword and setup all of the data structures for the * bit-shifting match algorithm. * * When doing this parsing, perform the parsing for all three cases * where the alphabet is DNA/RNA, Protein and ASCII (since at this * point we don't know what the alphabet will be). Set the values * of dna_flag and prt_flag to 1 or 0 depending on whether the keyword * is valid for that alphabet (it's always valid for ASCII). */ void prep_keyword(void) { int i, num, state, word_bits, comp_mode; unsigned int mask; char *s, *t; word_bits = sizeof(unsigned int) * 8 - 1; /* * Scan the keyword string to find out how long the actual keyword is. * Since the keyword string can contain character classes, the keyword * size may not be the same as the keyword string size. */ for (s=keyword,size=0,state=0; *s; s++) { if (!state && *s == '[') state = 1; else if (state && *s == ']' && *(s-1) != '\\') { state = 0; size++; } else if (!state) size++; } if (state != 0 || size == 0) { fprintf(stderr, "Error: Cannot parse pattern `%s'.\n", keyword); exit(1); } if (num_errors >= size) { fprintf(stderr, "Error: Pattern `%s' of length %d, but %d errors specified.\n", keyword, size, num_errors); exit(1); } if (size >= word_bits) { fprintf(stderr, "Error: Cannot handle patterns whose length is longer than %d.\n", word_bits); exit(1); } /* * Allocate the bit masks for the various alphabetic characters. * A mask such as dna_alpha_mask[ch] will have a 1 in the i'th bit * (counting from least to most significant) if the i'th position * of the keyword specifies `ch' as a valid matching character. * So, for the keyword "AG[TC]A", bit 1 of dna_alpha_mask['a'] and * dna_alpha_mask['A'] are set, bit 2 of dna_alpha_mask['g'] and * dna_alpha_mask['G'] are set, and so on. * * Note that the bit counting in this comment starts from 0, and * bit 0 will never be set in any alpha_mask, because it corresponds * to the start state of the matching (i.e., the text ending at the * current position matches 0 positions of the keyword). */ dna_alpha_mask = (unsigned int *) my_malloc(128 * sizeof(unsigned int)); prt_alpha_mask = (unsigned int *) my_malloc(128 * sizeof(unsigned int)); ascii_alpha_mask = (unsigned int *) my_malloc(128 * sizeof(unsigned int)); for (i=0; i < 128; i++) dna_alpha_mask[i] = prt_alpha_mask[i] = ascii_alpha_mask[i] = 0; dna_flag = prt_flag = 1; for (s=keyword,mask=1; *s; s++) { mask <<= 1; switch (*s) { case '\\': /* * Allow backslash characters for ASCII keywords only. */ dna_flag = prt_flag = 0; s++; if (isdigit(*s) && *s != '8' && *s != '9') { num = *s - '0'; if (isdigit(s[1]) && s[1] != '8' && s[1] != '9') { num *= 10; num += *++s - '0'; if (isdigit(s[1]) && s[1] != '8' && s[1] != '9') { num *= 10; num += *++s - '0'; } } ascii_alpha_mask[num] |= mask; } else ascii_alpha_mask[(int) *s] |= mask; break; case '.': /* * A period will always match any character, regardless of the * alphabet (you can specify a period by backslashing it). */ for (i=0; i < 128; i++) { dna_alpha_mask[i] |= mask; prt_alpha_mask[i] |= mask; ascii_alpha_mask[i] |= mask; } break; case '[': /* * A '[' begins a character class. */ s++; comp_mode = 0; if (*s == '^') { comp_mode = 1; s++; } for ( ; *s != ']'; s++) { if (*s == '\\') { dna_flag = prt_flag = 0; s++; if (isdigit(*s) && *s != '8' && *s != '9') { num = *s - '0'; if (isdigit(s[1]) && s[1] != '8' && s[1] != '9') { num *= 10; num += *++s - '0'; if (isdigit(s[1]) && s[1] != '8' && s[1] != '9') { num *= 10; num += *++s - '0'; } } ascii_alpha_mask[num] |= mask; } else ascii_alpha_mask[(int) *s] |= mask; } else { if (dna_flag) { if ((t = dna_expand(*s)) == NULL) dna_flag = 0; else for ( ; *t; t++) dna_alpha_mask[(int) *t] |= mask; } if (prt_flag) { if ((t = prt_expand(*s)) == NULL) prt_flag = 0; else for ( ; *t; t++) prt_alpha_mask[(int) *t] |= mask; } ascii_alpha_mask[(int) *s] |= mask; } } if (comp_mode) { for (i=0; i < 128; i++) { if (dna_flag) dna_alpha_mask[i] ^= mask; if (prt_flag) prt_alpha_mask[i] ^= mask; ascii_alpha_mask[i] ^= mask; } } break; default: if (dna_flag) { if ((t = dna_expand(*s)) == NULL) dna_flag = 0; else for ( ; *t; t++) dna_alpha_mask[(int) *t] |= mask; } if (prt_flag) { if ((t = prt_expand(*s)) == NULL) prt_flag = 0; else for ( ; *t; t++) prt_alpha_mask[(int) *t] |= mask; } ascii_alpha_mask[(int) *s] |= mask; } /* * Set the accept_mask to have a 1 at the last bit of the * matching (i.e. for "AG[TC]A", bit 4 of accept_mask is set). * And dyn. allocate the structure to hold the current partial * matches if errors are allowed. */ accept_mask = mask; if (num_errors > 0) masklist = (unsigned int *) my_malloc((num_errors + 1) * sizeof(unsigned int)); } } /* * isa_match * * Scan the sequence for matches to the keyword. Construct a string * specifying the matches that are found, and return that string if there * are matches. Return NULL if no matches were found. * * This function only reports the first six matches that are found, and * it will stop the search upon finding that sixth match. */ char *isa_match(SEQFILE *sfp, char *seq, int seqlen, int alpha, int list_mode) { int i, j, score, mcount, lastpos, lastscore, *table; unsigned int *alpha_mask, chmask, last_mask, newmask; unsigned int masklist0, masklist1, masklist2; char ch, *s, *mptr; static char mstring[512]; /* * Check whether the keyword supports the sequence's alphabet. */ if (alpha == DNA || alpha == RNA) { if (!dna_flag) { fprintf(stderr, "Error: Invalid pattern `%s' for DNA sequences.\n", keyword); exit(1); } alpha_mask = dna_alpha_mask; table = dna_table; } else if (alpha == PROTEIN) { if (!prt_flag) { fprintf(stderr, "Error: Invalid pattern `%s' for Protein sequences.\n", keyword); exit(1); } alpha_mask = prt_alpha_mask; table = prt_table; } else { alpha_mask = ascii_alpha_mask; table = ascii_table; } /* * Setup the variables for the string being returned. */ mcount = 0; mptr = mstring; lastpos = lastscore = -1; masklist0 = masklist1 = masklist2 = 0; if (num_errors >= 3) { masklist[0] = 1; for (i=1; i <= num_errors; i++) masklist[i] = 0; } for (s=seq; *s; s++) { /* * If no errors are permitted, execute a simplified algorithm looking * for matches, using `current_mask' to hold the partial matches to * the text seen so far. * * To match the next character, first shift the current match bits * over one and then AND it with the alphabet mask for the next * text character. Thus, if the text upto the previous character * matches the first 4 positions of the keyword (and so bit 4 of * masklist0 is set) and the current character of the text * matches position 5 of the keyword (and so bit 5 of that * character's alpha_mask is set), then the shifting and ANDing * with result in bit 5 of masklist0 being set at the end of * the loop's iteration. * * When errors are allowed, the unsigned int's of `masklist' perform * the same function as current mask above, but where `masklist[j]' * gives the current set of matches allowing j errors. Thus, * masklist[2][6] is 1 when the text seen so far matches the first 6 * positions of the keyword with 2 errors. * * Initially, set bit 0 of masklist[0] to 1, and everything else to 0. * * I won't go into detail about this, except to say that `last_mask' * during the loops always holds the value of `masklist[j-1]' that * was computed during the previous iteration of the main loop * (i.e. the matches of the text ending at i-1 to the keyword with * j-1 errors). * * The variable score is used to determine the value of the best * match ending at position i of the text. */ if (num_errors == 0) { score = 0; while ((ch = *s) && table[(int) ch] && !((masklist0 = ((masklist0 << 1) & alpha_mask[(int) *s]) + 1) & accept_mask)) s++; } else if (num_errors == 1) { score = -1; if (misonly_flag) { while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist0; masklist0 = ((masklist0 << 1) & chmask) + 1; masklist1 = ((masklist1 << 1) & chmask) | (last_mask << 1); if (masklist0 & accept_mask) { score = 0; break; } else if (masklist1 & accept_mask) { score = 1; break; } s++; } } else { while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist0; masklist0 = ((masklist0 << 1) & chmask) + 1; masklist1 = ((masklist1 << 1) & chmask) | /* match */ (last_mask << 1) | /* mismatch */ last_mask | /* deletion of text */ (masklist0 << 1); /* insertion of pattern */ if (masklist0 & accept_mask) { score = 0; break; } else if (masklist1 & accept_mask) { score = 1; break; } s++; } } } else if (num_errors == 2) { score = -1; if (misonly_flag) { while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist0; masklist0 = ((masklist0 << 1) & chmask) + 1; newmask = ((masklist1 << 1) & chmask) | (last_mask << 1); last_mask = masklist1; masklist1 = newmask; masklist2 = ((masklist2 << 1) & chmask) | (last_mask << 1); if (masklist0 & accept_mask) { score = 0; break; } else if (masklist1 & accept_mask) { score = 1; break; } else if (masklist2 & accept_mask) { score = 2; break; } s++; } } else { while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist0; masklist0 = ((masklist0 << 1) & chmask) + 1; newmask = ((masklist1 << 1) & chmask) | /* match */ (last_mask << 1) | /* mismatch */ last_mask | /* deletion of text */ (masklist0 << 1); /* insertion of pattern */ last_mask = masklist1; masklist1 = newmask; masklist2 = ((masklist2 << 1) & chmask) | /* match */ (last_mask << 1) | /* mismatch */ last_mask | /* deletion of text */ (masklist1 << 1); /* insertion of pattern */ if (masklist0 & accept_mask) { score = 0; break; } else if (masklist1 & accept_mask) { score = 1; break; } else if (masklist2 & accept_mask) { score = 2; break; } s++; } } } else if (misonly_flag) { score = -1; while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist[0]; masklist[0] = ((masklist[0] << 1) & chmask) + 1; score = (masklist[0] & accept_mask ? 0 : -1); for (j=1; j <= num_errors; j++) { newmask = ((masklist[j] << 1) & chmask) | (last_mask << 1); last_mask = masklist[j]; masklist[j] = newmask; if ((newmask & accept_mask) && score == -1) score = j; } if (score != -1) break; s++; } } else { score = -1; while ((ch = *s) && table[(int) ch]) { chmask = alpha_mask[(int) ch]; last_mask = masklist[0]; masklist[0] = ((masklist[0] << 1) & chmask) + 1; score = (masklist[0] & accept_mask ? 0 : -1); for (j=1; j <= num_errors; j++) { newmask = ((masklist[j] << 1) & chmask) | /* match */ (last_mask << 1) | /* mismatch */ last_mask | /* deletion of text */ (masklist[j-1] << 1); /* insertion of pattern */ last_mask = masklist[j]; masklist[j] = newmask; if ((newmask & accept_mask) && score == -1) score = j; } if (score != -1) break; s++; } } if (!ch) break; else if (!table[(int) ch]) { fprintf(stderr, "%s, entry %d: Sequence contains invalid characters.\n", seqffilename(sfp, 0), seqfentryno(sfp)); return NULL; } /* * If there's a match, add that match to the string being returned. */ i = s - seq + 1; if (lastpos == -1) { lastpos = i; lastscore = score; } else if (lastpos > i - size) { if (score < lastscore) { lastpos = i; lastscore = score; } } else { mcount++; if (mcount == 1) { if (list_mode) sprintf(mstring, "Matches: %d-%d (%d)", lastpos-size+1, lastpos, lastscore); else sprintf(mstring, "Pattern: %s\nMatches: %d-%d (%d)", keyword, lastpos-size+1, lastpos, lastscore); while (*mptr) mptr++; } else if (mcount <= 6) { sprintf(mptr, ", %d-%d (%d)", lastpos-size+1, lastpos, lastscore); while (*mptr) mptr++; if (mcount == 6) { *mptr++ = '.'; *mptr++ = '.'; *mptr++ = '.'; *mptr = '\0'; goto MATCH_END; } } lastpos = i; lastscore = score; } } if (lastpos != -1) { mcount++; if (mcount == 1) { if (list_mode) sprintf(mstring, "Matches: %d-%d (%d)", lastpos-size+1, lastpos, lastscore); else sprintf(mstring, "Pattern: %s\nMatches: %d-%d (%d)", keyword, lastpos-size+1, lastpos, lastscore); while (*mptr) mptr++; } else if (mcount <= 6) { sprintf(mptr, ", %d-%d (%d)", lastpos-size+1, lastpos, lastscore); while (*mptr) mptr++; if (mcount == 6) { *mptr++ = '.'; *mptr++ = '.'; *mptr++ = '.'; *mptr = '\0'; goto MATCH_END; } } } MATCH_END: if (mcount == 0) return NULL; else return mstring; } /* * init_tables * * This function just initializes tables specifying the value DNA, Protein * and ASCII characters (including ambiguous DNA and Protein characters). */ void init_tables(void) { int i; char *t; for (i=0; i < 128; i++) { dna_table[i] = 0; prt_table[i] = 0; ascii_table[i] = 1; } for (t="aAcCgGtTuUrRyYwWsSmMkKhHbBvVdDnN"; *t; t++) dna_table[(int) *t] = 1; for (t="aAcCdDeEfFgGhHiIkKlLmMnNpPqQrRsStTvVwWyYbBzZxX"; *t; t++) prt_table[(int) *t] = 1; } /* * dna_expand * * Given a character, it returns the set of characters that form a match * using the DNA alphabet and allowing matches of upper and lower case * characters. If the given character is not a valid DNA character, * it returns NULL. */ char *dna_expand(char ch) { if (ambiguous_flag) { switch (ch) { case 'a': case 'A': return "aArRwWmMhHvVdDnN"; case 'c': case 'C': return "cCyYsSmMhHbBvVnN"; case 'g': case 'G': return "gGrRsSkKbBvVdDnN"; case 't': case 'T': return "tTuUyYwWkKhHbBdDnN"; case 'u': case 'U': return "tTuUyYwWkKhHbBdDnN"; case 'r': case 'R': return "aAgGrRwWsSmMkKhHbBvVdDnN"; case 'y': case 'Y': return "cCtTuUyYwWsSmMkKhHbBvVdDnN"; case 'w': case 'W': return "aAtTuUrRyYwWmMkKhHbBvVdDnN"; case 's': case 'S': return "cCgGrRyYsSmMkKhHbBvVdDnN"; case 'm': case 'M': return "aAcCrRyYwWsSmMhHbBvVdDnN"; case 'k': case 'K': return "gGtTuUrRyYwWsSkKhHbBvVdDnN"; case 'h': case 'H': return "aAcCtTuUrRyYwWsSmMkKhHbBvVdDnN"; case 'b': case 'B': return "cCgGtTuUrRyYwWsSmMkKhHbBvVdDnN"; case 'v': case 'V': return "aAcCgGrRyYwWsSmMkKhHbBvVdDnN"; case 'd': case 'D': return "aAgGtTuUrRyYwWsSmMkKhHbBvVdDnN"; case 'n': case 'N': return "aAcCgGtTuUrRyYwWsSmMkKhHbBvVdDnN"; default: return NULL; } } else { switch (ch) { case 'a': case 'A': return "aA"; case 'c': case 'C': return "cC"; case 'g': case 'G': return "gG"; case 't': case 'T': return "tTuU"; case 'u': case 'U': return "tTuU"; case 'r': case 'R': return "aAgG"; case 'y': case 'Y': return "cCtTuU"; case 'w': case 'W': return "aAtTuU"; case 's': case 'S': return "cCgG"; case 'm': case 'M': return "aAcC"; case 'k': case 'K': return "gGtTuU"; case 'h': case 'H': return "aAcCtTuU"; case 'b': case 'B': return "cCgGtTuU"; case 'v': case 'V': return "aAcCgG"; case 'd': case 'D': return "aAgGtTuU"; case 'n': case 'N': return "aAcCgGtTuU"; default: return NULL; } } } /* * prt_expand * * Given a character, it returns the set of characters that form a match * using the Protein alphabet and allowing matches of upper and lower case * characters. If the given character is not a valid Protein character, * it returns NULL. */ char *prt_expand(char ch) { if (ambiguous_flag) { switch (ch) { case 'a': case 'A': return "aAxX"; case 'c': case 'C': return "cCxX"; case 'd': case 'D': return "dDbBxX"; case 'e': case 'E': return "eEzZxX"; case 'f': case 'F': return "fFxX"; case 'g': case 'G': return "gGxX"; case 'h': case 'H': return "hHxX"; case 'i': case 'I': return "iIxX"; case 'k': case 'K': return "kKxX"; case 'l': case 'L': return "lLxX"; case 'm': case 'M': return "mMxX"; case 'n': case 'N': return "nNbBxX"; case 'p': case 'P': return "pPxX"; case 'q': case 'Q': return "qQzZxX"; case 'r': case 'R': return "rRxX"; case 's': case 'S': return "sSxX"; case 't': case 'T': return "tTxX"; case 'v': case 'V': return "vVxX"; case 'w': case 'W': return "wWxX"; case 'y': case 'Y': return "yYxX"; case 'b': case 'B': return "dDnNbBxX"; case 'z': case 'Z': return "eEqQzZxX"; case 'x': case 'X':return "aAcCdDeEfFgGhHiIkKlLmMnNpPqQrRsStTvVwWyYbBzZxX"; default: return NULL; } } else { switch (ch) { case 'a': case 'A': return "aA"; case 'c': case 'C': return "cC"; case 'd': case 'D': return "dD"; case 'e': case 'E': return "eE"; case 'f': case 'F': return "fF"; case 'g': case 'G': return "gG"; case 'h': case 'H': return "hH"; case 'i': case 'I': return "iI"; case 'k': case 'K': return "kK"; case 'l': case 'L': return "lL"; case 'm': case 'M': return "mM"; case 'n': case 'N': return "nN"; case 'p': case 'P': return "pP"; case 'q': case 'Q': return "qQ"; case 'r': case 'R': return "rR"; case 's': case 'S': return "sS"; case 't': case 'T': return "tT"; case 'v': case 'V': return "vV"; case 'w': case 'W': return "wW"; case 'y': case 'Y': return "yY"; case 'b': case 'B': return "dDnN"; case 'z': case 'Z': return "eEqQ"; case 'x': case 'X': return "aAcCdDeEfFgGhHiIkKlLmMnNpPqQrRsStTvVwWyY"; default: return NULL; } } } /* * guessalpha * * This function takes a sequence and tries to determine whether the * alphabet is DNA/RNA, Protein or ASCII. */ int guessalpha(char *seq, int seqlen) { int dna, prt; char *s; dna = prt = 0; for (s=seq; *s; s++) { if (dna_table[(int) *s]) dna++; if (prt_table[(int) *s]) prt++; } if (dna == seqlen && prt == seqlen) { if ((float) dna / (float) seqlen >= 0.85) return DNA; else return PROTEIN; } else if (dna == seqlen) return DNA; else if (prt == seqlen) return PROTEIN; else return ASCII; } /* * my_malloc * * My version of malloc which tests for a NULL return and also zeroes out * the allocated memory before returning it. */ void *my_malloc(int bufsize) { char *s; if ((s = (char *) malloc(bufsize)) == NULL) { fprintf(stderr, "Memory Error: Ran out of memory.\n"); exit(1); } memset(s, 0, bufsize); return s; }