/* * Permission to use, copy, modify, distribute, and sell this software * for any purpose and without fee, restriction or acknowledgement is * hereby granted. The author (James Knight of the Univ. of California, * Davis) places it in the public domain. * * This software is provided AS IS with no warranties of any kind. The * author shall have no liability with respect to the infringement of * copyrights, trade secrets or any patents by this software or any part * thereof. In no event will the author be liable for any lost revenue * or profits or other special, indirect and consequential damages. */ #include #include #include #include #include "seqio.h" /* * This example program gives you the ability to filter some of the * entries from GenBank files and databases, based on information * contained in some of the fields of the entries. The command line * looks like the following: * * example4 [options] files... * -a string - Match an author's last name * -d string - Match a substring of definition * -e string - Match a substring in the entry * -g string - Match an element of geneaology * -j string - Match an journal name * -k string - Match a keyword * -o string - Match the formal organism name * -r string - Match a substring of reference title * * and the program will read each of the input entries (which must * be in GenBank format), will try to perform all of the matches given * by the various options specified, and will output all entries that * match all of the options (i.e., so if more than one option is given, * the program ANDs the results of the options). * * A couple notes. First, only one option of each type may be specified * (multiple "-a"'s are not allowed). Second, if the list of files * contains just a dash "-", then standard input is read (so you can * pipe the results through multiple executions of the program in order * to specify multiple options of the same type). * * Third, the "-d", "-e" and "-r" options will only match substrings that * begin at the beginning of a word (although the substring itself can * span multiple words, so "example4 -e 'RNA fragment' file" will match * an entry containing "small nuclear RNA fragment", but not * "snRNA fragment"). * * Fourth, the "-a", "-g", "-k" and "-o" options all match the complete * string of the appropriate type (author last name, keyword,...), and not * a substring of any of the strings. * * Fifth, the "-g" option looks at the taxonomic classification appearing * just below the "ORGANISM" sub-field of the "SOURCE" record. * * Sixth, all of the matching is case-insensitive. */ int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline); int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline); int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline); int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline); int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline); int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline); int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline); int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline); static int mycasecmp(char *s, char *t) { int diff; for ( ; !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++) ; return diff; } static int myncasecmp(char *s, char *t, int n) { int diff, i; diff = 0; for (i=0; i < n && !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++,i++) ; return diff; } void usage(char *s) { if (s == NULL) fprintf(stderr, "Error: No value given for last command line option.\n"); else fprintf(stderr, "Error: Invalid option `%s'.\n", s); fprintf(stderr, " Usage: example4 [options] files...\n"); fprintf(stderr, " -a string - Match an author's last name\n"); fprintf(stderr, " -d string - Match a substring of definition\n"); fprintf(stderr, " -e string - Match a substring in the entry\n"); fprintf(stderr, " -g string - Match an element of geneaology\n"); fprintf(stderr, " -j string - Match an journal name\n"); fprintf(stderr, " -k string - Match a keyword\n"); fprintf(stderr, " -o string - Match the formal organism name\n"); fprintf(stderr, " -r string - Match a substring of reference title\n"); exit(1); } int main(int argc, char *argv[]) { int i, entrylen, flag; char *defline, *keyline, *orgline, *autline; char *entry, *entline, *refline, *jouline, *genline; SEQFILE *sfp; /* * Parse the options. */ defline = keyline = orgline = autline = NULL; entline = refline = jouline = genline = NULL; for (i=1; i < argc; i++) { if (argv[i][0] == '-' && argv[i][1] != '\0') { switch (argv[i][1]) { case 'a': if (i == argc - 1) usage(NULL); autline = argv[++i]; break; case 'd': if (i == argc - 1) usage(NULL); defline = argv[++i]; break; case 'e': if (i == argc - 1) usage(NULL); entline = argv[++i]; break; case 'g': if (i == argc - 1) usage(NULL); genline = argv[++i]; break; case 'j': if (i == argc - 1) usage(NULL); jouline = argv[++i]; break; case 'k': if (i == argc - 1) usage(NULL); keyline = argv[++i]; break; case 'o': if (i == argc - 1) usage(NULL); orgline = argv[++i]; break; case 'r': if (i == argc - 1) usage(NULL); refline = argv[++i]; break; default: usage(argv[i]); } } } /* * Read and filter the input. */ for (i=1; i < argc; i++) { if (argv[i][0] == '-' && argv[i][1] != '\0') i++; else { if ((sfp = seqfopen2(argv[i])) == NULL) continue; if (strcmp(seqfformat(sfp, 0), "GenBank") != 0) { fprintf(stderr, "%s: File is not in GenBank format.\n", argv[i]); seqfclose(sfp); continue; } while ((entry = seqfgetentry(sfp, &entrylen, 0)) != NULL) { flag = 1; if (flag && autline != NULL && !match_author(sfp, entry, entrylen, autline)) flag = 0; if (flag && defline != NULL && !match_definition(sfp, entry, entrylen, defline)) flag = 0; if (flag && entline != NULL && !match_entry(sfp, entry, entrylen, entline)) flag = 0; if (flag && genline != NULL && !match_geneaology(sfp, entry, entrylen, genline)) flag = 0; if (flag && jouline != NULL && !match_journal(sfp, entry, entrylen, jouline)) flag = 0; if (flag && keyline != NULL && !match_keyword(sfp, entry, entrylen, keyline)) flag = 0; if (flag && orgline != NULL && !match_organism(sfp, entry, entrylen, orgline)) flag = 0; if (flag && refline != NULL && !match_reftitle(sfp, entry, entrylen, refline)) flag = 0; if (flag) fwrite(entry, 1, entrylen, stdout); } seqfclose(sfp); } } return 0; } int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline) { int len; char *s; len = strlen(entline); s = entry; while (*s) { if (myncasecmp(s, entline, len) == 0) return 1; while (*s && !isspace(*s)) s++; while (*s && isspace(*s)) s++; } return 0; } int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline) { int len; char *s, *def; if ((def = seqfdescription(sfp, 0)) == NULL) return 0; len = strlen(defline); s = def; while (*s) { if (myncasecmp(s, defline, len) == 0) return 1; while (*s && !isspace(*s)) s++; while (*s && isspace(*s)) s++; } return 0; } int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline) { char *org; if ((org = seqforganism(sfp, 0)) == NULL || mycasecmp(org, orgline) != 0) return 0; else return 1; } int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline) { int len; char *s, *t; len = strlen(autline); s = entry; while ((s = strstr(s, "\n AUTHORS")) != NULL) { for (s+=10; *s == ' '; s++) ; while (*s) { for (t=s; *s && !isspace(*s) && *s != ','; s++) ; if (s - t == len && myncasecmp(t, autline, len) == 0) return 1; while (*s && !isspace(*s)) s++; while (*s && (*s == ' ' || (*s == '\n' && isspace(s[1]) && isspace(s[2]) && isspace(s[3])))) s++; if (*s == '\n') break; } } return 0; } int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline) { int len; char *s; len = strlen(refline); s = entry; while ((s = strstr(s, "\n TITLE")) != NULL) { for (s+=8; *s == ' '; s++) ; while (*s) { if (myncasecmp(s, refline, len) == 0) return 1; if (*s == '\n') { if (!isspace(s[1]) || !isspace(s[2]) || !isspace(s[3])) break; for (s++; *s == ' '; s++) ; } else s++; } } return 0; } int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline) { int len; char *s; len = strlen(jouline); s = entry; while ((s = strstr(s, "\n JOURNAL")) != NULL) { for (s+=10; *s == ' '; s++) ; if (myncasecmp(s, jouline, len) == 0 && isspace(s[len])) return 1; } return 0; } int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline) { int len; char *s, *t; len = strlen(keyline); s = entry; while ((s = strstr(s, "\nKEYWORDS")) != NULL) { for (s+=9; *s == ' '; s++) ; while (*s) { for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ; if (s - t == len && myncasecmp(t, keyline, len) == 0) return 1; if (*s == '.') break; for (s++; *s && isspace(*s); s++) ; } } return 0; } int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline) { int len; char *s, *t; len = strlen(genline); s = entry; while ((s = strstr(s, "\n ORGANISM")) != NULL) { for (s++; *s != '\n'; s++) ; for (s++; *s == ' '; s++) ; while (*s) { for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ; if (s - t == len && myncasecmp(t, genline, len) == 0) return 1; if (*s == '.') break; for (s++; *s && isspace(*s); s++) ; } } return 0; }