/* * IDXSEQ.C - A Database Indexing Program (Version 1.2) * * Copyright (c) 1996 by James Knight at Univ. of California, Davis * * Permission to use, copy, modify, distribute and sell this software * and its documentation is hereby granted, subject to the following * restrictions and understandings: * * 1) Any copy of this software or any copy of software derived * from it must include this copyright notice in full. * * 2) All materials or software developed as a consequence of the * use of this software or software derived from it must duly * acknowledge such use, in accordance with the usual standards * of acknowledging credit in academic research. * * 3) The software may be used freely by anyone for any purpose, * commercial or non-commercial. That includes, but is not * limited to, its incorporation into software sold for a profit * or the development of commercial software derived from it. * * 4) This software is provided AS IS with no warranties of any * kind. The author shall have no liability with respect to the * infringement of copyrights, trade secrets or any patents by * this software or any part thereof. In no event will the * author be liable for any lost revenue or profits or other * special, indirect and consequential damages. */ #include #include #include #include #include #include #include #include #ifdef __unix #include #endif #include "seqio.h" #define HEADLINE "# SEQIO Index File - Version 1.0" #define BASE 64 #define MAXALLOC 3000000 #define TITLE "IDXSEQ v1.0 - Index Sequence Databases\n\n" /* * System dependencies and utility functions. */ #ifdef WIN32 char dirch = '\\'; #else char dirch = '/'; #endif #if defined(__sun) && !defined(FILENAME_MAX) #include #define FILENAME_MAX MAXPATHLEN #endif static int mycasecmp(char *s, char *t); static int myncasecmp(char *s, char *t, int n); static char *mystrdup(char *s); static int myatoi(char *s, int base, char basechar); static char *myitoa(char *s, int num, int base, char basechar); static int isa_file(char *filename); static int is_absolute(char *path); static char *get_truename(char *filename); static void *mymalloc(int bufsize); static void *myrealloc(void *ptr, int bufsize); int compare(char **a, char **b) { return mycasecmp(*a, *b); } /* * * The Main Data Structure for each of the index lists. * */ typedef struct idlistnode { char idpref[8]; char **files; int files_size, num_files, fcount; char *indexfile; int tmpcount, isactive; char **idents; int idents_size, num_idents; struct idlistnode *next; } IDLIST; IDLIST *table[128]; #define MERGE_MODE 0 #define LOAD_MODE 1 #define DELETE_MODE 2 int verbose_mode; /* * * The data structure and routines which store all of the * character strings (files and idents) kept by the * main data structure above. * */ #define STRPAGESIZE 131072 typedef struct pagenode { char *page; struct pagenode *next; } PAGE; PAGE *pagelist = NULL; int pagecount = STRPAGESIZE; int pagetotal = 0; static char *store_string(char *s, char *end); static void free_space(void); /* * * The data structures and signal handler used to keep track of * the temporary files, and to clean up those files on an interrupt. * */ char tempfile[FILENAME_MAX+1], tempfile2[FILENAME_MAX+2]; static void handler(int sig, int code, struct sigcontext *scp) { int i; char filename[FILENAME_MAX+1]; IDLIST *node; for (i=0; i < 128; i++) { for (node=table[i]; node != NULL; node=node->next) { if (node->isactive && node->tmpcount) { sprintf(filename, "%s.tmp.%d", node->indexfile, node->tmpcount); unlink(filename); } } } if (tempfile[0]) unlink(tempfile); if (tempfile2[0]) unlink(tempfile2); exit(1); } static void prog_exit(void) { handler(0, 0, NULL); } void myputerror(char *s) { if (verbose_mode) fputc('\n', stderr); if (strstr(s, "Error:") == NULL) fputs("Error: ", stderr); fputs(s, stderr); } /* * * Forward References * */ char *process_ident(char *ident, char *idend, int bytepos, char *file, int filecount); void output_identlist(IDLIST *node, int mode); void delete_files(int argc, char *argv[], char *idlist); void flush_lists(void); void merge_files(FILE *fp, FILE *outfp, IDLIST *node, int replaceflag); void merge_files2(FILE *idxfp, FILE *tmpfp, FILE *outfp); char *get_indexfile(char *idprefix); char **get_filelist(FILE *fp, int *numfiles_out); void output_header(FILE *fp, char **files, int num_files); /* * * * The Main Functions. * * */ static void usage(char *format, ...) { va_list ap; fputs("Error: ", stderr); va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); fputs(" Usage: index [-l | -m | -d idlist | -f format | -i idprefix] " "files...\n", stderr); exit(1); } int main(int argc, char *argv[]) { int i, len, count, status, flag, update_mode; int updatemode_set, filecount, bytepos; char *s, *t, *s2, *t2, *format, *idprefix, *deleteids, *limitids; char *current_file, *current_format, *current_idprefix, buffer[16]; char *file, *name, *filelist, *idlist, *db_format, *db_idprefix; IDLIST *node; SEQFILE *sfp; /* * Initialize the variables and signal handler. */ update_mode = MERGE_MODE; updatemode_set = 0; verbose_mode = 1; format = NULL; idprefix = NULL; deleteids = NULL; limitids = NULL; tempfile[0] = tempfile2[0] = '\0'; for (i=0; i < 128; i++) table[i] = NULL; signal(SIGHUP, (void (*)())handler); signal(SIGINT, (void (*)())handler); signal(SIGQUIT, (void (*)())handler); signal(SIGTERM, (void (*)())handler); /* * Check for these options: * -l - load option (remove old index files) * -m - merge option [default] (merge with old index files) * -r idlist - restrict the processing to these identifier prefixes * (load and merge mode only) * -d idlist - delete option (remove files' id's from index files * of listed identifier prefixes) * -f format - format of the input files/databases * -i idprefix - the default idprefix to use for main identifiers * -q - quiet mode (no status messages) */ for (i=1; i < argc && argv[i][0] == '-'; i++) { if (argv[i][1] == '\0') usage("Unknown option `%s'\n", argv[i]); switch (argv[i][1]) { case 'l': if (argv[i][2] != '\0') usage("Unknown option `%s'\n", argv[i]); else if (updatemode_set) usage("Only one of -l, -m and -d may be specified.\n"); update_mode = LOAD_MODE; updatemode_set = 1; break; case 'm': if (argv[i][2] != '\0') usage("Unknown option `%s'\n", argv[i]); else if (updatemode_set) usage("Only one of -l, -m and -d may be specified.\n"); updatemode_set = 1; break; case 'd': if (updatemode_set) usage("Only one of -l, -m and -d may be specified.\n"); if (argv[i][2] != '\0') deleteids = &argv[i][2]; else { if (i + 1 == argc) usage("No list of idprefixes given with -d option.\n"); deleteids = argv[++i]; } for (s=deleteids; *s; ) { for (t=s; *s && *s != ','; s++) ; if (t == s) usage("Invalid list of idprefixes `%s'.\n", deleteids); if (s - t < 2 || s - t > 4) { memcpy(buffer, t, s - t); buffer[s-t] = '\0'; usage("Invalid identifier prefix `%s'.\n", buffer); } if (*s) s++; } update_mode = DELETE_MODE; updatemode_set = 1; break; case 'f': if (format) usage("Multiple occurrences of -f option.\n"); if (argv[i][2] != '\0') format = &argv[i][2]; else { if (i + 1 == argc) usage("No format given with -f option.\n"); format = argv[++i]; } if (!seqfisaformat(format)) usage("Invalid file format `%s'.\n", format); break; case 'i': if (idprefix) usage("Multiple occurrences of -i option.\n"); if (argv[i][2] != '\0') idprefix = &argv[i][2]; else { if (i + 1 == argc) usage("No idprefix given with -i option.\n"); idprefix = argv[++i]; } for (s=idprefix,len=0; len < 6 && *s && isalnum(*s); s++,len++) ; if (len < 2 || len > 4 || (*s && !isalnum(*s))) usage("Invalid identifier prefix `%s'.\n", idprefix); break; case 'q': verbose_mode = 0; break; case 'r': if (limitids) usage("Multiple occurrences of -r option.\n"); if (argv[i][2] != '\0') limitids = &argv[i][2]; else { if (i + 1 == argc) usage("No list of idprefixes given with -r option.\n"); limitids = argv[++i]; } for (s=limitids; *s; ) { status = 0; for (t=s; *s && *s != ','; s++) if (!isalnum(*s)) status = 1; if (t == s) usage("Invalid list of idprefixes `%s'.\n", limitids); if (status || s - t < 2 || s - t > 4) { memcpy(buffer, t, s - t); buffer[s-t] = '\0'; usage("Invalid identifier prefix `%s'.\n", buffer); } if (*s) s++; } break; default: usage("Unknown option `%s'\n", argv[i]); } } if (i == argc) usage("No files or databases specified.\n"); if (verbose_mode) { printf("%s", TITLE); fflush(stdout); } seqfsetperror(myputerror); /* * Take care of delete mode separately. */ if (update_mode == DELETE_MODE) { delete_files(argc - i, argv + i, deleteids); return 0; } /* * The Main Loop for merging and loading * * First, construct the indexes for the new files. */ filecount = 0; for ( ; i < argc; i++) { filelist = NULL; db_format = db_idprefix = NULL; current_format = current_idprefix = NULL; if ((filelist = bioseq_parse(argv[i])) == NULL) continue; for (s=filelist; *s && *s != '@'; s++) if (*s == '\n') *s = '\0'; if (*s == '@') { fprintf(stderr, "%s: Only complete files may be specified.\n", argv[i]); free(filelist); continue; } db_format = bioseq_info(argv[i], "Format"); current_format = (db_format ? db_format : format); db_idprefix = bioseq_info(argv[i], "IdPrefix"); current_idprefix = (db_idprefix ? db_idprefix : idprefix); current_file = filelist; while (*current_file != '\0') { if ((sfp = seqfopen(current_file, "r", current_format)) != NULL) { if (verbose_mode) { printf("Reading %s", current_file); fflush(stdout); } name = get_truename(current_file); if ((file = store_string(name, NULL)) == NULL) { /* * If more than MAXALLOC space is currently in use, flush all of the * ident list out to files and try to allocate again. */ flush_lists(); if ((file = store_string(name, NULL)) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Memory error: Unable to allocate memory.\n"); prog_exit(); } } filecount++; if (current_idprefix != NULL) seqfsetidpref(sfp, current_idprefix); count = 0; while (seqfread(sfp, 1) == 0) { if (verbose_mode) { if (++count == 1000) { putchar('.'); fflush(stdout); count = 0; } } if ((idlist = seqfidlist(sfp, 0)) == NULL) continue; bytepos = seqfbytepos(sfp); for (s=idlist; *s; ) { for (t=s; *s && *s != '|'; s++) ; if (!limitids) file = process_ident(t, s, bytepos, file, filecount); else { flag = 0; for (s2=limitids; *s2; ) { for (t2=s2; *s2 && *s2 != ','; s2++) ; if (myncasecmp(t, t2, s2 - t2) == 0 && t[s2-t2] == ':') { flag = 1; break; } if (*s2) s2++; } if (flag) file = process_ident(t, s, bytepos, file, filecount); } if (*s) s++; } } if (verbose_mode) { if (count > 0) putchar('.'); printf("done\n"); fflush(stdout); } } status = seqferrno; seqfclose(sfp); if (status != E_EOF) { fprintf(stderr, "\nError while reading input. Program halting.\n"); prog_exit(); } while (*current_file) current_file++; current_file++; } if (db_idprefix != NULL) free(db_idprefix); if (db_format != NULL) free(db_format); free(filelist); } /* * Produce the output, either replacing or merging the old index files. */ for (i=0; i < 128; i++) { for (node=table[i]; node != NULL; node=node->next) { if (!node->isactive) continue; if (verbose_mode) { printf("Building index for `%s'...\n", node->idpref); fflush(stdout); } output_identlist(node, update_mode); } } return 0; } char *process_ident(char *ident, char *idend, int bytepos, char *file, int filecount) { int idpreflen; char ch, *s, *t, *ptr, buffer[256], filebuffer[FILENAME_MAX]; IDLIST *node, *newnode; if (idend == NULL) for (idend=ident; *idend; idend++) ; ch = toupper(*ident); for (s=ident; *s && *s != ':'; s++) ; if (!*s || s - ident < 2 || s - ident > 4) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Program error: An invalid identifier prefix found.\n"); prog_exit(); } idpreflen = s - ident; for (node=table[(int) ch]; node != NULL; node=node->next) if (myncasecmp(node->idpref, ident, idpreflen) == 0) break; if (node == NULL) { newnode = (IDLIST *) mymalloc(sizeof(IDLIST)); memcpy(newnode->idpref, ident, idpreflen); newnode->idpref[idpreflen] = '\0'; seqferrpolicy(PE_NONE); newnode->indexfile = get_indexfile(newnode->idpref); seqferrpolicy(PE_ALL); newnode->isactive = (newnode->indexfile != NULL); newnode->next = table[(int) ch]; table[(int) ch] = newnode; node = newnode; } if (!node->isactive) return file; /* * Add the file to the list of files, if not already there. */ if (node->fcount < filecount) { if (node->files_size == node->num_files) { if (node->files_size == 0) { node->files_size = 128; node->files = (char **) mymalloc(node->files_size * sizeof(char *)); } else { node->files_size += node->files_size; node->files = (char **) myrealloc(node->files, node->files_size * sizeof(char *)); } } node->files[node->num_files++] = file; node->fcount = filecount; } /* * Construct the index line for the identifier. */ for (t=ident; t < idend && *t != ':'; t++) ; if (t < idend) t++; else t = ident; for (s=buffer; t < idend; ) *s++ = *t++; *s++ = '\t'; s = myitoa(s, node->num_files-1, BASE, '0'); *s++ = '\t'; s = myitoa(s, bytepos, BASE, '0'); *s++ = '\n'; *s = '\0'; /* * Add the ident line to the list of identifiers. */ if ((ptr = store_string(buffer, s)) == NULL) { /* * If more than MAXALLOC space is currently in use, flush all of the * ident list out to files and try to allocate again. */ strcpy(filebuffer, file); flush_lists(); if ((file = node->files[0] = store_string(filebuffer, NULL)) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Memory error: Unable to allocate memory.\n"); prog_exit(); } node->num_files = 1; for (t=ident; t < idend && *t != ':'; t++) ; if (t < idend) t++; else t = ident; for (s=buffer; t < idend; ) *s++ = *t++; *s++ = '\t'; s = myitoa(s, node->num_files-1, BASE, '0'); *s++ = '\t'; s = myitoa(s, bytepos, BASE, '0'); *s++ = '\n'; *s = '\0'; if ((ptr = store_string(buffer, s)) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Memory error: Unable to allocate memory.\n"); prog_exit(); } } if (node->idents_size == node->num_idents) { if (node->idents_size == 0) { node->idents_size = 1024; node->idents = (char **) mymalloc(node->idents_size * sizeof(char *)); } else { node->idents_size += node->idents_size; node->idents = (char **) myrealloc(node->idents, node->idents_size * sizeof(char *)); } } node->idents[node->num_idents++] = ptr; return file; } void output_identlist(IDLIST *node, int mode) { FILE *oldfp, *newfp, *idxfp; if (!node->isactive) return; if (node->num_idents > 0) qsort(node->idents, node->num_idents, sizeof(char *), (int (*)(const void *, const void *)) compare); if (node->tmpcount == 0) { if (node->num_idents == 0) return; if (mode == LOAD_MODE || !isa_file(node->indexfile)) oldfp = NULL; else { if ((oldfp = fopen(node->indexfile, "r")) == NULL) { fprintf(stderr, "%s: Unable to open index file for `%s'.\n", node->indexfile, node->idpref); prog_exit(); } } sprintf(tempfile, "%s.tmp", node->indexfile); if ((newfp = fopen(tempfile, "w")) == NULL) { tempfile[0] = '\0'; fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } merge_files(oldfp, newfp, node, 1); if (oldfp != NULL) fclose(oldfp); fclose(newfp); unlink(node->indexfile); link(tempfile, node->indexfile); unlink(tempfile); tempfile[0] = '\0'; } else { /* * Create a single temporary file containing all of the new indices. */ if (node->num_idents != 0) { sprintf(tempfile, "%s.tmp.%d", node->indexfile, node->tmpcount); if ((oldfp = fopen(tempfile, "r")) == NULL) { tempfile[0] = '\0'; fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } node->tmpcount++; sprintf(tempfile2, "%s.tmp.%d", node->indexfile, node->tmpcount); if ((newfp = fopen(tempfile2, "w")) == NULL) { tempfile2[0] = '\0'; fclose(oldfp); fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } merge_files(oldfp, newfp, node, 0); fclose(oldfp); fclose(newfp); unlink(tempfile); tempfile[0] = tempfile2[0] = '\0'; } /* * Now either just replace the index file with the temp file (if loading), * or merge the contents of the two files. */ sprintf(tempfile, "%s.tmp.%d", node->indexfile, node->tmpcount); if (mode == LOAD_MODE || !isa_file(node->indexfile)) { unlink(node->indexfile); link(tempfile, node->indexfile); unlink(tempfile); tempfile[0] = '\0'; } else { if ((idxfp = fopen(node->indexfile, "r")) == NULL) { fprintf(stderr, "%s: Unable to open index file for `%s'.\n", node->indexfile, node->idpref); prog_exit(); } if ((oldfp = fopen(tempfile, "r")) == NULL) { tempfile[0] = '\0'; fclose(idxfp); fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } sprintf(tempfile2, "%s.tmp", node->indexfile); if ((newfp = fopen(tempfile2, "w")) == NULL) { tempfile2[0] = '\0'; fclose(oldfp); fclose(idxfp); fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } merge_files2(idxfp, oldfp, newfp); fclose(idxfp); fclose(oldfp); fclose(newfp); unlink(tempfile); tempfile[0] = '\0'; unlink(node->indexfile); link(tempfile2, node->indexfile); unlink(tempfile2); tempfile2[0] = '\0'; } } } void delete_files(int argc, char *argv[], char *idlist) { int i, j, len, filenum, file_size, *deletelist, *remaplist; int num_files, num_oldfiles, num_newfiles, deleteflag; char *s, *t, *file, *filelist, **files, **oldfiles, **newfiles; char *idpref, *indexfile, idbuffer[16]; char line[128], linebuffer[128]; FILE *oldfp, *newfp; file_size = 128; num_files = 0; files = (char **) mymalloc(file_size * sizeof(char *)); /* * Construct a list of the files to be deleted from the index files. */ for (i=0; i < argc; i++) { if ((filelist = bioseq_parse(argv[i])) == NULL) continue; for (s=filelist; *s; s++) if (*s == '\n') *s = '\0'; file = filelist; while (*file != '\0') { if (num_files == file_size) { file_size += file_size; files = (char **) myrealloc(files, file_size * sizeof(char *)); } files[num_files++] = mystrdup(get_truename(file)); while (*file) file++; file++; } free(filelist); } if (num_files == 0) { fprintf(stderr, "Error: No files to delete.\n"); return; } /* * Scan through the idlist, and for each idprefix's index file, * delete the lines containing one of the files in the list. */ for (idpref=idlist; *idpref; ) { for (t=idbuffer,len=0; *idpref && *idpref != ','; idpref++,t++,len++) *t = *idpref; *t = '\0'; if (*idpref) idpref++; if (verbose_mode) { printf("Filtering index file for `%s'...\n", idbuffer); fflush(stdout); } if ((indexfile = get_indexfile(idbuffer)) == NULL) continue; if (!isa_file(indexfile)) { fprintf(stderr, "Warning: Index file for `%s' does not exist.\n", idbuffer); free(indexfile); continue; } /* * Open the index file and retrieve its list of files. */ if ((oldfp = fopen(indexfile, "r")) == NULL) { fprintf(stderr, "%s: Unable to open index file for `%s'.\n", indexfile, idbuffer); free(indexfile); continue; } oldfiles = get_filelist(oldfp, &num_oldfiles); /* * Construct a deletelist, the new list of files and a remaplist from * the old list of files. */ deletelist = (int *) mymalloc(num_oldfiles * sizeof(int)); newfiles = (char **) mymalloc(num_oldfiles * sizeof(char *)); remaplist = (int *) mymalloc(num_oldfiles * sizeof(int)); for (i=0; i < num_oldfiles; i++) remaplist[i] = -1; deleteflag = 0; for (i=0,num_newfiles=0; i < num_oldfiles; i++) { for (j=0; j < num_files; j++) if (strcmp(oldfiles[i], files[j]) == 0) break; if (j < num_files) { deletelist[i] = 1; deleteflag++; } else { if (num_newfiles != i) remaplist[i] = num_newfiles; newfiles[num_newfiles++] = oldfiles[i]; } } if (!deleteflag) { free(remaplist); free(newfiles); free(deletelist); free(oldfiles[0]); free(oldfiles); free(indexfile); fclose(oldfp); continue; } /* * If all of the files are to be deleted, just unlink the file. */ if (deleteflag == num_oldfiles) { unlink(indexfile); free(remaplist); free(newfiles); free(deletelist); free(oldfiles[0]); free(oldfiles); free(indexfile); fclose(oldfp); continue; } /* * Open a temporary file, write out the header for the new file, and * then scan the lines of the file, writing (and possibly remapping) * the lines which are not deleted. */ sprintf(tempfile, "%s.tmp", indexfile); if ((newfp = fopen(tempfile, "w")) == NULL) { tempfile[0] = '\0'; fprintf(stderr, "Unable to open temporary files.\n"); prog_exit(); } output_header(newfp, newfiles, num_newfiles); while (fgets(line, 128, oldfp) != NULL) { for (s=line; *s != '\t'; s++) ; filenum = myatoi(s+1, BASE, '0'); if (deletelist[filenum]) continue; if (remaplist[filenum] == -1) fputs(line, newfp); else { for (t=linebuffer,s=line; (*t++ = *s++) != '\t'; ) ; t = myitoa(t, remaplist[filenum], BASE, '0'); while (*s != '\t') s++; strcpy(t, s); fputs(linebuffer, newfp); } } fclose(newfp); fclose(oldfp); /* * Make the temporary file the index file. */ unlink(indexfile); link(tempfile, indexfile); unlink(tempfile); tempfile[0] = '\0'; free(remaplist); free(newfiles); free(deletelist); free(oldfiles[0]); free(oldfiles); free(indexfile); } for (i=0; i < num_files; i++) free(files[i]); free(files); } void flush_lists(void) { int i; IDLIST *node; FILE *oldfp, *newfp; if (verbose_mode) { printf("Flushing..."); fflush(stdout); } for (i=0; i < 128; i++) { for (node=table[i]; node != NULL; node=node->next) { if (!node->isactive || node->num_idents == 0) continue; qsort(node->idents, node->num_idents, sizeof(char *), (int (*)(const void *, const void *)) compare); if (node->tmpcount == 0) oldfp = NULL; else { sprintf(tempfile, "%s.tmp.%d", node->indexfile, node->tmpcount); if ((oldfp = fopen(tempfile, "r")) == NULL) { tempfile[0] = '\0'; if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Error: Unable to open temporary files.\n"); prog_exit(); } } node->tmpcount++; sprintf(tempfile2, "%s.tmp.%d", node->indexfile, node->tmpcount); if ((newfp = fopen(tempfile2, "w")) == NULL) { tempfile2[0] = '\0'; fclose(oldfp); if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Error: Unable to open temporary files.\n"); prog_exit(); } merge_files(oldfp, newfp, node, 0); if (oldfp != NULL) { fclose(oldfp); unlink(tempfile); tempfile[0] = '\0'; } fclose(newfp); node->num_files = 0; node->fcount = 0; node->num_idents = 0; } } free_space(); if (verbose_mode) { printf("done"); fflush(stdout); } } void merge_files(FILE *fp, FILE *outfp, IDLIST *node, int replaceflag) { int i, j, isoldfile, num_oldfiles, num_newfiles, idcount, filenum; int *deletelist, *remaplist; char *s, *t, *idline, **oldfiles, **newfiles; char *status, line[128], linebuffer[128]; /* * Construct the list of files in the old index file (if the old * index file exists). */ if (fp == NULL) { isoldfile = 0; num_oldfiles = 0; oldfiles = NULL; } else { isoldfile = 1; oldfiles = get_filelist(fp, &num_oldfiles); } /* * Compute the deletelist (the list of files whose entries in the * in the old file should be deleted during the merge) and the * remaplist (the list of files whose entries in `node' should be * remapped during the merge). */ deletelist = (int *) (isoldfile ? mymalloc(num_oldfiles*sizeof(int)) : NULL); remaplist = (int *) mymalloc(node->num_files * sizeof(int)); for (i=0; i < node->num_files; i++) remaplist[i] = -1; num_newfiles = num_oldfiles; for (i=0; i < node->num_files; i++) { for (j=0; j < num_oldfiles; j++) { if (strcmp(node->files[i], oldfiles[j]) == 0) { if (replaceflag) deletelist[j] = 1; if (i != j) remaplist[i] = j; break; } } if (j == num_oldfiles) { if (i != num_newfiles) remaplist[i] = num_newfiles; num_newfiles++; } } /* * Build the total list of files and output the header lines of * the merged file. */ if (num_newfiles == num_oldfiles && isoldfile) output_header(outfp, oldfiles, num_oldfiles); else { newfiles = (char **) mymalloc(num_newfiles * sizeof(char *)); for (i=0; i < num_oldfiles; i++) newfiles[i] = oldfiles[i]; for (i=0; i < node->num_files; i++) { if (i >= num_oldfiles && remaplist[i] == -1) newfiles[i] = node->files[i]; else if (remaplist[i] >= num_oldfiles) newfiles[remaplist[i]] = node->files[i]; } output_header(outfp, newfiles, num_newfiles); free(newfiles); } if (isoldfile) { free(oldfiles[0]); free(oldfiles); } idcount = 0; status = (isoldfile ? fgets(line, 128, fp) : NULL); while (idcount < node->num_idents || status != NULL) { if (status == NULL || (idcount < node->num_idents && mycasecmp(node->idents[idcount], line) <= 0)) { idline = node->idents[idcount]; for (s=idline; *s != '\t'; s++) ; filenum = myatoi(s+1, BASE, '0'); if (remaplist[filenum] == -1) fputs(idline, outfp); else { for (t=linebuffer,s=idline; (*t++ = *s++) != '\t'; ) ; t = myitoa(t, remaplist[filenum], BASE, '0'); while (*s != '\t') s++; strcpy(t, s); fputs(linebuffer, outfp); } idcount++; } else { for (s=line; *s != '\t'; s++) ; filenum = myatoi(s+1, BASE, '0'); if (!deletelist[filenum]) fputs(line, outfp); status = fgets(line, 128, fp); } } free(deletelist); free(remaplist); } void merge_files2(FILE *idxfp, FILE *tmpfp, FILE *outfp) { int i, j, num_newfiles, filenum, num_idxfiles, num_tmpfiles; int *deletelist, *remaplist; char *s, *t, **idxfiles, **tmpfiles, **newfiles; char *idxstatus, *tmpstatus, tmpline[128], idxline[128], linebuffer[128]; /* * Construct the list of files in the old index file (if the old * index file exists). */ idxfiles = get_filelist(idxfp, &num_idxfiles); tmpfiles = get_filelist(tmpfp, &num_tmpfiles); /* * Compute the deletelist (the list of files whose entries in the * in the old file should be deleted during the merge) and the * remaplist (the list of files whose entries in `node' should be * remapped during the merge). */ deletelist = (int *) mymalloc(num_idxfiles * sizeof(int)); remaplist = (int *) mymalloc(num_tmpfiles * sizeof(int)); for (i=0; i < num_tmpfiles; i++) remaplist[i] = -1; num_newfiles = num_idxfiles; for (i=0; i < num_tmpfiles; i++) { for (j=0; j < num_idxfiles; j++) { if (strcmp(tmpfiles[i], idxfiles[j]) == 0) { deletelist[j] = 1; if (i != j) remaplist[i] = j; break; } } if (j == num_idxfiles) { if (i != num_newfiles) remaplist[i] = num_newfiles; num_newfiles++; } } /* * Build the total list of files and output the header lines of * the merged file. */ if (num_newfiles == num_idxfiles) output_header(outfp, idxfiles, num_idxfiles); else { newfiles = (char **) mymalloc(num_newfiles * sizeof(char *)); for (i=0; i < num_idxfiles; i++) newfiles[i] = idxfiles[i]; for (i=0; i < num_tmpfiles; i++) { if (i >= num_idxfiles && remaplist[i] == -1) newfiles[i] = tmpfiles[i]; else if (remaplist[i] >= num_idxfiles) newfiles[remaplist[i]] = tmpfiles[i]; } output_header(outfp, newfiles, num_newfiles); free(newfiles); } free(idxfiles[0]); free(idxfiles); free(tmpfiles[0]); free(tmpfiles); idxstatus = fgets(idxline, 128, idxfp); tmpstatus = fgets(tmpline, 128, tmpfp); while (idxstatus != NULL || tmpstatus != NULL) { if (idxstatus == NULL || (tmpstatus != NULL && mycasecmp(tmpline, idxline) <= 0)) { for (s=tmpline; *s != '\t'; s++) ; filenum = myatoi(s+1, BASE, '0'); if (remaplist[filenum] == -1) fputs(tmpline, outfp); else { for (t=linebuffer,s=tmpline; (*t++ = *s++) != '\t'; ) ; t = myitoa(t, remaplist[filenum], BASE, '0'); while (*s != '\t') s++; strcpy(t, s); fputs(linebuffer, outfp); } tmpstatus = fgets(tmpline, 128, tmpfp); } else { for (s=idxline; *s != '\t'; s++) ; filenum = myatoi(s+1, BASE, '0'); if (!deletelist[filenum]) fputs(idxline, outfp); idxstatus = fgets(idxline, 128, idxfp); } } free(deletelist); free(remaplist); } /* * * Common functions related to the indexfile. * * */ char *get_indexfile(char *idprefix) { char *t, *file, *rootdir, *indexfile, buffer[8]; indexfile = (char *) mymalloc(FILENAME_MAX+1); sprintf(buffer, "%s:-", idprefix); if ((file = bioseq_info(buffer, "Index")) == NULL) { free(indexfile); return NULL; } if (is_absolute(file) || (rootdir = bioseq_info(buffer, "Root")) == NULL) strcpy(indexfile, get_truename(file)); else { strcpy(indexfile, get_truename(rootdir)); for (t=indexfile; *t; t++) ; *t++ = dirch; strcpy(t, file); free(rootdir); } free(file); return indexfile; } char **get_filelist(FILE *fp, int *numfiles_out) { int i, startsize, linelen, num_lines, numread; char *s, *buffer, **oldfiles, line[129]; /* * Read the first line, and find out the size of the header. */ if (fgets(line, 128, fp) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Error: Cannot read temporary files.\n"); prog_exit(); } line[128] = '\0'; for (s=line; *s && isspace(*s); s++) ; startsize = myatoi(s, 10, '0'); while (*s && isdigit(*s)) s++; while (*s && isspace(*s)) s++; num_lines = myatoi(s, 10, '0'); while (*s && (isdigit(*s) || isspace(*s))) s++; if (startsize <= 0 || num_lines <= 0 || !*s || strncmp(s, HEADLINE, 18) != 0) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Error: Cannot read temporary files.\n"); prog_exit(); } /* * Read the header lines of the old file. */ linelen = strlen(line); buffer = (char *) mymalloc(startsize - linelen + 1); numread = fread(buffer, 1, startsize - linelen, fp); if (numread != startsize - linelen || buffer[numread-1] != '\n') { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Error: Cannot read temporary files.\n"); prog_exit(); } buffer[numread] = '\0'; /* * Build the list of filenames in the old file. */ oldfiles = (char **) mymalloc((num_lines - 1) * sizeof(char *)); for (s=buffer,i=0; *s && i < num_lines - 1; s++) { oldfiles[i++] = s; while (*s != '\n') s++; *s = '\0'; } if (*s || i < num_lines - 1) { fprintf(stderr, "Error: Cannot read temporary files.\n"); prog_exit(); } if (numfiles_out) *numfiles_out = num_lines - 1; return oldfiles; } void output_header(FILE *fp, char **files, int num_files) { int i, bytecount, total; char *s, buffer[16]; if (num_files == 0) return; bytecount = 0; for (i=0; i < num_files; i++) bytecount += strlen(files[i]) + 1; s = myitoa(buffer, num_files + 1, 10, '0'); bytecount += s - buffer; bytecount += strlen(HEADLINE) + 3; s = myitoa(buffer, bytecount, 10, '0'); total = bytecount + (s - buffer); s = myitoa(buffer, total, 10, '0'); bytecount += s - buffer; fprintf(fp, "%d %d %s\n", bytecount, num_files + 1, HEADLINE); for (i=0; i < num_files; i++) fprintf(fp, "%s\n", files[i]); } /* * * Functions handling the storage of filenames and identifier lines. * * */ static char *store_string(char *s, char *end) { static PAGE *page = NULL; int len; char *ptr; if (end == NULL) for (end=s; *end; end++) ; len = end - s; if (pagecount + len + 1 >= STRPAGESIZE) { if (pagetotal + STRPAGESIZE >= MAXALLOC || (page = (PAGE *) malloc(sizeof(PAGE))) == NULL || (page->page = (char *) malloc(STRPAGESIZE)) == NULL) return NULL; page->next = pagelist; pagelist = page; pagecount = 0; pagetotal += STRPAGESIZE; } ptr = page->page + pagecount; memcpy(ptr, s, len); ptr[len] = '\0'; pagecount += len + 1; return ptr; } static void free_space(void) { PAGE *node, *next; for (node=pagelist; node != NULL; node=next) { next = node->next; free(node->page); free(node); } pagetotal = 0; pagelist = NULL; } /* * * The Utility Functions. * * */ static int mycasecmp(char *s, char *t) { int diff; for ( ; !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++) ; return diff; } static int myncasecmp(char *s, char *t, int n) { int diff, i; diff = 0; for (i=0; i < n && !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++,i++) ; return diff; } static char *mystrdup(char *s) { char *temp; temp = (char *) malloc(strlen(s)+1); return (temp == NULL ? NULL : strcpy(temp, s)); } static int myatoi(char *s, int base, char basechar) { int num, sign; while (isspace(*s)) s++; sign = 0; if (*s == '+' || *s == '-') { sign = (*s == '-'); s++; } for (num=0; *s >= basechar && *s < basechar + base; s++) { num *= base; num += *s - basechar; } return (sign ? -num : num); } static char *myitoa(char *s, int num, int base, char basechar) { int pos, digit; char buffer[128]; if (num < 0) { *s++ = '-'; num *= -1; } pos = 0; do { digit = num % base; buffer[pos++] = (char) (digit + basechar); num /= base; } while (num != 0); for (pos--; pos >= 0; pos--) *s++ = buffer[pos]; return s; } static int isa_file(char *filename) { struct stat sbuf; if (stat(filename, &sbuf) >= 0 && (sbuf.st_mode & S_IFMT) == S_IFREG) return 1; else return 0; } static int is_absolute(char *path) { int abspath; if (path[0] != '~') { #ifdef WIN32 abspath = (path[0] == dirch || (isalpha(path[0]) && path[1] == ':' && path[2] == dirch)); #else abspath = (path[0] == dirch); #endif return abspath; } return 1; } static char *get_truename(char *filename) { static char buf[FILENAME_MAX+1]; int len; char *s, *t, *s2; s = filename; t = buf; if (*s == '~' && (s2 = getenv("HOME")) != NULL) { while ((*t++ = *s2++)) ; t--; s++; if (*s == dirch) { if (*(t-1) == dirch) t--; } else if (isalpha(*s)) while (t > buf && *(t-1) != dirch) t--; else { t = buf; s = filename; } } len = strlen(s); if (len > FILENAME_MAX - (t - buf)) len = FILENAME_MAX - (t - buf); memcpy(t, s, len); t[len] = '\0'; return buf; } static void *mymalloc(int bufsize) { void *s; if ((s = malloc(bufsize)) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Memory Error: Unable to allocate memory.\n"); prog_exit(); } memset(s, 0, bufsize); return s; } static void *myrealloc(void *ptr, int bufsize) { void *s; if ((s = realloc(ptr, bufsize)) == NULL) { if (verbose_mode) fputc('\n', stderr); fprintf(stderr, "Memory Error: Unable to allocate memory.\n"); prog_exit(); } return s; }