/*------------------------------------------------------ Maximum likelihood estimation of migration rate and effectice population size using a Metropolis-Hastings Monte Carlo algorithm ------------------------------------------------------- D A T A R O U T I N E S creates data structures, read data (Electrophoretic loci, sequences, microsats), feeds data into tree (?), prints data, destroys data. Peter Beerli 1996, Seattle beerli@scs.fsu.edu Copyright 1996-2002 Peter Beerli and Joseph Felsenstein, Seattle WA Copyright 2003-2007 Peter Beerli, Tallahassee FL This software is distributed free of charge for non-commercial use and is copyrighted. Of course, we do not guarantee that the software works and are not responsible for any damage you may cause or have. $Id: data.c 714 2007-05-22 17:06:46Z beerli $ -------------------------------------------------------*/ /*! \file data.c Data manipulation routines */ #include #include "migration.h" #include "sighandler.h" #include "tools.h" #include "migrate_mpi.h" #include "data.h" #include "sequence.h" #ifdef PRETTY #include "pretty.h" #endif extern long number_genomes (int type); #ifdef DMALLOC_FUNC_CHECK #include #endif /* prototypes ----------------------------------------- */ void create_data (data_fmt ** data); void get_data (FILE * infile, data_fmt * data, option_fmt * options); void print_data (world_fmt * world, option_fmt * options, data_fmt * data); long find_missing(data_fmt *data, long pop, long locus); void print_data_summary (FILE * file, world_fmt * world, option_fmt * options, data_fmt * data); short findAllele (data_fmt * data, char s[], long locus); void free_datapart (data_fmt * data, option_fmt * options, long locus); /*private functions */ void init_data_structure1 (data_fmt ** data, option_fmt * options); void read_header (FILE * infile, data_fmt * data, option_fmt * options); void read_sites (data_fmt * data); void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop); void init_data_structure3 (data_fmt * data); void read_popheader (FILE * infile, data_fmt * data, long pop, long genomes); void read_indname (FILE * file, data_fmt * data, long pop, long ind, long nmlength); void read_popdata (FILE * file, data_fmt * data, long pop, option_fmt * options); void read_microalleles (FILE * infile, data_fmt * data, long pop, long ind); void read_alleles (FILE * infile, data_fmt * data, long pop, long ind); long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options, long locus, long pop, long ind, long baseread); void read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m); void finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options, long pop, long baseread); void print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options); void print_microdata (world_fmt * world, data_fmt * data, option_fmt * options); void print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data); void print_header (FILE * outfile, long pop, world_fmt * world, option_fmt * options, data_fmt * data); void create_alleles (data_fmt * data, option_fmt *options); void addAllele (data_fmt * data, char s[], long locus, long *z); void set_numind (data_fmt * data); void print_seq_pop (long locus, long pop, world_fmt * world, option_fmt * options, data_fmt * data); void print_seq_ind (long locus, long pop, long ind, world_fmt * world, option_fmt * options, data_fmt * data); void print_locus_head (long locus, world_fmt * world, option_fmt * options, data_fmt * data); void find_delimiter (char *title, char *dlm); void read_geofile (data_fmt * data, option_fmt * options, long numpop); void read_datefile (data_fmt * data, option_fmt * options, long numpop); void read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep, long *uepsites, long datatype); void read_uepfile (data_fmt * data, option_fmt * options, long numpop); /*=====================================================*/ /// creates the data structure void create_data (data_fmt ** data) { (*data) = (data_fmt *) mycalloc (1, sizeof (data_fmt)); } /* void init_data (data_fmt * data) { } */ /// /// free the data module void destroy_data (data_fmt * data) { long ind; long indalloc; long locus; long pop; long loci = data->loci; long numpop = data->numpop; // free data from init_data_structure3 for (locus = 0; locus < loci; locus++) { myfree(data->allele[locus]); } myfree(data->allele); myfree(data->maxalleles); myfree(data->skiploci); // free data from init_data_structure2 for(pop=0; pop < numpop ; pop++) { indalloc = -1; for(locus=0; locus < loci; locus++) { if(indalloc < data->numind[pop][locus]) indalloc = data->numind[pop][locus]; } for (ind = 0; ind < indalloc; ind++) { myfree(data->indnames[pop][ind]); myfree(data->yy[pop][ind]); } myfree(data->indnames[pop]); myfree(data->yy[pop]); } myfree(data->indnames); myfree(data->yy); // data->yy were already freed in free_datapart() // free data from init_structure_1 myfree(data->popnames[0]); myfree(data->numind[0]); myfree(data->numalleles[0]); myfree(data->popnames); myfree(data->numind); myfree(data->numalleles); myfree(data->seq->sites); myfree(data->seq); myfree(data->geo); myfree(data->lgeo); if(data->ogeo != NULL) { myfree(data->ogeo[0]); myfree(data->ogeo); } #ifdef TESTINGDATE // free sampledates for(locus=0;locusloci;locus++) { for(pop=0;pop < data->numpop; pop++) { myfree(data->sampledates[pop][locus]); } } myfree(data->sampledates[0]); myfree(data->sampledates); #endif myfree(data); } void get_data (FILE * infile, data_fmt * data, option_fmt * options) { long pop; long genomes=1; data->hasghost = FALSE; read_header (infile, data, options); genomes = number_genomes (options->datatype); init_data_structure1 (&data, options); switch (options->datatype) { case 's': case 'f': read_sites (data); break; case 'n': read_sites (data); data->seq->addon = 4; break; case 'u': read_sites (data); data->seq->addon = 4; break; default: data->seq->fracchange = 1.0; break; } if (options->progress) fprintf (stdout, "\n\n"); if (options->writelog) fprintf (options->logfile, "\n\n"); for (pop = 0; pop < data->numpop; pop++) { read_popheader (infile, data, pop, genomes); if (options->progress) fprintf (stdout, "Reading (%li) %s ...\n", pop, data->popnames[pop]); if (options->writelog) fprintf (options->logfile, "Reading (%li) %s ...\n", pop, data->popnames[pop]); init_data_structure2 (&data, options, pop); read_popdata (infile, data, pop, options); } read_geofile (data, options, data->numpop); #ifdef UEP read_uepfile (data, options, data->numpop); #endif #ifdef TESTINGDATE read_datefile(data, options, data->numpop); #endif if (options->progress) fprintf (stdout, "\n\n"); init_data_structure3 (data); switch (options->datatype) { case 'a': create_alleles (data, options); break; case 'b': create_alleles (data, options); for (pop = 0; pop < data->loci; pop++) data->maxalleles[pop] = XBROWN_SIZE; break; case 'm': create_alleles (data, options); for (pop = 0; pop < data->loci; pop++) data->maxalleles[pop] = options->micro_stepnum; break; } if(options->murates_fromdata) { if (strchr (SEQUENCETYPES, options->datatype)) { find_rates_fromdata(data, options); } } } /* private functions ========================================== */ void init_data_structure1 (data_fmt ** data, option_fmt * options) { long pop; long numpop = (*data)->numpop; long loci = (*data)->loci; (*data)->ogeo = NULL; (*data)->geo = NULL; if ((*data)->yy == NULL) { (*data)->yy = (char *****) mymalloc (sizeof (char ****) * numpop); (*data)->seq = (seqmodel_fmt *) mycalloc (1, sizeof (seqmodel_fmt)); (*data)->popnames =(char **) mymalloc (sizeof (char *) * numpop); (*data)->popnames[0] =(char *) mycalloc (numpop * LINESIZE,sizeof(char)); (*data)->indnames = (char ***) mymalloc (sizeof (char **) * numpop); (*data)->numind = (long **) mymalloc (sizeof (long *) * numpop); (*data)->numind[0] = (long *) mymalloc (sizeof (long) * numpop * loci); (*data)->numalleles = (long **) mymalloc (sizeof (long *) * numpop); (*data)->numalleles[0] = (long *) mymalloc (sizeof (long) * numpop * loci); for (pop = 1; pop < numpop; pop++) { (*data)->popnames[pop] = (*data)->popnames[0] + pop * LINESIZE; (*data)->numind[pop] = (*data)->numind[0] + pop * loci; (*data)->numalleles[pop] = (*data)->numalleles[0] + pop * loci; } (*data)->seq->sites = (long *) mycalloc (1, sizeof (long) * loci); } else { error ("Problem with initialization of data matrix yy\n"); } } void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop) { long ind, locus; long indalloc = -1; for(locus=0;locus<(*data)->loci;locus++) { if(indalloc < (*data)->numind[pop][locus]) indalloc = (*data)->numind[pop][locus]; } if (indalloc == 0) indalloc = 2; (*data)->yy[pop] = (char ****) mymalloc (sizeof (char ***) * indalloc); (*data)->indnames[pop] = (char **) mycalloc (1, sizeof (char *) * indalloc); for (ind = 0; ind < indalloc; ind++) { (*data)->indnames[pop][ind] = (char *) mycalloc (1, sizeof (char) * (1 + options->nmlength)); (*data)->yy[pop][ind] = (char ***) mymalloc (sizeof (char **) * (*data)->loci); for (locus = 0; locus < (*data)->loci; locus++) { if (!strchr (SEQUENCETYPES, options->datatype)) { (*data)->yy[pop][ind][locus] = (char **) mycalloc (1, sizeof (char *) * 2); (*data)->yy[pop][ind][locus][0] = (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1)); (*data)->yy[pop][ind][locus][1] = (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1)); } else { (*data)->yy[pop][ind][locus] = (char **) mycalloc (1, sizeof (char *)); (*data)->yy[pop][ind][locus][0] = (char *) mycalloc (1, sizeof (char) * ((*data)->seq->sites[locus]+1)); } } } } short findAllele (data_fmt * data, char s[], long locus) { short found = 0; while ((strcmp (s, data->allele[locus][found]) && data->allele[locus][found][0] != '\0')) found++; return found; } void free_datapart (data_fmt * data, option_fmt * options, long locus) { long ind, pop; // long genomes = number_genomes (options->datatype); for (pop = 0; pop < data->numpop; pop++) { for (ind = 0; ind < data->numind[pop][locus]; ind++) { if (!strchr (SEQUENCETYPES, options->datatype)) { myfree(data->yy[pop][ind][locus][0]); myfree(data->yy[pop][ind][locus][1]); myfree(data->yy[pop][ind][locus]); } else { myfree(data->yy[pop][ind][locus][0]); myfree(data->yy[pop][ind][locus]); } } } } void init_data_structure3 (data_fmt * data) { long locus, pop, maxi; data->allele = (allele_fmt **) mycalloc (1, sizeof (allele_fmt *) * data->loci); for (locus = 0; locus < data->loci; locus++) { maxi = 0; for (pop = 0; pop < data->numpop; pop++) maxi += data->numalleles[pop][locus]; data->allele[locus] = (allele_fmt *) mycalloc (1, sizeof (allele_fmt) * maxi); } data->maxalleles = (long *) mycalloc (1, sizeof (long) * data->loci); data->skiploci = (boolean *) mycalloc (1, sizeof (boolean) * (data->loci + 1)); } /// /// read the first line of the data file /// \param infile datafilename /// \param data data structure that holds all the data /// \param options structure that contain all option information /// \reval none void read_header (FILE * infile, data_fmt * data, option_fmt * options) { char input[LINESIZE], *p; char title[LINESIZE]; strcpy(title,"\0"); input[0] = '#'; while(input[0]=='#') { FGETS (input, sizeof (input), infile); if ((p = (char *) strpbrk (input, CRLF)) != NULL) *p = '\0'; } switch (lowercase (input[0])) { case 'a': sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop), &(data->loci), title); find_delimiter (title, &data->dlm); if(!(title[0] == '\0')) strcpy(options->title,title); break; case 'b': case 'm': sscanf (input, "%1s%ld%ld%1s%[^\n]", &options->datatype, &(data->numpop), &(data->loci), &data->dlm, title); if(!(title[0] == '\0')) strcpy(options->title,title); break; case 's': case 'n': case 'u': case 'f': sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop), &(data->loci), title); if(!(title[0] == '\0')) strcpy(options->title,title); break; case 'g': /* fall through if a menu change forces to analyze data instead of using the already sampled genealogies */ if (options->datatype == 'g') break; else memmove (input, input + 1, (strlen (input) - 1) * sizeof (char)); default: if(input[0]== '<') { usererror ("This data file may contain an XML or HTML tag,\nand cannot be read properly, check the data formatting section in the manual!"); exit(-1); } switch (options->datatype) { case 'a': sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci), title); find_delimiter (title, &data->dlm); if(!(title[0]==' ' || title[0] == '\0')) strcpy(options->title,title); break; case 'b': case 'm': sscanf (input, "%ld%ld%1s%[^\n]", &(data->numpop), &(data->loci), &(data->dlm), title); if(!(title[0]==' ' || title[0] == '\0')) strcpy(options->title,title); break; case 's': case 'n': case 'u': case 'f': sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci), title); if(!(title[0]==' ' || title[0] == '\0')) strcpy(options->title,title); break; default: usererror ("Datatype is wrong, please use a valid data type!"); } } options->datatype = lowercase (options->datatype); } void find_delimiter (char *title, char *dlm) { char *p = title; long z = 0; while (*p == ' ') { p++; z++; } if (isalnum (*p)) memmove (title, p, sizeof (char) * (strlen (title) - z)); else { *dlm = *p; p++; while (*p == ' ') { p++; z++; } memmove (title, p, sizeof (char) * (strlen (title) - z)); } } /* old sites reader, will be obsolete once the new is working correctly void read_sites (data_fmt * data) { long locus; char *input, *p, *a; input = (char *) mycalloc (LINESIZE, sizeof (char)); FGETS (input, LINESIZE, data->infile); if ((p = (char *) strpbrk (input, CRLF)) != NULL) *p = '\0'; p = input; for (locus = 0; locus < data->loci; locus++) { while (isspace ((int) *p)) p++; if (locus == 0) a = strtok (p, " "); else a = strtok (NULL, " "); data->seq->sites[locus] = atoi (a); if (data->seq->sites[locus] == 0) { warning ("This does look like sequence data\n"); warning ("I just read a number of sites=0\n"); warning ("If you use the wrong data type, the program\n"); usererror ("will crash anyway, so I stop now\n"); } } myfree(input); } */ //================================================================== // read the number of sites for each locus in the dataset // does not assume a fixed line length, but assumes that at the end of the line is either // a \n or \r or \r\l (similar to the sequence reader) to accommodate windows, mac and // unix line ends void read_sites (data_fmt * data) { long locus; char *input; input = (char *) mycalloc(data->loci * 20 ,sizeof(char)); for (locus = 0; locus < data->loci-1; locus++) { read_word(data->infile, input); if(input[0] == '#') { FGETS(input,LINESIZE,data->infile); locus--; continue; } data->seq->sites[locus] = atoi (input); if (data->seq->sites[locus] == 0) { warning ("This does look like sequence data\n"); warning ("I just read a number of sites=0\n"); warning ("If you use the wrong data type, the program\n"); usererror ("will crash anyway, so I stop now\n"); } } FGETS(input,LINESIZE,data->infile); while(input[0]=='#') FGETS(input,LINESIZE,data->infile); data->seq->sites[locus] = atoi (input); myfree(input); } /* void read_popheader (FILE * infile, data_fmt * data, long pop, long genomes) { char input[SUPERLINESIZE], *p, *tmp; long locus,lo; boolean havepopname=FALSE; FGETS (input, sizeof (input), infile); if ((p = (char *) strpbrk (input, CRLF)) != NULL) *p = '\0'; // allows that sequence data can have different numbers of individuals for different loci // data syntax changes: #ind1 #ind2 #IND3 .... pop_name havepopname=FALSE; if(data->loci>1) { tmp = strtok (input, " "); data->numind[pop][0] = atol(tmp); data->numalleles[pop][0] = data->numind[pop][0] * genomes; for(locus=1; locus< data->loci; locus++) { tmp = strtok(NULL," "); if(tmp!=NULL) { if(isdigit(tmp[0])) { data->numind[pop][locus] = atol(tmp); data->numalleles[pop][locus] = data->numind[pop][locus]*genomes; } else { havepopname=TRUE; strncpy(data->popnames[pop],tmp,MIN(strlen(tmp),80)); break; } } } if(!havepopname) { tmp = strtok(NULL," "); if(tmp!=NULL) strncpy(data->popnames[pop],tmp,MIN(strlen(tmp),80)); } // fills numind for additional locus in case the numind was not specified for(lo=locus; lo< data->loci; lo++) { data->numind[pop][lo] = data->numind[pop][locus-1]; data->numalleles[pop][lo] = data->numind[pop][lo] * genomes; } } else { // only one locus so we can use old scheme [see below] sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]), data->popnames[pop]); data->numalleles[pop][0] = data->numind[pop][0]*genomes; } translate (data->popnames[pop], ' ', '_'); translate (data->popnames[pop], '\t', '_'); //trial // for(locus=0;locusloci; locus++) // { // if (data->numind[pop][locus] == 0) // data->hasghost = FALSE ;//TRUE; // } } */ void read_popheader (FILE * infile, data_fmt * data, long pop, long genomes) { boolean havepopname = FALSE; long minlength = 0; long lo; long locus; char *input; input = (char *) mycalloc(LINESIZE,sizeof(char)); // allows that sequence data can have different numbers of individuals for different loci // data syntax changes: #ind1 #ind2 #IND3 .... pop_name havepopname=FALSE; if(data->loci>1) { read_word(data->infile, input); while(input[0]=='#') { FGETS(input,LINESIZE,data->infile); read_word(data->infile, input); } data->numind[pop][0] = atol(input); data->numalleles[pop][0] = data->numind[pop][0] * genomes; for(locus=1; locus < data->loci; locus++) { read_word(infile, input); while(input[0]=='#') { FGETS(input,LINESIZE,data->infile); read_word(data->infile, input); } if(isdigit(input[0]) && havepopname == FALSE ) { data->numind[pop][locus] = atol(input); data->numalleles[pop][locus] = data->numind[pop][locus]*genomes; } else { unread_word(infile, input); FGETS(input,LINESIZE,infile); havepopname=TRUE; if(input!=NULL) { minlength = strlen(input); minlength = MIN(minlength,80); strncpy(data->popnames[pop],input,minlength); } break; } } if(!havepopname) { read_word(infile, input); if(input!=NULL) { minlength = strlen(input); minlength = MIN(minlength,80); strncpy(data->popnames[pop],input,minlength); } // strncpy(data->popnames[pop],input,MIN(strlen(input),80)); } // fills numind for additional locus in case the numind was not specified for(lo=locus; lo < data->loci; lo++) { data->numind[pop][lo] = data->numind[pop][locus-1]; data->numalleles[pop][lo] = data->numind[pop][lo] * genomes; } } else { // only one locus so we can use old scheme [see below] FGETS(input,LINESIZE,infile); while(input[0]=='#') { FGETS(input,LINESIZE,data->infile); } sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]), data->popnames[pop]); data->numalleles[pop][0] = data->numind[pop][0]*genomes; } translate (data->popnames[pop], ' ', '_'); translate (data->popnames[pop], '\t', '_'); myfree(input); } void read_indname (FILE * file, data_fmt * data, long pop, long ind, long nmlength) { long i = 0; char ch; char input[LINESIZE]; while (i < nmlength) { ch = getc (file); while(ch =='#') { FGETS(input,LINESIZE,data->infile); ch = getc (file); } if(!strchr("\r\n",ch)) data->indnames[pop][ind][i++] = ch; if(strchr("\t",ch)) break; } data->indnames[pop][ind][nmlength] = '\0'; } void read_popdata (FILE * infile, data_fmt * data, long pop, option_fmt * options) { long ind, baseread = 0; long locus = 0; for (ind = 0; ind < data->numind[pop][0]; ind++) { read_indname (infile, data, pop, ind, options->nmlength); switch (options->datatype) { case 'a': case 'b': case 'm': if (data->dlm == '\0') read_alleles (infile, data, pop, ind); else read_microalleles (infile, data, pop, ind); break; case 's': case 'n': case 'u': case 'f': baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0); break; default: usererror ("Wrong datatype, only the types a, m, s, n\n (electrophoretic alleles, \n microsatellite data,\n sequence data,\n SNP polymorphism)\n are allowed.\n"); break; } } if (!strchr (SEQUENCETYPES, options->datatype)) return; else { finish_read_seq (infile, data, options, pop, baseread); } } void read_microalleles (FILE * infile, data_fmt * data, long pop, long ind) { char *input, *isave, dlm[2], ddlm[2], *p, *a, *a1, *a2; long locus, i; input = (char *) mycalloc (1, sizeof (char) * (SUPERLINESIZE + 1)); isave = input; a = (char *) mycalloc (1, sizeof (char) * LINESIZE); a1 = (char *) mycalloc (1, sizeof (char) * LINESIZE); a2 = (char *) mycalloc (1, sizeof (char) * LINESIZE); dlm[0] = data->dlm, dlm[1] = '\0'; ddlm[0] = ' ', ddlm[1] = '\0'; FGETS (input, SUPERLINESIZE, infile); if ((p = (char *) strpbrk (input, CRLF)) != NULL) *p = '\0'; for (locus = 0; locus < data->loci; locus++) { while (isspace ((int) *input)) input++; if (input[0] == '\0') FGETS (input, SUPERLINESIZE, infile); i = 0; while (strchr(" \t",input[i])==NULL && input[i] != dlm[0]) { a1[i] = input[i]; i++; } a1[i] = '\0'; input += i; i = 0; if (input[i] == dlm[0]) { input++; while (strchr(" \t",input[i])==NULL && input[i] != '\0') { a2[i] = input[i]; i++; } a2[i] = '\0'; if (a2[0] == '\0') { strcpy (a2, a1); } input += i; } else { strcpy (a2, a1); } strcpy (data->yy[pop][ind][locus][0], a1); strcpy (data->yy[pop][ind][locus][1], a2); } myfree(a); myfree(a1); myfree(a2); myfree(isave); } void read_alleles (FILE * infile, data_fmt * data, long pop, long ind) { char *input, *isave, *p, *a; long locus; a = (char *) mycalloc (1, sizeof (char) * LINESIZE); input = (char *) mycalloc (1, sizeof (char) * SUPERLINESIZE); isave = input; FGETS (input, SUPERLINESIZE, infile); if ((p = (char *) strpbrk (input, CRLF)) != NULL) *p = '\0'; for (locus = 0; locus < data->loci; locus++) { while (isspace ((int) *input)) { input++; } if (sscanf (input, "%s", a) == 1) { input += (long) strlen (a); } data->yy[pop][ind][locus][0][0] = a[0]; data->yy[pop][ind][locus][0][1] = '\0'; if (a[1] == '\0') { data->yy[pop][ind][locus][1][0] = a[0]; data->yy[pop][ind][locus][1][1] = '\0'; } else { data->yy[pop][ind][locus][1][0] = a[1]; data->yy[pop][ind][locus][1][1] = '\0'; } } myfree(a); myfree(isave); } long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options, long locus, long pop, long ind, long baseread) { long j; char charstate; j = (options->interleaved) ? baseread : 0; charstate = getc (infile); ungetc ((int) charstate, infile); if(options->printdata) { fprintf(stdout,"%s|",data->indnames[pop][ind]); } while (j < data->seq->sites[locus] && !(options->interleaved && strchr(CRLF,charstate))) { charstate = getc (infile); if (strchr(CRLF,charstate)) { if (options->interleaved) { while(strchr(CRLF,charstate=getc(infile))) ; ungetc ((int) charstate, infile); return j; } else charstate = ' '; } if (charstate == ' ' || (charstate >= '0' && charstate <= '9') || charstate == '\\') continue; charstate = uppercase (charstate); if(options->printdata) { fprintf(stdout, "%c",charstate); } if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL) { printf ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n", charstate, j, ind+1, pop+1); printf ("Last complete sequences was in population %li, individual %li and locus %li:\n%s", pop + 1, ind - 1, locus, data->indnames[pop][ind - 1]); for (j = 0; j < data->seq->sites[locus]; j++) printf ("%c", data->yy[pop][ind - 1][locus][0][j]); exit (EXIT_FAILURE); } data->yy[pop][ind][locus][0][j++] = charstate; } charstate = getc (infile); /* swallow the \n or \r */ #ifndef MAC if (charstate == '\r') { charstate = getc (infile); /* swallow the \n */ if(charstate!='\n') ungetc((int) charstate, infile); } #endif if(options->printdata) { fprintf(stdout,"\n"); } return j; } void read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m) { char input[SUPERLINESIZE]; long i, j; if (dfile != NULL) { // assumes that the dfile is in PHYLIP format // and that all n x n cells are filled. FGETS (input, LONGLINESIZE, dfile); //reads header line with for (i = 0; i < tips; i++) // of individuals { //reads the population label FGETS (input, nmlength + 1, dfile); for (j = 0; j < tips; j++) { #ifdef USE_MYREAL_FLOAT fscanf (dfile, "%f", &m[i][j]); #else fscanf (dfile, "%lf", &m[i][j]); #endif if((i!=j) && (m[i][j] < EPSILON)) { warning("Reading geofile: adjusting dist[%li][%li]=%f to %f\n",i,j,m[i][j],EPSILON); m[i][j] = EPSILON; } } // reads the last \n from the // data matrix FGETS (input, LONGLINESIZE, dfile); } } } #ifdef UEP // uep function void read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep, long *uepsites, long datatype) { char input[LINESIZE]; long i, j; long thistips; if (uepfile != NULL) { // assumes that the uepfile is in PHYLIP format // and that all n cells are filled. FGETS (input, LINESIZE, uepfile); //reads header line with // of individuals and uep sites sscanf (input, "%li%li", &thistips, uepsites); if (thistips != tips) error ("UEP datafile and infile are inconsistent!"); if (strchr (SEQUENCETYPES, datatype)) { for (i = 0; i < tips; i++) { uep[i] = (int *) mycalloc (*uepsites, sizeof (int)); FGETS (input, nmlength + 1, uepfile); //reads each line for (j = 0; j < *uepsites; ++j) fscanf (uepfile, "%i", &uep[i][j]); // reads the last \n from the data matrix FGETS (input, LINESIZE, uepfile); } } else { for (i = 0; i < tips; i++) { uep[i] = (int *) mycalloc (*uepsites, sizeof (int)); uep[i + tips] = (int *) mycalloc (*uepsites, sizeof (int)); FGETS (input, nmlength + 1, uepfile); //reads each line for (j = 0; j < *uepsites; ++j) fscanf (uepfile, "%i", &uep[i][j]); // finished reading first allele, no onto the second for (j = 0; j < *uepsites; ++j) fscanf (uepfile, "%i", &uep[i + tips][j]); // reads the last \n from the data matrix FGETS (input, LINESIZE, uepfile); } } } } #endif void finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options, long pop, long baseread) { long ind, baseread2 = 0, locus = 0; if (options->interleaved) { while (baseread < data->seq->sites[0]) { for (ind = 0; ind < data->numind[pop][0]; ind++) { baseread2 = read_ind_seq (infile, data, options, locus, pop, ind, baseread); } baseread = baseread2; } } for (locus = 1; locus < data->loci; locus++) { baseread = 0; for (ind = 0; ind < data->numind[pop][locus]; ind++) { read_indname (infile, data, pop, ind, options->nmlength); baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0); } if (options->interleaved) { while (baseread < data->seq->sites[locus]) { for (ind = 0; ind < data->numind[pop][locus]; ind++) { baseread2 = read_ind_seq (infile, data, options, locus, pop, ind, baseread); } baseread = baseread2; } } } } long find_missing(data_fmt *data, long pop, long locus) { long missing = 0; long ind; for(ind=0; ind < data->numind[pop][locus]; ind++) { if(data->yy[pop][ind][locus][0][0]=='?') missing++; if(data->yy[pop][ind][locus][1][0]=='?') missing++; } return missing; } void print_data_summary (FILE * file, world_fmt * world, option_fmt * options, data_fmt * data) { long locus; long pop; long numind; long nummiss; char dstring[LINESIZE]; long *total; long *totalmiss; total = (long *) mycalloc(data->loci,sizeof(long)); totalmiss = (long *) mycalloc(data->loci,sizeof(long)); fprintf (file, "\nSummary of data:\n"); fprintf (file, "---------------\n"); switch (options->datatype) { case 'a': strcpy (dstring, "Allelic data"); break; case 'f': case 's': strcpy (dstring, "Sequence data"); break; case 'b': case 'm': strcpy (dstring, "Microsatellite data"); break; case 'n': case 'u': strcpy (dstring, "SNP data"); break; default: strcpy (dstring, "Unknown data [ERROR]"); break; } fprintf (file, "Datatype: %20s\n", dstring); fprintf (file, "Number of loci: %20li\n\n", data->loci); if (!strchr (SEQUENCETYPES, options->datatype)) { fprintf (file, "Population Locus Gene copies\n"); fprintf (file, " data (missing)\n"); } else fprintf (file, "Population Locus Gene copies\n"); fprintf (file, "----------------------------------------------------------------\n"); for (pop = 0; pop < world->numpop; pop++) { if (!strchr (SEQUENCETYPES, options->datatype)) { nummiss = find_missing(data,pop,0); numind = data->numalleles[pop][0] - nummiss; fprintf (file, "%3li %-40.40s %5li %6li (%li)\n", pop+1, data->popnames[pop], 1L , numind, nummiss); } else { nummiss = 0; numind = data->numind[pop][0]; fprintf (file, "%3li %-40.40s %5li %6li\n", pop+1, data->popnames[pop], 1L , numind); } total[0] += numind; totalmiss[0] += nummiss; for(locus=1; locus< data->loci; locus++) { if (!strchr (SEQUENCETYPES, options->datatype)) { nummiss = find_missing(data,pop,locus); numind = data->numalleles[pop][locus] - nummiss; fprintf (file," %5li %6li (%li)\n", locus+1, numind, nummiss); } else { nummiss=0; numind = data->numind[pop][locus]; fprintf (file," %5li %6li\n", locus+1, numind); } total[locus] += numind; totalmiss[locus] += nummiss; } } if (!strchr (SEQUENCETYPES, options->datatype)) { fprintf (file,"Total of all populations %5li %6li (%li)\n",1L, total[0], totalmiss[0]); for(locus=1; locus< data->loci; locus++) { fprintf (file," %5li %6li (%li)\n",locus+1, total[locus], totalmiss[locus]); } } else { fprintf (file,"Total of all populations %5li %6li\n",1L, total[0]); for(locus=1; locus< data->loci; locus++) { fprintf (file," %5li %6li\n",locus+1, total[locus]); } } fprintf(file,"\n"); myfree(total); myfree(totalmiss); fflush (file); } void print_data (world_fmt * world, option_fmt * options, data_fmt * data) { if (options->printdata) { switch (options->datatype) { case 'a': // if(options->dlm=='\0') // print_alleledata (world, data, options); //else print_microdata (world, data, options); break; case 'b': case 'm': print_microdata (world, data, options); break; case 's': case 'n': case 'u': case 'f': print_seqdata (world, options, data); break; } } } /// /// calculate allele frequency spectrum and print /// allele population1 .. populationN All void print_spectra(world_fmt * world, option_fmt * options,data_fmt * data) { long found; long locus; long a; long pop; long ind; MYREAL **total; MYREAL allfreq; long *maxalleles; long *maxallelepop; MYREAL ***freq; char *thisallele; char *thatallele; FILE *outfile = world->outfile; if (strchr (SEQUENCETYPES, options->datatype)) return; /*we do not calculate allele frequencies for DNA sequences*/ // find total number of alleles maxalleles = (long *) mycalloc(data->loci, sizeof(long)); maxallelepop = (long *) mycalloc(data->numpop, sizeof(long)); for (locus = 0; locus < data->loci; locus++) { maxalleles[locus] = findAllele(data,"\0",locus); //qsort((void *) data->allele[locus], maxalleles[locus], sizeof(char *), strcmp); } // create bins for each population and all freq = (MYREAL ***) mycalloc(data->numpop, sizeof(MYREAL **)); doublevec2d(&total,data->numpop,data->loci); for (pop = 0; pop < data->numpop; pop++) { freq[pop] = (MYREAL **) mycalloc(data->loci, sizeof(MYREAL *)); for (locus = 0; locus < data->loci; locus++) { freq[pop][locus] = (MYREAL *) mycalloc(maxalleles[locus], sizeof(MYREAL)); } } // calculate spectrum for (pop = 0; pop < data->numpop; pop++) { for (ind = 0; ind < data->numind[pop][0]; ind++) { for (locus = 0; locus < data->loci; locus++) { thisallele = data->yy[pop][ind][locus][0]; found = findAllele(data, thisallele, locus); thatallele = data->yy[pop][ind][locus][1]; if(!strcmp(thisallele,thatallele)) { freq[pop][locus][found] += 2.0 ; if(!strchr(thisallele,'?')) total[pop][locus] += 2.0 ; } else { freq[pop][locus][found] += 1.0 ; if(!strchr(thisallele,'?')) total[pop][locus] += 1.0 ; found = findAllele(data, thatallele, locus); freq[pop][locus][found] += 1.0 ; if(!strchr(thatallele,'?')) total[pop][locus] += 1.0 ; } } } } for (pop = 0; pop < data->numpop; pop++) { for (locus = 0; locus < data->loci; locus++) { for(a=0; a < maxalleles[locus]; a++) { freq[pop][locus][a] /= (MYREAL) total[pop][locus]; } } } // print in ascii fprintf(outfile,"Allele frequency spectra\n"); fprintf(outfile,"========================\n\n"); for (locus = 0; locus < data->loci; locus++) { memset(maxallelepop,'\0',sizeof(long)*data->numpop); fprintf(outfile,"Locus %li\n", locus); fprintf(outfile,"Allele "); for (pop = 0; pop < data->numpop; pop++) fprintf(outfile,"Pop%-2li ",pop+1); fprintf(outfile,"All\n-------"); for (pop = 0; pop < data->numpop+1; pop++) fprintf(outfile,"-------"); fprintf(outfile, "\n"); for(a=0; a < maxalleles[locus]; a++) { allfreq = 0.0; fprintf(outfile,"%6s ",data->allele[locus][a]); for (pop = 0; pop < data->numpop; pop++) { if(freq[pop][locus][a]>0.0) { maxallelepop[pop] += 1; fprintf(outfile," %1.3f ",freq[pop][locus][a]); allfreq += freq[pop][locus][a]; } else { fprintf(outfile," - "); } } fprintf(outfile," %1.3f\n", (MYREAL) allfreq/data->numpop); } fprintf(outfile,"Total "); for (pop = 0; pop < data->numpop; pop++) fprintf(outfile," %5li ",maxallelepop[pop]); fprintf(outfile," %5li\n\n", maxalleles[locus]); } // printd in PDF #ifdef PRETTY pdf_print_spectra(data, freq, maxalleles); #endif fflush(outfile); // cleanup for (pop = 0; pop < data->numpop; pop++) { for (locus = 0; locus < data->loci; locus++) { myfree(freq[pop][locus]); } myfree(freq[pop]); } myfree(freq); myfree(maxalleles); myfree(maxallelepop); free_doublevec2d(total); } void print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options) { long i, pop, ind, locus, mult80; for (pop = 0; pop < data->numpop; pop++) { print_header (world->outfile, pop, world, options, data); for (ind = 0; ind < data->numind[pop][0]; ind++) { fprintf (world->outfile, "%-*.*s ", (int) options->nmlength, (int) options->nmlength, data->indnames[pop][ind]); mult80 = options->nmlength; for (locus = 0; locus < data->loci; locus++) { mult80 += 1 + (long) (strlen (data->yy[pop][ind][locus][0])); if (mult80 >= 80) { mult80 = 0; fprintf (world->outfile, "\n"); for (i = 0; i < options->nmlength; i++) FPRINTF(world->outfile, " "); } fprintf (world->outfile, " %c.%c", data->yy[pop][ind][locus][0][0], data->yy[pop][ind][locus][0][1]); } fprintf (world->outfile, "\n"); } fprintf (world->outfile, "\n"); } fprintf (world->outfile, "\n\n"); fflush (world->outfile); } void print_microdata (world_fmt * world, data_fmt * data, option_fmt * options) { long i, pop, ind, locus, mult80; for (pop = 0; pop < data->numpop; pop++) { print_header (world->outfile, pop, world, options, data); for (ind = 0; ind < data->numind[pop][0]; ind++) { fprintf (world->outfile, "%-*.*s ", (int) options->nmlength, (int) options->nmlength, data->indnames[pop][ind]); mult80 = options->nmlength; for (locus = 0; locus < data->loci; locus++) { mult80 += 1 + (long) (strlen (data->yy[pop][ind][locus][0]) + strlen (data->yy[pop][ind][locus][1])); if (mult80 >= 80) { mult80 = 0; fprintf (world->outfile, "\n"); for (i = 0; i < options->nmlength; i++) FPRINTF(world->outfile, " "); } fprintf (world->outfile, " %s.%-s", data->yy[pop][ind][locus][0], data->yy[pop][ind][locus][1]); } fprintf (world->outfile, "\n"); } fprintf (world->outfile, "\n"); } fprintf (world->outfile, "\n\n"); fflush (world->outfile); } void print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data) { long pop, locus; for (pop = 0; pop < data->numpop; pop++) { print_header (world->outfile, pop, world, options, data); for (locus = 0; locus < data->loci; locus++) { print_locus_head (locus, world, options, data); print_seq_pop (locus, pop, world, options, data); } } fflush (world->outfile); } void print_header (FILE * outfile, long pop, world_fmt * world, option_fmt * options, data_fmt * data) { long i; long locus, mult80 = 80; char input[LINESIZE]; fprintf (outfile, "\n%-s", data->popnames[pop]); for (i = 0; i < (long) (80 - (long) strlen (data->popnames[pop])); i++) fprintf(world->outfile, "-"); fprintf (outfile, "\n\n"); if (!strchr (SEQUENCETYPES, options->datatype)) { fprintf (outfile, "%-s ", (data->loci == 1 ? "locus" : "loci ")); for (i = 0; i < (options->nmlength - 6); i++) fprintf(world->outfile, " "); for (locus = 0; locus < data->loci; locus++) { if (locus * 4 + options->nmlength > mult80) { mult80 += 80; fprintf (outfile, "\n"); for (i = 0; i < options->nmlength; i++) fprintf (outfile, " "); } fprintf (outfile, " %2li", locus + 1); } fprintf (outfile, "\n%-s\n", strncpy (input, "indiv.", options->nmlength)); } } void create_alleles (data_fmt * data, option_fmt *options) { long n=0; MYREAL mean=0.; MYREAL delta=0.; long locus, pop, ind; long z; char a1[DEFAULT_ALLELENMLENGTH]; char a2[DEFAULT_ALLELENMLENGTH]; for (locus = 0; locus < data->loci; locus++) { z = 0; for (pop = 0; pop < data->numpop; pop++) { for (ind = 0; ind < data->numind[pop][locus]; ind++) { strcpy (a1, data->yy[pop][ind][locus][0]); strcpy (a2, data->yy[pop][ind][locus][1]); if (!strcmp (a1, a2)) { addAllele (data, a1, locus, &z); } else { addAllele (data, a1, locus, &z); addAllele (data, a2, locus, &z); } } } data->maxalleles[locus] = z + 1; /* + 1: for all the unencountered alleles */ if(options->murates_fromdata) { if(options->mu_rates==NULL) { options->mu_rates = (MYREAL * ) mycalloc(data->loci,sizeof(MYREAL)); } options->mu_rates[locus] = z; n = n + 1; delta = options->mu_rates[locus] - mean; mean += delta/n; } } if(options->murates_fromdata) { options->muloci = data->loci; for (locus=0; locus < data->loci; locus++) { options->mu_rates[locus] /= mean; } } } void addAllele (data_fmt * data, char s[], long locus, long *z) { long found = 0; while ((data->allele[locus][found++][0] != '\0') && (strcmp (s, data->allele[locus][found - 1]))) ; if (found > (*z)) { strcpy (data->allele[locus][*z], s); (*z)++; } } void set_numind (data_fmt * data) { long locus, pop; for (locus = 1; locus < data->loci; locus++) { for (pop = 0; pop < data->numpop; pop++) { data->numind[pop][locus] = data->numind[pop][0]; data->numalleles[pop][locus] = data->numalleles[pop][0]; } } } void print_seq_pop (long locus, long pop, world_fmt * world, option_fmt * options, data_fmt * data) { long ind; for (ind = 0; ind < data->numalleles[pop][locus]; ind++) { print_seq_ind (locus, pop, ind, world, options, data); } } void print_seq_ind (long locus, long pop, long ind, world_fmt * world, option_fmt * options, data_fmt * data) { long site; char blank[2] = " "; fprintf (world->outfile, "%-*.*s", (int) options->nmlength, (int) options->nmlength, data->indnames[pop][ind]); fprintf (world->outfile, " %c", data->yy[pop][ind][locus][0][0]); for (site = 1; site < data->seq->sites[locus]; site++) { if ((site) % 60 == 0) { fprintf (world->outfile, "\n%-*.*s %c", (int) options->nmlength, (int) options->nmlength, blank, data->yy[pop][ind][locus][0][site]); } else { if ((site) % 10 == 0) { fprintf (world->outfile, " "); } fprintf (world->outfile, "%c", data->yy[pop][ind][locus][0][site]); } } fprintf (world->outfile, "\n"); } void print_locus_head (long locus, world_fmt * world, option_fmt * options, data_fmt * data) { char *head; head = (char *) mycalloc (1, sizeof (char) * MAX (10, options->nmlength)); sprintf (head, "Locus %li", locus); fprintf (world->outfile, "%-*.*s --------10 --------20 --------30", (int) options->nmlength, (int) options->nmlength, head); fprintf (world->outfile, " --------40 --------50 --------60\n"); myfree(head); } void read_geofile (data_fmt * data, option_fmt * options, long numpop) { long i, j, pop; long numpop2 = numpop * numpop; data->geo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2); data->lgeo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2); if (!options->geo) { for (i = 0; i < numpop2; i++) data->geo[i] = 1.0; } else { data->ogeo = (MYREAL **) mycalloc (1, sizeof (MYREAL *) * numpop); data->ogeo[0] = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2); for (pop = 1; pop < numpop; pop++) data->ogeo[pop] = data->ogeo[0] + numpop * pop; read_distance_fromfile (data->geofile, numpop, options->nmlength, data->ogeo); for (i = 0; i < numpop; i++) { for (j = 0; j < numpop; j++) { if(i!=j) { data->geo[mm2m (i, j, numpop)] = 1. / data->ogeo[i][j]; data->lgeo[mm2m (i, j, numpop)] = data->ogeo[i][j] > 0.0 ? LOG (1. / data->ogeo[i][j]) : -DBL_MAX; } } } } } /// /// read the file with the tip dates and returns the oldest date MYREAL read_date_fromfile (FILE * datefile, data_fmt *data, option_fmt *options, long nmlength) { char input[LINESIZE]; long pop; long locus; long ind; MYREAL oldest = 0. ; MYREAL temp; if (datefile != NULL) { FGETS (input, LINESIZE, datefile); //title line fprintf(stdout,"\n%i> Tip dates from file %s\n----------------------------------------\n", myID, options->datefilename); fprintf(stdout,"Generation time: %f\n", options->generation_year); for(pop=0;popnumpop;pop++) { FGETS (input, LINESIZE, datefile); //first populations for (locus=0; locus < data->loci; locus++) { fprintf(stdout,"Mutation rate of Locus %li: %g\n", locus, options->mutationrate_year[locus]); for(ind=0; ind < data->numind[pop][locus]; ind++) { FGETS (input, LINESIZE, datefile); sscanf (input+nmlength, "%lf",&temp); fprintf(stdout,"Locus %li: Tipdate %*.*s %f %f\n", locus, (int) nmlength, (int) nmlength, input, temp, temp * options->mutationrate_year[locus] * options->generation_year); data->sampledates[pop][locus][ind].date = temp; if(oldest < temp) oldest = temp; } } } } return oldest; } /// /// read the file with the tip dates void read_datefile (data_fmt * data, option_fmt * options, long numpop) { long locus, pop; data->sampledates = (tipdate_fmt ***) mycalloc (data->numpop, sizeof (tipdate_fmt **)); data->sampledates[0] = (tipdate_fmt **) mycalloc (data->numpop * data->loci, sizeof (tipdate_fmt *)); for (pop = 1; pop < data->numpop; pop++) { data->sampledates[pop] = data->sampledates[0] + data->loci * pop; } for(locus=0;locusloci;locus++) { for(pop=0;pop < data->numpop; pop++) { data->sampledates[pop][locus] = (tipdate_fmt*) mycalloc(data->numind[pop][locus],sizeof(tipdate_fmt)); } } if (options->has_datefile) { data->maxsampledate = read_date_fromfile (data->datefile, data, options, options->nmlength); // printf("%i> in data section maxsampledate=%f\n",myID, data->maxsampledate); } else { data->maxsampledate=0.0; } } #ifdef UEP void read_uepfile (data_fmt * data, option_fmt * options, long numpop) { long i; long sumtips = 0; if (!options->uep) return; for (i = 0; i < numpop; ++i) sumtips += data->numind[i][0]; //Assumes that UEP has the same number of individuals as // locus 1 (Is this OK? most dataset with UEP will have 1 locus?????) data->uep = (int **) mycalloc (number_genomes (options->datatype) * sumtips, sizeof (int *)); read_uep_fromfile (data->uepfile, sumtips, options->nmlength, data->uep, &data->uepsites, options->datatype); } #endif