/* * version 3.6. (c) Copyright 1993-2004 by the University of Washington. * Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, * Mike Palczewski, Doug Buxton and Dan Fineman. Permission is granted to * copy and use this program provided no fee is charged for it and provided * that this copyright notice is not removed. */ /* * This file is a phylip to NEXUS converter that consists of code from PHYLIP * 3.6.5 (see copyright above) tweaked by Mark Holder to output NEXUS. * * This file was created by concatenating the headers, and .c files: * phylip.h, * seq.h, * discrete.h * phylip.c * seq.c, * pars.c, and * discrete.c concatenated * followed by removal of code that is unused in this simple program, and * the addition of routines for printing out NEXUS. */ #include #include #include #ifdef WIN32 # include # define DELIMITER '\\' #else # define DELIMITER '/' #endif #include #include #include /*structs and typedefs */ typedef char Char; typedef Char * MatrixRow; typedef MatrixRow * MatrixPtr; typedef unsigned char boolean; typedef struct TaxAndMatrix_t { char ** name_array; MatrixPtr matrix; unsigned long n_taxa; unsigned long n_char; unsigned int datatype; unsigned int max_name_len; } TaxAndMatrix_t; enum DatatypesEnum { kNoneDatatype = 0x00, kDNADatatype = 0x01, kRNADatatype = 0x02, kNucDatatype = 0x03, /* RNA or DNA */ kAADatatype = 0x04, kSequenceDatatype = 0x07, /* AA or nucleotide */ kDiscreteDatatype = 0x08, kUnknownDatatype = 0x0F }; enum InterleaveEnum { kIsInterleaved = 0x01, kIsNotInterleaved = 0x02, kInterleaveUnknown = 0x03 }; /* function protypes */ /* Public interface functions */ TaxAndMatrix_t * read_data(const char * infile_name, /* path to input file */ const int expectedDatatype, /* Specifies which types of data to try to read -- DatatypesEnum facets OR'd together.*/ const int interleaveStatus /* kIsInterleaved, kIsNoInterleaved or kInterleaveUnknown*/ ); /** Prints the data that has been read into tax_and_mat from the source `source` as NEXUS. `output_msg` is used for printing (out goes to stdout if gOutfilePtr is NULL). The `source` string is just used in a NEXUS comment, so an empty string may be passed in if the source of the data is not known. */ void print_NEXUS(const TaxAndMatrix_t * tax_and_mat, const char * source); void free_TaxAndMat(TaxAndMatrix_t *); void convert_names_to_NEXUS(const unsigned int n_taxa, Char ** name_array, const unsigned int max_name_len); /* IO */ void debug_msg(const char * msg); void output_msg(const char * msg); void error_msg(const char * msg); void error_exit(const char * msg, int); /* Utility */ void eat_separator(FILE * instream); boolean eoff(FILE *f); boolean eoln(FILE *f); char gettc(FILE* file); const char* get_command_name (const char *vektor); void open_file(FILE **fp,const char *filename, const char *filedesc, const char *mode); void * my_malloc(long x); void scan_eoln(FILE *f); static void crash_handler(int sig_num); void install_sig_handlers(); TaxAndMatrix_t * alloc_TaxAndMat(long spp, long chars); void read_name(FILE* infile, char ** taxNameArray, long i); MatrixPtr read_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char * symbols, const char * cell_name, const char other_gap); MatrixPtr read_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char * symbols, const char * cell_name, const char other_gap); MatrixPtr read_any_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat); MatrixPtr read_any_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat); MatrixPtr read_any_seq_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved); MatrixPtr read_dna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved); MatrixPtr read_rna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved); MatrixPtr read_aa_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved); MatrixPtr read_disc_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved); unsigned replace_all(char *s, const char from_char, const char to_char); /* globals not used in struct definitions */ const char * gAppName = "phylip_to_nexus"; const char * gVersionStr = "0.0.1"; const char * gFlags = "-indrag"; /* unless otherwise stated, we assume the user is in drag*/ #define NMLNGTH 10 /* number of characters in species name */ #define FNMLNGTH 200 /* length of array to store a file name */ #define MAXNCH 20 const char * G_ANY_SEQ_SYMBOLS = "ABCDGHKMNRSVWXY?-"; const char * G_DNA_OR_PROT_SYMBOLS = "ABCDGHKMNRSTVWXY?-"; const char * G_JUST_AA_SYMBOLS = "EFILPQZ*"; const char * G_ALL_DNA_SYMBOLS = "ABCDGHKMNRSTVWXY?-"; const char * G_ALL_RNA_SYMBOLS = "ABCDGHKMNRSUVWXY?-"; const char * G_ALL_NUC_STATES = "ABCDGHKMNRSTUVWXY?-"; const char * G_ALL_AA_STATES = "ABCDEFGHIKLMNPQRSTVWXYZ*?-"; /* TEMPORARY: Some Phylip symbols are disallowed because they are awkward in NEXUS Really they should be escaped in a NEXUS friendly manner rather than disallowing them const char * G_ACCEPTED_SYMBOLS = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; */ const char * G_ACCEPTED_SYMBOLS = "!#$%&*+-0123456789?@ABCDEFGHIJKLMNOPQRSTUVWXYZ^abcdefghijklmnopqrstuvwxyz|~"; const char * G_GAP_NOT_DOT_MESSAGE = " Periods (.) may not be used as gap characters.\n The correct gap character is (-)"; const char * G_EMPTY_STRING = ""; #define MSG_BUFFER_LEN 512 char gLogMessage[MSG_BUFFER_LEN]; char gOutputMessage[MSG_BUFFER_LEN]; #define N_SYMBOLS 98 /*Phylip accepts up to 98 symbols for generic discrete data */ char gSymbolsSeen[N_SYMBOLS]; /*Error exit codes - these are the program's exit codes on failure */ const int kCouldNotReadFile = 1; const int kNoNumbersInfile = 2; const int kBadMemRequest = 3; const int kCouldNotAllocMem = 4; const int kUnexpectedEOLorEOF = 5; const int kBadTaxonName = 6; const int kFilePosError = 7; const int kBadCommandLineArgs = 8; const int kAssertionError = 9; const int kRemappingError = 10; /*The duplicate name disambiguation failed to produce a valid file*/ FILE * gLogfilePtr = 0L; FILE * gOutfilePtr = 0L; enum LogLevelsEnum {kDebugLevel, kWarnLevel, kErrorLevel}; int gCurrentLogThreshold = kWarnLevel; /* kErrorLevel to get errors, kWarnLevel to get warnings, kDebugLevel to get debugging output */ int kLastLogLevel = -1; /*This global is used as a flag to avoid prefixing long error messages with the program name multiple times. */ const char * gLogLevelNames[] = {"DEBUG", "WARNING", "ERROR"}; void log_msg(int level, const char * msg); void log_msg(int level, const char * msg) { const char * level_snippet; if (gCurrentLogThreshold > level) return; if (kLastLogLevel == level) { if (gLogfilePtr) fprintf(gLogfilePtr, msg); else fprintf(stderr, msg); } else { assert(level >= kDebugLevel); assert(level <= kErrorLevel); level_snippet = gLogLevelNames[level]; if (gLogfilePtr) fprintf(gLogfilePtr, "%s %s: %s", gAppName, level_snippet, msg); else fprintf(stderr, "%s %s: %s", gAppName, level_snippet, msg); } kLastLogLevel = level; } void debug_msg(const char * msg) { log_msg(kDebugLevel, msg); } void warn_msg(const char * msg) { log_msg(kWarnLevel, msg); } void error_msg(const char * msg) { log_msg(kErrorLevel, msg); } void output_msg(const char * msg) { if (gOutfilePtr) fprintf(gOutfilePtr, msg); else printf(msg); kLastLogLevel = -1; /*Triggers the error level prefix in the next log event.*/ } void error_exit(const char * msg, int e_code) { error_msg(msg); exit(e_code); } unsigned replace_all(char *s, const char from_char, const char to_char) { unsigned i = 0; char *ch; ch = strchr(s, from_char); while ((long) ch != 0L) { ++i; *ch = to_char; ch = strchr(s, from_char); } return i; } /*begin hashing stuff*/ /* String hashing to allow for constant time checking if names are duplicated. Duplicate names are allowed by PHYLIP, but not NEXUS. */ /* doubly-linked list of names (strings big enough to hold PHYLIP taxa names). */ typedef struct tax_name_t { struct tax_name_t * next; struct tax_name_t * prev; char s[NMLNGTH+1]; /**The name (will be stored as sent to the insert... function, usually this will be capitalized). */ int n_occurrences; /**The number of times this name has been added to the hash*/ int first_occ_index; } tax_name_t ; /* hash table with buckets that point to tax_name_t structs Struct should be viewed as an opaque object by caller. */ typedef struct str_hash_table_t { tax_name_t** buckets; unsigned n_buckets; tax_name_t * name_holder_array; /* used to preallocate a block of tax_name_t structs to improve cache hits. These are only used once and then hang around until free_hash. Thus, lots of removals of strings would lead to poor performance*/ unsigned nha_len; /* length of `name_holder_array`*/ unsigned pos_in_nha; /*index of next free in tax_name_t in `name_holder_array`*/ } str_hash_table_t; /*Hash table public interface */ str_hash_table_t * alloc_hash(unsigned n_buckets, unsigned n_strings); tax_name_t * insert_str(str_hash_table_t * hash_t, const char * s); tax_name_t * find_str(const str_hash_table_t * hash_t, const char * s); void free_hash(str_hash_table_t * hash_t); /*private functions */ /*String hashing function*/ unsigned long djb2_hash(const unsigned char *str); /**/ tax_name_t * alloc_list_entry(const char * s, str_hash_table_t * h); tax_name_t * insert_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next, str_hash_table_t * hash_t); tax_name_t * find_in_linked_list(tax_name_t * hash_t, const char * s, const int move_prev, const int move_next); void free_list_entry(tax_name_t * le, const str_hash_table_t * hash_t); void free_next_list_entry(tax_name_t * le, const str_hash_table_t * hash_t); void free_prev_list_entry(tax_name_t * le, const str_hash_table_t * hash_t); void init_tax_name_t(tax_name_t * le); /* end hashing "header" */ /* begin hashing implementation */ /** Returns a pointer to a str_hash_table_t with `n_buckets` buckets. The pointer must be later passed to `free_hash` to free all memory. */ str_hash_table_t * alloc_hash(unsigned n_buckets, unsigned n_strings) { int i; tax_name_t ** b; tax_name_t *nha_ptr; assert(n_buckets > 0); assert(n_strings > 0); str_hash_table_t * h = (str_hash_table_t *) my_malloc((long)(sizeof(str_hash_table_t))); h->n_buckets = n_buckets; b = (tax_name_t **) my_malloc((long)(n_buckets*sizeof(tax_name_t *))); h->buckets = b; for (i = 0; i < n_buckets; ++i, ++b) *b = 0L; /*Create the name_holder_array that will be used when inserting new strings */ h->name_holder_array = (tax_name_t *) my_malloc((long)(n_strings*sizeof(tax_name_t))); nha_ptr = h->name_holder_array; h->nha_len = n_strings; h->pos_in_nha = 0; return h; } void free_hash(str_hash_table_t * h) { int i; tax_name_t ** b; tax_name_t * curr_bucket; if ((long)h == 0L) return; b = h->buckets; for (i = 0; i < h->n_buckets; ++i, ++b) { if (*b) { curr_bucket = *b; free_prev_list_entry(curr_bucket, h); free_next_list_entry(curr_bucket, h); free_list_entry(curr_bucket, h); } } free(h->name_holder_array); free(h->buckets); free(h); } tax_name_t * insert_str(str_hash_table_t * hash_t, const char * s) { unsigned h; tax_name_t ** b; # if defined (RECENTERING_BUCKET_LINKED_LISTS) && RECENTERING_BUCKET_LINKED_LISTS int collision_cmp; # endif tax_name_t *to_return; assert((long)hash_t != 0L); assert((long)hash_t->buckets != 0L); h = djb2_hash((unsigned char *)s) % hash_t->n_buckets; assert(h < hash_t->n_buckets); b = (hash_t->buckets + h); if ((long)*b == 0L) { *b = alloc_list_entry(s, hash_t); if (gCurrentLogThreshold <= kDebugLevel) { sprintf(gLogMessage, "New string %s with hash value=%d\n", s, h); debug_msg(gLogMessage); } return *b; } if (gCurrentLogThreshold <= kDebugLevel) { sprintf(gLogMessage, "Collision or duplicate label with %s and hash value=%d\n", s, h); debug_msg(gLogMessage); } to_return = insert_in_linked_list(*b, s, 1, 1, hash_t); # if defined (RECENTERING_BUCKET_LINKED_LISTS) && RECENTERING_BUCKET_LINKED_LISTS collision_cmp = strcmp((*b)->s, s); if (collision_cmp == 0) { sprintf(gLogMessage, "Repeated Name %s", s); debug_msg(gLogMessage); } else { /*move the bucket attachment point in the direction of the center of the list (this is not a good scheme, we may end up on the end if we always add in the same direction, but the linked lists should be pretty short). */ if (collision_cmp < 0) { assert((long)(*b)->next); *b = (*b)->next; } else { assert((long)(*b)->prev); *b = (*b)->prev; } } # endif return to_return; } tax_name_t * find_str(const str_hash_table_t * hash_t, const char * s) { unsigned h; tax_name_t ** b; assert((long)hash_t); assert((long)hash_t->buckets); h = djb2_hash((unsigned char *)s) % hash_t->n_buckets; assert(h < hash_t->n_buckets); b = (hash_t->buckets + h); /*empty bucket, so the string is not in the hash*/ if ((long)*b == 0L) return 0L; return find_in_linked_list(*b, s, 1, 1); } void init_tax_name_t(tax_name_t * le) { le->next = 0L; le->prev = 0L; le->n_occurrences = 1; le->first_occ_index = -1; le->s[0] = '\0'; } tax_name_t * alloc_list_entry(const char * s, str_hash_table_t * h) { tax_name_t * le; if (h->pos_in_nha >= h->nha_len) le = my_malloc((long)(sizeof(tax_name_t))); else le = (h->name_holder_array + h->pos_in_nha++); init_tax_name_t(le); strcpy(le->s, s); return le; } tax_name_t * insert_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next, str_hash_table_t * h) { tax_name_t * to_return; assert((long)list_entry != 0L); const int d = strcmp(list_entry->s, s); if (d == 0) { list_entry->n_occurrences += 1; return list_entry; } else if (d < 0) { if (move_next && (long)list_entry->next) { return insert_in_linked_list(list_entry->next, s, 0, 1, h); } else { to_return = alloc_list_entry(s, h); to_return->prev = list_entry; if ((long)list_entry->next) { to_return->next = list_entry->next; list_entry->next->prev = to_return; } list_entry->next = to_return; return to_return; } } else { if (move_prev && (long)list_entry->prev) { return insert_in_linked_list(list_entry->prev, s, 1, 0, h); } else { to_return = alloc_list_entry(s, h); to_return->next = list_entry; if ((long)list_entry->prev) { to_return->prev = list_entry->prev; list_entry->prev->next = to_return; } list_entry->prev = to_return; return to_return; } } } tax_name_t * find_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next) { assert((long)list_entry != 0L); const int d = strcmp(list_entry->s, s); if (d == 0) return list_entry; else if (d < 0) { if (move_next && (long)list_entry->next) return find_in_linked_list(list_entry->next, s, 0, 1); return 0L; } if (move_prev && (long)list_entry->prev) return find_in_linked_list(list_entry->prev, s, 1, 0); return 0L; } void free_prev_list_entry(tax_name_t * le, const str_hash_table_t * hash_t) { if ((long)le->prev == 0L) return; free_prev_list_entry(le->prev, hash_t); free_list_entry(le->prev, hash_t); le->prev = 0L; } void free_next_list_entry(tax_name_t * le, const str_hash_table_t * hash_t) { if ((long)le->next == 0L) return; free_next_list_entry(le->next, hash_t); free_list_entry(le->next, hash_t); le->next = 0L; } void free_list_entry(tax_name_t * le, const str_hash_table_t * hash_t) { tax_name_t * prealloc_beg = hash_t->name_holder_array; tax_name_t * prealloc_end = prealloc_beg + hash_t->pos_in_nha; if (((long)le >= (long)prealloc_beg) && ((long)le <= (long)prealloc_end)) return; /* this le was in the prealloc array, it will be freed with the hash table*/ free(le); } /* from http://www.cse.yorku.ca/~oz/hash.html this algorithm (k=33) was first reported by dan bernstein many years ago in comp.lang.c. another version of this algorithm (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; the magic of number 33 (why it works better than many other constants, prime or not) has never been adequately explained. */ unsigned long djb2_hash(const unsigned char *str) { unsigned long hash = 5381; int c; while ((c = *str++)) hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ return hash; } /*end hashing stuff*/ TaxAndMatrix_t * alloc_TaxAndMat(long spp, long chars) { int i,j; /* The "+6" is for the terminator and disambiguation # that may be appended (if there are clashes) The "2*" is a TEMPORARY hack to avoid tokenizing later (doubling ' to '' will increase the length) */ const int max_name_len = 6 + (2*NMLNGTH); Char * all_names; Char * curr_name; TaxAndMatrix_t * tnm; tnm = (TaxAndMatrix_t *) my_malloc((long)(sizeof(TaxAndMatrix_t))); tnm->max_name_len = max_name_len; tnm->n_taxa = spp; tnm->n_char = chars; tnm->datatype = kUnknownDatatype; assert(spp > 0); assert(chars > 0); tnm->matrix = (MatrixPtr) my_malloc((long)(spp * sizeof(Char *))); for (i = 0; i < spp; i++) { tnm->matrix[i] = (Char *) my_malloc((long)((1 + chars) * sizeof(Char))); tnm->matrix[i][chars] = '\0'; } tnm->name_array = (char **) my_malloc((long)(spp * sizeof(char *))); all_names = (char *) my_malloc((long)(spp * max_name_len * sizeof(Char))); tnm->name_array[0] = all_names; for (i = 0; i < spp; i++) { curr_name = all_names + (i*max_name_len); tnm->name_array[i] = curr_name; for (j = 0; j < max_name_len; ++j) { curr_name[j] = '\0'; } } return tnm; } void free_TaxAndMat(TaxAndMatrix_t *t) { int i; if ((long)t == 0L) return; if ((long)t->name_array != 0L) { free(t->name_array[0]); free(t->name_array); } if ((long)t->matrix != 0L) { for (i = 0; i < t->n_taxa; ++i) free(t->matrix[i]); free(t->matrix); } free(t); } static void crash_handler(int sig_num) { /* when we crash, lets print out something useful */ int haveSigEgv = 0; # ifdef SIGSEGV haveSigEgv = (sig_num == SIGSEGV ? 1 : 0); # endif error_msg("ERROR: \n"); switch(sig_num) { # ifdef SIGSEGV case SIGSEGV: error_msg("This program has caused a Segmentation fault.\n"); break; # endif /* SIGSEGV */ # ifdef SIGFPE case SIGFPE: error_msg("This program has caused a Floating Point Exception\n"); break; # endif /* SIGFPE */ # ifdef SIGILL case SIGILL: error_msg("This program has attempted an illegal instruction\n"); break; # endif /* SIGILL */ # ifdef SIGPIPE case SIGPIPE: error_msg("This program tried to write to a broken pipe\n"); break; # endif /* SIGPIPE */ # ifdef SIGBUS case SIGBUS: error_msg("This program had a bus error\n"); break; # endif /* SIGBUS */ } if (haveSigEgv) { error_msg( " This may have been caused by an incorrectly formatted input file\n"); error_msg( " or input tree file. You should check those files carefully.\n"); error_msg(" If this seems to be a bug, please report it at the webpage http://ngbugz.sdsc.edu/default.php?pg=pgPublicEdit\n"); } else { error_msg(" Most likely, you have encountered a bug in the program.\n"); error_msg(" Since this seems to be a bug, please report it at the webpage http://ngbugz.sdsc.edu/default.php?pg=pgPublicEdit\n"); } error_msg(" with the name of the program, your computer system type,\n"); error_msg(" a full description of the problem, and with the input data file.\n"); error_msg(" (which should be in the body of the message, not as an Attachment).\n"); abort(); } /* eat white space -- if the separator line has spaces on it */ void eat_separator(FILE * instream) { char c; do { c = gettc(instream); } while (c == ' ' || c == '\t'); ungetc(c, instream); if (eoln(instream)) scan_eoln(instream); } boolean eoln(FILE *f) { /* check for end of line or eof*/ register int ch; ch = getc(f); if (ch == EOF) return 1; ungetc(ch, f); return ((ch == '\n') || (ch == '\r')); } /*eoln*/ boolean eoff(FILE *f) { /* check for end of file */ int ch; if (feof(f)) return 1; ch = getc(f); if (ch == EOF) { ungetc(ch, f); return 1; } ungetc(ch, f); return 0; } /*eoff*/ /*from phylip.c*/ const char* get_command_name (const char *vektor) { /* returns the name of the program from vektor without the whole path */ char *last_slash; /* Point to the last slash... */ last_slash = strrchr (vektor, DELIMITER); /* If there was a last slash, return the character after it */ /* If not, return the vector */ return (last_slash ? last_slash + 1: vektor); } /*get_command_name*/ char gettc(FILE* file) { /* catch eof's so that other functions not expecting an eof * won't have to worry about it */ int ch =getc(file); if (ch == EOF) error_exit("Unexpected End of File\n", kUnexpectedEOLorEOF); if ( ch == '\r' ) { ch = getc(file); if ( ch != '\n' ) ungetc(ch,file); ch = '\n'; } return ch; } /*from phylip.c*/ void install_sig_handlers() { /* initialization routine for all programs * anything done at the beginig for every program should be done here */ /* set up signal handler for * segfault,floating point exception, illeagal instruction, bad pipe, bus error * there are more signals that can cause a crash, but these are the most common * even these aren't found on all machines. */ # ifdef SIGSEGV signal(SIGSEGV, crash_handler); # endif /* SIGSEGV */ # ifdef SIGFPE signal(SIGFPE, crash_handler); # endif /* SIGFPE */ # ifdef SIGILL signal(SIGILL, crash_handler); # endif /* SIGILL */ # ifdef SIGPIPE signal(SIGPIPE, crash_handler); # endif /* SIGPIPE */ # ifdef SIGBUS signal(SIGBUS, crash_handler); # endif /* SIGBUS */ } void read_name(FILE* infile, char ** taxNameArray, long i) { /* read in species name */ long j; long offset=0; char c; char * tn = taxNameArray[i]; for (j = 0; j < NMLNGTH; j++) { if (eoff(infile) | eoln(infile)) { error_msg("\n\nERROR: end-of-line or end-of-file"); sprintf(gLogMessage, " in the middle of species name for species %ld\n\n", i+1); error_exit(gLogMessage, kUnexpectedEOLorEOF); } c = gettc(infile); if ((long)strchr("():,;[]", c) != 0L) { error_msg("\nERROR: Species name may not contain characters ( ) : ; , [ ] \n"); sprintf(gLogMessage, " In name of species number %ld there is character %c\n\n", i+1, c); error_exit(gLogMessage, kBadTaxonName); } tn[j+offset] = c; } } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_any_seq_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved) { MatrixPtr m; if (interleaved) m = read_any_seq_interleaved(instream, taxAndMat); else m = read_any_seq_non_interleaved(instream, taxAndMat); return m; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_dna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved) { MatrixPtr m; if (interleaved) m = read_seq_interleaved(instream, taxAndMat, G_ALL_DNA_SYMBOLS, "base", 'O'); else m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_DNA_SYMBOLS, "base", 'O'); if (m) taxAndMat->datatype = kDNADatatype; return m; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_rna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved) { MatrixPtr m; if (interleaved) m = read_seq_interleaved(instream, taxAndMat, G_ALL_RNA_SYMBOLS, "base", 'O'); else m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_RNA_SYMBOLS, "base", 'O'); if (m) taxAndMat->datatype = kDNADatatype; return m; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_aa_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved) { MatrixPtr m; if (interleaved) m = read_seq_interleaved(instream, taxAndMat, G_ALL_AA_STATES, "amino acid", '\0'); else m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_AA_STATES, "amino acid", '\0'); if (m) taxAndMat->datatype = kAADatatype; return m; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_any_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat) { long i, j; Char charstate; const char * err_tail; int added; const long n_taxa = taxAndMat->n_taxa; const long n_char = taxAndMat->n_char; MatrixPtr mat = taxAndMat->matrix; MatrixRow matRow = 0L; const char * curr_symbols = G_ANY_SEQ_SYMBOLS; int nuc_only_seen = 0; taxAndMat->datatype = kSequenceDatatype; eat_separator(instream); for (i = 1; i <= n_taxa; ++i) { matRow = mat[i - 1]; read_name(instream, taxAndMat->name_array, i - 1); j = 0; while (!eoff(instream)) { while (j < n_char && !(eoln(instream) || eoff(instream))) { charstate = gettc(instream); if (isspace(charstate) || (charstate >= '0' && charstate <= '9')) continue; if (islower(charstate)) charstate = toupper(charstate); if ((long)(strchr(curr_symbols, charstate)) == 0L) { added = 0; if (charstate == 'T') { /* U and T means mixed data type*/ if ((long)strchr(curr_symbols, 'U') == 0L) { if (nuc_only_seen == 0) { taxAndMat->datatype = (kAADatatype | kDNADatatype); curr_symbols = G_DNA_OR_PROT_SYMBOLS; } else { taxAndMat->datatype = kDNADatatype; curr_symbols = G_ALL_DNA_SYMBOLS; } } else { taxAndMat->datatype = kNucDatatype; curr_symbols = G_ALL_NUC_STATES; } added = 1; } else if (charstate == 'U') { /* U and T means mixed data type*/ if ((long)strchr(curr_symbols, 'L') == 0L) { if ((long)strchr(curr_symbols, 'T') == 0L) { taxAndMat->datatype = kRNADatatype; curr_symbols = G_ALL_RNA_SYMBOLS; } else { taxAndMat->datatype = kNucDatatype; curr_symbols = G_ALL_NUC_STATES; } added = 1; } } else if ((long)strchr(G_JUST_AA_SYMBOLS, charstate) != 0L) { /* Prot symbols and O or U means mixed data type*/ if ((long)(strchr(curr_symbols, 'U') == 0L) && (nuc_only_seen == 0)) { taxAndMat->datatype = kAADatatype; curr_symbols = G_ALL_AA_STATES; added = 1; } } else if (charstate == 'O') { if ((long)strchr(curr_symbols, 'L') == 0L) { taxAndMat->datatype = kNucDatatype; added = 1; nuc_only_seen = 1; charstate = '-'; } } if (!added) { err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING); sprintf(gLogMessage, "ERROR: bad symbol: %c at site %5ld of species %3ld\n%s\n", charstate, j + 1, i, err_tail); return 0L; } } j++; matRow[j - 1] = charstate; } if (j < n_char) scan_eoln(instream); else break; } if (j != n_char) { sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i); return 0L; } scan_eoln(instream); } taxAndMat->datatype = kDNADatatype; return mat; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_any_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat) { long i, j, basesread, basesnew = 0; Char charstate; const char * err_tail; const long n_taxa = taxAndMat->n_taxa; const long n_char = taxAndMat->n_char; MatrixPtr mat = taxAndMat->matrix; MatrixRow matRow = 0L; const char * curr_symbols = G_ANY_SEQ_SYMBOLS; int added; int nuc_only_seen = 0; taxAndMat->datatype = kSequenceDatatype; basesread = 0; for (;;) { eat_separator(instream); for (i = 1; i <= n_taxa; ++i) { matRow = mat[i - 1]; if (basesread == 0) read_name(instream, taxAndMat->name_array, i - 1); j = basesread; while (j < n_char && !(eoln(instream) || eoff(instream))) { charstate = gettc(instream); if (isspace(charstate) || (charstate >= '0' && charstate <= '9')) continue; if (islower(charstate)) charstate = toupper(charstate); if ((long)(strchr(curr_symbols, charstate)) == 0L) { added = 0; if (charstate == 'T') { /* U and T means mixed data type*/ if ((long)strchr(curr_symbols, 'U') == 0L) { if (nuc_only_seen == 0) { taxAndMat->datatype = (kAADatatype | kDNADatatype); curr_symbols = G_DNA_OR_PROT_SYMBOLS; } else { taxAndMat->datatype = kDNADatatype; curr_symbols = G_ALL_DNA_SYMBOLS; } } else { taxAndMat->datatype = kNucDatatype; curr_symbols = G_ALL_NUC_STATES; } added = 1; } else if (charstate == 'U') { /* U and T means mixed data type*/ if ((long)strchr(curr_symbols, 'L') == 0L) { if ((long)strchr(curr_symbols, 'T') == 0L) { taxAndMat->datatype = kRNADatatype; curr_symbols = G_ALL_RNA_SYMBOLS; } else { taxAndMat->datatype = kNucDatatype; curr_symbols = G_ALL_NUC_STATES; } added = 1; } } else if ((long)strchr(G_JUST_AA_SYMBOLS, charstate) != 0L) { /* Prot symbols and O or U means mixed data type*/ if ((long)(strchr(curr_symbols, 'U') == 0L) && (nuc_only_seen == 0)) { taxAndMat->datatype = kAADatatype; curr_symbols = G_ALL_AA_STATES; added = 1; } } else if (charstate == 'O') { if ((long)strchr(curr_symbols, 'L') == 0L) { taxAndMat->datatype = kNucDatatype; added = 1; nuc_only_seen = 1; charstate = '-'; } } if (!added) { err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING); sprintf(gLogMessage, "ERROR: bad symbol: %c at site %5ld of species %3ld\n%s\n", charstate, j + 1, i, err_tail); return 0L; } } j++; matRow[j - 1] = charstate; } if (i == 1) basesnew = j; else if (j != basesnew) { sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i); return 0L; } scan_eoln(instream); } basesread = basesnew; if (basesread == n_char) { return mat; } } } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char *symbols, const char * cell_name, const char other_gap) { long i, j; Char charstate; const char * err_tail; const long n_taxa = taxAndMat->n_taxa; const long n_char = taxAndMat->n_char; MatrixPtr mat = taxAndMat->matrix; MatrixRow matRow = 0L; eat_separator(instream); for (i = 1; i <= n_taxa; ++i) { matRow = mat[i - 1]; read_name(instream, taxAndMat->name_array, i - 1); j = 0; while (!eoff(instream)) { while (j < n_char && !(eoln(instream) || eoff(instream))) { charstate = gettc(instream); if (isspace(charstate) || (charstate >= '0' && charstate <= '9')) continue; if (islower(charstate)) charstate = toupper(charstate); if (((long)strchr(symbols, charstate)) == 0L) { if (charstate == other_gap) charstate = '-'; else { err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING); sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail); return 0L; } } j++; matRow[j - 1] = charstate; } if (j < n_char) scan_eoln(instream); else break; } if (j != n_char) { sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i); return 0L; } scan_eoln(instream); } taxAndMat->datatype = kDNADatatype; return mat; } /* if this returns NULL, then the read failed and gLogMessage will contain and error message. */ MatrixPtr read_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char *symbols, const char * cell_name, char other_gap) { long i, j, basesread, basesnew = 0; Char charstate; const char * err_tail; const long n_taxa = taxAndMat->n_taxa; const long n_char = taxAndMat->n_char; MatrixPtr mat = taxAndMat->matrix; MatrixRow matRow = 0L; basesread = 0; for (;;) { eat_separator(instream); for (i = 1; i <= n_taxa; ++i) { matRow = mat[i - 1]; if (basesread == 0) read_name(instream, taxAndMat->name_array, i - 1); j = basesread; while (j < n_char && !(eoln(instream) || eoff(instream))) { charstate = gettc(instream); if (isspace(charstate) || (charstate >= '0' && charstate <= '9')) continue; if (islower(charstate)) charstate = toupper(charstate); if ((long)(strchr(symbols, charstate)) == 0L) { if (charstate == other_gap) charstate = '-'; else { err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING); sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail); return 0L; } } j++; matRow[j - 1] = charstate; } if (i == 1) basesnew = j; else if (j != basesnew) { sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i); return 0L; } scan_eoln(instream); } basesread = basesnew; if (basesread == n_char) { return mat; } } } /* Returns the matrix ptr of the TaxAndMatrix_t , then the read failed and gLogMessage will contain and error message. Fills in gSymbolsSeen as a sideeffect */ MatrixPtr read_disc_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved) { long i, j, basesread, basesnew = 0; Char charstate; const char * symbols = G_ACCEPTED_SYMBOLS; const char * cell_name = "symbol"; const char * err_tail; boolean allread, done; const long n_taxa = taxAndMat->n_taxa; const long n_char = taxAndMat->n_char; MatrixPtr mat = taxAndMat->matrix; int nSymbolsSeen = 0; for (i = 0; i < N_SYMBOLS; ++i) gSymbolsSeen[i] = '\0'; basesread = 0; allread = 0; while (!(allread)) { eat_separator(instream); i = 1; while (i <= n_taxa) { if (basesread == 0 || !interleaved) read_name(instream, taxAndMat->name_array, i - 1); j = (interleaved) ? basesread : 0; done = 0; while (!done && !eoff(instream)) { if (interleaved) done = 1; while (j < n_char && !(eoln(instream) || eoff(instream))) { charstate = gettc(instream); if (isspace(charstate)) continue; if ((long)(strchr(gSymbolsSeen, charstate)) == 0L) { if ((long)(strchr(symbols, charstate)) == 0L) { err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING); sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail); return 0L; } gSymbolsSeen[nSymbolsSeen++] = charstate; } j++; mat[i - 1][j - 1] = charstate; } if (interleaved) continue; if (j < n_char) scan_eoln(instream); else if (j == n_char) done = 1; } if (interleaved && i == 1) basesnew = j; scan_eoln(instream); if ((interleaved && j != basesnew) || (!interleaved && j != n_char)) { sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i); return 0L; } i++; } if (interleaved) { basesread = basesnew; allread = (basesread == n_char); } else allread = (i > n_taxa); } taxAndMat->datatype = kDiscreteDatatype; return mat; } /* from phylip.cinput the numbers of species and of characters */ void inputnumbers(FILE *instream, long *spp, long *chars) { if (fscanf(instream, "%ld%ld", spp, chars) != 2 || *spp <= 0 || *chars <= 0) { error_exit("ERROR: Unable to read the number of species or characters in data set\n"\ "The input file is incorrect (perhaps it was not saved text only).\n", kNoNumbersInfile); } } /* inputnumbers */ void *my_malloc(long x) { /* wrapper for malloc, allowing error message if too little, too much */ void *new_block = (void *)0L; if ((x <= 0) || (x > 1000000000)) { error_msg("ERROR: a function asked for an inappropriate amount of memory:"); sprintf(gLogMessage, " %ld bytes\n", x); error_msg(gLogMessage); error_msg( " This can mean one of two things:\n"\ " 1. The input file is incorrect"\ " (perhaps it was not saved as Text Only),\n"\ " 2. There is a bug in the program.\n"\ " Please check your input file carefully.\n"\ " If it seems to be a bug, please mail joe@gs.washington.edu\n"\ " with the name of the program, your computer system type,\n"\ " a full description of the problem, and with the input data file.\n"\ " (which should be in the body of the message, not as an Attachment).\n"); exit(kBadMemRequest); } new_block = (void *) calloc(1,x); if (!new_block) error_exit("Error allocating memory\n", kCouldNotAllocMem); return (void *) new_block; } /* my_malloc */ void open_file( FILE **fp, const char *filename, const char *filedesc, const char *mode) { /* open a file, testing whether it exists etc. */ char filemode[3]; strcpy(filemode,mode); strcat(filemode,"b"); *fp = fopen(filename,filemode); if (!*fp) { switch (filemode[0]){ case 'r': sprintf(gLogMessage, "Can't find %s \"%s\"\n", filedesc, filename); break; case 'w': case 'a': sprintf(gLogMessage, "Can't write %s \"%s\"\n", filedesc, filename); break; default: sprintf(gLogMessage, "There is some error in the call of open_file. Unknown mode.\n"); } error_exit(gLogMessage, kCouldNotReadFile); } } /* open_file */ void print_NEXUS(const TaxAndMatrix_t * tax_and_mat, const char * infile_name) { int i; const char * fmt_code; int std_format = 0; const char * equate; equate=""; sprintf(gOutputMessage, "#NEXUS\n"\ "\n[Converted from PHYLIP to NEXUS by the CIPRES %s tool version %s invoked with options = %s]\n", gAppName, gVersionStr, gFlags); output_msg(gOutputMessage); convert_names_to_NEXUS(tax_and_mat->n_taxa, tax_and_mat->name_array, tax_and_mat->max_name_len - 1); sprintf(gOutputMessage, "\nBEGIN Taxa;\n"\ " Dimensions NTax = %ld ;\n"\ " TaxLabels\n", tax_and_mat->n_taxa); output_msg(gOutputMessage); for (i= 0; i < tax_and_mat->n_taxa; ++i) { sprintf(gOutputMessage, " %s\n", tax_and_mat->name_array[i]); output_msg(gOutputMessage); } sprintf(gOutputMessage, " ;\nEND;\n\nBEGIN Characters;\n Dimensions NChar = %ld ;\n", tax_and_mat->n_char); output_msg(gOutputMessage); if (tax_and_mat->datatype == kNucDatatype) fmt_code = "nucleotide"; else if (tax_and_mat->datatype == kDNADatatype || tax_and_mat->datatype == (kAADatatype | kDNADatatype)) fmt_code = "DNA"; else if (tax_and_mat->datatype == kRNADatatype) fmt_code = "RNA"; else if (tax_and_mat->datatype == kAADatatype) { fmt_code = "Protein"; /* this is correct: equate="equate=\"X={ACDEFGHIKLMNPQRSTVWY*}\""; what follows is a workaround for an NCL bug: */ equate="equate=\"X={ACDEFGHIKLMNPQRSTVWY}\""; } else { std_format = 1; sprintf(gOutputMessage, " Format Datatype = Standard Symbols=\"%s\" missing = ? ;\n Matrix\n", gSymbolsSeen); } if (!std_format) sprintf(gOutputMessage, " Format Datatype = %s missing = ? gap = - %s ;\n Matrix\n", fmt_code, equate); output_msg(gOutputMessage); for (i= 0; i < tax_and_mat->n_taxa; ++i) { sprintf(gOutputMessage, "\n%-20s ", tax_and_mat->name_array[i]); output_msg(gOutputMessage); assert(strlen(tax_and_mat->matrix[i]) == tax_and_mat->n_char); output_msg(tax_and_mat->matrix[i]); } output_msg("\n;\nEND;\n"); } /* Assumes that names can be expanded up to length max_name_len when adding NEXUS tokenizing characters. If duplicate labels are found #1, #2,.... #n are appended */ void convert_names_to_NEXUS(const unsigned int n_taxa, char ** name_array, const unsigned max_name_len) { const unsigned int len_with_term = max_name_len + 1; char * curr_name_alias; char * scratch = (char *)my_malloc((long)(len_with_term*sizeof(char))); char * to_hash = (char *)my_malloc((long)(len_with_term*sizeof(char))); char * remap_scratch = (char *)my_malloc((long)(len_with_term*sizeof(char))); char * remap_tail = (char *)my_malloc((long)(len_with_term*sizeof(char))); char * fir_occ_name; long fir_occ_len; unsigned int i, j, k, needs_quotes, write_pos, add_now, spaces_skipped; int all_digit, prev_pound; /* will be 1 if the string is \d* or \d*# */ tax_name_t * name_entry; char raw_char, conv_char; const int n_buckets = (n_taxa > 1024 ? n_taxa : 1024); str_hash_table_t * hash_table = alloc_hash(n_buckets, n_taxa); for (i = 0; i < n_taxa; ++i) { curr_name_alias = name_array[i]; assert((long)curr_name_alias != 0L); needs_quotes = 0; write_pos = 0; spaces_skipped = 0; all_digit = 1; prev_pound = 0; for (j = 0;; ++j) { raw_char = (char) curr_name_alias[j]; if (raw_char == '\0') { to_hash[j] = '\0'; scratch[write_pos] = '\0'; break; } assert(j < len_with_term); if (j >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); add_now = 1; if (isalpha(raw_char) || raw_char == '.'){ conv_char = (islower(raw_char) ? toupper(raw_char) : raw_char); prev_pound = 0; all_digit = 0; } else if (isdigit(raw_char)){ conv_char = raw_char; if (prev_pound) all_digit = 0; prev_pound = 0; } else { conv_char = raw_char; if (raw_char == ' ') { spaces_skipped += 1; add_now = 0; prev_pound = 0; } else { needs_quotes = 1; if (raw_char == '#' && prev_pound == 0) prev_pound = 1; else { prev_pound = 0; all_digit = 0; } } } if (add_now != 0) { /*We deal with spaces this way so that we strip trailing whitespace (by not adding it unless there is another non-space character) later in the string. */ if (spaces_skipped > 0) { all_digit = 0; needs_quotes = 1; assert(write_pos + spaces_skipped < len_with_term); if (write_pos + spaces_skipped >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); for (k = 0; k < spaces_skipped; ++k) { scratch[write_pos + k] = ' '; to_hash[j - spaces_skipped + k] = ' '; } write_pos += spaces_skipped; spaces_skipped = 0; } scratch[write_pos++] = raw_char; if (raw_char == '\'') { assert(write_pos < len_with_term); if (write_pos >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); scratch[write_pos++] = raw_char; } to_hash[j] = conv_char; } } if (write_pos == 0) { all_digit = 0; needs_quotes = 1; } name_entry = insert_str(hash_table, to_hash); assert((long)name_entry != 0L); /* Check for things that don't fly in NEXUS: repeated names all digit names Try to fix these problems by appending # to avoid all-digit names, and #1, #2, #3,... #n to disambiguate repeated names We need to make sure that the modification does not cause a clash. If it does we bail out. */ if (name_entry->n_occurrences == 1) { name_entry->first_occ_index = (int) i; /*Check for all digit taxon names and append a # */ if (all_digit == 1) { strcpy(remap_scratch, to_hash); if (prev_pound){ remap_scratch[strlen(remap_scratch) - 1] = '\0'; if ((long)find_str(hash_table, remap_scratch) != 0L) { sprintf(gLogMessage, "All digit name %s was remapped to %s#, but %s# has been encountered. Mapping to avoid all digit names failed.\n", remap_scratch, remap_scratch, remap_scratch); error_exit(gLogMessage, kRemappingError); } } else { strcat(remap_scratch, "#"); if ((long)find_str(hash_table, remap_scratch) != 0L) { sprintf(gLogMessage, "All digit name %s was remapped to %s, but %s name was already encountered. Mapping to avoid all digit names failed.\n", to_hash, remap_scratch, remap_scratch); error_exit(gLogMessage, kRemappingError); } needs_quotes = 1; strcat(scratch, "#"); sprintf(gLogMessage, "All digit names found. Mapped %s to %s#\n", curr_name_alias, curr_name_alias); warn_msg(gLogMessage); sprintf(gOutputMessage, " [ %s ] ", gLogMessage); output_msg(gOutputMessage); } } } else if (name_entry->n_occurrences == 2) { needs_quotes = 1; assert(write_pos + 2 < len_with_term); if (write_pos + 2 >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); strcat(scratch, "#2"); strcpy(remap_scratch, to_hash); strcat(remap_scratch, "#2"); if (find_str(hash_table, remap_scratch) != 0L) { sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s already exists.\n", curr_name_alias, remap_scratch); error_exit(gLogMessage, kRemappingError); } fir_occ_name = name_array[name_entry->first_occ_index]; sprintf(gLogMessage, "Duplicate names found. Mapped %s to %s#1 and %s#2\n", curr_name_alias, fir_occ_name, curr_name_alias); warn_msg(gLogMessage); sprintf(gOutputMessage, " [ %s ] ", gLogMessage); output_msg(gOutputMessage); strcpy(remap_scratch, to_hash); strcat(remap_scratch, "#1"); if (find_str(hash_table, remap_scratch) != 0L) { sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s already exists.\n", curr_name_alias, remap_scratch); error_exit(gLogMessage, kRemappingError); } fir_occ_len = strlen(fir_occ_name); assert(fir_occ_len + 4 < len_with_term); if (fir_occ_len + 4 >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); if (fir_occ_name[0] == '\'') strcpy(fir_occ_name + fir_occ_len - 1, "#1\'"); else { strcpy(remap_scratch, fir_occ_name); sprintf(fir_occ_name, "\'%s#1\'", to_hash); } } else { needs_quotes = 1; sprintf(remap_tail,"#%d", name_entry->n_occurrences); if (write_pos + strlen(remap_tail) >= len_with_term) { sprintf(gLogMessage, "Duplicate names found and cannot remap %s to %s%s. This program was not written in anticipation of this large number of name clashes\n", curr_name_alias, scratch, remap_tail); error_exit(gLogMessage, kRemappingError); } strcat(scratch, remap_tail); strcpy(remap_scratch, to_hash); strcat(remap_scratch, remap_tail); if (find_str(hash_table, remap_scratch) != 0L) { sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s%s already exists.\n", curr_name_alias, remap_scratch, remap_tail); error_exit(gLogMessage, kRemappingError); } sprintf(gLogMessage, "Duplicate names found. Mapped %s to %s\n", curr_name_alias, scratch); warn_msg(gLogMessage); sprintf(gOutputMessage, " [ %s] ", gLogMessage); output_msg(gOutputMessage); } /* Now we are ready to quote those names that need them */ if (needs_quotes == 0) sprintf(curr_name_alias, "%s", scratch); else { assert(write_pos + 3 < len_with_term); if (write_pos + 3 >= len_with_term) error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError); sprintf(curr_name_alias,"\'%s\'", scratch); } } free(remap_tail); free(remap_scratch); free(scratch); free(to_hash); free_hash(hash_table); } void scan_eoln(FILE *f) { /* eat everything to the end of line or eof*/ char ch; while (!eoff(f) && !eoln(f)) gettc(f); if (!eoff(f)) ch = gettc(f); } void getPosOrDie(FILE * f, fpos_t *fPos) { if (fgetpos(f,fPos)) error_exit("Error checking file position\n", kFilePosError); } void setPosOrDie(FILE * f, const fpos_t *fPos) { if (fsetpos(f, fPos)) error_exit("Error setting file position\n", kFilePosError); } TaxAndMatrix_t * read_data(const char * infile_name, const int expectedDatatype, const int interleaveStatus) { fpos_t fPos; FILE * infilePtr; TaxAndMatrix_t * taxAndMat; long n_taxa; long n_char; open_file(&infilePtr, infile_name, "input file", "r"); /* begin doinit from pars.c -- initializes variables */ inputnumbers(infilePtr, &n_taxa, &n_char); sprintf(gLogMessage, "%2ld species, %3ld sites\n\n", n_taxa, n_char); debug_msg(gLogMessage); taxAndMat = alloc_TaxAndMat(n_taxa, n_char); /* end doinit from pars.c -- initializes variables */ getPosOrDie(infilePtr, &fPos); if (kSequenceDatatype == (kSequenceDatatype & expectedDatatype)){ if (interleaveStatus & kIsInterleaved) { if (read_any_seq_data(infilePtr, taxAndMat, 1)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } if (interleaveStatus & kIsNotInterleaved) { if (read_any_seq_data(infilePtr, taxAndMat, 0)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } } else { if (expectedDatatype & kDNADatatype) { if (interleaveStatus & kIsInterleaved) { if (read_dna_data(infilePtr, taxAndMat, 1)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } if (interleaveStatus & kIsNotInterleaved) { if (read_dna_data(infilePtr, taxAndMat, 0)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } } if (expectedDatatype & kRNADatatype) { if (interleaveStatus & kIsInterleaved) { if (read_rna_data(infilePtr, taxAndMat, 1)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } if (interleaveStatus & kIsNotInterleaved) { if (read_rna_data(infilePtr, taxAndMat, 0)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } } if (expectedDatatype & kAADatatype) { if (interleaveStatus & kIsInterleaved) { if (read_aa_data(infilePtr, taxAndMat, 1)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } if (interleaveStatus & kIsNotInterleaved) { if (read_aa_data(infilePtr, taxAndMat, 0)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } } } if (expectedDatatype & kDiscreteDatatype) { if (interleaveStatus & kIsInterleaved) { if (read_disc_data(infilePtr, taxAndMat, 1)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } if (interleaveStatus & kIsNotInterleaved) { if (read_disc_data(infilePtr, taxAndMat, 0)) return taxAndMat; setPosOrDie(infilePtr, &fPos); } } taxAndMat->matrix = 0L; /*MEMORY LEAK matrix, should be freed but we are signalling that the input was not read*/ return taxAndMat; } int main(int argc, char * argv[]) { /* * reads in spp, chars, and the data. Then calls maketree to * construct the tree */ const char * usage = " phylip_to_nexus [OPTIONS] INFILE [OUTFILE]\n"\ "The OPTIONS string starts with - followed by any of the following flags:\n"\ " i = support interleaved input\n"\ " n = support non-interleaved input\n"\ " a = support amino acid sequences\n"\ " d = support dna sequences\n"\ " r = support rna sequences\n"\ " g = support generic discrete output\n"\ " v = verbose output\n"\ " q = quiet mode (only output errors)\n"\ "Default behavior is to act as if all flags were requested (diagnose input type and interleaving).\n"; TaxAndMatrix_t * taxAndMat; const char * fn = 0L; const char * outfn = 0L; int i; char c; int interleaveBits = 0; int datatypeBits = 0; install_sig_handlers(); if (argc > 4) { sprintf(gLogMessage, "Expecting at most 3 command line arguments.\nUsage:\n%s", usage); error_exit(gLogMessage, kBadCommandLineArgs); } else if (argc == 4) { gFlags = argv[1]; fn = argv[2]; outfn = argv[3]; } else if (argc == 3) { if (strlen(argv[1]) > 0 && argv[1][0] == '-') { gFlags = argv[1]; fn = argv[2]; } else { fn = argv[1]; outfn = argv[2]; } } else if (argc == 2) { if (strlen(argv[1]) > 0 && argv[1][0] == '-') { gFlags = argv[1]; fn = "infile"; } else { fn = argv[1]; } } else fn = "infile"; if (strlen(gFlags) == 0 || gFlags[0] != '-') { sprintf(gLogMessage, "Expecting flags to start with -.\nUsage:\n%s", usage); error_exit(gLogMessage, kBadCommandLineArgs); } for (i = 1; i < strlen(gFlags); ++ i) { c = gFlags[i]; if (islower(c)) c = toupper(c); if (c == 'I') interleaveBits |= kIsInterleaved; else if (c == 'N') interleaveBits |= kIsNotInterleaved; else if (c == 'D') datatypeBits |= kDNADatatype; else if (c == 'R') datatypeBits |= kRNADatatype; else if (c == 'A') datatypeBits |= kAADatatype; else if (c == 'G') datatypeBits |= kDiscreteDatatype; else if (c == 'V') gCurrentLogThreshold = kDebugLevel; else if (c == 'Q') gCurrentLogThreshold = kErrorLevel; else if (c == 'H') { sprintf(gLogMessage, "%s version %s.\n"\ "The CIPRES project\'s tool for converting PHYLIP files to NEXUS.\n\n"\ "Based on the code base of PHYLIP 3.6.5 which is copyrighted by:\n"\ " Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, Mike Palczewski, Doug Buxton and Dan Fineman.\n"\ "Tweaks for CIPRES by Mark Holder.\n\n"\ "Usage:\n%s", gAppName, gVersionStr, usage); output_msg(gLogMessage); return 0; } else { sprintf(gLogMessage, "Unknown flag \"%c\"\n", gFlags[i]); error_exit(gLogMessage, kBadCommandLineArgs); } } if (datatypeBits == 0) { sprintf(gLogMessage, "At least one of the datatype options (d, r, a, or g) must be supplied.\nUsage:\n%s", usage); error_exit(gLogMessage, kBadCommandLineArgs); } if (outfn != 0L) open_file(&gOutfilePtr, outfn, "output file", "w"); taxAndMat = read_data(fn, datatypeBits, interleaveBits); if (taxAndMat == 0L || taxAndMat->matrix == 0L) { free_TaxAndMat(taxAndMat); error_msg(gLogMessage); sprintf(gLogMessage, "Could not parse %s.\n", fn); error_exit(gLogMessage, kCouldNotReadFile); } else { print_NEXUS(taxAndMat, fn); free_TaxAndMat(taxAndMat); } return 0; }