/*
 * version 3.6. (c) Copyright 1993-2004 by the University of Washington.
 * Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe,
 * Mike Palczewski, Doug Buxton and Dan Fineman. Permission is granted to
 * copy and use this program provided no fee is charged for it and provided
 * that this copyright notice is not removed.
 */


/*
 * This file is a phylip to NEXUS converter that consists of code from PHYLIP
 * 3.6.5 (see copyright above) tweaked by Mark Holder to output NEXUS.
 *
 * This file was created by concatenating the headers, and .c files:
 *	phylip.h, 
 *	seq.h,
 *	discrete.h 
 *	phylip.c
 *	seq.c, 
 *	pars.c, and 
 *	discrete.c concatenated 
 *	followed by removal of code that is unused in this simple program, and 
 * the addition of routines for printing out NEXUS.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef WIN32
#	include <windows.h>
#	define DELIMITER '\\'
#else
#	define DELIMITER '/'
#endif
#include <signal.h>
#include <ctype.h>
#include <assert.h>


/*structs and typedefs */
typedef char Char;
typedef Char * MatrixRow;
typedef MatrixRow * MatrixPtr;
typedef unsigned char boolean;
typedef struct TaxAndMatrix_t {
  char ** name_array;
  MatrixPtr matrix;
  unsigned long n_taxa;
  unsigned long n_char;
  unsigned int datatype;
  unsigned int max_name_len;
} TaxAndMatrix_t;

enum DatatypesEnum {
	kNoneDatatype     = 0x00,
	kDNADatatype      = 0x01,
	kRNADatatype      = 0x02,
	kNucDatatype      = 0x03, /* RNA or DNA */
	kAADatatype       = 0x04,
	kSequenceDatatype = 0x07, /* AA or nucleotide */
	kDiscreteDatatype = 0x08,
	kUnknownDatatype  = 0x0F
	};

enum InterleaveEnum {
	kIsInterleaved = 0x01,
	kIsNotInterleaved = 0x02,
	kInterleaveUnknown = 0x03
	};

/* function protypes */
	/* Public interface functions */
TaxAndMatrix_t * read_data(const char * infile_name, /* path to input file */
					   const int expectedDatatype, /* Specifies which types of data to try to read -- DatatypesEnum facets OR'd together.*/
					   const int interleaveStatus /* kIsInterleaved, kIsNoInterleaved or kInterleaveUnknown*/
					   );
/** Prints the data that has been read into tax_and_mat from the source `source` as NEXUS.
	`output_msg` is used for printing (out goes to stdout if gOutfilePtr is NULL).
	The `source` string is just used in a NEXUS comment, so an empty string may be passed in
		if the source of the data is not known.
*/
void print_NEXUS(const TaxAndMatrix_t * tax_and_mat, const char * source); 
void free_TaxAndMat(TaxAndMatrix_t *);

void convert_names_to_NEXUS(const unsigned int n_taxa, Char ** name_array, const unsigned int max_name_len);


	/* IO */
void debug_msg(const char * msg);
void output_msg(const char * msg);
void error_msg(const char * msg);
void error_exit(const char * msg, int);
	/* Utility */
void eat_separator(FILE * instream);
boolean eoff(FILE *f);
boolean eoln(FILE *f);
char gettc(FILE* file);
const char* get_command_name (const char *vektor);
void open_file(FILE **fp,const char *filename, const char *filedesc, const char *mode);
void * my_malloc(long x);
void scan_eoln(FILE *f);

static void crash_handler(int sig_num);
void install_sig_handlers();

TaxAndMatrix_t * alloc_TaxAndMat(long spp, long chars);
void read_name(FILE* infile, char ** taxNameArray, long i);
MatrixPtr read_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char * symbols, const char * cell_name, const char other_gap);
MatrixPtr read_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char * symbols, const char * cell_name, const char other_gap);
MatrixPtr read_any_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat);
MatrixPtr read_any_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat);
MatrixPtr read_any_seq_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved);
MatrixPtr read_dna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved);
MatrixPtr read_rna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved);
MatrixPtr read_aa_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved);
MatrixPtr read_disc_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved);
unsigned replace_all(char *s, const char from_char, const char to_char);


/* globals not used in struct definitions */
const char * gAppName = "phylip_to_nexus";
const char * gVersionStr = "0.0.1";
const char * gFlags = "-indrag"; /* unless otherwise stated, we assume the user is in drag*/

#define NMLNGTH			10	/* number of characters in species name	   */
#define FNMLNGTH 200 /* length of array to store a file name */
#define MAXNCH 20

const char * G_ANY_SEQ_SYMBOLS = "ABCDGHKMNRSVWXY?-";
const char * G_DNA_OR_PROT_SYMBOLS = "ABCDGHKMNRSTVWXY?-";
const char * G_JUST_AA_SYMBOLS = "EFILPQZ*";
const char * G_ALL_DNA_SYMBOLS = "ABCDGHKMNRSTVWXY?-";
const char * G_ALL_RNA_SYMBOLS = "ABCDGHKMNRSUVWXY?-";
const char * G_ALL_NUC_STATES = "ABCDGHKMNRSTUVWXY?-";
const char * G_ALL_AA_STATES = "ABCDEFGHIKLMNPQRSTVWXYZ*?-";
/*
	TEMPORARY:  Some Phylip symbols are disallowed because they are awkward in NEXUS
			Really they should be escaped in a NEXUS friendly manner rather than disallowing them
	const char * G_ACCEPTED_SYMBOLS = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
*/
const char * G_ACCEPTED_SYMBOLS = "!#$%&*+-0123456789?@ABCDEFGHIJKLMNOPQRSTUVWXYZ^abcdefghijklmnopqrstuvwxyz|~";
const char * G_GAP_NOT_DOT_MESSAGE = "	   Periods (.) may not be used as gap characters.\n	   The correct gap character is (-)";
const char * G_EMPTY_STRING = "";

#define MSG_BUFFER_LEN 512
char gLogMessage[MSG_BUFFER_LEN];
char gOutputMessage[MSG_BUFFER_LEN];


#define N_SYMBOLS 98 /*Phylip accepts up to 98 symbols for generic discrete data */
char gSymbolsSeen[N_SYMBOLS];


/*Error exit codes - these are the program's exit codes on failure */
const int kCouldNotReadFile = 1;
const int kNoNumbersInfile = 2;
const int kBadMemRequest = 3;
const int kCouldNotAllocMem = 4;
const int kUnexpectedEOLorEOF = 5;
const int kBadTaxonName = 6;
const int kFilePosError = 7;
const int kBadCommandLineArgs = 8;
const int kAssertionError = 9;
const int kRemappingError = 10; /*The duplicate name disambiguation failed to produce a valid file*/


FILE * gLogfilePtr = 0L;
FILE * gOutfilePtr = 0L;

enum LogLevelsEnum {kDebugLevel, kWarnLevel, kErrorLevel};
int gCurrentLogThreshold = kWarnLevel; /* kErrorLevel to get errors, 
										  kWarnLevel to get warnings, 
										  kDebugLevel to get debugging output 
									   */

int kLastLogLevel = -1; /*This global is used as a flag to avoid prefixing long error messages with the program name multiple times. */
const char * gLogLevelNames[] = {"DEBUG", "WARNING", "ERROR"};

void log_msg(int level, const char * msg);

void log_msg(int level, const char * msg)
{
	const char * level_snippet;
	
	if (gCurrentLogThreshold > level)
		return;
	
	if (kLastLogLevel == level) {
		if (gLogfilePtr)
			fprintf(gLogfilePtr, msg);
		else
			fprintf(stderr, msg);
	}
	else {
		assert(level >= kDebugLevel);
		assert(level <= kErrorLevel);
		level_snippet = gLogLevelNames[level];
		if (gLogfilePtr)
			fprintf(gLogfilePtr, "%s %s: %s", gAppName, level_snippet, msg);
		else
			fprintf(stderr, "%s %s: %s", gAppName, level_snippet, msg);
	}
	kLastLogLevel = level;
}

void debug_msg(const char * msg)
{
	log_msg(kDebugLevel, msg);
}

void warn_msg(const char * msg)
{
	log_msg(kWarnLevel, msg);
}

void error_msg(const char * msg)
{
	log_msg(kErrorLevel, msg);
}

void output_msg(const char * msg)
{
	if (gOutfilePtr)
		fprintf(gOutfilePtr, msg);
	else
		printf(msg);
	kLastLogLevel = -1; /*Triggers the error level prefix in the next log event.*/
}

void error_exit(const char * msg, int e_code)
{
	error_msg(msg);
	exit(e_code);
}

unsigned replace_all(char *s, const char from_char, const char to_char)
{
	unsigned i = 0;
	char *ch;
	ch = strchr(s, from_char);
	while ((long) ch != 0L) {
		++i;
		*ch = to_char;
		ch = strchr(s, from_char);
	}
	return i;
}


/*begin hashing stuff*/
/*
	String hashing to allow for constant time checking if names are duplicated.
	Duplicate names are allowed by PHYLIP, but not NEXUS.
*/

/* doubly-linked list of names (strings big enough to hold PHYLIP taxa names).
*/

typedef struct tax_name_t {
	struct tax_name_t * next;
	struct tax_name_t * prev;
	char s[NMLNGTH+1]; /**The name (will be stored as sent to the insert... function, usually this will be capitalized). */
	int n_occurrences; /**The number of times this name has been added to the hash*/
	int first_occ_index;
} tax_name_t ;


/* 	hash table with buckets that point to tax_name_t structs 
	Struct should be viewed as an opaque object by caller.
*/
typedef struct str_hash_table_t {
	
	tax_name_t** buckets;
	unsigned n_buckets;
	
	tax_name_t * name_holder_array; /* used to preallocate a block of tax_name_t structs to improve cache hits.
									   These are only used once and then hang around until free_hash.
									   Thus, lots of removals of strings would lead to poor performance*/
	unsigned nha_len;  /* length of `name_holder_array`*/
	unsigned pos_in_nha; /*index of next free in tax_name_t in `name_holder_array`*/
	
} str_hash_table_t;

	/*Hash table public interface */
str_hash_table_t * alloc_hash(unsigned n_buckets, unsigned n_strings);
tax_name_t * insert_str(str_hash_table_t * hash_t, const char * s);
tax_name_t * find_str(const str_hash_table_t * hash_t, const char * s);
void free_hash(str_hash_table_t * hash_t);

	/*private functions */
	/*String hashing function*/
unsigned long djb2_hash(const unsigned char *str);
	/**/
tax_name_t * alloc_list_entry(const char * s, str_hash_table_t * h);
tax_name_t * insert_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next, str_hash_table_t * hash_t);
tax_name_t * find_in_linked_list(tax_name_t * hash_t, const char * s, const int move_prev, const int move_next);

void free_list_entry(tax_name_t * le, const str_hash_table_t * hash_t);
void free_next_list_entry(tax_name_t * le, const str_hash_table_t * hash_t);
void free_prev_list_entry(tax_name_t * le, const str_hash_table_t * hash_t);
void init_tax_name_t(tax_name_t * le);


/* end  hashing "header" */
/* begin  hashing implementation */

/**	Returns a pointer to a str_hash_table_t with `n_buckets` buckets.
	The pointer must be later passed to `free_hash` to free all memory.
*/
str_hash_table_t * alloc_hash(unsigned n_buckets, unsigned n_strings) 
{
	int i;
	tax_name_t ** b;
	tax_name_t *nha_ptr;
	
	assert(n_buckets > 0);
	assert(n_strings > 0);
	str_hash_table_t * h =  (str_hash_table_t *) my_malloc((long)(sizeof(str_hash_table_t)));
	h->n_buckets = n_buckets;
	b = (tax_name_t **) my_malloc((long)(n_buckets*sizeof(tax_name_t *)));
	h->buckets = b;
	for (i = 0; i < n_buckets; ++i, ++b)
		*b = 0L;
	
		/*Create the name_holder_array that will be used when inserting new strings */
	h->name_holder_array = (tax_name_t *) my_malloc((long)(n_strings*sizeof(tax_name_t)));
	nha_ptr = h->name_holder_array;
	h->nha_len = n_strings;
	h->pos_in_nha = 0;
	
	return h;
}

void free_hash(str_hash_table_t * h)
{
	int i;
	tax_name_t ** b;
	tax_name_t * curr_bucket;
	if ((long)h == 0L)
		return;
	b = h->buckets;
	for (i = 0; i < h->n_buckets; ++i, ++b) {
		if (*b) {
			curr_bucket = *b;
			free_prev_list_entry(curr_bucket, h);
			free_next_list_entry(curr_bucket, h);
			free_list_entry(curr_bucket, h);
		}
	}

	
	free(h->name_holder_array);
	free(h->buckets);
	free(h);

}


tax_name_t * insert_str(str_hash_table_t * hash_t, const char * s)
{
	unsigned h;
	tax_name_t ** b;
#		if defined (RECENTERING_BUCKET_LINKED_LISTS) && RECENTERING_BUCKET_LINKED_LISTS
		int collision_cmp;
#		endif
	tax_name_t *to_return;
	
	assert((long)hash_t != 0L);
	assert((long)hash_t->buckets != 0L);
	h = djb2_hash((unsigned char *)s) % hash_t->n_buckets;
	assert(h < hash_t->n_buckets);
	b = (hash_t->buckets + h);
	if ((long)*b == 0L) {
		*b = alloc_list_entry(s, hash_t);
		if (gCurrentLogThreshold <= kDebugLevel) {
			sprintf(gLogMessage, "New string %s with hash value=%d\n", s, h);
			debug_msg(gLogMessage);
		}
		return *b;
	}
	if (gCurrentLogThreshold <= kDebugLevel) {
		sprintf(gLogMessage, "Collision or duplicate label with %s and hash value=%d\n", s, h);
		debug_msg(gLogMessage);
	}
	
	to_return = insert_in_linked_list(*b, s, 1, 1, hash_t);
#		if defined (RECENTERING_BUCKET_LINKED_LISTS) && RECENTERING_BUCKET_LINKED_LISTS
		collision_cmp = strcmp((*b)->s, s);
		if (collision_cmp == 0) {
			sprintf(gLogMessage, "Repeated Name %s", s);
			debug_msg(gLogMessage);
		}
		else {
			/*move the bucket attachment point in the direction of the center of the list
				(this is not a good scheme, we may end up on the end if we always add 
				in the same direction, but the linked lists should be pretty short).
			*/
			if (collision_cmp < 0) {
				assert((long)(*b)->next);
				*b = (*b)->next;
			}
			else {
				assert((long)(*b)->prev);
				*b = (*b)->prev;
			}
		}
#		endif
	return to_return;
}

tax_name_t * find_str(const str_hash_table_t * hash_t, const char * s)
{
	unsigned h;
	tax_name_t ** b;
	
	assert((long)hash_t);
	assert((long)hash_t->buckets);
	h = djb2_hash((unsigned char *)s) % hash_t->n_buckets;
	assert(h < hash_t->n_buckets);
	b = (hash_t->buckets + h);

	/*empty bucket, so the string is not in the hash*/
	if ((long)*b == 0L)
		return 0L;
	return find_in_linked_list(*b, s, 1, 1);
}

void init_tax_name_t(tax_name_t * le)
{
	le->next = 0L;
	le->prev = 0L;
	le->n_occurrences = 1;
	le->first_occ_index = -1;
	le->s[0] = '\0';
}

tax_name_t * alloc_list_entry(const char * s, str_hash_table_t * h)
{
	tax_name_t * le;
	if (h->pos_in_nha >= h->nha_len)
		le = my_malloc((long)(sizeof(tax_name_t)));
	else 
		le = (h->name_holder_array + h->pos_in_nha++);
	init_tax_name_t(le);
	strcpy(le->s, s);
	return le;
}
	
tax_name_t * insert_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next, str_hash_table_t * h)
{
	tax_name_t * to_return;
	assert((long)list_entry != 0L);
	const int d = strcmp(list_entry->s, s);
	if (d == 0) {
		list_entry->n_occurrences += 1;
		return list_entry;
	}
	else if (d < 0) {
		if (move_next && (long)list_entry->next) {
			return insert_in_linked_list(list_entry->next, s, 0, 1, h);
		}
		else {
			to_return = alloc_list_entry(s, h);
			to_return->prev = list_entry;
			if ((long)list_entry->next) {
				to_return->next = list_entry->next;
				list_entry->next->prev = to_return;
			}
			list_entry->next = to_return;
			return to_return;
		}
	}
	else  {
		if (move_prev && (long)list_entry->prev) {
			return insert_in_linked_list(list_entry->prev, s, 1, 0, h);
		}
		else {
			to_return = alloc_list_entry(s, h);
			to_return->next = list_entry;
			if ((long)list_entry->prev) {
				to_return->prev = list_entry->prev;
				list_entry->prev->next = to_return;
			}
			list_entry->prev = to_return;
			return to_return;
		}
	}
}

tax_name_t * find_in_linked_list(tax_name_t * list_entry, const char * s, const int move_prev, const int move_next)
{
	assert((long)list_entry != 0L);
	const int d = strcmp(list_entry->s, s);
	if (d == 0)
		return list_entry;
	else if (d < 0) {
		if (move_next && (long)list_entry->next)
			return find_in_linked_list(list_entry->next, s, 0, 1);
		return 0L;
	}
	if (move_prev && (long)list_entry->prev)
		return find_in_linked_list(list_entry->prev, s, 1, 0);
	return 0L;	
}

void free_prev_list_entry(tax_name_t * le, const str_hash_table_t * hash_t)
{
	if ((long)le->prev == 0L)
		return;
	free_prev_list_entry(le->prev, hash_t);
	free_list_entry(le->prev, hash_t);
	le->prev = 0L;
}

void free_next_list_entry(tax_name_t * le, const str_hash_table_t * hash_t)
{
	if ((long)le->next == 0L)
		return;
	free_next_list_entry(le->next, hash_t);
	free_list_entry(le->next, hash_t);
	le->next = 0L;
}

void free_list_entry(tax_name_t * le, const str_hash_table_t * hash_t)
{
	tax_name_t * prealloc_beg = hash_t->name_holder_array;
	tax_name_t * prealloc_end = prealloc_beg + hash_t->pos_in_nha;
	if (((long)le >= (long)prealloc_beg) && ((long)le <= (long)prealloc_end))
		return; /* this le was in the prealloc array, it will be freed with the hash table*/
	free(le);
}
	

/* from http://www.cse.yorku.ca/~oz/hash.html 
this algorithm (k=33) was first reported by dan bernstein many years ago in
comp.lang.c. 
another version of this algorithm (now favored by bernstein) uses xor: 
	hash(i) = hash(i - 1) * 33 ^ str[i]; 
the magic of number 33 (why it works better than many other constants, 
prime or not) has never been adequately explained.
*/
unsigned long djb2_hash(const unsigned char *str) {
	unsigned long hash = 5381;
	int c;
	while ((c = *str++))
		hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
	return hash;
}

/*end hashing stuff*/


TaxAndMatrix_t * alloc_TaxAndMat(long spp, long chars)
{
	int i,j;
	/* The "+6" is for the terminator and  disambiguation # that may be appended (if there are clashes)
	   The "2*" is a TEMPORARY hack to avoid tokenizing later (doubling ' to '' will increase the length) 
	*/
	const int max_name_len = 6 + (2*NMLNGTH);
	Char * all_names;
	Char * curr_name;
	TaxAndMatrix_t * tnm;
	
	tnm = (TaxAndMatrix_t *) my_malloc((long)(sizeof(TaxAndMatrix_t)));
	tnm->max_name_len = max_name_len;
	tnm->n_taxa = spp;
	tnm->n_char = chars;
	tnm->datatype = kUnknownDatatype;

	assert(spp > 0);
	assert(chars > 0);
	
	tnm->matrix = (MatrixPtr) my_malloc((long)(spp * sizeof(Char *)));
	for (i = 0; i < spp; i++) {
		tnm->matrix[i] = (Char *) my_malloc((long)((1 + chars) * sizeof(Char)));
		tnm->matrix[i][chars] = '\0';
	}

	tnm->name_array = (char **) my_malloc((long)(spp * sizeof(char *)));
	all_names = (char *) my_malloc((long)(spp * max_name_len * sizeof(Char)));
	tnm->name_array[0] = all_names;
	for (i = 0; i < spp; i++) {
		curr_name = all_names + (i*max_name_len);
		tnm->name_array[i] = curr_name; 
		for (j = 0; j < max_name_len; ++j) {
			curr_name[j] = '\0';
		}
	}
	return tnm;
}

void free_TaxAndMat(TaxAndMatrix_t *t)
{
	int i;
	if ((long)t == 0L)
		return;
	if ((long)t->name_array != 0L) {
		free(t->name_array[0]);
		free(t->name_array);
	}
	if ((long)t->matrix != 0L) {
		for (i = 0; i < t->n_taxa; ++i)
			free(t->matrix[i]);
		free(t->matrix);
	}
	free(t);
}

static void crash_handler(int sig_num)
{ /* when we crash, lets print out something useful */
	int haveSigEgv = 0;
#	ifdef SIGSEGV
		haveSigEgv = (sig_num == SIGSEGV ? 1 : 0);
#	endif 
	error_msg("ERROR:    \n");
	switch(sig_num) {
#		ifdef SIGSEGV
			case SIGSEGV:
			error_msg("This program has caused a Segmentation fault.\n");
			break;
#		endif /* SIGSEGV */
#		ifdef SIGFPE
			case SIGFPE:
			error_msg("This program has caused a Floating Point Exception\n");
			break;
#		endif  /* SIGFPE */
#		ifdef SIGILL
			case SIGILL:
			error_msg("This program has attempted an illegal instruction\n");
			break;
#		endif  /* SIGILL */
#		ifdef SIGPIPE 
			case SIGPIPE:
			error_msg("This program tried to write to a broken pipe\n");
			break;
#		endif  /* SIGPIPE */
#		ifdef SIGBUS
			case SIGBUS:
			error_msg("This program had a bus error\n");
			break;
#		endif /* SIGBUS */
	}	
	if (haveSigEgv) {
		error_msg(
		"        This may have been caused by an incorrectly formatted input file\n");
		error_msg(
		"        or input tree file.	 You should check those files carefully.\n");
		error_msg("        If this seems to be a bug, please report it at the webpage http://ngbugz.sdsc.edu/default.php?pg=pgPublicEdit\n");
	}
	else {
		error_msg("        Most likely, you have encountered a bug in the program.\n");
		error_msg("        Since this seems to be a bug, please report it at the webpage http://ngbugz.sdsc.edu/default.php?pg=pgPublicEdit\n");
	}
	error_msg("        with the name of the program, your computer system type,\n");
	error_msg("        a full description of the problem, and with the input data file.\n");
	error_msg("        (which should be in the body of the message, not as an Attachment).\n");
	abort();
}


/* eat white space -- if the separator line has spaces on it */
void eat_separator(FILE * instream)
{
	char c;
	do {
		c = gettc(instream);
	} while (c == ' ' || c == '\t');

	ungetc(c, instream);

	if (eoln(instream))
		scan_eoln(instream);
}

boolean eoln(FILE *f)
{ /* check for end of line or eof*/
	register int ch;

	ch = getc(f);
	if (ch == EOF)
	  return 1;
	ungetc(ch, f);
	return ((ch == '\n') || (ch == '\r'));
}  /*eoln*/

boolean eoff(FILE *f)
{ /* check for end of file */
	int ch;

	if (feof(f)) 
	  return 1;
	ch = getc(f);
	if (ch == EOF) {
	  ungetc(ch, f);
	  return 1;
	}
	ungetc(ch, f);
	return 0;
}  /*eoff*/


/*from phylip.c*/
const char* get_command_name (const char *vektor)
{ /* returns the name of the program from vektor without the whole path */
	char *last_slash;
		/* Point to the last slash... */
	last_slash = strrchr (vektor, DELIMITER);
		/* If there was a last slash, return the character after it */
		/* If not, return the vector */
	return (last_slash ? last_slash + 1: vektor);
}  /*get_command_name*/

char gettc(FILE* file) 
{ /* catch eof's so that other functions not expecting an eof
   * won't have to worry about it */
	int ch =getc(file);
	if (ch == EOF)
		error_exit("Unexpected End of File\n", kUnexpectedEOLorEOF);
	
	if ( ch == '\r' ) {
		ch = getc(file);
		if ( ch != '\n' )
			ungetc(ch,file);
		ch = '\n';
	}
	return ch;
} 

/*from phylip.c*/
void install_sig_handlers() 
{ /* initialization routine for all programs 
   * anything done at the beginig for every program should be done here */ 
 
  /* set up signal handler for 
   * segfault,floating point exception, illeagal instruction, bad pipe, bus error
   * there are more signals that can cause a crash, but these are the most common
   * even these aren't found on all machines.  */
#	ifdef SIGSEGV
		signal(SIGSEGV, crash_handler);
#	endif /* SIGSEGV */
#	ifdef SIGFPE
		signal(SIGFPE, crash_handler);
#	endif /* SIGFPE */
#	ifdef SIGILL
		signal(SIGILL, crash_handler);
#	endif /* SIGILL */
#	ifdef SIGPIPE
		signal(SIGPIPE, crash_handler);
#	endif /* SIGPIPE */
#	ifdef SIGBUS
	signal(SIGBUS, crash_handler);
#	endif /* SIGBUS */
}

void read_name(FILE* infile, char ** taxNameArray, long i)
{
	/* read in species name */
	long j;
	long offset=0;
	char c;
	char * tn = taxNameArray[i];
	for (j = 0; j < NMLNGTH; j++) {
		if (eoff(infile) | eoln(infile)) {
			error_msg("\n\nERROR: end-of-line or end-of-file");
			sprintf(gLogMessage, " in the middle of species name for species %ld\n\n", i+1);
			error_exit(gLogMessage, kUnexpectedEOLorEOF);
		}
		c = gettc(infile);
		if ((long)strchr("():,;[]", c) != 0L) {
			error_msg("\nERROR: Species name may not contain characters ( ) : ; , [ ] \n");
			sprintf(gLogMessage, "		 In name of species number %ld there is character %c\n\n", i+1, c);
			error_exit(gLogMessage, kBadTaxonName);
		}
		tn[j+offset] = c;
	}
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_any_seq_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved)
{
	MatrixPtr m;
	if (interleaved)
		m = read_any_seq_interleaved(instream, taxAndMat);
	else
		m = read_any_seq_non_interleaved(instream, taxAndMat);
	return m;	
}


/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_dna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved)
{
	MatrixPtr m;
	if (interleaved)
		m = read_seq_interleaved(instream, taxAndMat, G_ALL_DNA_SYMBOLS, "base", 'O');
	else
		m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_DNA_SYMBOLS, "base", 'O');
	if (m)
		taxAndMat->datatype = kDNADatatype;
	return m;	
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_rna_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved)
{
	MatrixPtr m;
	if (interleaved)
		m = read_seq_interleaved(instream, taxAndMat, G_ALL_RNA_SYMBOLS, "base", 'O');
	else
		m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_RNA_SYMBOLS, "base", 'O');
	if (m)
		taxAndMat->datatype = kDNADatatype;
	return m;	
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_aa_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved)
{
	MatrixPtr m;
	if (interleaved)
		m = read_seq_interleaved(instream, taxAndMat, G_ALL_AA_STATES, "amino acid", '\0');
	else
		m = read_seq_non_interleaved(instream, taxAndMat, G_ALL_AA_STATES, "amino acid", '\0');
	if (m)
		taxAndMat->datatype = kAADatatype;
	return m;	
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_any_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat)
{
	long			i, j;
	Char			charstate;
	const char * err_tail;
	int added;
	const long n_taxa = taxAndMat->n_taxa;
	const long n_char = taxAndMat->n_char;
    MatrixPtr mat = taxAndMat->matrix;
    MatrixRow matRow = 0L;
    const char * curr_symbols = G_ANY_SEQ_SYMBOLS;
	int nuc_only_seen = 0;
    taxAndMat->datatype = kSequenceDatatype;

	eat_separator(instream);
	
	
	for (i = 1; i <= n_taxa; ++i) {
		matRow = mat[i - 1];
		read_name(instream, taxAndMat->name_array, i - 1);
		j = 0;
		while (!eoff(instream)) {
			while (j < n_char && !(eoln(instream) || eoff(instream))) {
				charstate = gettc(instream);
				if (isspace(charstate) || (charstate >= '0' && charstate <= '9'))
					continue;
				if (islower(charstate))
					charstate = toupper(charstate);
				if ((long)(strchr(curr_symbols, charstate)) == 0L) {
					added = 0;
					if (charstate == 'T') {
						/* U and T means mixed data type*/
						if ((long)strchr(curr_symbols, 'U') == 0L) { 
							if (nuc_only_seen == 0) {
								taxAndMat->datatype = (kAADatatype | kDNADatatype);
								curr_symbols = G_DNA_OR_PROT_SYMBOLS;
							}
							else {
								taxAndMat->datatype = kDNADatatype;
								curr_symbols = G_ALL_DNA_SYMBOLS;
							}
						}
						else {
							taxAndMat->datatype = kNucDatatype;
							curr_symbols = G_ALL_NUC_STATES;
						}
						added = 1;
					}
					else if (charstate == 'U') {
						/* U and T means mixed data type*/
						if ((long)strchr(curr_symbols, 'L') == 0L) { 
							if ((long)strchr(curr_symbols, 'T') == 0L) { 
								taxAndMat->datatype = kRNADatatype;
								curr_symbols = G_ALL_RNA_SYMBOLS;
							}
							else {
								taxAndMat->datatype = kNucDatatype;
								curr_symbols = G_ALL_NUC_STATES;
							}
							added = 1;
						}
					}
					else if ((long)strchr(G_JUST_AA_SYMBOLS, charstate) != 0L) {
						/* Prot symbols and O or U means mixed data type*/
						if ((long)(strchr(curr_symbols, 'U') == 0L) && (nuc_only_seen == 0)) { 
							taxAndMat->datatype = kAADatatype;
							curr_symbols = G_ALL_AA_STATES;
							added = 1;
						}
					}
					else if (charstate == 'O') {
						if ((long)strchr(curr_symbols, 'L') == 0L) {
							taxAndMat->datatype = kNucDatatype;
							added = 1;
							nuc_only_seen = 1;
							charstate = '-';
						}
					}
					if (!added) {
						err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING);
						sprintf(gLogMessage, "ERROR: bad symbol: %c at site %5ld of species %3ld\n%s\n", charstate, j + 1, i, err_tail);
						return 0L;
					}
				}
				j++;
				matRow[j - 1] = charstate;
			}
			if (j < n_char)
				scan_eoln(instream);
			else
				break;
		}
		if (j != n_char) {
			sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i);
			return 0L;
		}
		scan_eoln(instream);
	}
	taxAndMat->datatype = kDNADatatype;
	return mat;
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_any_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat)
{
	long			i, j, basesread, basesnew = 0;
	Char			charstate;
	const char * err_tail;
	
	const long n_taxa = taxAndMat->n_taxa;
	const long n_char = taxAndMat->n_char;
    MatrixPtr mat = taxAndMat->matrix;
    MatrixRow matRow = 0L;
    const char * curr_symbols = G_ANY_SEQ_SYMBOLS;
	int added;
	int nuc_only_seen = 0;
	
    taxAndMat->datatype = kSequenceDatatype;

	basesread = 0;
	for (;;) {
		eat_separator(instream);

		for (i = 1; i <= n_taxa; ++i) {
			matRow = mat[i - 1];
			if (basesread == 0)
				read_name(instream, taxAndMat->name_array, i - 1);
			j = basesread;
			while (j < n_char && !(eoln(instream) || eoff(instream))) {
				charstate = gettc(instream);
				if (isspace(charstate) || (charstate >= '0' && charstate <= '9'))
					continue;
				if (islower(charstate))
					charstate = toupper(charstate);
				if ((long)(strchr(curr_symbols, charstate)) == 0L) {
					added = 0;
					if (charstate == 'T') {
						/* U and T means mixed data type*/
						if ((long)strchr(curr_symbols, 'U') == 0L) { 
							if (nuc_only_seen == 0) {
								taxAndMat->datatype = (kAADatatype | kDNADatatype);
								curr_symbols = G_DNA_OR_PROT_SYMBOLS;
							}
							else {
								taxAndMat->datatype = kDNADatatype;
								curr_symbols = G_ALL_DNA_SYMBOLS;
							}
						}
						else {
							taxAndMat->datatype = kNucDatatype;
							curr_symbols = G_ALL_NUC_STATES;
						}
						added = 1;
					}
					else if (charstate == 'U') {
						/* U and T means mixed data type*/
						if ((long)strchr(curr_symbols, 'L') == 0L) { 
							if ((long)strchr(curr_symbols, 'T') == 0L) { 
								taxAndMat->datatype = kRNADatatype;
								curr_symbols = G_ALL_RNA_SYMBOLS;
							}
							else {
								taxAndMat->datatype = kNucDatatype;
								curr_symbols = G_ALL_NUC_STATES;
							}
							added = 1;
						}
					}
					else if ((long)strchr(G_JUST_AA_SYMBOLS, charstate) != 0L) {
						/* Prot symbols and O or U means mixed data type*/
						if ((long)(strchr(curr_symbols, 'U') == 0L) && (nuc_only_seen == 0)) { 
							taxAndMat->datatype = kAADatatype;
							curr_symbols = G_ALL_AA_STATES;
							added = 1;
						}
					}
					else if (charstate == 'O') {
						if ((long)strchr(curr_symbols, 'L') == 0L) {
							taxAndMat->datatype = kNucDatatype;
							added = 1;
							nuc_only_seen = 1;
							charstate = '-';
						}
					}
					if (!added) {
						err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING);
						sprintf(gLogMessage, "ERROR: bad symbol: %c at site %5ld of species %3ld\n%s\n", charstate, j + 1, i, err_tail);
						return 0L;
					}
				}
				j++;
				matRow[j - 1] = charstate;
			}
			if (i == 1)
				basesnew = j;
			else if (j != basesnew) {
				sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i);
				return 0L;
			}
			scan_eoln(instream);
		}
		basesread = basesnew;
		if (basesread == n_char) {
			return mat;
		}
	}
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_seq_non_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char *symbols, const char * cell_name, const char other_gap)
{
	long			i, j;
	Char			charstate;
	const char * err_tail;

	const long n_taxa = taxAndMat->n_taxa;
	const long n_char = taxAndMat->n_char;
    MatrixPtr mat = taxAndMat->matrix;
    MatrixRow matRow = 0L;

	eat_separator(instream);
	
	for (i = 1; i <= n_taxa; ++i) {
		matRow = mat[i - 1];
		read_name(instream, taxAndMat->name_array, i - 1);
		j = 0;
		while (!eoff(instream)) {
			while (j < n_char && !(eoln(instream) || eoff(instream))) {
				charstate = gettc(instream);
				if (isspace(charstate) || (charstate >= '0' && charstate <= '9'))
					continue;
				if (islower(charstate))
					charstate = toupper(charstate);
				if (((long)strchr(symbols, charstate)) == 0L) {
					if (charstate == other_gap)
						charstate = '-';
					else {
						err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING);
						sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail);
						return 0L;
					}
				}
				j++;
				matRow[j - 1] = charstate;
			}
			if (j < n_char)
				scan_eoln(instream);
			else
				break;
		}
		if (j != n_char) {
			sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i);
			return 0L;
		}
		scan_eoln(instream);
	}
	taxAndMat->datatype = kDNADatatype;
	return mat;
}

/* if this returns NULL, then the read failed and gLogMessage will contain
	and error message.
*/
MatrixPtr read_seq_interleaved(FILE * instream, TaxAndMatrix_t * taxAndMat, const char *symbols, const char * cell_name, char other_gap)
{
	long			i, j, basesread, basesnew = 0;
	Char			charstate;
	const char * err_tail;
	
	const long n_taxa = taxAndMat->n_taxa;
	const long n_char = taxAndMat->n_char;
    MatrixPtr mat = taxAndMat->matrix;
    MatrixRow matRow = 0L;

	basesread = 0;
	for (;;) {
		eat_separator(instream);

		for (i = 1; i <= n_taxa; ++i) {
			matRow = mat[i - 1];
			if (basesread == 0)
				read_name(instream, taxAndMat->name_array, i - 1);
			j = basesread;
			while (j < n_char && !(eoln(instream) || eoff(instream))) {
				charstate = gettc(instream);
				if (isspace(charstate) || (charstate >= '0' && charstate <= '9'))
					continue;
				if (islower(charstate))
					charstate = toupper(charstate);
				if ((long)(strchr(symbols, charstate)) == 0L) {
					if (charstate == other_gap)
						charstate = '-';
					else {
						err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING);
						sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail);
						return 0L;
					}
				}
				j++;
				matRow[j - 1] = charstate;
			}
			if (i == 1)
				basesnew = j;
			else if (j != basesnew) {
				sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i);
				return 0L;
			}
			scan_eoln(instream);
		}
		basesread = basesnew;
		if (basesread == n_char) {
			return mat;
		}
	}
}

/* Returns the matrix ptr of the TaxAndMatrix_t
	, then the read failed and gLogMessage will contain
	and error message.
	
	Fills in gSymbolsSeen as a sideeffect
*/
MatrixPtr read_disc_data(FILE * instream, TaxAndMatrix_t * taxAndMat, boolean interleaved)
{
	long			i, j, basesread, basesnew = 0;
	Char			charstate;
	const char * symbols = G_ACCEPTED_SYMBOLS;
	const char * cell_name = "symbol";
	const char * err_tail;
	boolean			allread, done;
	const long n_taxa = taxAndMat->n_taxa;
	const long n_char = taxAndMat->n_char;
    MatrixPtr mat = taxAndMat->matrix;
	int nSymbolsSeen = 0;
	for (i = 0; i < N_SYMBOLS; ++i)
		gSymbolsSeen[i] = '\0';

	basesread = 0;
	allread = 0;
	while (!(allread)) {
		eat_separator(instream);

		i = 1;
		while (i <= n_taxa) {
			if (basesread == 0 || !interleaved)
				read_name(instream, taxAndMat->name_array, i - 1);
			j = (interleaved) ? basesread : 0;
			done = 0;
			while (!done && !eoff(instream)) {
				if (interleaved)
					done = 1;
				while (j < n_char && !(eoln(instream) || eoff(instream))) {
					charstate = gettc(instream);
					if (isspace(charstate))
						continue;
					if ((long)(strchr(gSymbolsSeen, charstate)) == 0L) {
						if ((long)(strchr(symbols, charstate)) == 0L) {
							err_tail = (charstate == '.' ? G_GAP_NOT_DOT_MESSAGE : G_EMPTY_STRING);
							sprintf(gLogMessage, "ERROR: bad %s: %c at site %5ld of species %3ld\n%s\n", cell_name, charstate, j + 1, i, err_tail);
							return 0L;
						}
					gSymbolsSeen[nSymbolsSeen++] = charstate;
					}
					j++;
					mat[i - 1][j - 1] = charstate;
				}
				if (interleaved)
					continue;
				if (j < n_char)
					scan_eoln(instream);
				else if (j == n_char)
					done = 1;
			}
			if (interleaved && i == 1)
				basesnew = j;

			scan_eoln(instream);

			if ((interleaved && j != basesnew) || (!interleaved && j != n_char)) {
				sprintf(gLogMessage, "\nERROR: sequences out of alignment at position %ld of species %ld\n\n", j + 1, i);
				return 0L;
			}
			i++;
		}

		if (interleaved) {
			basesread = basesnew;
			allread = (basesread == n_char);
		} else
			allread = (i > n_taxa);
	}
	taxAndMat->datatype = kDiscreteDatatype;
	return mat;
}


/* from phylip.cinput the numbers of species and of characters */
void inputnumbers(FILE *instream, long *spp, long *chars)
{
	if (fscanf(instream, "%ld%ld", spp, chars) != 2 || *spp <= 0 || *chars <= 0) {
		error_exit("ERROR: Unable to read the number of species or characters in data set\n"\
				  "The input file is incorrect (perhaps it was not saved text only).\n", kNoNumbersInfile);
	}
}  /* inputnumbers */


void *my_malloc(long x)
{ /* wrapper for malloc, allowing error message if too little, too much */
	void *new_block = (void *)0L;
	if ((x <= 0) || (x > 1000000000)) {
		error_msg("ERROR: a function asked for an inappropriate amount of memory:");
		sprintf(gLogMessage, "	 %ld bytes\n", x);
		error_msg(gLogMessage);
		error_msg(	"		  This can mean one of two things:\n"\
					"		  1.  The input file is incorrect"\
					" (perhaps it was not saved as Text Only),\n"\
					"		  2.  There is a bug in the program.\n"\
					"		  Please check your input file carefully.\n"\
					"		  If it seems to be a bug, please mail joe@gs.washington.edu\n"\
					"		  with the name of the program, your computer system type,\n"\
					"		  a full description of the problem, and with the input data file.\n"\
					"		  (which should be in the body of the message, not as an Attachment).\n");
		exit(kBadMemRequest); 
	}
  	new_block = (void *) calloc(1,x);
  	if (!new_block)
  		error_exit("Error allocating memory\n", kCouldNotAllocMem);
	return (void *) new_block;
} /* my_malloc */

void open_file(
  FILE **fp,
  const char *filename,
  const char *filedesc, 
  const char *mode)
{ /* open a file, testing whether it exists etc. */
	char filemode[3];
	
	strcpy(filemode,mode);
	strcat(filemode,"b");

	*fp = fopen(filename,filemode);
	if (!*fp) {
		switch (filemode[0]){
			case 'r':
				sprintf(gLogMessage, "Can't find %s \"%s\"\n", filedesc, filename);
				break;
			case 'w':
			case 'a':
				sprintf(gLogMessage, "Can't write %s \"%s\"\n", filedesc, filename);
				break;
			default:
				sprintf(gLogMessage, "There is some error in the call of open_file. Unknown mode.\n");
		}
		error_exit(gLogMessage, kCouldNotReadFile);
	}
} /* open_file */

void print_NEXUS(const TaxAndMatrix_t * tax_and_mat, const char * infile_name)
{
	int i;
	const char * fmt_code;
	int std_format = 0;
	const char * equate;

	equate="";
	sprintf(gOutputMessage, "#NEXUS\n"\
	"\n[Converted from PHYLIP to NEXUS by the CIPRES %s tool version %s invoked with options = %s]\n", gAppName, gVersionStr, gFlags);
	output_msg(gOutputMessage);
	
	convert_names_to_NEXUS(tax_and_mat->n_taxa, tax_and_mat->name_array, tax_and_mat->max_name_len - 1);
	
	sprintf(gOutputMessage, "\nBEGIN Taxa;\n"\
	"    Dimensions NTax = %ld ;\n"\
	"    TaxLabels\n", tax_and_mat->n_taxa);
	output_msg(gOutputMessage);
	
	
	for (i= 0; i < tax_and_mat->n_taxa; ++i) {
		sprintf(gOutputMessage, "        %s\n", tax_and_mat->name_array[i]);
		output_msg(gOutputMessage);
	}
	
	sprintf(gOutputMessage, "    ;\nEND;\n\nBEGIN Characters;\n    Dimensions NChar = %ld ;\n", tax_and_mat->n_char);

	output_msg(gOutputMessage);
	if (tax_and_mat->datatype == kNucDatatype)
		fmt_code = "nucleotide";
	else if (tax_and_mat->datatype == kDNADatatype || tax_and_mat->datatype == (kAADatatype | kDNADatatype))
		fmt_code = "DNA";
	else if (tax_and_mat->datatype == kRNADatatype)
		fmt_code = "RNA";
	else if (tax_and_mat->datatype == kAADatatype) {
		fmt_code = "Protein";
		/* this is correct:
		equate="equate=\"X={ACDEFGHIKLMNPQRSTVWY*}\"";
		what follows is a workaround for an NCL bug:
		*/
		
		equate="equate=\"X={ACDEFGHIKLMNPQRSTVWY}\"";
	}
	else {
		std_format = 1;
		sprintf(gOutputMessage, "    Format Datatype = Standard Symbols=\"%s\" missing = ? ;\n    Matrix\n", gSymbolsSeen);
	}
	if (!std_format)
		sprintf(gOutputMessage, "    Format Datatype = %s missing = ? gap = - %s ;\n    Matrix\n", fmt_code, equate);
	output_msg(gOutputMessage);
	
	for (i= 0; i < tax_and_mat->n_taxa; ++i) {
		sprintf(gOutputMessage, "\n%-20s ", tax_and_mat->name_array[i]);
		output_msg(gOutputMessage);
		assert(strlen(tax_and_mat->matrix[i]) == tax_and_mat->n_char);
		output_msg(tax_and_mat->matrix[i]);
	}
	output_msg("\n;\nEND;\n");
}

/* 	Assumes that  names can be expanded up to length max_name_len when adding NEXUS tokenizing characters.
	If duplicate labels are found #1, #2,.... #n are appended
*/
void convert_names_to_NEXUS(const unsigned int n_taxa, char ** name_array, const unsigned max_name_len)
{
	const unsigned int len_with_term = max_name_len + 1;
	char * curr_name_alias;
	char * scratch = (char *)my_malloc((long)(len_with_term*sizeof(char)));
	char * to_hash = (char *)my_malloc((long)(len_with_term*sizeof(char)));
	char * remap_scratch = (char *)my_malloc((long)(len_with_term*sizeof(char)));
	char * remap_tail = (char *)my_malloc((long)(len_with_term*sizeof(char)));
	char * fir_occ_name;
	long fir_occ_len;
	unsigned int i, j, k, needs_quotes, write_pos, add_now, spaces_skipped;
	int all_digit, prev_pound; /* will be 1 if the string is \d* or \d*# */
	tax_name_t * name_entry;
	char raw_char, conv_char;
	const int n_buckets =  (n_taxa > 1024 ? n_taxa : 1024);
	str_hash_table_t * hash_table = alloc_hash(n_buckets, n_taxa);
	
	for (i = 0; i < n_taxa; ++i) {
		curr_name_alias = name_array[i];
		assert((long)curr_name_alias != 0L);
		needs_quotes = 0;
		write_pos = 0;
		spaces_skipped = 0;
		all_digit = 1;
		prev_pound = 0;
		for (j = 0;; ++j) {
			raw_char = (char) curr_name_alias[j];
			if (raw_char == '\0') {
				to_hash[j] = '\0';
				scratch[write_pos] = '\0';
				break;
			}

			assert(j < len_with_term);
			if (j >= len_with_term)
				error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);

			add_now = 1;
			if (isalpha(raw_char) || raw_char == '.'){
				conv_char = (islower(raw_char) ? toupper(raw_char) : raw_char);
				prev_pound = 0;
				all_digit = 0;
			}
			else if (isdigit(raw_char)){
				conv_char = raw_char;
				if (prev_pound)
					all_digit = 0;
				prev_pound = 0;
			}
			else {
				conv_char = raw_char;
				if (raw_char == ' ') {
					spaces_skipped += 1;
					add_now = 0;
					prev_pound = 0;
				}
				else {
					needs_quotes = 1;
					if (raw_char == '#' && prev_pound == 0)
						prev_pound = 1;
					else {
						prev_pound = 0;
						all_digit = 0;
					}
				}
			}
			if (add_now != 0) {
				/*We deal with spaces this way so that we strip trailing whitespace
					(by not adding it unless there is another non-space character)
					later in the string.
				*/
				if (spaces_skipped > 0) {
					all_digit = 0;
					needs_quotes = 1;
					assert(write_pos + spaces_skipped  < len_with_term);
					if (write_pos + spaces_skipped   >= len_with_term)
						error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);

					for (k = 0; k < spaces_skipped; ++k) {
						scratch[write_pos + k] = ' ';
						to_hash[j - spaces_skipped + k] = ' ';
					}
					write_pos += spaces_skipped;
					spaces_skipped = 0;
				}
				scratch[write_pos++] = raw_char;
	
				if (raw_char == '\'') {
					assert(write_pos < len_with_term);
					if (write_pos >= len_with_term)
						error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);
					scratch[write_pos++] = raw_char;
				}
				to_hash[j] = conv_char;
			}
		}
		if (write_pos == 0) {
			all_digit = 0;
			needs_quotes = 1;
		}
		name_entry = insert_str(hash_table, to_hash);
		assert((long)name_entry != 0L);
		
		
		/* Check for things that don't fly in NEXUS:
			repeated names
			all digit names
			
		   Try to fix these problems by appending # to avoid all-digit names,
		   and #1, #2, #3,... #n to disambiguate repeated names
		   
		   We need to make sure that the modification does not cause a clash.
		   
		   If it does we bail out.
		*/
		if (name_entry->n_occurrences == 1) {
			name_entry->first_occ_index = (int) i;

			/*Check for all digit taxon names and append a # */
			if (all_digit == 1) {
				strcpy(remap_scratch, to_hash);
				if (prev_pound){
					remap_scratch[strlen(remap_scratch) - 1] = '\0';
					if ((long)find_str(hash_table, remap_scratch) != 0L)  {
						sprintf(gLogMessage, "All digit name %s was remapped to %s#, but %s# has been encountered. Mapping to avoid all digit names failed.\n", remap_scratch, remap_scratch, remap_scratch);
						error_exit(gLogMessage, kRemappingError);
					}
				}
				else {
					strcat(remap_scratch, "#");
					if ((long)find_str(hash_table, remap_scratch) != 0L)  {
						sprintf(gLogMessage, "All digit name %s was remapped to %s, but %s name was already encountered. Mapping to avoid all digit names failed.\n", to_hash, remap_scratch, remap_scratch);
						error_exit(gLogMessage, kRemappingError);
					}
					needs_quotes = 1;
					strcat(scratch, "#");
					sprintf(gLogMessage, "All digit names found.  Mapped %s to %s#\n", curr_name_alias, curr_name_alias);
					warn_msg(gLogMessage);
					sprintf(gOutputMessage, " [ %s ] ", gLogMessage);
					output_msg(gOutputMessage);
				}
			}
		}
		else if (name_entry->n_occurrences == 2) {
			needs_quotes = 1;
			assert(write_pos + 2 < len_with_term);
			if (write_pos + 2 >= len_with_term)
				error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);
			strcat(scratch, "#2");
			
			strcpy(remap_scratch, to_hash);
			strcat(remap_scratch, "#2");
			if (find_str(hash_table, remap_scratch) != 0L) {
				sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s already exists.\n", curr_name_alias, remap_scratch);
				error_exit(gLogMessage, kRemappingError);
			}
			fir_occ_name = name_array[name_entry->first_occ_index];

			sprintf(gLogMessage, "Duplicate names found.  Mapped %s to %s#1 and %s#2\n", curr_name_alias, fir_occ_name, curr_name_alias);
			warn_msg(gLogMessage);
			sprintf(gOutputMessage, " [ %s ] ", gLogMessage);
			output_msg(gOutputMessage);
	

			strcpy(remap_scratch, to_hash);
			strcat(remap_scratch, "#1");
			if (find_str(hash_table, remap_scratch) != 0L) {
				sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s already exists.\n", curr_name_alias, remap_scratch);
				error_exit(gLogMessage, kRemappingError);
			}
			
			fir_occ_len = strlen(fir_occ_name);
			assert(fir_occ_len + 4 < len_with_term);
			if (fir_occ_len + 4 >= len_with_term)
				error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);
			if (fir_occ_name[0] == '\'')
				strcpy(fir_occ_name + fir_occ_len - 1, "#1\'");
			else {
				strcpy(remap_scratch, fir_occ_name);
				sprintf(fir_occ_name, "\'%s#1\'", to_hash);
			}
		}
		else {
			needs_quotes = 1;
			sprintf(remap_tail,"#%d", name_entry->n_occurrences);
			if (write_pos + strlen(remap_tail) >= len_with_term) {
				sprintf(gLogMessage, "Duplicate names found and cannot remap %s to %s%s. This program was not written in anticipation of this large number of name clashes\n", curr_name_alias, scratch, remap_tail);
				error_exit(gLogMessage, kRemappingError);
			}
			strcat(scratch, remap_tail);
			
			strcpy(remap_scratch, to_hash);
			strcat(remap_scratch, remap_tail);
			if (find_str(hash_table, remap_scratch) != 0L) {
				sprintf(gLogMessage, "Duplicate names found and cannot remap %s, because %s%s already exists.\n", curr_name_alias, remap_scratch, remap_tail);
				error_exit(gLogMessage, kRemappingError);
			}
			sprintf(gLogMessage, "Duplicate names found.  Mapped %s to %s\n", curr_name_alias, scratch);
			warn_msg(gLogMessage);
			sprintf(gOutputMessage, " [ %s] ", gLogMessage);
			output_msg(gOutputMessage);
		}
		
		
		/* Now we are ready to quote those names that need them */
		if (needs_quotes == 0)
			sprintf(curr_name_alias, "%s", scratch);
		else {
			assert(write_pos + 3 < len_with_term);
			if (write_pos + 3 >= len_with_term)
				error_exit("Internal representation of the name is too long (this is a bug).", kAssertionError);
			sprintf(curr_name_alias,"\'%s\'", scratch);
		}
			
		
	}
	free(remap_tail);
	free(remap_scratch);
	free(scratch);
	free(to_hash);
	free_hash(hash_table);
}


void scan_eoln(FILE *f) 
{ /* eat everything to the end of line or eof*/
  char ch;

  while (!eoff(f) && !eoln(f)) 
	gettc(f);
  if (!eoff(f)) 
	ch = gettc(f);
}


void getPosOrDie(FILE * f, fpos_t *fPos) 
{
	if (fgetpos(f,fPos))
		error_exit("Error checking file position\n", kFilePosError);
}
void setPosOrDie(FILE * f, const fpos_t *fPos) 
{
	if (fsetpos(f, fPos))
		error_exit("Error setting file position\n", kFilePosError);
}


TaxAndMatrix_t * read_data(const char * infile_name, const int expectedDatatype, const int interleaveStatus)
{
	fpos_t fPos;
	FILE * infilePtr;
	TaxAndMatrix_t * taxAndMat;
	long n_taxa;
	long n_char;
	
	open_file(&infilePtr, infile_name, "input file", "r");
	
	/* begin doinit from pars.c   -- initializes variables */
	inputnumbers(infilePtr, &n_taxa, &n_char);

	sprintf(gLogMessage, "%2ld species, %3ld  sites\n\n", n_taxa, n_char);
	debug_msg(gLogMessage);	

	
	taxAndMat = alloc_TaxAndMat(n_taxa, n_char);
	/* end doinit from pars.c   -- initializes variables */

	getPosOrDie(infilePtr, &fPos);
	if (kSequenceDatatype == (kSequenceDatatype & expectedDatatype)){
		if (interleaveStatus & kIsInterleaved) {
			if (read_any_seq_data(infilePtr, taxAndMat, 1))
				return taxAndMat;
			setPosOrDie(infilePtr, &fPos);
		}
		if (interleaveStatus & kIsNotInterleaved) {
			if (read_any_seq_data(infilePtr, taxAndMat, 0))
				return taxAndMat;
			setPosOrDie(infilePtr, &fPos);
		}
	}
	else {
		if (expectedDatatype & kDNADatatype) {
			if (interleaveStatus & kIsInterleaved) {
				if (read_dna_data(infilePtr, taxAndMat, 1))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
			if (interleaveStatus & kIsNotInterleaved) {
				if (read_dna_data(infilePtr, taxAndMat, 0))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
		}
		if (expectedDatatype & kRNADatatype) {
			if (interleaveStatus & kIsInterleaved) {
				if (read_rna_data(infilePtr, taxAndMat, 1))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
			if (interleaveStatus & kIsNotInterleaved) {
				if (read_rna_data(infilePtr, taxAndMat, 0))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
		}
		if (expectedDatatype & kAADatatype) {
			if (interleaveStatus & kIsInterleaved) {
				if (read_aa_data(infilePtr, taxAndMat, 1))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
			if (interleaveStatus & kIsNotInterleaved) {
				if (read_aa_data(infilePtr, taxAndMat, 0))
					return taxAndMat;
				setPosOrDie(infilePtr, &fPos);
			}
		}
	}
	if (expectedDatatype & kDiscreteDatatype) {
		if (interleaveStatus & kIsInterleaved) {
			if (read_disc_data(infilePtr, taxAndMat, 1))
				return taxAndMat;
			setPosOrDie(infilePtr, &fPos);
		}
		if (interleaveStatus & kIsNotInterleaved) {
			if (read_disc_data(infilePtr, taxAndMat, 0))
				return taxAndMat;
			setPosOrDie(infilePtr, &fPos);
		}
	}
	taxAndMat->matrix = 0L; /*MEMORY LEAK matrix, should be freed but we are signalling that the input was not read*/
	return taxAndMat;
}

int main(int argc, char * argv[])
{	/*
	 * reads in spp, chars, and the data. Then calls maketree to
	 * construct the tree
	 */
	const char * usage = "   phylip_to_nexus [OPTIONS] INFILE [OUTFILE]\n"\
						 "The OPTIONS string starts with - followed by any of the following flags:\n"\
						 "    i = support interleaved input\n"\
						 "    n = support non-interleaved input\n"\
						 "    a = support amino acid sequences\n"\
						 "    d = support dna sequences\n"\
						 "    r = support rna sequences\n"\
						 "    g = support generic discrete output\n"\
						 "    v = verbose output\n"\
						 "    q = quiet mode (only output errors)\n"\
						 "Default behavior is to act as if all flags were requested (diagnose input type and interleaving).\n";
	 
	TaxAndMatrix_t * taxAndMat;
	const char * fn = 0L;
	const char * outfn = 0L;
	int i;
	char c;
	int interleaveBits = 0;
	int datatypeBits = 0;
	install_sig_handlers();
	
	if (argc > 4) {
		sprintf(gLogMessage, "Expecting at most 3 command line arguments.\nUsage:\n%s", usage);
		error_exit(gLogMessage, kBadCommandLineArgs);
	}
	else if (argc == 4) {
		gFlags = argv[1];
		fn = argv[2];
		outfn = argv[3];
	}
	else if (argc == 3) {
		if (strlen(argv[1]) > 0 && argv[1][0] == '-') {
			gFlags = argv[1];
			fn = argv[2];
		}
		else {
			fn = argv[1];
			outfn = argv[2];
		}
	}
	else if (argc == 2) {
		if (strlen(argv[1]) > 0 && argv[1][0] == '-') {
			gFlags = argv[1];
			fn = "infile";
		}
		else {
			fn = argv[1];
		}
	}
	else
		fn = "infile";


	if (strlen(gFlags) == 0 || gFlags[0] != '-') {
		sprintf(gLogMessage, "Expecting flags to start with -.\nUsage:\n%s", usage);
		error_exit(gLogMessage, kBadCommandLineArgs);
	}
	for (i = 1; i < strlen(gFlags); ++ i) {
		c = gFlags[i];
		if (islower(c))
			c = toupper(c);
		if (c == 'I')
			interleaveBits |= kIsInterleaved;
		else if (c == 'N')
			interleaveBits |= kIsNotInterleaved;
		else if (c == 'D')
			datatypeBits |= kDNADatatype;
		else if (c == 'R')
			datatypeBits |= kRNADatatype;
		else if (c == 'A')
			datatypeBits |= kAADatatype;
		else if (c == 'G')
			datatypeBits |= kDiscreteDatatype;
		else if (c == 'V')
			gCurrentLogThreshold = kDebugLevel;
		else if (c == 'Q')
			gCurrentLogThreshold = kErrorLevel;
		else if (c == 'H') {
			sprintf(gLogMessage, "%s  version %s.\n"\
			                     "The CIPRES project\'s tool for converting PHYLIP files to NEXUS.\n\n"\
			                     "Based on the code base of PHYLIP 3.6.5 which is copyrighted by:\n"\
			                     "    Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, Mike Palczewski, Doug Buxton and Dan Fineman.\n"\
			                     "Tweaks for CIPRES by Mark Holder.\n\n"\
			                     "Usage:\n%s", gAppName, gVersionStr, usage);
			output_msg(gLogMessage);
			return 0;
		}	
		else {
			sprintf(gLogMessage, "Unknown flag \"%c\"\n", gFlags[i]);
			error_exit(gLogMessage, kBadCommandLineArgs);
		}
	}
	if (datatypeBits == 0) {
		sprintf(gLogMessage, "At least one of the datatype options (d, r, a, or g) must be supplied.\nUsage:\n%s", usage);
		error_exit(gLogMessage, kBadCommandLineArgs);
	}
	if (outfn != 0L)
	    open_file(&gOutfilePtr, outfn, "output file", "w");

	taxAndMat = read_data(fn, datatypeBits, interleaveBits);
	if (taxAndMat == 0L || taxAndMat->matrix == 0L) {
		free_TaxAndMat(taxAndMat);
		error_msg(gLogMessage);
		sprintf(gLogMessage, "Could not parse %s.\n", fn);
		error_exit(gLogMessage, kCouldNotReadFile);
	}
	else {
		print_NEXUS(taxAndMat, fn);
		free_TaxAndMat(taxAndMat);
	}
	return 0;
}