/* This file implements a phylogenetic tree structure. Copyright (C) 2004 The University of Texas at Austin This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Email: usman@cs.njit.edu Mail: Usman Roshan, Department of Computer Sciences, NJIT-CCS, Computer Science Department, GITC 4400, University Heights, Newark, NJ 07102 */ #include #include #include #include #include #include #include #include #include #include static void copy_subtree(phylogenetic_tree T, phylogenetic_tree base_tree, node* sub2tree); static void copy_subtree_rec(node u, node v, edge e, phylogenetic_tree T, phylogenetic_tree base_tree, node* sub2tree); static void print_tree_rec(phylogenetic_tree T, char** num_to_name, const node root, const edge e, char* tree_string, short do_wgts); static int read_tree_rec(phylogenetic_tree T, char** num_to_name, char** newick_tree, node root, hashtable name_to_num); static int read_leaf_name(phylogenetic_tree T, char** num_to_name, char** newick_tree, node n, hashtable name_to_num); static int restrict_tree_rec(phylogenetic_tree T, int_set taxa, list edge_list, list node_list, edge e, node v); static int set_subtree_sizes_rec(phylogenetic_tree T, node v, edge e); /****************/ /*New and delete*/ /****************/ phylogenetic_tree new_phylogenetic_tree(){ phylogenetic_tree T; T = (phylogenetic_tree) malloc(sizeof(struct phylogenetic_tree_struct)); T->G = new_graph(); T->num_leaves = 0; /*for(i = 0; i < MAX_LEAVES; i++) T->num_to_name[i] = '\0';*/ return T; } void del_phylogenetic_tree(phylogenetic_tree T){ assert(T); /*just delete graph*/ del_graph(T->G); free(T); } /*****************/ void print_phylo_tree(phylogenetic_tree T, char** num_to_name) { node n; edge e; assert(T); assert(num_to_name); printf("Num of nodes = %d, num of edges = %d\n", T->G->n_nodes, T->G->n_edges); /*this commented section is used for debugging where we output actual pointer values to make sure everything is pointing to where it should.*/ /* printf("\nEdges\n"); for(e = T->G->first_edge; e; e = e->next){ printf("ptr: %d id: %d wgt: %.2f u_id: %d v_id: %d\nprev: %d next: %d\nprev_u: %d next_u: %d\nprev_v: %d next_v: %d\n\n", (unsigned int)e, e->id, ((edge_aux_vars)e->aux_vars)->wgt, e->u->id, e->v->id, (unsigned int)e->prev, (unsigned int)e->next, (unsigned int)e->u_prev, (unsigned int)e->u_next, (unsigned int)e->v_prev, (unsigned int)e->v_next); } */ for(n = T->G->first_node; n; n = n->next) { if(((node_aux_vars)n->aux_vars)->taxon_id != -1){ printf("(%-3d %d %-3s %d %d): ", n->id, n->degree, num_to_name[((node_aux_vars)n->aux_vars)->taxon_id], ((node_aux_vars)n->aux_vars)->taxon_id, ((node_aux_vars)n->aux_vars)->taxa_size); } else{ printf("(%-3d %d %-3d %d %d): ", n->id, n->degree, ((node_aux_vars)n->aux_vars)->taxon_id, ((node_aux_vars)n->aux_vars)->taxon_id, ((node_aux_vars) n->aux_vars)->taxa_size); } for(e = n->first_edge; e;){ printf(" (%d-%d):%.3f ", e->u->id, e->v->id, ((edge_aux_vars)e->aux_vars)->wgt); if(e->u == n) e = e->u_next; else if (e->v == n) e = e->v_next; else assert(0); } printf("\n"); } } /***************/ /*I/O routines*/ /***************/ void print_tree(phylogenetic_tree T, char** num_to_name, char* tree_string, short do_wgts){ node v, w; edge e; char buffer[50]; assert(T); assert(num_to_name); remove_2degreeinternal_tree(T); e = T->G->first_edge; /*v = e->u; w = e->v;*/ w = e->u; v = e->v; strcat(tree_string, "("); print_tree_rec(T, num_to_name, v, e, tree_string, do_wgts); if(do_wgts){ strcat(tree_string, ":"); /*printing only three significant digits*/ sprintf(buffer, "%.3f", ((edge_aux_vars)e->aux_vars)->wgt); strcat(tree_string, buffer); } strcat(tree_string, ","); print_tree_rec(T, num_to_name, w, e, tree_string, do_wgts); if(do_wgts) strcat(tree_string, ":0.0);"); else strcat(tree_string, ");"); } static void print_tree_rec(phylogenetic_tree T, char** num_to_name, const node root, const edge e, char* tree_string, short do_wgts){ int i, x; int_set A; char buffer[50]; short first, first_; edge f; node v; int* intset_entries; A = ((node_aux_vars)root->aux_vars)->taxaset; if(root->degree == 1){ A = ((node_aux_vars)root->aux_vars)->taxaset; assert(size_int_set(A)); first_=1; intset_entries = (int*) malloc(size_int_set(A) * sizeof(int)); get_entries_int_set(A, intset_entries); for(i=0; i < size_int_set(A); i++) { x = intset_entries[i]; assert(num_to_name[x]); if(first_) sprintf(buffer, "%s", num_to_name[x]); else sprintf(buffer, ",%s", num_to_name[x]); first_=0; strcat(tree_string, buffer); } free(intset_entries); } else { A = ((node_aux_vars)root->aux_vars)->taxaset; if(size_int_set(A)){ first_=1; intset_entries = (int*) malloc(size_int_set(A) * sizeof(int)); get_entries_int_set(A, intset_entries); for(i=0; i < size_int_set(A); i++) { x = intset_entries[i]; assert(num_to_name[x]); if(first_) sprintf(buffer, "%s", num_to_name[x]); else sprintf(buffer, ",%s", num_to_name[x]); first_=0; strcat(tree_string, buffer); } strcat(tree_string, ","); free(intset_entries); } first=1; strcat(tree_string, "("); for(f=root->first_edge; f; ){ if(f!=e){ if(first) first=0; else strcat(tree_string, ","); v = opposite(root, f); print_tree_rec(T, num_to_name, v, f, tree_string, do_wgts); if(do_wgts){ strcat(tree_string, ":"); /*printing only three significant digits*/ sprintf(buffer, "%.3f", ((edge_aux_vars)f->aux_vars)->wgt); strcat(tree_string, buffer); } } if(f->u == root) f = f->u_next; else if (f->v == root) f = f->v_next; else assert(0); } strcat(tree_string, ")"); } } int read_tree(phylogenetic_tree T, char** num_to_name, char* newick_tree, hashtable name_to_num){ int tree_correctly_read; node root; node_aux_vars node_vars; assert(T); assert(newick_tree); assert(num_to_name); /*creating a new node_aux_vars to carry auxilliary info in node*/ node_vars = new_node_aux_vars(); root = new_node(T->G, node_vars); tree_correctly_read = read_tree_rec(T, num_to_name, &newick_tree, root, name_to_num); if(!tree_correctly_read) return 0; /*print_phylo_tree(T);*/ if(root->degree == 1 && ((node_aux_vars)root->aux_vars)->taxon_id == -1){ del_node(T->G, root); root = NULL; } /*print_phylo_tree(T);*/ remove_2degreeinternal_tree(T); /*print_phylo_tree(T);*/ return 1; } static int read_tree_rec(phylogenetic_tree T, char** num_to_name, char** newick_tree, node root, hashtable name_to_num){ node n; char* endptr; double x; node_aux_vars node_vars; edge_aux_vars edge_vars; /*creating a new node_aux_vars for carrying auxilliary node info*/ node_vars = new_node_aux_vars(); n = new_node(T->G, node_vars); if(**newick_tree=='('){ /*recurse to read rest of tree*/ do { (*newick_tree)++; if(!read_tree_rec(T, num_to_name, newick_tree, n, name_to_num)) return 0; } while(**newick_tree == ','); if(**newick_tree != ')'){ printf("Error: expected: ')', found: '%c'\n", **newick_tree); return 0; } (*newick_tree)++; } else { /*found leaf; read in the name*/ if(!read_leaf_name(T, num_to_name, newick_tree, n, name_to_num)){ printf("Error: expected taxon name before: %c\n", **newick_tree); return 0; /*exit(EXIT_FAILURE);*/ } } if(**newick_tree == ':') { /*found weight; read in weight*/ (*newick_tree)++; x = strtod(*newick_tree, &endptr); if(*newick_tree == endptr) { printf("Error: expected double, found %c\n", **newick_tree); return 0; /*exit(EXIT_FAILURE);*/ } *newick_tree = endptr; edge_vars = new_edge_aux_vars(); /*convert negative edge weight to zero and print warning*/ if(x < 0) { x = 0; printf("Warning: negative edge weight found, converting to zero\n"); } edge_vars->wgt = x; new_edge(T->G, root, n, edge_vars); ((node_aux_vars)root->aux_vars)->taxa_size += ((node_aux_vars)n->aux_vars)->taxa_size; } else { edge_vars = new_edge_aux_vars(); edge_vars->wgt = 0; new_edge(T->G, root, n, edge_vars); ((node_aux_vars)root->aux_vars)->taxa_size += ((node_aux_vars)n->aux_vars)->taxa_size; } return 1; } static int read_leaf_name(phylogenetic_tree T, char** num_to_name, char** newick_tree, node n, hashtable name_to_num){ char name[MAX_NAME_LEN]; int i = 0; int taxon_id; while(!strchr(":(),", **newick_tree)){ // TL added length check if (i > (MAX_NAME_LEN - 2)) { printf("Leaf name too long, max is %d\n", MAX_NAME_LEN); return 0; } name[i++] = **newick_tree; (*newick_tree)++; if(!*newick_tree){ printf("Error: unexpected EOF while reading leaf name\n"); return 0; } } name[i] = '\0'; if(!name_to_num){ num_to_name[T->num_leaves] = (char*) malloc(strlen(name)+1); strcpy(num_to_name[T->num_leaves], name); ((node_aux_vars)n->aux_vars)->taxon_id = T->num_leaves; ((node_aux_vars)n->aux_vars)->taxa_size = 1; /*inserting taxon_id into taxaset*/ insert_int_set(((node_aux_vars)n->aux_vars)->taxaset, T->num_leaves); } else{ taxon_id = lookup_hashtable(name_to_num, name); /*printf("found name=%s at position %d\n", name,taxon_id);*/ assert(!strcmp(num_to_name[taxon_id], name)); ((node_aux_vars)n->aux_vars)->taxon_id = taxon_id; ((node_aux_vars)n->aux_vars)->taxa_size = 1; /*inserting taxon_id into taxaset*/ insert_int_set(((node_aux_vars)n->aux_vars)->taxaset, taxon_id); } T->num_leaves++; return 1; } /***************/ /*End of I/O routines*/ /***************/ /***************/ /*Miscellaneous routines*/ /***************/ void set_subtree_sizes(phylogenetic_tree T){ node v; /*first make sure all taxa sizes are set to zero*/ for(v=T->G->first_node; v; v=v->next) ((node_aux_vars)v->aux_vars)->taxa_size = 0; for(v=T->G->first_node; v; v=v->next) if(v->degree > 1){ set_subtree_sizes_rec(T, v, (edge) 0); break; } } static int set_subtree_sizes_rec(phylogenetic_tree T, node v, edge e){ edge f; if(v->degree == 1){ if(((node_aux_vars)v->aux_vars)->taxon_id != -1) { ((node_aux_vars)v->aux_vars)->taxa_size = 1; return 1; } else { /*printf("node with degree 1 has taxon_id=-1\n"); printf("%d\n", size_int_set(((node_aux_vars)v->aux_vars)->taxaset));*/ assert(0); /*degree one nodes must be leaves with a +ve taxon id*/ } } for(f=v->first_edge; f; ){ if(f!=e) ((node_aux_vars)v->aux_vars)->taxa_size += set_subtree_sizes_rec(T, opposite(v, f), f); if(f->u==v) f=f->u_next; else if(f->v==v) f=f->v_next; else assert(0); } return ((node_aux_vars)v->aux_vars)->taxa_size; } void assign_tree(phylogenetic_tree dest, phylogenetic_tree src){ assert(dest && src); assign_graph(dest->G, src->G); dest->num_leaves = src->num_leaves; } void restrict_tree(phylogenetic_tree T, int_set taxa){ node u, v; list edge_list; list_node ln; list node_list; edge_list = new_list(); node_list = new_list(); for(v = T->G->first_node; v; v=v->next) if(v->degree > 1){ /*find edges to remove, i.e. those not part of the induced tree*/ restrict_tree_rec(T, taxa, edge_list, node_list, (edge) 0, v); break; } /*remove selected edges*/ for(ln=edge_list->head; ln; ln=ln->next){ del_edge(T->G, (edge)ln->data); ln->data = NULL; } /*remove selected nodes*/ for(ln=node_list->head; ln; ln=ln->next){ /*updating num_leaves variable*/ if(((node_aux_vars)((node)ln->data)->aux_vars)->taxon_id!=-1) T->num_leaves--; del_node(T->G, (node)ln->data); ln->data = NULL; } del_list(edge_list); del_list(node_list); /*removing internal 2-degree nodes as a result of deleting edges*/ remove_2degreeinternal_tree(T); /*check for hanging nodes*/ for(v=T->G->first_node; v; ) if(v->degree==1 && ((node_aux_vars)v->aux_vars)->taxon_id==-1){ u = v->next; del_node(T->G, v); v = u; } else v = v->next; /*removing internal 2-degree nodes as a result of deleting edges. this needs to be done again to assure the tree is still binary*/ remove_2degreeinternal_tree(T); return; } static int restrict_tree_rec(phylogenetic_tree T, int_set taxa, list edge_list, list node_list, edge e, node v){ edge f; int found; found = 0; if(v->degree == 1) { if(member_int_set(taxa, ((node_aux_vars)v->aux_vars)->taxon_id)) return 1; else { /*need to delete this node*/ push_list(node_list, (void*) v); return 0; } } for(f = v->first_edge; f; ){ if(f!=e) { if(!restrict_tree_rec(T, taxa, edge_list, node_list, f, opposite(v, f))) push_list(edge_list, (void*) f); else found=1; } if(f->u==v) f=f->u_next; else if(f->v==v) f=f->v_next; else assert(0); } /*need to delete this node as well*/ if(found==0) push_list(node_list, (void*) v); return found; } void remove_2degreeinternal_tree(phylogenetic_tree T){ node u, v, w, x; edge e1, e2; edge_aux_vars edge_vars; assert(T); u = T->G->first_node; while(u){ v = u->next; /*if(u->degree == 2 && size_int_set(((node_aux_vars)u->aux_vars)->taxaset)==0) {*/ if(u->degree == 2){ if(((node_aux_vars)u->aux_vars)->taxon_id == -1) { e1 = u->first_edge; e2 = u->last_edge; if(e1->u != u) w = e1->u; else w = e1->v; if(e2->u != u) x = e2->u; else x = e2->v; edge_vars = new_edge_aux_vars(); if(e1->aux_vars && e2->aux_vars) edge_vars->wgt = ((edge_aux_vars)e1->aux_vars)->wgt + ((edge_aux_vars)e2->aux_vars)->wgt; else edge_vars->wgt = 0; new_edge(T->G, w, x, edge_vars); del_node(T->G, u); u = NULL; } else assert(0); /*internal 2-degree node should have -ve taxon id*/ } u = v; } } void uniform_random_tree(int num_leaves, phylogenetic_tree base_tree){ node u, v, uv, w; edge* edges; edge f; int i, j, count, n_edges; phylogenetic_tree T; node_aux_vars node_vars; edge_aux_vars edge_vars; srand(time(NULL)); assert(base_tree->num_leaves==0); T = new_phylogenetic_tree(); count = 0; node_vars = new_node_aux_vars(); node_vars->taxon_id=count++; u = new_node(T->G, node_vars); T->num_leaves++; node_vars = new_node_aux_vars(); node_vars->taxon_id=count++; v = new_node(T->G, node_vars); T->num_leaves++; edges = (edge*) malloc((2*num_leaves-3) * sizeof(edge)); for(i=0;i<(2*num_leaves - 3);i++) edges[i]=NULL; edge_vars = new_edge_aux_vars(); edge_vars->wgt=1; edges[0] = new_edge(T->G, u, v, edge_vars); /*default edge wgt=1*/ n_edges = 1; for(i=3; i <= num_leaves; i++){ j = (int) ((n_edges-1) * ((double) rand()/(double) RAND_MAX)); assert(j >= 0); assert(j < (2*num_leaves - 3)); f = edges[j]; u = f->u; v = f->v; assert(f); node_vars = new_node_aux_vars(); uv = new_node(T->G, node_vars); node_vars = new_node_aux_vars(); node_vars->taxon_id=count++; w = new_node(T->G, node_vars); T->num_leaves++; del_edge(T->G, f); edge_vars = new_edge_aux_vars(); edge_vars->wgt=1; edges[j] = new_edge(T->G, w, uv, edge_vars); edge_vars = new_edge_aux_vars(); edge_vars->wgt=1; edges[n_edges++] = new_edge(T->G, v, uv, edge_vars); edge_vars = new_edge_aux_vars(); edge_vars->wgt=1; edges[n_edges++] = new_edge(T->G, u, uv, edge_vars); } assert(T->num_leaves == num_leaves); free(edges); assign_tree(base_tree, T); del_phylogenetic_tree(T); } void random_refine(phylogenetic_tree T){ node v; node* sub2tree; int i; list node_list; phylogenetic_tree base_tree; list_node ln; edge f; node_list=new_list(); for(v = T->G->first_node; v; v = v->next) if(v->degree > 3) push_list(node_list, (void*)v); for(ln=node_list->head; ln; ln=ln->next){ v = (node)ln->data; sub2tree = (node*) malloc(v->degree * sizeof(node)); i = 0; for(i=0;idegree;i++)sub2tree[i]=NULL; i = 0; for(f = v->first_edge; f; ){ sub2tree[i++] = (void*) (opposite(v, f)); if(f->u == v) f=f->u_next; else f=f->v_next; } base_tree = new_phylogenetic_tree(); uniform_random_tree(v->degree, base_tree); copy_subtree(T, base_tree, sub2tree); del_phylogenetic_tree(base_tree); del_node(T->G, v); free(sub2tree); } for(ln=node_list->head; ln; ln=ln->next) ln->data=NULL; del_list(node_list); } void random_refine_edgelens(phylogenetic_tree T){ node v, w; node* sub2tree; int i; list node_list; phylogenetic_tree base_tree; list_node ln; edge f; node_list=new_list(); for(v = T->G->first_node; v; v = v->next) if(v->degree > 3) push_list(node_list, (void*)v); for(ln=node_list->head; ln; ln=ln->next){ v = (node)ln->data; sub2tree = (node*) malloc(v->degree * sizeof(node)); i = 0; for(i=0;idegree;i++)sub2tree[i]=NULL; i = 0; for(f = v->first_edge; f; ){ sub2tree[i++] = (void*) (opposite(v, f)); if(f->u == v) f=f->u_next; else f=f->v_next; } base_tree = new_phylogenetic_tree(); uniform_random_tree(v->degree, base_tree); /*fixing edge lengths*/ for(f = base_tree->G->first_edge; f; f=f->next){ if(f->u->degree==1 || f->v->degree==1){ if(f->u->degree==1) w = f->u; else w = f->v; assert(((node_aux_vars)w->aux_vars)->taxon_id != -1); ((edge_aux_vars)f->aux_vars)->wgt = ((edge_aux_vars) sub2tree[((node_aux_vars)w->aux_vars)->taxon_id]->first_edge->aux_vars)->wgt; } else ((edge_aux_vars)f->aux_vars)->wgt = 0; } copy_subtree(T, base_tree, sub2tree); del_phylogenetic_tree(base_tree); del_node(T->G, v); free(sub2tree); } for(ln=node_list->head; ln; ln=ln->next) ln->data=NULL; del_list(node_list); } static void copy_subtree(phylogenetic_tree T, phylogenetic_tree base_tree, node* sub2tree){ node s, t, v; for(v=base_tree->G->first_node; v; v=v->next) if(v->degree == 1 && ((node_aux_vars)v->aux_vars)->taxon_id==0){ s = v; break; } assert(s); t = sub2tree[0]; assert(t); /*printf("base_tree has %d leaves\n", base_tree->num_leaves);*/ copy_subtree_rec(s, t, (edge) 0, T, base_tree, sub2tree); } static void copy_subtree_rec(node u, node v, edge e, phylogenetic_tree T, phylogenetic_tree base_tree, node* sub2tree){ edge f; node_aux_vars nvars; edge_aux_vars evars; node n, opp; for(f = u->first_edge; f; ){ if(f!=e){ opp = opposite(u, f); if(((node_aux_vars)opp->aux_vars)->taxon_id == -1){ nvars = new_node_aux_vars(); n = new_node(T->G, nvars); evars = new_edge_aux_vars(); /*evars->wgt=1;*/ evars->wgt = ((edge_aux_vars)f->aux_vars)->wgt; new_edge(T->G, v, n, evars); copy_subtree_rec(opp, n, f, T, base_tree, sub2tree); } else { evars = new_edge_aux_vars(); /*evars->wgt=1;*/ evars->wgt = ((edge_aux_vars)f->aux_vars)->wgt; new_edge(T->G, v, sub2tree[((node_aux_vars)opp->aux_vars)->taxon_id], evars); } } if(f->u == u) {f=f->u_next; } else {f=f->v_next;} } }