# $Id$ package Bio::Phylo::Parsers::Newick; use strict; use Bio::Phylo::IO; use Bio::Phylo; use vars qw(@ISA); @ISA=qw(Bio::Phylo::IO); no warnings 'recursion'; my $logger = Bio::Phylo->get_logger; *_from_handle = \&_from_both; *_from_string = \&_from_both; =head1 NAME Bio::Phylo::Parsers::Newick - Parses newick trees. No serviceable parts inside. =head1 DESCRIPTION This module parses tree descriptions in parenthetical format. It is called by the L facade, don't call it directly. =begin comment Type : Constructor Title : _new Usage : my $newick = Bio::Phylo::Parsers::Newick->_new; Function: Initializes a Bio::Phylo::Parsers::Newick object. Returns : A Bio::Phylo::Parsers::Newick object. Args : none. =end comment =cut sub _new { my $class = $_[0]; $logger->debug("instantiating newick parser"); my $self = {}; bless( $self, $class ); return $self; } =begin comment Type : Wrapper Title : _from_both(%options) Usage : $newick->_from_both(%options); Function: Extracts trees from file, sends strings to _parse_string() Returns : Bio::Phylo::Forest Args : -handle => (\*FH) or -string => (scalar). Comments: =end comment =cut sub _from_both { my $self = shift; my %args = @_; # turn string into pseudo-handle if ( $args{'-string'} ) { require IO::String; $args{'-handle'} = IO::String->new( $args{'-string'} ); $logger->debug("creating handle from string"); } # just concatenate my $string; while ( my $line = $args{-handle}->getline ) { chomp( $line ); $string .= $line; } $logger->debug("concatenated lines"); # remove comments, split on trees my @trees = $self->_split( $string ); # lazy loading, we only want the forest *now* require Bio::Phylo::Forest; my $forest = Bio::Phylo::Forest->new; # parse trees for my $tree ( @trees ) { $forest->insert( $self->_parse_string( $tree ) ); } # adding labels to untagged nodes if ( $args{'-label'} ) { for my $tree ( @{ $forest->get_entities } ) { my $i = 1; for my $node ( @{ $tree->get_entities } ) { if ( not $node->get_name ) { $node->set_name( 'n' . $i++ ); } } } } # done return $forest; } =begin comment Type : Parser Title : _split($string) Usage : my @strings = $newick->_split($string); Function: Creates an array of (decommented) tree descriptions Returns : A Bio::Phylo::Forest::Tree object. Args : $string = concatenated tree descriptions =end comment =cut sub _split { my ( $self, $string ) = @_; my ( $QUOTED, $COMMENTED ) = ( 0, 0 ); my $decommented = ''; my @trees; TOKEN: for my $i ( 0 .. length( $string ) ) { if ( ! $QUOTED && ! $COMMENTED && substr($string,$i,1) eq "'" ) { $QUOTED++; } elsif ( ! $QUOTED && ! $COMMENTED && substr($string,$i,1) eq "[" ) { $COMMENTED++; next TOKEN; } elsif ( ! $QUOTED && $COMMENTED && substr($string,$i,1) eq "]" ) { $COMMENTED--; next TOKEN; } elsif ( $QUOTED && ! $COMMENTED && substr($string,$i,1) eq "'" && substr($string,$i,2) ne "''" ) { $QUOTED--; } $decommented .= substr($string,$i,1) unless $COMMENTED; if ( ! $QUOTED && ! $COMMENTED && substr($string,$i,1) eq ';' ) { push @trees, $decommented; $decommented = ''; } } $logger->debug("removed comments, split on tree descriptions"); return @trees; } =begin comment Type : Parser Title : _parse_string($string) Usage : my $tree = $newick->_parse_string($string); Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick string. Returns : A Bio::Phylo::Forest::Tree object. Args : $string = a newick tree description =end comment =cut sub _parse_string { my ( $self, $string ) = @_; $logger->debug("going to parse tree string '$string'"); require Bio::Phylo::Forest::Tree; require Bio::Phylo::Forest::Node; my $tree = Bio::Phylo::Forest::Tree->new; my $remainder = $string; my $token; my @tokens; while ( ( $token, $remainder ) = $self->_next_token( $remainder ) ) { last if ( ! defined $token || ! defined $remainder ); $logger->debug("fetched token '$token'"); push @tokens, $token; } my $i; for ( $i = $#tokens; $i >= 0; $i-- ) { last if $tokens[$i] eq ';'; } my $root = Bio::Phylo::Forest::Node->new; $tree->insert( $root ); $self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] ); $self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] ); return $tree; } sub _parse_clade { my ( $self, $tree, $root, @tokens ) = @_; $logger->debug("recursively parsing clade '@tokens'"); my ( @clade, $depth, @remainder ); TOKEN: for my $i ( 0 .. $#tokens ) { if ( $tokens[$i] eq '(' ) { if ( not defined $depth ) { $depth = 1; next TOKEN; } else { $depth++; } } elsif ( $tokens[$i] eq ',' && $depth == 1 ) { my $node = Bio::Phylo::Forest::Node->new; $root->set_child( $node ); $tree->insert( $node ); $self->_parse_node_data( $node, @clade ); $self->_parse_clade( $tree, $node, @clade ); @clade = (); next TOKEN; } elsif ( $tokens[$i] eq ')' ) { $depth--; if ( $depth == 0 ) { @remainder = @tokens[ ( $i + 1 ) .. $#tokens ]; my $node = Bio::Phylo::Forest::Node->new; $root->set_child( $node ); $tree->insert( $node ); $self->_parse_node_data( $node, @clade ); $self->_parse_clade( $tree, $node, @clade ); last TOKEN; } } push @clade, $tokens[$i]; } } sub _parse_node_data { my ( $self, $node, @clade ) = @_; $logger->debug("parsing name and branch length for node"); my @tail; PARSE_TAIL: for ( my $i = $#clade; $i >= 0; $i-- ) { if ( $clade[$i] eq ')' ) { @tail = @clade[ ( $i + 1 ) .. $#clade ]; last PARSE_TAIL; } elsif ( $i == 0 ) { @tail = @clade; } } # name only if ( scalar @tail == 1 ) { $node->set_name( $tail[0] ); } elsif ( scalar @tail == 2 ) { $node->set_branch_length( $tail[-1] ); } elsif ( scalar @tail == 3 ) { $node->set_name( $tail[0] ); $node->set_branch_length( $tail[-1] ); } } sub _next_token { my ( $self, $string ) = @_; $logger->debug("tokenizing string '$string'"); my $QUOTED = 0; my $token = ''; my $TOKEN_DELIMITER = qr/[():,;]/; TOKEN: for my $i ( 0 .. length( $string ) ) { $token .= substr($string,$i,1); $logger->debug("growing token: '$token'"); if ( ! $QUOTED && $token =~ $TOKEN_DELIMITER ) { my $length = length( $token ); if ( $length == 1 ) { $logger->debug("single char token: '$token'"); return $token, substr($string,($i+1)); } else { $logger->debug(sprintf("range token: %s", substr($token,0,$length-1))); return substr($token,0,$length-1),substr($token,$length-1,1).substr($string,($i+1)); } } if ( ! $QUOTED && substr($string,$i,1) eq "'" ) { $QUOTED++; } elsif ( $QUOTED && substr($string,$i,1) eq "'" && substr($string,$i,2) ne "''" ) { $QUOTED--; } } } =begin comment Type : Internal method. Title : _nodelabels($string) Usage : my $labelled = $newick->_nodelabels($string); Function: Returns a newick string with labelled nodes Returns : SCALAR = a labelled newick tree description Args : $string = a newick tree description Notes : Node labels are now optional, determined by the -labels => 1 switch. =end comment =cut sub _nodelabels { my ( $self, $string ) = @_; my ( $x, @x ); while ( $string =~ /\)[:|,|;|\)]/o ) { foreach ( split( /[:|,|;|\)]/o, $string ) ) { if (/n([0-9]+)/) { push( @x, $1 ); } } @x = sort { $a <=> $b } @x; $x = $x[-1]; $string =~ s/(\))([:|,|;|\)])/$1.'n'.++$x.$2/ose; } return $string; } =begin comment Type : Internal method. Title : _parse Usage : my $labelled = $newick->_nodelabels($string); Function: Recursive newick parser function Returns : (Modifies caller's tree object) Args : $substr (a newick subtree), $tree (a tree object), $parent (root of subtree) Notes : =end comment =cut sub _parse { my ( $substr, $tree, $parent ) = @_; my @clades; my ( $level, $token ) = ( 0, '' ); for my $i ( 0 .. length($substr) ) { my $c = substr( $substr, $i, 1 ); $level++ if $c eq '('; $level-- if $c eq ')'; if ( !$level && $c eq ',' || $i == length($substr) ) { my ( $node, $clade ) = &_token_handler($token); if ($clade) { push( @clades, [ $node, $clade ] ); } else { push( @clades, [$node] ); } $token = ''; } else { $token .= $c; } } $parent->set_first_daughter( $clades[0][0] ) ->set_last_daughter( $clades[-1][0] ); $clades[0][0]->set_parent($parent); $tree->insert( $clades[0][0] ); &_parse( $clades[0][1], $tree, $clades[0][0] ) if $clades[0][1]; for my $i ( 1 .. $#clades ) { $clades[$i][0]->set_parent($parent) ->set_previous_sister( $clades[ $i - 1 ][0]->set_next_sister( $clades[$i][0] ) ); $tree->insert( $clades[$i][0] ); &_parse( $clades[$i][1], $tree, $clades[$i][0] ) if $clades[$i][1]; } } =begin comment Type : Internal subroutine. Title : _token_handler Usage : my ( $node, $substring ) = &_token_handler($string); Function: Tokenizes current substring, instantiates node objects. Returns : L, SCALAR substring Args : $token (a newick subtree) Notes : =end comment =cut sub _token_handler { my $token = shift; my ( $node, $name, $clade ); if ( $token =~ m/^\((.*)\)([^()]*)$/o ) { ( $clade, $name ) = ( $1, $2 ); } else { $name = $token; } if ( $name =~ m/^([^:()]*?)\s*:\s*(.*)$/o ) { $node = Bio::Phylo::Forest::Node->new( '-name' => $1, '-branch_length' => $2, ); } else { $node = Bio::Phylo::Forest::Node->new( '-name' => $name, ); } return $node, $clade; } =head1 SEE ALSO =over =item L The newick parser is called by the L object. Look there to learn how to parse newick strings. =item L Also see the manual: L and L. =back =head1 REVISION $Id$ =cut 1;