// Hennig86Reader.cpp #if defined(_WIN32) || defined(_WIN64) // C4290 - the compiler ignores exception specifications #pragma warning(disable: 4290) #endif #include #include #include "CharacterCodes.hpp" #include "Hennig86Reader.hpp" namespace Cipres { namespace SequenceFormats { // Hennig86Reader void Hennig86Reader::Read(std::istream &input_stream, NexusWriter &writer) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { Parser parser(input_stream, writer); parser.Parse(); } // Hennig86Reader::Lexer unsigned int Hennig86Reader::Lexer::GetToken(Token &token) throw(std::bad_alloc, std::ios_base::failure) { if(std::isspace(m_input)) { do { if(m_input == '\n') { m_line_num += 1; m_col_num = 0; } ReadNextChar(); } while(std::isspace(m_input)); token.prev_space = true; } else token.prev_space = false; token.line_num = m_line_num; token.col_num = m_col_num; if(m_input == EOF) token.type = END_OF_FILE; else if(std::isdigit(m_input)) { do { token.value.push_back(m_input); ReadNextChar(); } while(std::isalnum(m_input)); token.type = INTEGER; } else if(std::isalpha(m_input)) { do { token.value.push_back(m_input); ReadNextChar(); } while(std::isalnum(m_input) || m_input == '_'); token.type = STRING; } else { token.value.push_back(m_input); switch(m_input) { case ',': token.type = COMMA; break; case '.': token.type = PERIOD; break; case '&': token.type = AMPERSAND; break; case ';': token.type = SEMICOLON; break; case '_': token.type = UNDERSCORE; break; case '+': token.type = PLUS_SIGN; break; case '-': token.type = MINUS_SIGN; break; case '=': token.type = EQUALS_SIGN; break; case '\'': token.type = SINGLE_QUOTE; break; case '/': token.type = FORWARD_SLASH; break; case '?': token.type = QUESTION_MARK; break; case '[': token.type = OPEN_BRACKET; break; case ']': token.type = CLOSE_BRACKET; break; case '{': token.type = OPEN_BRACE; break; case '}': token.type = CLOSE_BRACE; break; case '(': token.type = OPEN_PARENS; break; case ')': token.type = CLOSE_PARENS; break; default: token.type = OTHER_NON_ALNUM; } ReadNextChar(); } return token.type; } bool Hennig86Reader::Lexer::GetBlock(std::string &block, char delim) throw(std::bad_alloc, std::ios_base::failure) { while(m_input != delim) { if(m_input == EOF) return false; else if(m_input == '\n') { m_line_num += 1; m_col_num = 0; } block.push_back(m_input); ReadNextChar(); } ReadNextChar(); return true; } bool Hennig86Reader::Lexer::SkipBlock(char delim) throw(std::ios_base::failure) { while(m_input != delim) { if(m_input == EOF) return false; else if(m_input == '\n') { m_line_num += 1; m_col_num = 0; } ReadNextChar(); } ReadNextChar(); return true; } void Hennig86Reader::Lexer::ReadNextChar() throw(std::ios_base::failure) { if(m_input_stream.eof()) { m_input = EOF; return; } m_input = m_input_stream.get(); m_col_num += 1; } // Hennig86Reader::Parser bool Hennig86Reader::Parser::MatchesKeyword(const char *keyword, std::string &value) throw() { assert(keyword); for(std::string::const_iterator i = value.begin() ; i != value.end() ; i++, keyword++) { assert(std::tolower(*keyword) == *keyword); if(std::tolower(*i) != *keyword) return false; } return *keyword == '\0'; } void Hennig86Reader::Parser::Parse() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { while(GetToken() != Lexer::END_OF_FILE) { if(m_token.type == Lexer::STRING) { if(MatchesKeyword("ag", m_token.value) || MatchesKeyword("agroup", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("cc", m_token.value) || MatchesKeyword("ccode", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("cn", m_token.value) || MatchesKeyword("cnames", m_token.value)) ParseCnamesCommand(); else if(MatchesKeyword("comments", m_token.value)) ParseCommentsCommand(); else if(MatchesKeyword("ns", m_token.value) || MatchesKeyword("nstates", m_token.value)) { ParseNstatesCommand(); if(GetToken() == Lexer::END_OF_FILE) std::cerr << "warning: line " << m_token.line_num << ", character " << m_token.col_num << ": no terminating semicolon" << std::endl; else if(m_token.type != Lexer::SEMICOLON) throw SyntaxError(m_token); } else if(MatchesKeyword("p", m_token.value) || MatchesKeyword("proc", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("tg", m_token.value) || MatchesKeyword("tgroup", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("tr", m_token.value) || MatchesKeyword("tread", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("xg", m_token.value) || MatchesKeyword("xgroup", m_token.value)) ParsedIgnoredCommmand(); else if(MatchesKeyword("xr", m_token.value) || MatchesKeyword("xread", m_token.value)) ParseXreadCommand(); else ParsedIgnoredCommmand(); } else { std::cerr << "warning: line " << m_token.line_num << ", character " << m_token.col_num << ": ignoring unrecognized token " << m_token.value << std::endl; if(!m_lexer.SkipBlock(';')) { std::cerr << "warning: line " << m_token.line_num << ", character " << m_token.col_num << ": no terminating semicolon " << m_token.value << std::endl; break; } } } } void Hennig86Reader::Parser::ParsedIgnoredCommmand() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::string command; command.reserve(m_token.value.size()); for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) command.push_back(std::toupper(*i)); std::cerr.write("warning: line ", 14); std::cerr << m_token.line_num; std::cerr.write(", character ", 12); std::cerr << m_token.col_num; std::cerr.write(": ignoring ", 11); std::cerr << command; std::cerr.write(" command", 8); std::cerr << std::endl; m_lexer.SkipBlock(';'); } void Hennig86Reader::Parser::ParseCnamesCommand() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::cerr.write("warning: line ", 14); std::cerr << m_token.line_num; std::cerr.write(", character ", 12); std::cerr << m_token.col_num; std::cerr.write(": ignoring CNAMES command", 25); std::cerr << std::endl; while(1) { m_lexer.SkipBlock(';'); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) return; } } void Hennig86Reader::Parser::ParseCommentsCommand() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::cerr.write("warning: line ", 14); std::cerr << m_token.line_num; std::cerr.write(", character ", 12); std::cerr << m_token.col_num; std::cerr.write(": ignoring COMMENTS command", 27); std::cerr << std::endl; while(1) { m_lexer.SkipBlock(';'); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) return; } } void Hennig86Reader::Parser::ParseNstatesCommand() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { if(GetToken() == Lexer::STRING) { if(MatchesKeyword("dna", m_token.value)) { m_default_data_type = DNA; m_max_char_states = 8; return; } else if(MatchesKeyword("prot", m_token.value)) { m_default_data_type = PROTEIN; m_max_char_states = 32; return; } else if(MatchesKeyword("cont", m_token.value)) { m_default_data_type = CONTINUOUS; m_max_char_states = 66; return; } else if(MatchesKeyword("min", m_token.value)) { std::cerr << "warning: line " << m_token.line_num << ", character " << m_token.col_num << ": ignoring NSTATES MIN command" << std::endl; return; } else if(MatchesKeyword("num", m_token.value)) GetToken(); else throw SyntaxError(m_token); } if(m_token.type != Lexer::INTEGER) throw SyntaxError(m_token); int max_states = ToInteger(m_token.value); if(max_states < 1 || max_states > 32) throw std::runtime_error("error: invalid value"); m_default_data_type = NUMERIC; m_max_char_states = max_states; } void Hennig86Reader::Parser::ParseXreadCommand() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { if(m_writer.NumDataBlocks() > 0) throw std::runtime_error("more than one XREAD command"); if(GetToken() == Lexer::SINGLE_QUOTE) { std::string title; if(!m_lexer.GetBlock(title, '\'')) throw std::runtime_error("unterminated title"); m_writer.SetTitle(title); GetToken(); } if(m_token.type != Lexer::INTEGER) throw SyntaxError(m_token); m_num_chars = ToInteger(m_token.value); if(m_num_chars < 1) throw std::runtime_error("error: invalid value"); if(GetToken() != Lexer::INTEGER) throw SyntaxError(m_token); m_num_taxa = ToInteger(m_token.value); if(m_num_taxa < 1) throw std::runtime_error("error: invalid value"); if(GetToken() == Lexer::AMPERSAND) ParseInterleavedData(); else { if(m_default_data_type == DNA) ParseDnaDataBlock(false); else if(m_default_data_type == PROTEIN) ParseProteinDataBlock(false); else if(m_default_data_type == CONTINUOUS) ParseContinuousDataBlock(false); else { assert(m_default_data_type == NUMERIC); if(m_max_char_states <= 10) ParseNumDataBlock(false); else ParseAlphaNumDataBlock(false); } } } void Hennig86Reader::Parser::ParseInterleavedData() throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { while(1) { unsigned int data_type = m_default_data_type; if(GetToken() == Lexer::OPEN_BRACKET) { if(GetToken() != Lexer::STRING) throw SyntaxError(m_token); if(MatchesKeyword("numeric", m_token.value)) data_type = NUMERIC; else if(MatchesKeyword("dna", m_token.value)) data_type = DNA; else if(MatchesKeyword("proteins", m_token.value)) data_type = PROTEIN; else if(MatchesKeyword("continuous", m_token.value)) data_type = CONTINUOUS; else throw SyntaxError(m_token); if(GetToken() != Lexer::CLOSE_BRACKET) throw SyntaxError(m_token); GetToken(); } if(data_type == DNA) ParseDnaDataBlock(true); else if(data_type == PROTEIN) ParseProteinDataBlock(true); else if(data_type == CONTINUOUS) ParseContinuousDataBlock(true); else { assert(data_type == NUMERIC); if(m_max_char_states <= 10) ParseNumDataBlock(true); else ParseAlphaNumDataBlock(true); } if(m_token.type == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) return; else if(m_token.type != Lexer::AMPERSAND) throw SyntaxError(m_token); } } void Hennig86Reader::Parser::ParseDnaDataBlock(bool interleaved) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::string taxon_name; NexusWriter::MolecularDataBlock &block = m_writer.AddDnaDataBlock(); while(1) { ParseTaxonName(taxon_name); NexusWriter::DiscreteTaxon &taxon = block.AddTaxon(taxon_name); while(1) { if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) taxon.AddChar(GAP); else if(m_token.type == Lexer::QUESTION_MARK) taxon.AddChar(MISSING); else if(m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = NucleotideSymbolToCode(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); taxon.AddChar(code); } } else if(m_token.type == Lexer::OPEN_BRACKET) { NexusWriter::DiscreteTaxon::DiscreteCharList states; while(1) { GetToken(); if(m_token.prev_space || m_token.type == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) throw std::runtime_error("unterminated polymorphic character"); if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) states.push_back(GAP); else if(m_token.type == Lexer::QUESTION_MARK) states.push_back(MISSING); else if(m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = NucleotideSymbolToCode(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); states.push_back(code); } } else if(m_token.type == Lexer::CLOSE_BRACKET) break; else throw SyntaxError(m_token); } if(states.empty()) throw std::runtime_error("empty polymorphic character"); taxon.AddPolyChar(states); } else throw SyntaxError(m_token); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE || (m_token.type == Lexer::AMPERSAND && interleaved)) return; if(m_token.prev_space) break; } } } void Hennig86Reader::Parser::ParseProteinDataBlock(bool interleaved) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::string taxon_name; NexusWriter::MolecularDataBlock &block = m_writer.AddProteinDataBlock(); while(1) { ParseTaxonName(taxon_name); NexusWriter::DiscreteTaxon &taxon = block.AddTaxon(taxon_name); while(1) { if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) taxon.AddChar(GAP); else if(m_token.type == Lexer::QUESTION_MARK) taxon.AddChar(MISSING); else if(m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = AminoAcidSymbolToCode(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); taxon.AddChar(code); } } else if(m_token.type == Lexer::OPEN_BRACKET) { NexusWriter::DiscreteTaxon::DiscreteCharList states; while(1) { GetToken(); if(m_token.prev_space || m_token.type == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) throw std::runtime_error("unterminated polymorphic character"); if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) states.push_back(GAP); else if(m_token.type == Lexer::QUESTION_MARK) states.push_back(MISSING); else if(m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = AminoAcidSymbolToCode(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); states.push_back(code); } } else if(m_token.type == Lexer::CLOSE_BRACKET) break; else throw SyntaxError(m_token); } if(states.empty()) throw std::runtime_error("empty polymorphic character"); taxon.AddPolyChar(states); } else throw SyntaxError(m_token); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE || (m_token.type == Lexer::AMPERSAND && interleaved)) return; if(m_token.prev_space) break; } } } void Hennig86Reader::Parser::ParseNumDataBlock(bool interleaved) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { int max_observed_state = -1; std::string taxon_name; NexusWriter::StandardDataBlock &block = m_writer.AddStandardDataBlock(); while(1) { ParseTaxonName(taxon_name); NexusWriter::DiscreteTaxon &taxon = block.AddTaxon(taxon_name); while(1) { if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) taxon.AddChar(GAP); else if(m_token.type == Lexer::QUESTION_MARK) taxon.AddChar(MISSING); else if(m_token.type == Lexer::INTEGER) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = SymbolToState(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); else if(code >= m_max_char_states) throw std::runtime_error("state exceeds maximum state value"); if(code > max_observed_state) max_observed_state = code; taxon.AddChar(code); } } else if(m_token.type == Lexer::OPEN_BRACKET) { NexusWriter::DiscreteTaxon::DiscreteCharList states; while(1) { GetToken(); if(m_token.prev_space || m_token.type == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) throw std::runtime_error("unterminated polymorphic character"); if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) states.push_back(GAP); else if(m_token.type == Lexer::QUESTION_MARK) states.push_back(MISSING); else if(m_token.type == Lexer::INTEGER) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = SymbolToState(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); else if(code >= m_max_char_states) throw std::runtime_error("state exceeds maximum state value"); if(code > max_observed_state) max_observed_state = code; states.push_back(code); } } else if(m_token.type == Lexer::CLOSE_BRACKET) break; else throw SyntaxError(m_token); } if(states.empty()) throw std::runtime_error("empty polymorphic character"); taxon.AddPolyChar(states); } else throw SyntaxError(m_token); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE || (m_token.type == Lexer::AMPERSAND && interleaved)) { std::string symbols; for(int i = 0 ; i <= max_observed_state ; i++) symbols.push_back(StateToSymbol(i)); block.SetSymbols(symbols); return; } if(m_token.type == Lexer::STRING) break; } } } void Hennig86Reader::Parser::ParseAlphaNumDataBlock(bool interleaved) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { int max_observed_state = -1; std::string taxon_name; NexusWriter::StandardDataBlock &block = m_writer.AddStandardDataBlock(); while(1) { ParseTaxonName(taxon_name); NexusWriter::DiscreteTaxon &taxon = block.AddTaxon(taxon_name); while(1) { if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) taxon.AddChar(GAP); else if(m_token.type == Lexer::QUESTION_MARK) taxon.AddChar(MISSING); else if(m_token.type == Lexer::INTEGER || m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = SymbolToState(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); else if(code >= m_max_char_states) throw std::runtime_error("state exceeds maximum state value"); if(code > max_observed_state) max_observed_state = code; taxon.AddChar(code); } } else if(m_token.type == Lexer::OPEN_BRACKET) { NexusWriter::DiscreteTaxon::DiscreteCharList states; while(1) { GetToken(); if(m_token.prev_space || m_token.type == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE) throw std::runtime_error("unterminated polymorphic character"); if(m_token.type == Lexer::PERIOD || m_token.type == Lexer::MINUS_SIGN) states.push_back(GAP); else if(m_token.type == Lexer::QUESTION_MARK) states.push_back(MISSING); else if(m_token.type == Lexer::INTEGER || m_token.type == Lexer::STRING) { for(std::string::const_iterator i = m_token.value.begin() ; i != m_token.value.end() ; i++) { int code = SymbolToState(*i); if(code < 0) throw std::runtime_error("unrecognized symbol"); else if(code >= m_max_char_states) throw std::runtime_error("state exceeds maximum state value"); if(code > max_observed_state) max_observed_state = code; states.push_back(code); } } else if(m_token.type == Lexer::CLOSE_BRACKET) break; else throw SyntaxError(m_token); } if(states.empty()) throw std::runtime_error("empty polymorphic character"); taxon.AddPolyChar(states); } else throw SyntaxError(m_token); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE || (m_token.type == Lexer::AMPERSAND && interleaved)) { std::string symbols; for(int i = 0 ; i <= max_observed_state ; i++) symbols.push_back(StateToSymbol(i)); block.SetSymbols(symbols); return; } if(m_token.prev_space) break; } } } void Hennig86Reader::Parser::ParseContinuousDataBlock(bool interleaved) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { std::string taxon_name; std::string character; NexusWriter::ContinuousDataBlock &block = m_writer.AddContinuousDataBlock(); while(1) { ParseTaxonName(taxon_name); NexusWriter::ContinuousTaxon &taxon = block.AddTaxon(taxon_name); while(1) { if(m_token.type == Lexer::QUESTION_MARK) taxon.AddChar(MISSING); else if(m_token.type == Lexer::INTEGER) { character.assign(m_token.value); if(GetToken() != Lexer::PERIOD || m_token.prev_space) throw SyntaxError(m_token); character.push_back('.'); if(GetToken() != Lexer::INTEGER || m_token.prev_space) throw SyntaxError(m_token); character.append(m_token.value); float value = ToFloat(character); taxon.AddChar(value); } else throw SyntaxError(m_token); if(GetToken() == Lexer::SEMICOLON || m_token.type == Lexer::END_OF_FILE || (m_token.type == Lexer::AMPERSAND && interleaved)) return; if(m_token.type == Lexer::STRING) break; } } } void Hennig86Reader::Parser::ParseTaxonName(std::string &name) throw(std::bad_alloc, std::runtime_error, std::ios_base::failure) { if(m_token.type != Lexer::STRING) throw SyntaxError(m_token); name.assign(m_token.value); while(1) { GetToken(); if(m_token.prev_space) break; if(m_token.type != Lexer::STRING && m_token.type != Lexer::INTEGER && m_token.type != Lexer::PERIOD && m_token.type != Lexer::UNDERSCORE) throw SyntaxError(m_token); name.append(m_token.value); } } // Hennig86Reader::Parser::SyntaxError Hennig86Reader::Parser::SyntaxError::SyntaxError(Lexer::Token &token) throw() : std::runtime_error(token.value) { try { std::ostringstream printer; printer.write("error: line ", 12); printer << token.line_num; printer.write(", column ", 9); printer << token.col_num; printer.write(": unexpected token ", 19); printer << token.value; printer.put('\n'); m_error_message.assign(printer.str()); } catch(...) { // we're presumably already handling an error, and it's unlikely that an exception // thrown from within the handler will be caught correctly, so we swallow it and // rely on std::runtime_error to provide a message } } } // namespace SequenceFormats } // namespace Cipres