// This file is part of BULL, a program for phylogenetic simulations // most of the code was written by Mark T. Holder. // This program is for internal use by the lab of Dr. Tandy Warnow only. // Do not redistribute the code. It is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // // Some of the code is from publically available source by Paul Lewis, Ziheng Yang, // John Huelsenbeck, David Swofford , and others (as noted in the code). // In fact the main structure of the program was created by modifying Paul Lewis' // basiccmdline.cpp from his NCL // // This code was used in Mark's dissertation, some changes were made in order to // get it to compile on gcc. It is possible that this porting introduced bugs (very // little debugging has been done on UNIX platforms). I would suggest checking // the simulator by generating data on trees with short branches, etc. // This file is part of BULL, a program for phylogenetic simulations // most of the code was written by Mark T. Holder. // This program is for internal use by the lab of Dr. Tandy Warnow only. // Do not redistribute the code. It is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // // Some of the code is from publically available source by Paul Lewis, Ziheng Yang, // John Huelsenbeck, David Swofford , and others (as noted in the code). // In fact the main structure of the program was created by modifying Paul Lewis' // basiccmdline.cpp from his NCL // // This code was used in Mark's dissertation, some changes were made in order to // get it to compile on gcc. It is possible that this porting introduced bugs (very // little debugging has been done on UNIX platforms). I would suggest checking // the simulator by generating data on trees with short branches, etc. #include "nexus_defs.hpp" #include "xbull.hpp" #include "nexus_token.hpp" /** * @class NexusToken * @file nexus_token.hpp * @file nexustoken.cpp * @author Paul O. Lewis * @copyright Copyright © 1999. All Rights Reserved. * @variable atEOF [bool:private] true if last character read resulted in eof() returning true for input stream * @variable atEOL [bool:private] true if newline encountered while newlineIsToken labile flag set * @variable comment [std::string:private] temporary buffer used to store output comments while they are being built * @variable filecol [long:private] current column in current line (refers to column immediately following token just read) * @variable fileline [long:private] current file line * @variable filepos [long:private] current file position (for Metrowerks compiler, type is streampos rather than long) * @variable in [istream&:private] reference to input stream from which tokens will be read * @variable labileFlags [int:private] storage for labile flags (see labile enum) * @variable saved [char:private] either '\0' or is last character read from input stream * @variable special [char:private] ad hoc punctuation character; default value is '\0' * @variable token [std::string:private] the character buffer used to store the current token * @see NexusReader * @see XBull * * This class is used to read tokens from a Nexus data file. If the token * object is not attached to an input stream, calls to GetNextToken will have no * effect. If the token object is not attached to an output stream, output * comments will be discarded (i.e., not output anywhere) and calls to Write * or Writeln will be ineffective. If input and output streams have been * attached to the token object, however, tokens are read one at a time from * the input stream, and comments are correctly read and either written to * the output stream (if an output comment) or ignored (if not an output * comment). Sequences of characters surrounded by single quotes are read in * as single tokens. A pair of adjacent single quotes are stored as a single * quote, and underscore characters are stored as blanks. */ /** * @enumeration * @enumitem saveCommandComment [0x0001] if set, command comments of the form [&X] are not ignored but are instead saved as regular tokens (without the square brackets, however) * @enumitem parentheticalToken [0x0002] if set, and if next character encountered is a left parenthesis, token will include everything up to the matching right parenthesis * @enumitem curlyBracketedToken [0x0004] if set, and if next character encountered is a left curly bracket, token will include everything up to the matching right curly bracket * @enumitem doubleQuotedToken [0x0008] if set, grabs entire phrase surrounded by double quotes * @enumitem singleCharacterToken [0x0010] if set, next non-whitespace character returned as token * @enumitem newlineIsToken [0x0020] if set, newline character treated as a token and atEOL set if newline encountered * @enumitem tildeIsPunctuation [0x0040] if set, tilde character treated as punctuation and returned as a separate token * @enumitem useSpecialPunctuation [0x0080] if set, "special" character treated as punctuation and returned as a separate token * * For use with the variable labileFlags. */ /** * @constructor * @param i [istream&] the istream object to which the token is to be associated * * Performs the following initializations: *
Variable | Initial Value * | |
---|---|---|
atEOF | = | false * |
atEOL | = | false * |
comment | = | "" * |
filecol | = | 1L * |
fileline | = | 1L * |
filepos | = | 0L * |
in | = | i * |
labileFlags | = | 0 * |
saved | = | '\0' * |
special | = | '\0' * |
token | = | "" * |
Needs work: need to handle exceptions (e.g., a * minus sign is not considered punctuation if we are reading in a number) */ bool NexusToken::IsPunctuation( char ch ) { char punctuation[21]; punctuation[0] = '('; punctuation[1] = ')'; punctuation[2] = '['; punctuation[3] = ']'; punctuation[4] = '{'; punctuation[5] = '}'; punctuation[6] = '/'; punctuation[7] = '\\'; punctuation[8] = ','; punctuation[9] = ';'; punctuation[10] = ':'; punctuation[11] = '='; punctuation[12] = '*'; punctuation[13] = '\''; punctuation[14] = '"'; punctuation[15] = '`'; punctuation[16] = '+'; punctuation[17] = '-'; punctuation[18] = '<'; punctuation[19] = '>'; punctuation[20] = '\0'; bool is_punctuation = false; if ( strchr( punctuation, ch ) != NULL ) is_punctuation = true; if ( labileFlags & tildeIsPunctuation && ch == '~' ) is_punctuation = true; if ( labileFlags & useSpecialPunctuation && ch == special ) is_punctuation = true; return is_punctuation; } bool NexusToken::IsInteger() { for (int i=0;i < token.size();i++) {if(!isdigit(token[i])) if (i>0 || token[0]!='-') //make sure it isn't the negative sign return false; } return true; } long NexusToken::GetLongEquivalent() { long temp=0; int j=0; bool negative=false; if (token[0] == '-') {negative=true; j++; } assert(IsInteger()); for (int i=j;i < token.size();i++) {if(temp>214748363L) throw XBull("Number is too big", *this); temp*=10; temp+=(token[i]-'0'); } if (negative) temp*=-1; return temp; } /** * @method IsWhitespace [bool:protected] * @param ch [char] the character in question * * Returns true if character supplied is considered a whitespace character. * Note: treats '\n' as darkspace if labile flag newlineIsToken is in effect. */ bool NexusToken::IsWhitespace( char ch ) { char whitespace[4]; whitespace[0] = ' '; whitespace[1] = '\t'; whitespace[2] = '\n'; whitespace[3] = '\0'; bool ws = false; // if ch is found in the whitespace array, it's whitespace // if ( strchr( whitespace, ch ) != NULL ) ws = true; // unless of course ch is the newline character and we're currently // treating newlines as darkspace! // if ( labileFlags & newlineIsToken && ch == '\n' ) ws = false; return ws; } /** * @method Abbreviation [bool:public] * @param s [std::string] the comparison string * * Returns true if token begins with the capitalized portion of s * and, if token is longer than s, the remaining characters match * those in the lower-case portion of s. The comparison is case * insensitive. This function should be used instead of the * Begins function if you wish to allow for abbreviations of commands * and also want to ensure that user does not type in a word that * does not correspond to any command. */ bool NexusToken::Abbreviation( std::string s ) { int k; int slen = s.size(); int tlen = token.size(); char tokenChar, otherChar; // The variable mlen refers to the "mandatory" portion // that is the upper-case portion of s // int mlen; for ( mlen = 0; mlen < slen; mlen++ ) { if ( !isupper(s[mlen]) ) break; } // User must have typed at least mlen characters in // for there to even be a chance at a match // if ( tlen < mlen ) return false; // If user typed in more characters than are contained in s, // then there must be a mismatch // if ( tlen > slen ) return false; // Check the mandatory portion for mismatches // for ( k = 0; k < mlen; k++ ) { tokenChar = (char)toupper( token[k] ); otherChar = s[k]; if ( tokenChar != otherChar ) return false; } // Check the auxiliary portion for mismatches (if necessary) // for ( k = mlen; k < tlen; k++ ) { tokenChar = (char)toupper( token[k] ); otherChar = (char)toupper( s[k] ); if ( tokenChar != otherChar ) return false; } return true; } /** * @method BlanksToUnderscores [void:public] * * Converts all blanks in token to underscore characters. Normally, * underscores found in the tokens read from a NEXUS file are converted * to blanks automatically as they are read; this function reverts * the blanks back to underscores. */ void NexusToken::BlanksToUnderscores() { char tmp[256]; int len = token.length(); assert( len < 256 ); strcpy( tmp, token.c_str() ); for ( int i = 0; i < len; i++ ) { if ( tmp[i] == ' ' ) tmp[i] = '_'; } token = tmp; } /** * @method Begins [bool:public] * @param s [std::string] the comparison string * @param respect_case [bool] determines whether comparison is case sensitive (false by default) * * Returns true if token std::string begins with the std::string s. * The comparison is case insensitive by default. This function should * be used instead of the Equals function if you wish to * allow for abbreviations of commands. */ bool NexusToken::Begins( std::string s, bool respect_case /* = false */ ) { int k; char tokenChar, otherChar; int slen = s.size(); if ( slen > token.size() ) return false; for ( k = 0; k < slen; k++ ) { if ( respect_case ) { tokenChar = token[k]; otherChar = s[k]; } else { tokenChar = (char)toupper( token[k] ); otherChar = (char)toupper( s[k] ); } if ( tokenChar != otherChar ) return false; } return true; } /** * @method Equals [bool:public] * @param s [std::string] the comparison string * @param respect_case [bool] determines whether or not comparison is case sensitive (default is false) * * Returns true if token std::string exactly equals s. The comparison * is case insensitive by default. If abbreviations are to be allowed, * either Begins or Abbreviation should be used instead of Equals. */ bool NexusToken::Equals( std::string s, bool respect_case /* = false */ ) { int k; char tokenChar, otherChar; int slen = s.size(); if ( slen != token.size() ) return false; for ( k = 0; k < token.size(); k++ ) { if ( respect_case ) { tokenChar = token[k]; otherChar = s[k]; } else { tokenChar = (char)toupper( token[k] ); otherChar = (char)toupper( s[k] ); } if ( tokenChar != otherChar ) return false; } return true; } /** * @method GetNextToken [void:public] * @throws XBull * * Reads characters from in until a complete token has been read and * stored in token. *
GetNextToken performs a number of useful operations in the process * of retrieving tokens: *
The behavior of GetNextToken may be altered by using labile flags. * For example, the labile flag saveCommandComments can be set using * the member function SetLabileFlagBit. This will cause comments * of the form [&X] to be saved as tokens (without the square brackets), * but only for the aquisition of the next token. Labile flags are cleared * after each application. */ void NexusToken::GetNextToken() { ResetToken(); char ch = ' '; if ( saved == '\0' || IsWhitespace(saved) ) { // skip leading whitespace while ( IsWhitespace(ch) && !atEOF ) ch = GetNextChar(); saved = ch; } for (;;) { // break now if singleCharacterToken mode on and token length > 0 if ( labileFlags & singleCharacterToken && token.size() > 0 ) break; // get next character either from saved or from input stream if ( saved != '\0' ) { ch = saved; saved = '\0'; } else ch = GetNextChar(); // break now if we've hit EOF if ( atEOF ) break; if ( ch == '\n' && labileFlags & newlineIsToken ) { if ( token.size() > 0 ) { // newline came after token, save newline until // next time when it will be reported as a separate // token atEOL = 0; saved = ch; } else { atEOL = 1; AppendToToken(ch); } break; } else if ( IsWhitespace(ch) ) { // break only if we've begun adding to token // (remember, if we hit a comment before a token, // there might be further white space between the comment and // the next token) if ( token.size() > 0 ) break; } else if ( ch == '_' ) { ch = ' '; AppendToToken(ch); } else if ( ch == '[' ) { // get rest of comment and deal with it, but notice // that we only break if the comment ends a token, // not if it starts one (comment counts as whitespace) // // in the case of command comments (if saveCommandComment) GetComment will // add to the token std::string, causing us to break because // token.size() will be greater than 0 GetComment(); if ( token.size() > 0 ) break; } else if ( ch == '(' && labileFlags & parentheticalToken ) { AppendToToken(ch); // get rest of parenthetical token GetParentheticalToken(); break; } else if ( ch == '{' && labileFlags & curlyBracketedToken ) { AppendToToken(ch); // get rest of curly-bracketed token GetCurlyBracketedToken(); break; } else if ( ch == '\"' && labileFlags & doubleQuotedToken ) { // get rest of double-quoted token GetDoubleQuotedToken(); break; } else if ( ch == '\'' ) { if ( token.size() > 0 ) { // we've encountered a single quote after a token has // already begun to be read; should be another tandem // single quote character immediately following ch = GetNextChar(); if ( ch == '\'' ) AppendToToken(ch); else { errormsg = "Expecting second single quote character"; throw XBull(errormsg, *this); } } else { // get rest of quoted Nexus word and break, since // we will have eaten one token after calling GetQuoted GetQuoted(); } break; } else if ( IsPunctuation(ch) ) { if ( token.size() > 0 ) { // if we've already begun reading the token, encountering // a punctuation character means we should stop, saving // the punctuation character for the next token //MTH modification to allow getting token numbers in Sci Not if (ch == '-' && IsConsistentWithFirstPartOfSciNotation()) AppendToToken(ch); else {saved = ch; break; } } else { // if we haven't already begun reading the token, encountering // a punctuation character means we should stop and return // the punctuation character as this token (i.e., the token // is just the single punctuation character AppendToToken(ch); break; } } else { AppendToToken(ch); } } labileFlags = 0; } /** * @method IsPlusMinusToken [bool:public] * * Returns true if current token is a single character and this character * is either '+' or '-'. */ bool NexusToken::IsPlusMinusToken() { if ( token.size() == 1 && ( token[0] == '+' || token[0] == '-' ) ) return true; else return false; } /** * @method IsPunctuationToken [bool:public] * * Returns true if current token is a single character and this character * is a punctuation character (as defined in IsPunctuation function). */ bool NexusToken::IsPunctuationToken() { if ( token.size() == 1 && IsPunctuation( token[0] ) ) return true; else return false; } /** * @method IsWhitespaceToken [bool:public] * * Returns true if current token is a single character and this character * is a whitespace character (as defined in IsWhitespace function). */ bool NexusToken::IsWhitespaceToken() { if ( token.size() == 1 && IsWhitespace( token[0] ) ) return true; else return false; } /** * @method ReplaceToken [void:public] * @param s [const std::string] std::string to replace current token std::string * * Replaces current token std::string with s. */ void NexusToken::ReplaceToken( const std::string s ) { token = s; } /** * @method ResetToken [void:public] * * Sets token to the empty std::string (""). */ void NexusToken::ResetToken() { token = ""; } /** * @method SetSpecialPunctuationCharacter [void:public] * @param c [char] the character to which special is set * * Sets the special punctuation character to c. If the labile bit * useSpecialPunctuation is set, this character will be added to the * standard list of punctuation symbols, and will be returned as a * separate token like the other punctuation characters. */ void NexusToken::SetSpecialPunctuationCharacter( char c ) { special = c; } /** * @method SetLabileFlagBit [void:public] * @param bit [int] the bit (see enum) to set in labileFlags * * Sets the bit specified in the variable labileFlags. The available * bits are specified in the enumeration starting with saveCommandComments. * All bits in labileFlags are cleared after each token is read. */ void NexusToken::SetLabileFlagBit( int bit ) { labileFlags |= bit; } /** * @method StoppedOn [void:public] * @param ch [char] the character to compare with saved character * * Checks character stored in the variable saved to see if it * matches supplied character ch. Good for checking such things * as whether token stopped reading characters because it encountered * a newline (and labileFlags bit newlineIsToken was set): *
StoppedOn('\n'); *
or whether token stopped reading characters because of a * punctuation character such as a comma: *
StoppedOn(','); */ bool NexusToken::StoppedOn( char ch ) { if ( saved == ch ) return true; else return false; } /** * @method StripWhitespace [void:public] * * Strips whitespace from currently-stored token. Removes leading, * trailing, and embedded whitespace characters. */ void NexusToken::StripWhitespace() { std::string s = ""; for ( int j = 0; j < token.size(); j++ ) { if ( IsWhitespace( token[j] ) ) continue; s += token[j]; } token = s; } /** * @method Write [void:public] * @param out [ostream&] the output stream to which to write token std::string * * Simply outputs the current std::string stored in token to the output * stream out. Does not send a newline to the output stream afterwards. */ void NexusToken::Write(std::ostream& out ) { out << token; } /** * @method Writeln [void:public] * @param out [ostream&] the output stream to which to write token std::string * * Simply outputs the current std::string stored in token to the output * stream out. Sends a newline to the output stream afterwards. */ void NexusToken::Writeln(std::ostream& out ) { out << token << std::endl; } /** * @method OutputComment [virtual void:public] * @param msg [std::string&] the output comment to be displayed * * This function is called whenever an output comment (i.e., a comment * beginning with an exclamation point) is found in the data file. * This version of OutputComment does nothing; override this virtual * function to display the output comment in the most appropriate way * for the platform you are supporting. */ void NexusToken::OutputComment( std::string& /* msg */ ) { } //MTH bool NexusToken::IsConsistentWithFirstPartOfSciNotation() { assert(token.size()); int s; s=token.size(); s--; if (token[s]!='E' && token[s]!='e') return false; int n=0; if (token[n] == '-') n++; bool decipt=false; while (n < s) {if(token[n]<'0' || token[n]>'9') {if(token[n]!='.' || decipt) return false; decipt=true; } n++; } return true; }