#include "phycas/phycas.h" #if !defined (ALL_BIND_USING_CPPS_IN_ONE_FILE) || defined(INCLUDED_FROM_OTHER_BIND_USING_FILE) // This file inclusion avoids a bizarre anonymous namespace multiple definition link error that TL is getting on Mac 10.3.9 (gcc 3.3) //#warning using macro to include nxs_command_output.cpp TEMPORARY HACK! //#include "ncl/command/nxs_command_output.cpp" #include "ncl/nxs_token.hpp" #include "ncl/output/nxs_output.hpp" #include "ncl/nxs_defs.hpp" #include "ncl/characters/nxs_characters_block.hpp" #include "ncl/characters/nxs_characters_manager.hpp" #include "ncl/taxa/nxs_taxa_manager.hpp" #include "ncl/nxs_exception.hpp" #include "ncl/nxs_token.hpp" #include "ncl/command/nxs_cmd_param.hpp" // needed because we have to check if commands have been read in HandleFormat #include "ncl/misc/nxs_data_type_inl.hpp" #include "ncl/output/nxs_output_stream.hpp" using std::map; using std::vector; using std::string; using std::set; using std::pair; using ncl::DiscreteMatrixShPtr; using ncl::NxsDiscreteMatrix; void NxsCharactersBlock::Report( NxsOutputStream& outS) const { outS << '\n' << GetID() << " block contains "; if(GetNumTaxaWithData() == 0) outS << "no taxa"; else if(GetNumTaxaWithData() == 1) outS << "one taxon"; else outS << GetNumTaxaWithData() << " taxa"; outS << " and "; string s; AppendNumberThenWord(s, GetTotalNumCharacters(), "character"); outS << s << "\n Data type is \""; switch(formatSettings.dataTypeIndex) { case NxsDataType::kDNA: outS << "DNA\"\n"; break; case NxsDataType::kRNA: outS << "RNA\"\n"; break; case NxsDataType::kNucleotide: outS << "nucleotide\"\n"; break; case NxsDataType::kProtein: outS << "protein\"\n"; break; case NxsDataType::kContinuous: outS << "continuous\"\n"; break; default: outS << "standard\"\n"; } if(formatSettings.respectingCase) outS << " Respecting case\n"; else outS << " Ignoring case\n"; if(formatSettings.tokens) outS << " Multicharacter tokens allowed in data matrix\n"; else outS << " Data matrix entries are expected to be single symbols\n"; if(formatSettings.labels && formatSettings.transposing) outS << " Character labels are expected on left side of matrix\n"; else if(formatSettings.labels && !formatSettings.transposing) outS << " Taxon labels are expected on left side of matrix\n"; else outS << " No labels are expected on left side of matrix\n"; if(!charLabels.empty()) { outS << " Character and character state labels:\n"; for(unsigned k = 0; k < GetTotalNumCharacters(); ++k) { const string charKLabel = GetLabel(k); outS << '\t' << (1 + k) << '\t' << charKLabel << "\n"; // output state labels if any are defined for this character StateLabelMap::const_iterator cib = charStates.find(k); if(cib != charStates.end()) { unsigned ns = (unsigned) (*cib).second.size(); for(unsigned m = 0; m < ns; ++k) outS << "\t\t" << (*cib).second[m] << "\n"; } } } if(formatSettings.transposing && formatSettings.interleaving) outS << " Matrix transposed and interleaved\n"; else if(formatSettings.transposing && !formatSettings.interleaving) outS << " Matrix transposed but not interleaved\n"; else if(!formatSettings.transposing && formatSettings.interleaving) outS << " Matrix interleaved but not transposed\n"; else outS << " Matrix neither transposed nor interleaved\n"; outS << " Missing data symbol is '" << formatSettings.missingSymbol << "\'\n"; if(formatSettings.matchSymbol != '\0') outS << " Match character is '" << formatSettings.matchSymbol << "\'\n"; else outS << " No match character specified\n"; if(formatSettings.gapSymbol != '\0') outS << " Gap character specified is '" << formatSettings.gapSymbol << "\'\n"; else outS << " No gap character specified\n"; const char *symbolsList = dataType.GetSymbols(); outS << " Valid symbols are: " << symbolsList << "\n"; const map eqMap = dataType.GetEquates(); if(!eqMap.empty()) { outS << " Equate macros in effect:\n"; for(map::const_iterator i = eqMap.begin(); i != eqMap.end(); ++i) outS << " " << (*i).first << " = " << (*i).second << '\n'; } else outS << " No equate macros have been defined\n"; if(nEliminated == 0) outS << " No characters were eliminated\n"; else { outS << " The following characters were eliminated:\n"; NxsIndexSet::const_iterator k(eliminated.begin()); for(; k != eliminated.end(); ++k) outS << " " << ((*k)+1) << "\n"; } outS << " Data matrix:" << "\n"; DebugShowMatrix(outS, false, " "); } /** * @method DebugShowMatrix [int:protected] * @param out [ostream&] output stream on which to print matrix * @param use_matchchar [bool] if true, matchchar symbol used; otherwise, states shown for all taxa * @param marginText [char*] string to print first on each line * * Provides a dump of the contents of the matrix variable. Useful for testing * whether data is being read as expected. The default for marginText is NULL, * which has the effect of placing the matrix output flush left. If each line * of output should be prefaced with a tab character, specify marginText = "\t". */ void NxsCharactersBlock::DebugShowMatrix( NxsOutputStream& outS, bool use_matchchar, const char* marginText) const { const unsigned width = NxsAlternativeTaxaBlock::taxLabels.GetMaxLabelLength(); unsigned first_taxon = UINT_MAX; const unsigned nCharTotal = GetTotalNumCharacters(); for(unsigned j = 0; j < GetNumTaxa(); ++j) { unsigned taxPos = GetTaxPos(j); if (taxPos != UINT_MAX) { if(marginText != NULL) outS << marginText; const string &currTaxonLabel = NxsAlternativeTaxaBlock::taxLabels.GetLabel(j); outS << currTaxonLabel; // print out enough spaces to even up the left edge of the matrix output unsigned currTaxonLabelLen = (unsigned) currTaxonLabel.size(); unsigned diff = width - currTaxonLabelLen; for(unsigned k = 0; k < diff + 5; ++k) outS << ' '; if(first_taxon == UINT_MAX) first_taxon = taxPos; for(unsigned currChar = 0; currChar < nCharTotal; ++currChar) { unsigned k = charIndexAdjuster.GetElementIndexFromLocalIndex(currChar); if(k != UINT_MAX) ShowStateLabels(outS, taxPos, k, (use_matchchar ? first_taxon : UINT_MAX)); } outS << '\n'; } } for(unsigned j = 0; j < GetNumTaxa(); ++j) { if (GetTaxPos(j) == UINT_MAX) outS << marginText << "No data for " << NxsAlternativeTaxaBlock::taxLabels.GetLabel(j) << '\n'; } outS << ncl::endl; } /** * @method ShowStateLabels [void:protected] * @param out [ostream&] the output stream on which to write * @param i [int] the taxon, in range [0..ntax) * @param j [int] the character, in range [0..nchar) * @param first_taxon [int] the index of the first taxon (if -1, don't use matchchar) * * Looks up the state(s) at row i, column j of matrix and writes it (or them) * to out. If there is uncertainty or polymorphism, the list of states is * surrounded by the appropriate set of symbols (i.e., parentheses for polymorphism, * curly brackets for uncertainty). If 'tokens' is in effect, the output takes * the form of the defined state labels; otherwise, the correct symbol is * looked up in symbols and output. */ void NxsCharactersBlock::ShowStateLabels( NxsOutputStream& outS, unsigned rowIndex, unsigned colIndex, unsigned first_taxon) const { if (rowIndex == UINT_MAX || colIndex == UINT_MAX) return; if(dataMatrix->IsMissing(rowIndex, colIndex)) { outS << formatSettings.missingSymbol; return; } else if(dataMatrix->IsGap(rowIndex, colIndex)) { outS << formatSettings.gapSymbol; return; } const DataStorageType *stateCode = dataMatrix->GetState(rowIndex, colIndex); if(formatSettings.tokens) { unsigned n = dataType.CountStates(stateCode); if(n == 1) { bool use_matchchar = false; if(first_taxon != UINT_MAX && rowIndex != first_taxon) { const DataStorageType * firsts = dataMatrix->GetState(first_taxon, colIndex); if(dataType.StatesComp(firsts, stateCode) == 0) use_matchchar = true; } if(use_matchchar) outS << formatSettings.matchSymbol; else { StateLabelMap::const_iterator ci = charStates.find(colIndex); // OPEN ISSUE: need to eliminate state labels for characters that have // been eliminated bool printed = false; if (ci != charStates.end()) { vector ords = dataType.GetOrdinationsOfStates(stateCode); if (ci->second[ords[0]] != " ") { printed = true; outS << " " << ci->second[ords[0]]; } } if (!printed) outS << " " << dataType.GetStateChar(stateCode) << "[<-no label found]"; } } else { vector ords = dataType.GetOrdinationsOfStates(stateCode); //TODO: handle matchchar possibility here too // if(dataMatrix->IsPolymorphic(rowIndex, colIndex)) outS << " ("; else outS << " {"; for(vector::iterator s = ords.begin(); s != ords.end(); ++s) { StateLabelMap::const_iterator ci = charStates.find(colIndex); if(ci == charStates.end() || (*ci).second[*s] == " ") outS << " " << dataType.GetStateCharFromIndex(*s) << "[<-no label found]"; else // show label at index number s in LabelList at ci outS << " " << (*ci).second[*s]; } if(dataMatrix->IsPolymorphic(rowIndex, colIndex)) outS << ')'; else outS << '}'; } } else { if(first_taxon != UINT_MAX && rowIndex > first_taxon && (dataType.StatesComp(stateCode, dataMatrix->GetState(first_taxon, colIndex)) == 0)) outS << '.'; else outS << dataType.GetStatesAsNexusString(stateCode, dataMatrix->IsPolymorphic(rowIndex,colIndex), false); } } CmdResult NxsCharactersBlock::EndEncountered() { NxsAlternativeTaxaBlock::FinishedManipulatingTaxa(); return charactersMgr.NewBlockRead(this); } void NxsCharactersBlock::ResetDataType(NxsDataType::NxsDatatypesEnum dataTypeIndex) { dataType = NxsDataType(dataTypeIndex, dataTypeIndex == NxsDataType::kStandard); //std::cout <<" end of ResetDataType" << std::endl; //@POL 27-Oct-2005 looked like debugging code, so I commented it out } CmdResult NxsCharactersBlock::HandleFormat(NxsFormatCmdSettings *s) { formatSettings = *s; if(formatSettings.dataTypeIndex == NxsDataType::kContinuous) { errorMsg = "Sorry, continuous character matrices have not yet been implemented"; throw NxsException(errorMsg); } // check all of the difficult inter-dependencies in the format command // if (!IsLegalNexusChar(formatSettings.missingSymbol)) ThrowIllegalNexusChar(formatSettings.missingSymbol, "MISSING"); if (formatSettings.gapSymbol != '\0') { if (!IsLegalNexusChar(formatSettings.gapSymbol)) ThrowIllegalNexusChar(formatSettings.gapSymbol, "GAP"); if (formatSettings.gapSymbol == formatSettings.missingSymbol) throw NxsException("the GAP character and MISSING character cannot be identical"); if (formatSettings.gapSymbol == formatSettings.matchSymbol) throw NxsException("the GAP character and MATCHCHAR cannot be identical"); } if (formatSettings.matchSymbol != '\0') { if (!IsLegalNexusChar(formatSettings.matchSymbol)) ThrowIllegalNexusChar(formatSettings.matchSymbol, "MATCHCHAR"); if (formatSettings.matchSymbol == formatSettings.missingSymbol) throw NxsException("the missing character and MATCHCHAR character cannot be identical"); } bool allowRC = false; if (formatSettings.dataTypeIndex == NxsDataType::kContinuous) { if (formatSettings.rawSymbols.length() > 0) throw NxsException("the SYMBOLS keyword is not allowed if the DATATYPE is CONTINUOUS"); if (!statesFormatCI.get()->HasBeenRead()) { // Individuals is the default for the states format command if datatype = continuous // formatSettings.stateFormIndex = 1; // depends on "STATESPRESENT|INDIVIDUALS|COUNT|FREQUENCY" order of choices formatSettings.stateFormName = "INDIVIDUALS"; } if (!formatSettings.tokens) { if (tokensCI.get()->HasBeenRead()) throw NxsException("TOKENS must be used when the DATATYPE is CONTINUOUS"); formatSettings.tokens = false; } } else if (formatSettings.dataTypeIndex == NxsDataType::kDNA || formatSettings.dataTypeIndex == NxsDataType::kRNA || formatSettings.dataTypeIndex == NxsDataType::kNucleotide) { if (formatSettings.tokens) throw NxsException("TOKENS cannot be used with NUCLEOTIDE data"); } else allowRC = true; if (!allowRC && formatSettings.respectingCase) throw NxsException("the RESPECTCASE option can only be used with the STANDARD DATATYPE"); // get the symbols list ready for reading in the matrix // ResetDataType((NxsDataType::NxsDatatypesEnum) formatSettings.dataTypeIndex); dataType.SetGapChar('\0'); dataType.SetMatchChar('\0'); dataType.SetMissingChar('\0'); if (formatSettings.rawSymbols.length() > 0) { if (formatSettings.dataTypeIndex == NxsDataType::kStandard) dataType.ReplaceSymbols(formatSettings.rawSymbols, formatSettings.respectingCase); else dataType.AddSymbols(formatSettings.rawSymbols, formatSettings.respectingCase); } // missing, match and gap characters can't be in the symbols list. // const string fullSymbolsList = string(dataType.GetSymbols()); string errCharName; char errorChar = '\0'; if (fullSymbolsList.find(formatSettings.gapSymbol) != string::npos) { errCharName = "GAP"; errorChar = formatSettings.gapSymbol; } else if (fullSymbolsList.find(formatSettings.matchSymbol) != string::npos) { errCharName = "MATCH"; errorChar = formatSettings.matchSymbol; } else if (fullSymbolsList.find(formatSettings.missingSymbol) != string::npos) { errCharName = "MISSING"; errorChar = formatSettings.missingSymbol; } if (!errCharName.empty()) { errorMsg << "The " << errCharName << " character(" << errorChar << ") cannot be identical to a state's symbol"; throw NxsException(errorMsg); } dataType.SetGapChar(formatSettings.gapSymbol); dataType.SetMatchChar(formatSettings.matchSymbol); dataType.SetMissingChar(formatSettings.missingSymbol); if (formatSettings.rawEquate.length() > 0) dataType.AddEquates(formatSettings.rawEquate); return kCmdSucceeded; } /** * @method HandleTokenState [int:protected] * @param token [NxsToken&] the token used to read from in * @param j [int] the character index, in range [0..nchar) * @throws NxsException * * Called from HandleNextState to read in the next state when 'tokens' is in effect. * Looks up state in character states listed for the character to make * sure it is a valid state, and returns state's value (0, 1, 2, ...). * Note: does NOT handle adding the state's value to matrix. Save the return * value, let's call it k, and use the following command to add it to matrix: * matrix->AddState(i, j, k); */ unsigned NxsCharactersBlock::ReadTokenState( NxsToken &token, unsigned origInd) const { // token should be one of the character states listed for character origInd // in charStates // StateLabelMap::const_iterator bagIter = charStates.find(origInd); if(bagIter == charStates.end()) { errorMsg << "No states were defined for character " << (1 + origInd); throw NxsException(errorMsg, token); } const VecString &statesVec(bagIter->second); // TO DO: this section is very UGLY - need to find some cleaner way of comparing // the token string to the strings representing valid characters states // in the LabelList associated with character j // VecString::const_iterator cit; if(formatSettings.respectingCase) cit = find(statesVec.begin(), statesVec.end(), token.GetTokenReference()); else { NStrCaseInsensitiveEquals compF(token.GetTokenReference()); cit = find_if(statesVec.begin(), statesVec.end(), compF); } if(cit == statesVec.end()) { errorMsg << "Character state " << token.GetTokenReference() << " not defined for character " << (1 + origInd); throw NxsException(errorMsg, token); } // ok, the state has been identified, so return the state's internal // representation. That is, if the list of state labels was // "small medium large" and "small" was specified in the data file, // state saved in matrix would be 0 (it would be 1 if "medium" were // specified in the data file, and 2 if "large" were specified in the // data file). return (unsigned) distance(statesVec.begin(), cit); } bool NxsCharactersBlock::ReadNextStateToken( NxsToken& token, unsigned i, unsigned j, unsigned origInd) { assert(formatSettings.tokens); ++token; // handle case in which TOKENS was specified in the FORMAT command // token should be in one of the following forms: "{" "a" "bb" // bool polymorphism = (token.GetTokenReference() == '('); bool uncertainty = (token.GetTokenReference() == '{'); if(!uncertainty && !polymorphism) { unsigned k = ReadTokenState(token, origInd); dataMatrix->SetStateIndex(i, j, k); } else { bool prevTokenWasTilde = false; unsigned first = UINT_MAX; unsigned last; dataMatrix->ZeroState(i,j); for(;;) { // OPEN ISSUE: What about newlines if interleaving? I'm assuming // that the newline must come between characters to count. ++token; if (polymorphism && token.GetTokenReference() == ')') { if(prevTokenWasTilde) { errorMsg = "Range of states still being specified when ')' encountered"; throw NxsException(errorMsg, token); } break; } else if (uncertainty && token.GetTokenReference() == '}') { if(prevTokenWasTilde) { errorMsg = "Range of states still being specified when '}' encountered"; throw NxsException(errorMsg, token); } break; } else if (token.GetTokenReference() == '~') { if(first == UINT_MAX) { errorMsg = "Tilde character ('~') cannot precede token indicating beginning of range"; throw NxsException(errorMsg, token); } prevTokenWasTilde = true; } else if(prevTokenWasTilde) { // Add all states from first+1 to last, then reset prevTokenWasTilde to 0 // last = ReadTokenState(token, origInd); if(last <= first) { errorMsg << "Last state in specified range (" << token.GetTokenReference() << ") must be greater than the first"; throw NxsException(errorMsg, token); } for(unsigned k = first+1; k <= last; ++k) dataMatrix->AddStateIndex(i, j, k); prevTokenWasTilde = false; first = UINT_MAX; } else { // Add current state, then set first to that state's value // State's value is its position within the list of states // for that character // first = ReadTokenState(token, origInd); dataMatrix->AddStateIndex(i, j, first); } } if(polymorphism) dataMatrix->SetPolymorphic(i, j); } return true; } bool NxsCharactersBlock::ReadTransposedMatrix(NxsToken& token) { const unsigned nCharTotal = GetTotalNumCharacters(); const unsigned totalNTax = GetNumTaxa(); const bool readLabels = formatSettings.labels; const bool interleaving = formatSettings.interleaving; const bool tokens = formatSettings.tokens; unsigned currTaxon = UINT_MAX; unsigned firstTaxonInPage = 0; unsigned lastTaxonInPage = totalNTax; bool newCharLabels = (needCharLabels && charLabels.empty()); vector indToCharPosMap; charIndexAdjuster.BuildOldToNewIndexMap(&indToCharPosMap, nCharTotal); // if currTaxon equals nTaxInMatrix, then we've just finished reading the last // interleave page and thus should break from the outer loop // Note that if we are not interleaving, this will still work since // lastTaxon is initialized to ntaxTotal and never changed // for (unsigned page = 0; currTaxon != totalNTax; ++page) { for(unsigned currChar = 0; currChar < nCharTotal; ++currChar) { if(readLabels) { // this should be the character label // ++token; unsigned charInd = FindIndexFromCharLabels(token.GetTokenReference()); bool isANumber; if (page == 0 && newCharLabels) { if (charInd != UINT_MAX) { errorMsg << "Data for this character (" << token.GetTokenReference() << ") has already been saved"; throw NxsException(errorMsg, token); } // Labels provided, need to add them to charLabels list. // saving character labels even for characters that have been eliminated. // if (!IsALegalNexusLabelForObjectN(token.GetTokenReference(), currChar, &isANumber)) { errorMsg << token.GetTokenReference() << " is an illegal " << GetCharLabelsCmdName() << " for " << GetDatumName() << " number " << (currChar+1) << '.'; if (isANumber) errorMsg << " If a number is used as a" << GetCharLabelsCmdName() << ", it must identical to the number of that " << GetDatumName(); throw NxsException(errorMsg, token); } if (!isANumber) charLabels[currChar] = token.GetTokenReference(); } else { // either not first interleaved page or character labels previously defined // if(charInd == UINT_MAX) { if (!IsALegalNexusLabelForObjectN(token.GetTokenReference(), currChar, &isANumber)) { errorMsg << token.GetTokenReference() << " is an illegal " << GetCharLabelsCmdName() << " for " << GetDatumName() << " number " << (currChar+1) << '.'; throw NxsException(errorMsg, token); } if (!isANumber) { errorMsg << "Could not find character named " << token.GetTokenReference() << " among stored character labels"; throw NxsException(errorMsg, token); } charInd = currChar; } else if (charInd != currChar) { // make sure user has not duplicated data for a single character or // changed the order in which characters appear in different interleave // pages // if(page == 0) errorMsg << "Data for this character (" << token.GetTokenReference() << ") has already been saved"; else errorMsg = "Ordering of characters must be identical to that in first interleave page"; throw NxsException(errorMsg, token); } } } //************************************************ //******** Beginning of loop through taxa ******** //************************************************ if (interleaving) token.AlterTokenReading(NxsToken::kNewlineIsToken); try { // it is possible that character currChar has been ELIMINATEd, we need to keep track // of the positon in characters matrix by iterating posInStored // unsigned charPosInMatrix = indToCharPosMap[currChar]; NxsDiscreteMatrix *bareDataMatrix = dataMatrix.get(); for(currTaxon = firstTaxonInPage; currTaxon < lastTaxonInPage; ++currTaxon) { // ok will be 0 only if a newline character is encountered before // taxon i processed // bool ok ; if (!tokens) { token.ReadSingleCharacter(); assert(token.GetTokenLength() > 0); if (token.GetTokenReference()[0] == '\n') ok = false; else { ok = true; dataType.ReadNextState(bareDataMatrix, token, currTaxon, charPosInMatrix); } } else ok = ReadNextStateToken(token, currTaxon, charPosInMatrix, currChar); if(!ok) { assert(interleaving); if (lastTaxonInPage < totalNTax && currTaxon != lastTaxonInPage) { errorMsg = "Each line within an interleave page must comprise the same number of taxa"; throw NxsException(errorMsg, token); } lastTaxonInPage = currTaxon; } } // innermost loop (over taxa) } catch (...) { token.AlterTokenReading(NxsToken::kNewlineIsNotToken); throw; } if (interleaving) token.AlterTokenReading(NxsToken::kNewlineIsNotToken); } // middle loop (over characters) firstTaxonInPage = lastTaxonInPage; lastTaxonInPage = totalNTax; } // outer loop (over interleave pages) ++token; token.ThrowIfNot(";"); return true; } bool NxsCharactersBlock::ReadStdMatrix( NxsToken& token) { const unsigned nCharTotal = GetTotalNumCharacters(); unsigned maxNTax = NxsAlternativeTaxaBlock::GetNumTaxa(); const bool readLabels = formatSettings.labels; const bool interleaving = formatSettings.interleaving; const bool tokens = formatSettings.tokens; bool labelIsNumber; unsigned currChar = 0; unsigned firstCharInCurrPage = 0; bool needNextTaxonName = true; // used in case we accidentally read a taxon name from the next interleaved page (will only happen when interleaving, without newtaxa and when not all of the taxa from the taxa block are present) vector indToCharPosMap; charIndexAdjuster.BuildOldToNewIndexMap(&indToCharPosMap, nCharTotal); vector::const_iterator thisPagesFirstPosInMatrix = indToCharPosMap.begin(); vector::const_iterator currPosInMatrix = indToCharPosMap.begin(); // if currChar equals ncharTotal, then we've just finished reading the last // interleave page and thus should break from the outer loop // Note that if we are not interleaving, this will still work since // lastCharInCurrPage is initialized to ncharTotal and never changed // so currChar == nCharTotal when it exits the loop over taxa // for(unsigned page = 0; currChar != nCharTotal; ++page) { unsigned lastCharInCurrPage = nCharTotal; for(unsigned rowIndex = 0; rowIndex < maxNTax; ++rowIndex) { unsigned positionInTaxaBlock = rowIndex; if(readLabels) { if (needNextTaxonName) ++token; // This should be the taxon label if (token.GetTokenReference() == ';' && lastCharInCurrPage == nCharTotal && !IsAddingNewTaxa() && page == 0) { // we hit a semicolon before readin all of the taxa from the taxa block. This is tolerated as long as the user didn't specify ntax // and we aren't partially through reading the data for the taxa that are included. // SetNewNumTaxaAfterReadingData(rowIndex); return true; } needNextTaxonName = true; if (page == 0 && IsAddingNewTaxa()) { // the label supplied should be unique (or a number identical to the taxon number) // unsigned prevInd = GetTaxPosDontThrow(token.GetTokenReference()); if (prevInd != UINT_MAX && prevInd != rowIndex) { errorMsg << "Data for this taxon (" << token.GetTokenReference() << ") has already been saved"; throw NxsException(errorMsg, token); } // replacing the default name (#) with a name // if (!IsALegalNexusLabelForObjectN(token.GetTokenReference(), rowIndex, &labelIsNumber)) { errorMsg << token.GetTokenReference() << " is not a legal taxon name."; if (labelIsNumber) errorMsg << " If a number is used as a taxon label, it must identical to the number of that taxon in the matrix."; throw NxsException(errorMsg, token); } SetTaxonLabel(rowIndex, token.GetTokenReference()); } else { // Cannot assume taxon in same position in // taxa block. Set up taxonPos array so that we can look up // the correct row in matrix for any given taxon // positionInTaxaBlock = FindExternalIndexForTaxon(token.GetTokenReference()); if (positionInTaxaBlock == UINT_MAX) { if (token.GetTokenReference() == ';' && rowIndex == 0) errorMsg << "Unexpected ; (after only " << currChar << " characters were read)"; else errorMsg << "Could not find taxon named " << token.GetTokenReference() << " among stored taxon labels"; throw NxsException(errorMsg, token); } if (page == 0) { if (GetTaxPos(positionInTaxaBlock) != UINT_MAX) { if (interleaving && GetTaxPos(positionInTaxaBlock) == 0) { // we've repeated a taxon name without reaching the rowIndex == maxNTax. // This is only allowed if we are interleaving (and he user didn't specify ntax) // needNextTaxonName = false; //flags the fact that we've already read the taxon name maxNTax = rowIndex; SetNewNumTaxaAfterReadingData(rowIndex); break; // breaks to the loop over interleave pages } else { errorMsg << "Data for this taxon (" << token.GetTokenReference() << ") has already been saved"; throw NxsException(errorMsg, token); } } } else { if(GetTaxPos(positionInTaxaBlock) != rowIndex) { errorMsg = "Ordering of taxa must be identical to that in first interleave page"; throw NxsException(errorMsg, token); } } } if (page == 0) SetTaxPosInMatrix(positionInTaxaBlock, rowIndex); } //****************************************************** //******** Beginning of loop through characters ******** //****************************************************** if (interleaving) token.AlterTokenReading(NxsToken::kNewlineIsToken); try { NxsDiscreteMatrix *bareDataMatrix = dataMatrix.get(); currPosInMatrix = thisPagesFirstPosInMatrix; for(currChar = firstCharInCurrPage; currChar < lastCharInCurrPage; ++currChar, ++currPosInMatrix) { // Because some characters might be eliminated, posInStored does not // necessarily equal currChar // bool ok; if (!tokens) { token.ReadSingleCharacter(); assert(token.GetTokenLength() > 0); if (token.GetTokenReference()[0] != '\n') { ok = true; dataType.ReadNextState(bareDataMatrix, token, rowIndex, *currPosInMatrix); } else ok = false; } else ok = ReadNextStateToken(token, rowIndex, *currPosInMatrix, currChar); if(!ok) { // ok will be false only if a newline character is encountered // assert(interleaving); if(lastCharInCurrPage < nCharTotal && currChar != lastCharInCurrPage) { // lastCharInCurrPage == nCharTotal for the first taxon in a page. // errorMsg = "Each line within an interleave page must comprise the same number of characters"; throw NxsException(errorMsg, token); } lastCharInCurrPage = currChar; } } // innermost loop (over characters) } catch (...) { token.AlterTokenReading(NxsToken::kNewlineIsNotToken); throw; } if (interleaving) token.AlterTokenReading(NxsToken::kNewlineIsNotToken); } // middle loop (over taxa) // to read the next page, advance charPosIt and firstCharInCurrPage // thisPagesFirstPosInMatrix = currPosInMatrix; firstCharInCurrPage = lastCharInCurrPage; } // outer loop (over interleave pages) ++token; token.ThrowIfNot(";"); return true; } bool NxsCharactersBlock::ParseMatrix( NxsToken & token) { StartingCommandThatUsesTaxa("Matrix"); unsigned expectedNTax = GetNumTaxa(); if (expectedNTax == 0) { errorMsg << "Must precede " << GetID() << " block with a TAXA block or specify NEWTAXA and NTAX in the DIMENSIONS command"; throw NxsException(errorMsg, token); } dataMatrix = DiscreteMatrixShPtr(new NxsDiscreteMatrix(expectedNTax, nChar, dataType)); // if we're reading a non-transposed matrix with labels then we need to build the taxonPos array while reading the matrix // if not the ordering has to be identical to the taxablock ordering // BuildTaxonPosArray(!(formatSettings.labels && !(formatSettings.transposing))); bool matRead; if(formatSettings.transposing) matRead = ReadTransposedMatrix(token); else matRead = ReadStdMatrix(token); assert(matRead); // now the ordering of the taxa in the matrix is known, so we alert the AlternativeTaxaBlock interface // that we are done introducing new taxa, or deciding on the order of taxa // FinishedManipulatingTaxonPos(); if (GetNumTaxaWithData() < expectedNTax) { for (unsigned i = expectedNTax -1; i >= GetNumTaxaWithData(); --i) dataMatrix->RemoveRow(i); dataMatrix->TrimExcessCapacity(); } return kCmdSucceeded; } CmdResult NxsCharactersBlock::HandleEliminate(NxsEliminateCmdSettings *s) { eliminated = s->toEliminate; nEliminated = eliminated.size(); charIndexAdjuster.SetEliminated(eliminated, GetTotalNumCharacters()); nChar -= nEliminated; return kCmdSucceeded; } void NxsCharactersBlock::ReadCharLabel( NxsToken &token, set &uniqNames, unsigned index, bool save) { if(token.GetTokenReference() != ' ') { string s(token.GetTokenReference()); if (!formatSettings.respectingCase) Capitalize(s); pair< set::iterator ,bool> ret = uniqNames.insert(s); if (!ret.second) { errorMsg << GetCharLabelsCmdName() << "s must be unique (" << s << " was repeated)"; throw NxsException(errorMsg, token); } } bool isANumber; if (!IsALegalNexusLabelForObjectN(token.GetTokenReference(), index, &isANumber)) { errorMsg << token.GetTokenReference() << " is an illegal " << GetCharLabelsCmdName() << " for " << GetDatumName() << " number " << (index+1) << '.'; if (isANumber) errorMsg << " If a number is used as a" << GetCharLabelsCmdName() << ", it must identical to the number of that " << GetDatumName(); throw NxsException(errorMsg, token); } if(save && !isANumber ) charLabels[index] = token.GetTokenReference(); ++token; } bool NxsCharactersBlock::ParseCharLabels( NxsToken &token) { if (NoAdvancedCommandsHaveBeenRead()) StartingAdvancedCommand(); if (!charLabels.empty()) { errorMsg << "The " << GetCharLabelsCmdName() << "s command cannot follow a " << GetCharStateLabelsCmdName() << "s command."; throw NxsException(errorMsg, token); } set uniqNames; ++token; for(unsigned index = 0; token.GetTokenReference() != ';'; ++index) { if (index >= GetTotalNumCharacters()) { errorMsg << "The number of " << GetCharLabelsCmdName() << "s supplied cannot exceed " << GetTotalNumCharacters() << " (the " << GetNumCharsName() << " specified in the DIMENSIONS command)"; throw NxsException(errorMsg, token); } bool save = !IsEliminated(index); ReadCharLabel(token, uniqNames, index, save); } return true; } // return indicates whether or not a ; was encountered // bool NxsCharactersBlock::ReadStateLabels( NxsToken &token, unsigned index, unsigned nStatesInSymbols, bool save) { LabelList stList; set stSet; bool semiColon = false; const bool respectCase = formatSettings.respectingCase; for(unsigned x = 1; token.GetTokenReference() != ','; ++x) { if (x > nStatesInSymbols) { errorMsg << "There only " << nStatesInSymbols << " states according to the FORMAT command, but at least " << x << " state labels were supplied for " << GetDatumName() << ' ' << (index+1) << " in the " << GetCharStateLabelsCmdName() << " command."; throw NxsException(errorMsg, token); } stList.push_back(token.GetTokenReference()); if (token.GetTokenReference() != ' ') { string s(token.GetTokenReference()); if (!respectCase) Capitalize(s); pair< set::iterator ,bool> ret = stSet.insert(s); if (!ret.second) { errorMsg << "The " << GetStateLabelsCmdName() << " for a " << GetDatumName() << " must be unique (" << s << " was repeated)."; throw NxsException(errorMsg, token); } } ++token; if (token.GetTokenReference() == ';') { semiColon = true; break; } } if (save) charStates[index] = stList; ++token; return semiColon; } bool NxsCharactersBlock::ParseStateLabels( NxsToken &token) { if (NoAdvancedCommandsHaveBeenRead()) StartingAdvancedCommand(); if (!charStates.empty()) { errorMsg << "The " << GetStateLabelsCmdName() << " command cannot follow a " << GetCharStateLabelsCmdName() << " command."; throw NxsException(errorMsg, token); } const unsigned nStatesInSymbols = dataType.GetNumStates(); ++token; for(; token.GetTokenReference() != ';';) { const unsigned n = ReadCharacterIndex(token); if (ReadStateLabels(token, n, nStatesInSymbols, !IsEliminated(n))) break; } needCharLabels = false; return true; } unsigned NxsCharactersBlock::ReadCharacterIndex( NxsToken &token) { // token should be the character number; create a new association UInt u; if (!IsAnUnsigned(token.GetTokenReference(), &u)) { errorMsg << "Expecting a " << GetDatumName() << " number, but found " << token.GetTokenReference() << " in the " << GetCharStateLabelsCmdName() << "command."; throw NxsException(errorMsg, token); } if (u > GetTotalNumCharacters() || u < 1) { errorMsg << "Invalid " << GetDatumName() << " number (" << token.GetTokenReference() << ") found in " << GetCharStateLabelsCmdName() << " command ("; if (u < GetTotalNumCharacters()) errorMsg << "greater than the " << GetNumCharsName() << " specified in the DIMENSIONS command)"; else errorMsg << GetDatumName() << " numbers must be positive)"; throw NxsException(errorMsg, token); } ++token; return (u-1); } bool NxsCharactersBlock::ParseCharStateLabels( NxsToken &token) { if (NoAdvancedCommandsHaveBeenRead()) StartingAdvancedCommand(); if (!charStates.empty() || !charLabels.empty()) { errorMsg << "The " << GetCharStateLabelsCmdName() << " command cannot follow a " << GetCharLabelsCmdName() << " or a " << GetStateLabelsCmdName() << " command."; throw NxsException(errorMsg, token); } set uniqNames; const unsigned nStatesInSymbols = dataType.GetNumStates(); ++token; for(;;) { if (token.GetTokenReference() == ';') break; unsigned n = ReadCharacterIndex(token); bool save = !IsEliminated(n); if (token.GetTokenReference() != '/') ReadCharLabel(token, uniqNames, n, save); if (token.GetTokenReference() == '/') { ++token; if (ReadStateLabels(token, n, nStatesInSymbols, save)) { needCharLabels = false; return true; } } else if (token.GetTokenReference() == ';') { needCharLabels = false; return true; } else if (token.GetTokenReference() == ',') ++token; else { errorMsg << "Expecting a comma or semicolon here, but found (" << token.GetTokenReference() << ") instead"; throw NxsException(errorMsg, token); } } return true; } void NxsCharactersBlock::ResetCmdMgrNxsBlock() { dataType = NxsDataType(NxsDataType::kStandard, false); charLabels.clear(); /* vector of the known taxon labels */ charStates.clear(); dataMatrix = DiscreteMatrixShPtr((NxsDiscreteMatrix *)NULL); nChar = 0; needCharLabels = true; NxsAlternativeTaxaBlock::ResetTaxaInfo(); eliminated.clear(); nEliminated = 0; charIndexAdjuster.clear(); } /*---------------------------------------------------------------------------------------------------------------------- | Returns the number of characters in the longest taxon name (or number) */ unsigned NxsCharactersBlock::GetMaxLabelLength() const { string s; s << GetTotalNumCharacters(); unsigned maxN = (unsigned)s.length(); for (LabelMap::const_iterator labIt = charLabels.begin(); labIt != charLabels.end(); ++labIt) { if (labIt->second.length() > maxN) maxN = (unsigned)labIt->second.length(); } return maxN; } CmdResult NxsCharactersBlock::FinishHandlingDimensions(NxsDimensionsSettings *s) { nChar = s->secondDimension; assert(charLabels.empty() && charStates.empty()); return kCmdSucceeded; } bool NxsCharactersBlock::CanReadBlockType( const string &s) { if (EqualsCaseInsensitive(s, "CHARACTERS")) { SetID("CHARACTERS"); NxsAlternativeTaxaBlock::dimensionSettings->newTaxa = false; NxsAlternativeTaxaBlock::dimensionSettings->nTaxa = NxsAlternativeTaxaBlock::taxaMgr.GetSize(); } else if (EqualsCaseInsensitive(s, "DATA")) { SetID("DATA"); NxsAlternativeTaxaBlock::dimensionSettings->newTaxa = true; } else return false; return true; } string NxsCharactersBlock::GetAdvancedCommandName() const { string s; s << "a MATRIX or " << GetCharStateLabelsCmdName(); return s; } NxsCharactersBlock::NxsCharactersBlock(NxsTaxaManager & inTaxaMgr, NxsCharactersManager & inCharMgr) :NxsCommandManagerBlock("CHARACTERS"), NxsAlternativeTaxaBlock(inTaxaMgr), dataMatrix(), dataType(NxsDataType::kStandard, false), charactersMgr(inCharMgr) { ConstructorInitialization(); // call this virtual function which is overridden in AllelesBlock to avoid prematurely calling InitializeRecognizedCommands before AllelesBlock is constructed } void NxsCharactersBlock::ConstructorInitialization() { InitializeRecognizedCommands(); Reset(); } #endif