#!/usr/bin/python # Copyright (c) 2005 by Mark T. Holder, Florida State University. (see end of file) '''Quick and dirty sequence class written to help Gavin out with a script. \temp should be replaced with Bio.Seq (perhaps)''' import string, copy, sys class MthSeq: def findSubSequenceStartPoint(self, potentialSubSet): # TEMP hideous algorithm def matchFunc(x, s, p): c = s.seqList[x] d = p[x] inter = c&d return inter == c or inter == d # Note we may want "return inter != 0" here and dispense with all of the determining which is a subset and just look for an intersection at every site if potentialSubSet._isMatchingSubset(self, matchFunc): return 0 shorterCopy = copy.copy(self) maxOffSet = len(self.seqList) - len(potentialSubSet) if maxOffSet < 1: return None for i in xrange(1, maxOffSet + 1): shorterCopy.pop(0) if potentialSubSet._isMatchingSubset(shorterCopy, matchFunc): return i return None def isMoreResolvedSubset(self, potentialSuperSet): return self._isMatchingSubset(potentialSuperSet, lambda x, s , p : s.seqList[x] & p[x] == s.seqList[x]) def _isMatchingSubset(self, potentialSuperSet, matchFunc): ls = len(self.seqList) if ls > len(potentialSuperSet): if ls > len(potentialSuperSet) + 1 or self.seqList[ls-1] != self.stopSeqCode: return False ls -= 1 for i in range(ls): if not matchFunc(i, self, potentialSuperSet): #print >>sys.stderr, self.seqList[i], 'does not match', potentialSuperSet[i], 'in _isMatchingSubset()' return i == ls - 1 and potentialSuperSet[i] == self.stopSeqCode return True def isValidSymbol(self, n): return n <= self.missingCode def appendCode(self, n): if n > self.missingCode: s = '%s is not a valid numeric code for %s' % (n, self.__class__.__name__) raise ValueError self.seqList.append(n) def insertCode(self, i, n): if n > self.missingCode: s = '%s is not a valid numeric code for %s' % (n, self.__class__.__name__) raise ValueError, s self.seqList.insert(i, n) def removeGaps(self): gapN = self.symbolDict['-'] tmp = self.seqList self.seqList = [] for i in tmp: if i & gapN: if i != gapN: self.seqList.append(i - gapN) else: self.seqList.append(i) def pop(self, i): self.seqList.pop(i) def __delitem__(self, k): del self.seqList[k] def __str__(self): return ''.join([self.translateCode(i) for i in self.seqList]) def __len__(self): return len(self.seqList) def __getitem__(self, k): return self.seqList[k] def __contains__(self, potentialSubSet): # TEMP hideous algorithm if potentialSubSet.isMoreResolvedSubset(self): return True shorterCopy = copy.copy(self) maxOffSet = len(self.seqList) - len(potentialSubSet) if maxOffSet < 1: return False for i in xrange(maxOffSet): shorterCopy.pop(0) if potentialSubSet.isMoreResolvedSubset(shorterCopy): return True return False def __iter__(self): return iter(self.seqList) class DNASeq(MthSeq): symbols = 'ACGT-' equates = [ ('R', '{AG}'), ('Y', '{CT}'), ('M', '{AC}'), ('K', '{GT}'), ('S', '{CG}'), ('W', '{AT}'), ('H', '{ACT}'), ('B', '{CGT}'), ('V', '{ACG}'), ('D', '{AGT}'), ('N', '{ACGT}'), ('X', '{ACGT}')] listBitLists = [ [], [1], [2], [1, 2], [4], [1, 4], [2, 4], [1, 2, 4], [8], [1, 8], [2, 8], [1, 2, 8], [4, 8], [1, 4, 8], [2, 4, 8], [1, 2, 4, 8]] def __init__(self, transSeqList, stringForm = ''): self.seqList = [i for i in transSeqList] def __copy__(self): return DNASeq(copy.copy(self.seqList)) def translateCode(self, n): return DNASeq.codeToSymbolDict.get(n, '?') def toCodon(codonString): assert(len(codonString) == 3) nList = [DNASeq.symbolDict[i] for i in codonString[:3]] return DNASeq.numericCodesToCodon(nList) toCodon = staticmethod(toCodon) def numericCodesToCodon(nList): codonCode = 0 for n in nList[0:3]: if n == DNASeq.gapCode: raise ValueError, 'Gap code found in ...ToCodon() method.' if n > DNASeq.gapCode: n -= DNASeq.gapCode # we'll tolerate possible gaps and just mask off the gap codonCode <<= 4 codonCode += n return codonCode numericCodesToCodon = staticmethod(numericCodesToCodon) def codonCodeToStr(codonNumberCode): #print hex(codonNumberCode) tCode = codonNumberCode & 15 sCode = (codonNumberCode >> 4) & 15 fCode = (codonNumberCode >> 8) & 15 return ''.join([DNASeq.codeToSymbolDict[fCode], DNASeq.codeToSymbolDict[sCode], DNASeq.codeToSymbolDict[tCode]]) codonCodeToStr = staticmethod(codonCodeToStr) def iter_next_nucleotide(self): '''skips gaps and masks off the gap code of positions that have the gap bit''' for i in self.seqList: if i >= DNASeq.gapCode: if i != DNASeq.gapCode: yield i - DNASeq.gapCode else: yield i def iter_codons(self): '''skips gap sites, returns triples. Pads the end with 'N' codes''' retList = [0, 0] for i in self.iter_next_nucleotide(): if retList[0] == 0: retList[0] = i elif retList[1] == 0: retList[1] = i else: yield retList[0], retList[1], i retList = [0, 0] if retList[0] != 0: if retList[1] == 0: yield retList[0], DNASeq.anyNonGapCode, DNASeq.anyNonGapCode else: yield retList[0], retList[1], DNASeq.anyNonGapCode def getSubsetCodons(f, s, t): all_unambig = [] for f_unambig in DNASeq.listBitLists[f]: for s_unambig in DNASeq.listBitLists[s]: for t_unambig in DNASeq.listBitLists[t]: all_unambig.append(DNASeq.numericCodesToCodon((f_unambig, s_unambig, t_unambig))) return all_unambig getSubsetCodons = staticmethod(getSubsetCodons) def iter_ambig_codons(self): for f, s, t in self.iter_codons(): yield DNASeq.getSubsetCodons(f, s, t) class AASeq(MthSeq): symbols = 'ACDEFGHIKLMNPQRSTVWY*-' equates = [ ('B', '{DN}'), ('Z', '{EQ}'), ('X', 'ACDEFGHIKLMNPQRSTVWY*')] def __init__(self, nucList): self.seqList = nucList def __copy__(self): return AASeq(copy.copy(self.seqList)) def translateCode(self, n): c = AASeq.codeToSymbolDict.get(n) if c is None: return n > self.gapCode and '?' or 'X' return c def addLowerCaseEquateKeys(eList): lcEquates = [(i[0].lower(), i[1]) for i in eList] eList.extend(lcEquates) def constructNexusSymbolsTranslation(blockObj, respectCase, missing, useEquatesInOutput, tok = None): identityTrans = string.maketrans('', '') blockObj.charShift = 1 blockObj.symbolDict = {} blockObj.codeToSymbolDict = {} for s in blockObj.symbols: if respectCase: blockObj.symbolDict[s] = blockObj.charShift blockObj.codeToSymbolDict[blockObj.charShift] = s else: # assumes that the symbols list will not have both 'a' and 'A' blockObj.symbolDict[s.upper()] = blockObj.charShift blockObj.symbolDict[s.lower()] = blockObj.charShift blockObj.codeToSymbolDict[blockObj.charShift] = s.upper() blockObj.charShift = blockObj.charShift << 1 blockObj.missingCode = blockObj.charShift - 1 blockObj.symbolDict[missing] = blockObj.missingCode for e, v in blockObj.equates: if v.count('(') + v.count(')') > 0: raise NexusAfterTokenError(tok, 'Polymorphic Equates are not supported yet') eqSyms = v.translate(identityTrans, '{}()') n = 0 try: for es in eqSyms: n += blockObj.symbolDict[es] blockObj.symbolDict[e] = n if useEquatesInOutput: blockObj.codeToSymbolDict[n] = e.upper() # bio python IUPAC for protein is slightly different than NEXUS (X is {SC}), so we do not use protein equates except KeyError, x: raise NexusAfterTokenError(tok, 'Symbol %s in expansion of EQUATE macro "%s => %s" is invalid' % (x, e, v)) # for k,v in blockObj.symbolDict.iteritems(): # print k, '= %20s' % hex(v) # make sure when we are outputting we use IUPAC in preference to whatever missing character was assigned (if there is a clash) if blockObj.symbols.startswith(DNASeq.symbols[:-1]) and blockObj.symbolDict.has_key('N'): blockObj.codeToSymbolDict[blockObj.symbolDict['N']] = 'N' elif blockObj.symbols.startswith(AASeq.symbols[:-1]) and blockObj.symbolDict.has_key('X'): blockObj.codeToSymbolDict[blockObj.symbolDict['X']] = 'X' addLowerCaseEquateKeys(DNASeq.equates) constructNexusSymbolsTranslation(DNASeq, False, '?', True) DNASeq.missingCode = DNASeq.symbolDict['?'] DNASeq.gapCode = DNASeq.symbolDict['-'] DNASeq.anyNonGapCode = DNASeq.symbolDict['N'] DNASeq.stopSeqCode = 0 addLowerCaseEquateKeys(AASeq.equates) constructNexusSymbolsTranslation(AASeq, False, '?', False) AASeq.missingCode = AASeq.symbolDict['?'] AASeq.gapCode = AASeq.symbolDict['-'] AASeq.anyNonGapCode = AASeq.symbolDict['X'] AASeq.stopSeqCode = AASeq.symbolDict['*']