""" Parser for (new) ACE files output by PHRAP. version 1.3, 05/06/2004 Written by Frank Kauff (fkauff@duke.edu) and Cymon J. Cox (cymon@duke.edu) Uses the Biopython Parser interface: ParserSupport.py Usage: There are two ways of reading an ace file: The ACEParser() reads the whole file at once and the RecordParser() reads contig after contig. 1) Parse whole ace file at once: aceparser=Ace.ACEParser() acefilerecord=aceparser.parse(open('my_ace_file.ace','r')) gives you acefilerecord.ncontigs (the number of contigs in the ace file) acefilerecord.nreads (the number of reads in the ace file) acefilerecord.contigs[] (one instance of the Contig class for each contig) The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used for this contig in a list of instances of the Read class, e.g.: contig3=acefilerecord.contigs[2] read4=contig3.reads[3] RD_of_read4=read4.rd DS_of_read4=read4.ds CT, WA, RT tags from the end of the file can appear anywhere are automatically sorted into the right place. see _RecordConsumer for details. 2) *** DEPRECATED: not entirely suitable for ACE files! Or you can iterate over the contigs of an ace file one by one in the ususal way: recordparser=Ace.RecordParser() iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser) while 1: contig=iterator.next() if contig is None: break ... if WA, CT, RT, WR tags are at the end and the iterator is used, they will be returned with the last contig record. This is is necessarily the case when using an interator. Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags are needed, the ACEParser instead of the RecordParser might be appropriate. """ import os from types import * from Bio import File from Bio import Index #from Bio import Seq #from Bio import SeqRecord from Bio.ParserSupport import * from Bio.Alphabet import IUPAC class rd: def __init__(self): self.name='' self.padded_bases=None self.info_items=None self.read_tags=None self.sequence='' class qa: def __init__(self): self.qual_clipping_start=None self.qual_clipping_end=None self.align_clipping_start=None self.align_clipping_end=None class ds: def __init__(self): self.chromat_file='' self.phd_file='' self.time='' self.chem='' self.dye='' self.template='' self.direction='' class af: def __init__(self): self.name='' self.coru=None self.padded_start=None class bs: def __init__(self): self.name='' self.padded_start=None self.padded_end=None class rt: def __init__(self): self.name='' self.tag_type='' self.program='' self.padded_start=None self.padded_end=None self.date='' class ct: def __init__(self): self.name='' self.tag_type='' self.program='' self.padded_start=None self.padded_end=None self.date='' self.notrans='' self.info=[] class wa: def __init__(self): self.tag_type='' self.program='' self.date='' self.info=[] class wr: def __init__(self): self.name='' self.aligned='' self.program='' self.date=[] class Reads: def __init__(self): self.rd=None # one per read self.qa=None # one per read self.ds=None # none or one per read self.rt=None # none or many per read self.wr=None # none or many per read class Contig: """Hold information from a ACE record """ def __init__(self): self.name = '' self.nbases=None self.nreads=None self.nsegments=None self.uorc=None self.sequence=None self.quality=None self.af=[] self.bs=[] self.reads=[] self.ct=None # none or many self.wa=None # none or many class Iterator: """Iterates over a ACE-file with multiple contigs Methods: next Return the next record from the stream, or None. """ def __init__(self, handle, parser=None): """__init__(self, handle, parser=None) Create a new iterator. handle is a file-like object. parser is an optional Parser object to change the results into another form. If set to None, then the raw contents of the file will be returned. """ if type(handle) is not FileType and type(handle) is not InstanceType: raise ValueError, "I expected a file handle or file-like object" self._uhandle = File.UndoHandle(handle) self._parser = parser def next(self): """next(self) -> object Return the next contig record from the file. If no more records return None. """ lines = [] while 1: # if at beginning, skip the AS and look for first CO command line=self._uhandle.readline() if not line: # empty or corrupt file return None if line[:2]=='CO': lines.append(line) break while 1: line = self._uhandle.readline() if not line: break # If a new record, then put the line back and stop. if lines and line[:2] == 'CO': self._uhandle.saveline(line) break lines.append(line) if not lines: return None data = ''.join(lines) if self._parser is not None: return self._parser.parse(File.StringHandle(data)) return data class RecordParser(AbstractParser): """Parses ACE file data into a Record object """ def __init__(self): self._scanner = _Scanner() self._consumer = _RecordConsumer() def parse(self, handle): if isinstance(handle, File.UndoHandle): uhandle = handle else: uhandle = File.UndoHandle(handle) self._scanner.feed(uhandle, self._consumer) return self._consumer.data class ACEFileRecord: """Holds data of ACE file. """ def __init__(self): self.ncontigs=None self.nreads=None self.contigs=[] self.wa=None # none or many def sort(self): """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ ct=[] rt=[] wr=[] # search for tags that aren't in the right position for i in range(len(self.contigs)): c = self.contigs[i] if c.wa: if not self.wa: self.wa=[] self.wa.extend(c.wa) if c.ct: newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] map(self.contigs[i].ct.remove,newcts) ct.extend(newcts) for j in range(len(c.reads)): r = c.reads[j] if r.rt: newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] map(self.contigs[i].reads[j].rt.remove,newrts) rt.extend(newrts) if r.wr: newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] map(self.contigs[i].reads[j].wr.remove,newwrs) wr.extend(newwrs) # now sort them into their proper place for i in range(len(self.contigs)): c = self.contigs[i] for ct_tag in ct: if ct_tag.name==c.name: if self.contigs[i].ct is None: self.contigs[i].ct=[] self.contigs[i].ct.append(ct_tag) if rt or wr: for j in range(len(c.reads)): r = c.reads[j] for rt_tag in rt: if rt_tag.name==r.rd.name: if self.contigs[i].reads[j].rt is None: self.contigs[i].reads[j].rt=[] self.contigs[i].reads[j].rt.append(rt_tag) for wr_tag in wr: if wr_tag.name==r.rd.name: if self.contigs[i].reads[j].wr is None: self.contigs[i].reads[j].wr=[] self.contigs[i].reads[j].wr.append(wr_tag) class ACEParser(AbstractParser): """Parses full ACE file in list of contigs. """ def __init__(self): self.data=ACEFileRecord() def parse(self,handle): firstline=handle.readline() # check if the file starts correctly if firstline[:2]!='AS': raise SyntaxError, "File does not start with 'AS'." self.data.ncontigs=eval(firstline.split()[1]) self.data.nreads=eval(firstline.split()[2]) # now read all the records recparser=RecordParser() iter=Iterator(handle,recparser) while 1: rec=iter.next() if not rec: break self.data.contigs.append(rec) # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). # If the iterator is used, the tags are returned with the contig or the read after which they appear, # if all tags are at the end, they are read with the last contig. The concept of an # iterator leaves no other choice. But if the user uses the ACEParser, we can check # them and put them into the appropriate contig/read instance. # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... self.data.sort() return self.data class _Scanner: """Scans a ACE-formatted file Methods: feed - Feed one ACE record. """ def feed(self, handle, consumer): """feed(self, handle, consumer) Feed in ACE data for scanning. handle is a file-like object containing ACE data. consumer is a Consumer object that will receive events as the ACE data is scanned. """ assert isinstance(handle, File.UndoHandle), \ "handle must be an UndoHandle" if handle.peekline(): self._scan_record(handle, consumer) def _scan_record(self, uhandle, consumer): consumer.begin_contig() read_and_call(uhandle,consumer.co_header,start='CO ') consumer.co_data(self._scan_sequence_data(uhandle)) read_and_call_while(uhandle,consumer.noevent,blank=1) read_and_call(uhandle,consumer.bq_header,start='BQ') consumer.bq_data(self._scan_bq_data(uhandle, consumer)) read_and_call_while(uhandle,consumer.noevent,blank=1) read_and_call_while(uhandle,consumer.af,start='AF ') read_and_call_while(uhandle,consumer.noevent,blank=1) read_and_call_while(uhandle,consumer.bs,start='BS ') # now read all the read data # it starts with a 'RD', and then a mandatory QA # then follows an optional DS # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actial read or contig, # or, if encountered at the end of file, to any previous read or contig. the sort() method deals # with that later. while 1: # each read must have a rd and qa read_and_call_until(uhandle,consumer.noevent,start='RD ') read_and_call(uhandle,consumer.rd_header,start='RD ') consumer.rd_data(self._scan_sequence_data(uhandle)) read_and_call_while(uhandle,consumer.noevent,blank=1) read_and_call(uhandle,consumer.qa,start='QA ') # now one ds can follow try: read_and_call_while(uhandle,consumer.noevent,blank=1) attempt_read_and_call(uhandle,consumer.ds,start='DS ') except SyntaxError: # file ends consumer.end_contig() return # the file could just end, or there's some more stuff. In ace files, everything can happen. # the following tags are interspersed between reads and can ap[pear multiple times. while 1: # something left try: read_and_call_while(uhandle,consumer.noevent,blank=1) except SyntaxError: # file ends here consumer.end_contig() return else: if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'): consumer.rt_data(self._scan_bracket_tags(uhandle)) elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'): consumer.wr_data(self._scan_bracket_tags(uhandle)) elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'): consumer.wa_data(self._scan_bracket_tags(uhandle)) elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'): consumer.ct_data(self._scan_bracket_tags(uhandle)) else: line=safe_peekline(uhandle) break if not line.startswith('RD'): # another read? consumer.end_contig() break def _scan_bq_data(self, uhandle, consumer): """Scans multiple lines of quality data and concatenates them.""" qual='' while 1: line=uhandle.readline() if is_blank_line(line): uhandle.saveline(line) break qual+=' '+line return qual def _scan_sequence_data(self,uhandle): """Scans multiple lines of sequence data and concatenates them.""" seq='' while 1: line=uhandle.readline() if is_blank_line(line): uhandle.saveline(line) break seq+=line.strip() return seq def _scan_bracket_tags(self,uhandle): """Reads the data lines of a {} tag.""" fulltag=[] while 1: line=uhandle.readline().strip() fulltag.append(line) if line.endswith('}'): fulltag[-1]=fulltag[-1][:-1] # delete the ending } if fulltag[-1]=='': fulltag=fulltag[:-1] # delete empty line break return fulltag class _RecordConsumer(AbstractConsumer): """Reads the ace tags into data records.""" def __init__(self): self.data = None def begin_contig(self): self.data = Contig() def end_contig(self): pass def co_header(self,line): header=line.split() self.data.name=header[1] self.data.nbases=eval(header[2]) self.data.nreads=eval(header[3]) self.data.nsegments=eval(header[4]) self.data.uorc=header[5] def co_data(self,seq): self.data.sequence=seq def bq_header(self,line): pass def bq_data(self,qual): self.data.quality=map(eval,qual.split()) def af(self,line): header=line.split() afdata=af() afdata.name=header[1] afdata.coru=header[2] afdata.padded_start=eval(header[3]) self.data.af.append(afdata) def bs(self,line): header=line.split() bsdata=bs() bsdata.padded_start=eval(header[1]) bsdata.padded_end=eval(header[2]) bsdata.name=header[3] self.data.bs.append(bsdata) def rd_header(self,line): header=line.split() # Reads start with rd, so we create a new read record here self.data.reads.append(Reads()) rddata=rd() rddata.name=header[1] rddata.padded_bases=eval(header[2]) rddata.info_items=eval(header[3]) rddata.read_tags=eval(header[4]) self.data.reads[-1].rd=rddata def rd_data(self,seq): self.data.reads[-1].rd.sequence=seq def qa(self,line): header=map(eval,line.split()[1:]) qadata=qa() qadata.qual_clipping_start=header[0] qadata.qual_clipping_end=header[1] qadata.align_clipping_start=header[2] qadata.align_clipping_end=header[3] self.data.reads[-1].qa=qadata def ds(self,line): dsdata=ds() tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] poss=map(line.find,tags) tagpos=dict(zip(poss,tags)) if tagpos.has_key(-1): del tagpos[-1] ps=tagpos.keys() ps.sort() for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip()) self.data.reads[-1].ds=dsdata def ct_start(self,line): if not line.strip().endswith('{'): print line raise SyntaxError, 'CT tag does not start with CT{' ctdata=ct() if self.data.ct is None: self.data.ct=[] self.data.ct.append(ctdata) def ct_data(self,taglines): if len(taglines)<1: raise SyntaxError, 'Missing header line in CT tag' header=taglines[0].split() self.data.ct[-1].name=header[0] self.data.ct[-1].tag_type=header[1] self.data.ct[-1].program=header[2] self.data.ct[-1].padded_start=eval(header[3]) self.data.ct[-1].padded_end=eval(header[4]) self.data.ct[-1].date=header[5] if len(header)==7: self.data.ct[-1].notrans=header[6] self.data.ct[-1].info=taglines[1:] def rt_start(self,line): if not line.strip().endswith('{'): raise SyntaxError, 'RT tag does not start with RT{' rtdata=rt() # now if we're at the end of the file, this rt could belong to a previous read, not the actual one # we store it here were it appears, the user can sort later. if self.data.reads[-1].rt is None: self.data.reads[-1].rt=[] self.data.reads[-1].rt.append(rtdata) def rt_data(self,taglines): if len(taglines)<1: raise SyntaxError, 'Missing header line in RT tag' header=taglines[0].split() self.data.reads[-1].rt[-1].name=header[0] self.data.reads[-1].rt[-1].tag_type=header[1] self.data.reads[-1].rt[-1].program=header[2] self.data.reads[-1].rt[-1].padded_start=eval(header[3]) self.data.reads[-1].rt[-1].padded_end=eval(header[4]) self.data.reads[-1].rt[-1].date=header[5] def wa_start(self,line): if not line.strip().endswith('{'): raise SyntaxError, 'WA tag does not start with WA{' wadata=wa() if self.data.wa is None: self.data.wa=[] self.data.wa.append(wadata) def wa_data(self,taglines): if len(taglines)<1: raise SyntaxError, 'Missing header line in WA tag' header=taglines[0].split() self.data.wa[-1].tag_type=header[0] self.data.wa[-1].program=header[1] self.data.wa[-1].date=header[2] self.data.wa[-1].info=taglines[1:] def wr_start(self,line): if not line.strip().endswith('{'): raise SyntaxError, 'WR tag does not start with WR{' wrdata=wr() if self.data.reads[-1].wr is None: self.data.reads[-1].wr=[] self.data.reads[-1].wr.append(wrdata) def wr_data(self,taglines): if len(taglines)<1: raise SyntaxError, 'Missing header line in WR tag' header=taglines[0].split() self.data.reads[-1].wr[-1].name=header[0] self.data.reads[-1].wr[-1].aligned=header[1] self.data.reads[-1].wr[-1].program=header[2] self.data.reads[-1].wr[-1].date=header[3]