1# Copyright 2009 by Peter Cock. All rights reserved. 2# This code is part of the Biopython distribution and governed by its 3# license. Please see the LICENSE file that should have been included 4# as part of this package. 5"""Optimised sequence conversion code (PRIVATE). 6 7You are not expected to access this module, or any of its code, directly. This 8is all handled internally by the Bio.SeqIO.convert(...) function which is the 9public interface for this. 10 11The idea here is that while doing this will work:: 12 13 from Bio import SeqIO 14 records = SeqIO.parse(in_handle, in_format) 15 count = SeqIO.write(records, out_handle, out_format) 16 17it is shorter to write:: 18 19 from Bio import SeqIO 20 count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 21 22Also, the convert function can take a number of special case optimisations. This 23means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 24All these file format specific optimisations are handled by this (private) module. 25""" 26 27fromBioimportSeqIO 28# NOTE - Lots of lazy imports further on... 29 30__docformat__="restructuredtext en" 31 32

105"""Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).106107 Useful for removing line wrapping and the redundant second identifier108 on the plus lines. Will check also check the quality string is valid.109 Avoids creating SeqRecord and Seq objects in order to speed up this110 conversion.111 """112# Map unexpected chars to null113mapping="".join([chr(0)forasciiinrange(0,59)]114+[chr(ascii)forasciiinrange(59,127)]115+[chr(0)forasciiinrange(127,256)])116assertlen(mapping)==256117return_fastq_generic(in_handle,out_handle,mapping)

121"""Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).122123 Useful for removing line wrapping and the redundant second identifier124 on the plus lines. Will check also check the quality string is valid.125 Avoids creating SeqRecord and Seq objects in order to speed up this126 conversion.127 """128# Map unexpected chars to null129mapping="".join([chr(0)forasciiinrange(0,64)]130+[chr(ascii)forasciiinrange(64,127)]131+[chr(0)forasciiinrange(127,256)])132assertlen(mapping)==256133return_fastq_generic(in_handle,out_handle,mapping)

185"""Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).186187 Avoids creating SeqRecord and Seq objects in order to speed up this188 conversion. Will issue a warning if the scores had to be truncated at 62189 (maximum possible in the Solexa FASTQ format)190 """191# Map unexpected chars to null192fromBio.SeqIO.QualityIOimportsolexa_quality_from_phred193trunc_char=chr(1)194mapping="".join([chr(0)forasciiinrange(0,33)]195+[chr(64+int(round(solexa_quality_from_phred(q))))196forqinrange(0,62+1)]197+[trunc_charforasciiinrange(96,127)]198+[chr(0)forasciiinrange(127,256)])199assertlen(mapping)==256200return_fastq_generic2(in_handle,out_handle,mapping,trunc_char,201"Data loss - max Solexa quality 62 in Solexa FASTQ")

238"""Fast FASTQ to FASTA conversion (PRIVATE).239240 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and241 Seq objects in order to speed up this conversion.242243 NOTE - This does NOT check the characters used in the FASTQ quality string244 are valid!245 """246fromBio.SeqIO.QualityIOimportFastqGeneralIterator247# For real speed, don't even make SeqRecord and Seq objects!248count=0249fortitle,seq,qualinFastqGeneralIterator(in_handle):250count+=1251out_handle.write(">%s\n"%title)252# Do line wrapping253foriinrange(0,len(seq),60):254out_handle.write(seq[i:i+60]+"\n")255returncount

259"""Fast FASTQ to simple tabbed conversion (PRIVATE).260261 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and262 Seq objects in order to speed up this conversion.263264 NOTE - This does NOT check the characters used in the FASTQ quality string265 are valid!266 """267fromBio.SeqIO.QualityIOimportFastqGeneralIterator268# For real speed, don't even make SeqRecord and Seq objects!269count=0270fortitle,seq,qualinFastqGeneralIterator(in_handle):271count+=1272out_handle.write("%s\t%s\n"%(title.split(None,1)[0],seq))273returncount

277"""FASTQ helper function for QUAL output (PRIVATE).278279 Mapping should be a dictionary mapping expected ASCII characters from the280 FASTQ quality string to PHRED quality scores (as strings).281 """282fromBio.SeqIO.QualityIOimportFastqGeneralIterator283# For real speed, don't even make SeqRecord and Seq objects!284count=0285fortitle,seq,qualinFastqGeneralIterator(in_handle):286count+=1287out_handle.write(">%s\n"%title)288# map the qual... note even with Sanger encoding max 2 digits289try:290qualities_strs=[mapping[ascii]forasciiinqual]291exceptKeyError:292raiseValueError("Invalid character in quality string")293data=" ".join(qualities_strs)294whilelen(data)>60:295# Know quality scores are either 1 or 2 digits, so there296# must be a space in any three consecutive characters.297ifdata[60]==" ":298out_handle.write(data[:60]+"\n")299data=data[61:]300elifdata[59]==" ":301out_handle.write(data[:59]+"\n")302data=data[60:]303else:304assertdata[58]==" ","Internal logic failure in wrapping"305out_handle.write(data[:58]+"\n")306data=data[59:]307out_handle.write(data+"\n")308returncount