pybam.py 52KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756
  1. '''
  2. Pybam from commit ba460f1 converted for Python3 by Hannes Luidalepp.
  3. Currently only dynamic parsing is functional.
  4. http://github.com/luidale/pybam
  5. Original description:
  6. Awesome people who have directly contributed to the project:
  7. Jon Palmer - Bug finder & advice on project direction
  8. Mahmut Uludag - Bug finder
  9. Help: print pybam.wat
  10. Github: http://github.com/JohnLonginotto/pybam
  11. This code was written by John Longinotto, a PhD student of the Pospisilik Lab at the Max Planck Institute of Immunbiology & Epigenetics, Freiburg.
  12. My PhD is funded by the Deutsches Epigenom Programm (DEEP), and the Max Planck IMPRS Program.
  13. I study Adipose Biology and Circadian Rhythm in mice, although it seems these days I spend most of my time at the computer :-)
  14. '''
  15. import os
  16. import sys
  17. import zlib
  18. import time
  19. import tempfile
  20. import subprocess
  21. from array import array
  22. from struct import unpack
  23. CtoPy = { 'A':'<c', 'c':'<b', 'C':'<B', 's':'<h', 'S':'<H', 'i':'<i', 'I':'<I', 'f':'<f' }
  24. py4py = { 'A': 1 , 'c': 1 , 'C': 1 , 's': 2 , 'S': 2 , 'i': 4 , 'I': 4 , 'f': 4 }
  25. dna_codes = '=ACMGRSVTWYHKDBN'
  26. cigar_codes = 'MIDNSHP=X'
  27. parse_codes = {
  28. 'sam': ' The current alignment in SAM format.',
  29. 'bam': ' All the bytes that make up the current alignment ("read"),\n still in binary just as it was in the BAM file. Useful\n when creating a new BAM file of filtered alignments.',
  30. 'sam_qname': ' [1st column in SAM] The QNAME (fragment ID) of the alignment.',
  31. 'bam_qname': ' The original bytes before decoding to sam_qname.',
  32. 'sam_flag': ' [2nd column in SAM] The FLAG number of the alignment.',
  33. 'bam_flag': ' The original bytes before decoding to sam_flag.',
  34. 'sam_refID': ' The chromosome ID (not the same as the name!).\n Chromosome names are stored in the BAM header (file_chromosomes),\n so to convert refIDs to chromsome names one needs to do:\n "my_bam.file_chromosomes[read.sam_refID]" (or use sam_rname)\n But for comparisons, using the refID is much faster that using\n the actual chromosome name (for example, when reading through a\n sorted BAM file and looking for where last_refID != this_refID)\n Note that when negative the alignment is not aligned, and thus one\n must not perform my_bam.file_chromosomes[read.sam_refID]\n without checking that the value is positive first.',
  35. 'sam_rname': ' [3rd column in SAM] The actual chromosome/contig name for the\n alignment. Will return "*" if refID is negative.',
  36. 'bam_refID': ' The original bytes before decoding to sam_refID.',
  37. 'sam_pos1': ' [4th column in SAM] The 1-based position of the alignment. Note\n that in SAM format values less than 1 are converted to "0" for\n "no data" and sam_pos1 will also do this.',
  38. 'sam_pos0': ' The 0-based position of the alignment. Note that in SAM all\n positions are 1-based, but in BAM they are stored as 0-based.\n Unlike sam_pos1, negative values are kept as negative values,\n essentially giving one the decoded value as it was stored.',
  39. 'bam_pos': ' The original bytes before decoding to sam_pos*.',
  40. 'sam_mapq': ' [5th column in SAM] The Mapping Quality of the current alignment.',
  41. 'bam_mapq': ' The original bytes before decoding to sam_mapq.',
  42. 'sam_cigar_string': ' [6th column in SAM] The CIGAR string, as per the SAM format.\n Allowed values are "MIDNSHP=X".',
  43. 'sam_cigar_list': ' A list of tuples with 2 values per tuple:\n the number of bases, and the CIGAR operation applied to those\n bases. Faster to calculate than sam_cigar_string.',
  44. 'bam_cigar': ' The original bytes before decoding to sam_cigar_*.',
  45. 'sam_next_refID': ' The sam_refID of the alignment\'s mate (if any). Note that as per\n sam_refID, this value can be negative and is not the actual\n chromosome name (see sam_pnext1).',
  46. 'sam_rnext': ' [7th column in SAM] The chromosome name of the alignment\'s mate.\n Value is "*" if unmapped. Note that in a SAM file this value\n is "=" if it is the same as the sam_rname, however pybam will\n only do this if the user prints the whole SAM entry with "sam".',
  47. 'bam_next_refID': ' The original bytes before decoding to sam_next_refID.',
  48. 'sam_pnext1': ' [8th column in SAM] The 1-based position of the alignment\'s mate.\n Note that in SAM format values less than 1 are converted to "0"\n for "no data", and sam_pnext1 will also do this.',
  49. 'sam_pnext0': ' The 0-based position of the alignment\'s mate. Note that in SAM all\n positions are 1-based, but in BAM they are stored as 0-based.\n Unlike sam_pnext1, negative values are kept as negative values\n here, essentially giving you the value as it was stored in BAM.',
  50. 'bam_pnext': ' The original bytes before decoding to sam_pnext0.',
  51. 'sam_tlen': ' [9th column in SAM] The TLEN value.',
  52. 'bam_tlen': ' The original bytes before decoding to sam_tlen.',
  53. 'sam_seq': ' [10th column in SAM] The SEQ value (DNA sequence of the alignment).\n Allowed values are "ACGTMRSVWYHKDBN and =".',
  54. 'bam_seq': ' The original bytes before decoding to sam_seq.',
  55. 'sam_qual': ' [11th column in SAM] The QUAL value (quality scores per DNA base\n in SEQ) of the alignment.',
  56. 'bam_qual': ' The original bytes before decoding to sam_qual.',
  57. 'sam_tags_list': ' A list of tuples with 3 values per tuple: a two-letter TAG ID, the\n type code used to describe the data in the TAG value (see SAM spec.\n for details), and the value of the TAG. Note that the BAM format\n has type codes like "c" for a number in the range -127 to +127,\n and "C" for a number in the range of 0 to 255.\n In a SAM file however, all numerical codes appear to just be stored\n using "i", which is a number in the range -2147483647 to +2147483647.\n sam_tags_list will therefore return the code used in the BAM file,\n and not "i" for all numbers.',
  58. 'sam_tags_string': ' [12th column a SAM] Returns the TAGs in the same format as would be found \n in a SAM file (with all numbers having a signed 32bit code of "i").',
  59. 'bam_tags': ' The original bytes before decoding to sam_tags_*.',
  60. 'sam_bin': ' The bin value of the alignment (used for indexing reads).\n Please refer to section 5.3 of the SAM spec for how this\n value is calculated.',
  61. 'bam_bin': ' The original bytes before decoding to sam_bin.',
  62. 'sam_block_size': ' The number of bytes the current alignment takes up in the BAM\n file minus the four bytes used to store the block_size value\n itself. Essentially sam_block_size +4 == bytes needed to store\n the current alignment.',
  63. 'bam_block_size': ' The original bytes before decoding to sam_block_size.',
  64. 'sam_l_read_name': ' The length of the QNAME plus 1 because the QNAME is terminated\n with a NUL byte.',
  65. 'bam_l_read_name': ' The original bytes before decoding to sam_l_read_name.',
  66. 'sam_l_seq': ' The number of bases in the seq. Useful if you just want to know\n how many bases are in the SEQ but do not need to know what those\n bases are (which requires more decoding effort).',
  67. 'bam_l_seq': ' The original bytes before decoding to sam_l_seq.',
  68. 'sam_n_cigar_op': ' The number of CIGAR operations in the CIGAR field. Useful if one\n wants to know how many CIGAR operations there are, but does not\n need to know what they are.',
  69. 'bam_n_cigar_op': ' The original bytes before decoding to sam_n_cigar_op.',
  70. 'file_alignments_read': ' A running counter of the number of alignments ("reads"),\n processed thus far. Note the BAM format does not store\n how many reads are in a file, so the usefulness of this\n metric is somewhat limited unless one already knows how\n many reads are in the file.',
  71. 'file_binary_header': ' From the first byte in the file, until the first byte of\n the first read. The original binary header.',
  72. 'file_bytes_read': ' A running counter of the bytes read from the file. Note\n that as data is read in arbitary chunks, this is literally\n the amount of data read from the file/pipe by pybam.',
  73. 'file_chromosome_lengths': ' The binary header of the BAM file includes chromosome names\n and chromosome lengths. This is a dictionary of chromosome-name\n keys and chromosome-length values.',
  74. 'file_chromosomes': ' A list of chromosomes from the binary header.',
  75. 'file_decompressor': ' BAM files are compressed with bgzip. The value here reflects\n the decompressor used. "internal" if pybam\'s internal\n decompressor is being used, "gzip" or "pigz" if the system\n has these binaries installed and pybam can find them.\n Any other value reflects a custom decompression command.',
  76. 'file_directory': ' The directory the input BAM file can be found in. This will be\n correct if the input file is specified via a string or python\n file object, however if the input is a pipe such as sys.stdin, \n then the current working directory will be used.',
  77. 'file_header': ' The ASCII portion of the BAM header. This is the typical header\n users of samtools will be familiar with.',
  78. 'file_name': ' The file name (base name) of input file if input is a string or\n python file object. If input is via stdin this will be "<stdin>"'
  79. }
  80. wat = '''
  81. Main class: pybam.read
  82. Github: http://github.com/JohnLonginotto/pybam
  83. [ Dynamic Parser Example ]
  84. for alignment in pybam.read('/my/data.bam'):
  85. print alignment.sam_seq
  86. [ Static Parser Example ]
  87. for seq,mapq in pybam.read('/my/data.bam',['sam_seq','sam_mapq']):
  88. print seq
  89. print mapq
  90. [ Mixed Parser Example ]
  91. my_bam = pybam.read('/my/data.bam',['sam_seq','sam_mapq'])
  92. print my_bam._static_parser_code
  93. for seq,mapq in my_bam:
  94. if seq.startswith('ACGT') and mapq > 10:
  95. print my_bam.sam
  96. [ Custom Decompressor (from file path) Example ]
  97. my_bam = pybam.read('/my/data.bam.lzma',decompressor='lzma --decompress --stdout /my/data.bam.lzma')
  98. [ Custom Decompressor (from file object) Example ]
  99. my_bam = pybam.read(sys.stdin,decompressor='lzma --decompress --stdout') # data given to lzma via stdin
  100. [ Force Internal bgzip Decompressor ]
  101. my_bam = pybam.read('/my/data.bam',decompressor='internal')
  102. [ Parse Words (hah) ]'''
  103. wat += '\n'+''.join([('\n===============================================================================================\n\n ' if code is 'file_alignments_read' or code is 'sam' else ' ')+(code+' ').ljust(25,'-')+description+'\n' for code,description in sorted(parse_codes.items())]) + '\n'
  104. class read():
  105. '''
  106. [ Dynamic Parser Example ]
  107. for alignment in pybam.read('/my/data.bam'):
  108. print alignment.sam_seq
  109. [ Static Parser Example ]
  110. for seq,mapq in pybam.read('/my/data.bam',['sam_seq','sam_mapq']):
  111. print seq
  112. print mapq
  113. [ Mixed Parser Example ]
  114. my_bam = pybam.read('/my/data.bam',['sam_seq','sam_mapq'])
  115. print my_bam._static_parser_code
  116. for seq,mapq in my_bam:
  117. if seq.startswith('ACGT') and mapq > 10:
  118. print my_bam.sam
  119. [ Custom Decompressor (from file path) Example ]
  120. my_bam = pybam.read('/my/data.bam.lzma',decompressor='lzma --decompress --stdout /my/data.bam.lzma')
  121. [ Custom Decompressor (from file object) Example ]
  122. my_bam = pybam.read(sys.stdin,decompressor='lzma --decompress --stdout') # data given to lzma via stdin
  123. [ Force Internal bgzip Decompressor ]
  124. my_bam = pybam.read('/my/data.bam',decompressor='internal')
  125. "print pybam.wat" in the python terminal to see the possible parsable values,
  126. or visit http://github.com/JohnLonginotto/pybam for the latest info.
  127. '''
  128. def __init__(self,f,fields=False,decompressor=False):
  129. self.file_bytes_read = 0
  130. self.file_chromosomes = []
  131. self.file_alignments_read = 0
  132. self.file_chromosome_lengths = {}
  133. if fields is not False:
  134. print(fields)
  135. if type(fields) is not list or len(fields) is 0:
  136. raise PybamError('\n\nFields for the static parser must be provided as a non-empty list. You gave a ' + str(type(fields)) + '\n')
  137. else:
  138. for field in fields:
  139. if field.startswith('sam') or field.startswith('bam'):
  140. if field not in list(parse_codes.keys()):
  141. raise PybamError('\n\nStatic parser field "' + str(field) + '" from fields ' + str(fields) + ' is not known to this version of pybam!\nPrint "pybam.wat" to see available field names with explinations.\n')
  142. else:
  143. raise PybamError('\n\nStatic parser field "' + str(field) + '" from fields ' + str(fields) + ' does not start with "sam" or "bam" and thus is not an avaliable field for the static parsing.\nPrint "pybam.wat" in interactive python to see available field names with explinations.\n')
  144. if decompressor:
  145. if type(decompressor) is str:
  146. if decompressor is not 'internal' and '{}' not in decompressor: raise PybamError('\n\nWhen a custom decompressor is used and the input file is a string, the decompressor string must contain at least one occurence of "{}" to be substituted with a filepath by pybam.\n')
  147. else: raise PybamError('\n\nUser-supplied decompressor must be a string that when run on the command line decompresses a named file (or stdin), to stdout:\ne.g. "lzma --decompress --stdout {}" if pybam is provided a path as input file, where {} is substituted for that path.\nor just "lzma --decompress --stdout" if pybam is provided a file object instead of a file path, as data from that file object will be piped via stdin to the decompression program.\n')
  148. ## First we make a generator that will return chunks of uncompressed data, regardless of how we choose to decompress:
  149. def generator():
  150. DEVNULL = open(os.devnull, 'wb')
  151. # First we need to figure out what sort of file we have - whether it's gzip compressed, uncompressed, or something else entirely!
  152. if type(f) is str:
  153. try: self._file = open(f,'rb')
  154. except: raise PybamError('\n\nCould not open "' + str(self._file.name) + '" for reading!\n')
  155. try: magic = os.read(self._file.fileno(),4)
  156. except: raise PybamError('\n\nCould not read from "' + str(self._file.name) + '"!\n')
  157. elif type(f) is file:
  158. self._file = f
  159. try: magic = os.read(self._file.fileno(),4)
  160. except: raise PybamError('\n\nCould not read from "' + str(self._file.name) + '"!\n')
  161. else: raise PybamError('\n\nInput file was not a string or a file object. It was: "' + str(f) + '"\n')
  162. self.file_name = os.path.basename(os.path.realpath(self._file.name))
  163. self.file_directory = os.path.dirname(os.path.realpath(self._file.name))
  164. if magic == b'BAM\1':
  165. # The user has passed us already unzipped BAM data! Job done :)
  166. data = b'BAM\1' + self._file.read(35536)
  167. self.file_bytes_read += len(data)
  168. self.file_decompressor = 'None'
  169. while data:
  170. yield data
  171. data = self._file.read(35536)
  172. self.file_bytes_read += len(data)
  173. self._file.close()
  174. DEVNULL.close()
  175. return
  176. elif magic == b"\x1f\x8b\x08\x04": # The user has passed us compressed gzip/bgzip data, which is typical for a BAM file
  177. # use custom decompressor if provided:
  178. if decompressor is not False and decompressor is not 'internal':
  179. if type(f) is str: self._subprocess = subprocess.Popen( decompressor.replace('{}',f), shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
  180. else: self._subprocess = subprocess.Popen('{ printf "'+magic+'"; cat; } | ' + decompressor, stdin=self._file, shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
  181. self.file_decompressor = decompressor
  182. data = self._subprocess.stdout.read(35536)
  183. self.file_bytes_read += len(data)
  184. while data:
  185. yield data
  186. data = self._subprocess.stdout.read(35536)
  187. self.file_bytes_read += len(data)
  188. self._file.close()
  189. DEVNULL.close()
  190. return
  191. # else look for pigz or gzip:
  192. else:
  193. try:
  194. self._subprocess = subprocess.Popen(["pigz"],stdin=DEVNULL,stdout=DEVNULL,stderr=DEVNULL)
  195. if self._subprocess.returncode is None: self._subprocess.kill()
  196. use = 'pigz'
  197. except OSError:
  198. try:
  199. self._subprocess = subprocess.Popen(["gzip"],stdin=DEVNULL,stdout=DEVNULL,stderr=DEVNULL)
  200. if self._subprocess.returncode is None: self._subprocess.kill()
  201. use = 'gzip'
  202. except OSError:
  203. use = 'internal'
  204. if use is not 'internal' and decompressor is not 'internal':
  205. if type(f) is str: self._subprocess = subprocess.Popen([ use , '--decompress','--stdout', f ], stdout=subprocess.PIPE, stderr=DEVNULL)
  206. else: self._subprocess = subprocess.Popen('{ printf "'+magic+'"; cat; } | ' + use + ' --decompress --stdout', stdin=f, shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
  207. time.sleep(1)
  208. if self._subprocess.poll() == None:
  209. data = self._subprocess.stdout.read(35536)
  210. self.file_decompressor = use
  211. self.file_bytes_read += len(data)
  212. while data:
  213. yield data
  214. data = self._subprocess.stdout.read(35536)
  215. self.file_bytes_read += len(data)
  216. self._file.close()
  217. DEVNULL.close()
  218. return
  219. # Python's gzip module can't read from a stream that doesn't support seek(), and the zlib module cannot read the bgzip format without a lot of help:
  220. self.file_decompressor = 'internal'
  221. raw_data = magic + self._file.read(65536)
  222. self.file_bytes_read = len(raw_data)
  223. internal_cache = []
  224. blocks_left_to_grab = 50
  225. bs = 0
  226. checkpoint = 0
  227. decompress = zlib.decompress
  228. while raw_data:
  229. if len(raw_data) - bs < 35536:
  230. raw_data = raw_data[bs:] + self._file.read(65536)
  231. self.file_bytes_read += len(raw_data) - bs
  232. bs = 0
  233. magic = raw_data[bs:bs+4]
  234. if not magic: break # a child's heart
  235. if magic != b"\x1f\x8b\x08\x04": raise PybamError('\n\nThe input file is not in a format I understand. First four bytes: ' + repr(magic) + '\n')
  236. try:
  237. more_bs = bs + unpack("<H", raw_data[bs+16:bs+18])[0] +1
  238. internal_cache.append(decompress(raw_data[bs+18:more_bs-8],-15))
  239. bs = more_bs
  240. except: ## zlib doesnt have a nice exception for when things go wrong. just "error"
  241. header_data = magic + raw_data[bs+4:bs+12]
  242. header_size = 12
  243. extra_len = unpack("<H", header_data[-2:])[0]
  244. while header_size-12 < extra_len:
  245. header_data += raw_data[bs+12:bs+16]
  246. subfield_id = header_data[-4:-2]
  247. subfield_len = unpack("<H", header_data[-2:])[0]
  248. subfield_data = raw_data[bs+16:bs+16+subfield_len]
  249. header_data += subfield_data
  250. header_size += subfield_len + 4
  251. if subfield_id == 'BC': block_size = unpack("<H", subfield_data)[0]
  252. raw_data = raw_data[bs+16+subfield_len:bs+16+subfield_len+block_size-extra_len-19]
  253. crc_data = raw_data[bs+16+subfield_len+block_size-extra_len-19:bs+16+subfield_len+block_size-extra_len-19+8] # I have left the numbers in verbose, because the above try is the optimised code.
  254. bs = bs+16+subfield_len+block_size-extra_len-19+8
  255. zipped_data = header_data + raw_data + crc_data
  256. internal_cache.append(decompress(zipped_data,47)) # 31 works the same as 47.
  257. # Although the following in the bgzip code from biopython, its not needed if you let zlib decompress the whole zipped_data, header and crc, because it checks anyway (in C land)
  258. # I've left the manual crc checks in for documentation purposes:
  259. '''
  260. expected_crc = crc_data[:4]
  261. expected_size = unpack("<I", crc_data[4:])[0]
  262. if len(unzipped_data) != expected_size: print 'ERROR: Failed to unpack due to a Type 1 CRC error. Could the BAM be corrupted?'; exit()
  263. crc = zlib.crc32(unzipped_data)
  264. if crc < 0: crc = pack("<i", crc)
  265. else: crc = pack("<I", crc)
  266. if expected_crc != crc: print 'ERROR: Failed to unpack due to a Type 2 CRC error. Could the BAM be corrupted?'; exit()
  267. '''
  268. blocks_left_to_grab -= 1
  269. if blocks_left_to_grab == 0:
  270. yield b''.join(internal_cache)
  271. internal_cache = []
  272. blocks_left_to_grab = 50
  273. self._file.close()
  274. DEVNULL.close()
  275. if internal_cache != b'':
  276. yield b''.join(internal_cache)
  277. return
  278. elif decompressor is not False and decompressor is not 'internal':
  279. # It wouldn't be safe to just print to the shell four random bytes from the beginning of a file, so instead it's
  280. # written to a temp file and cat'd. The idea here being that we trust the decompressor string as it was written by
  281. # someone with access to python, so it has system access anyway. The file/data, however, should not be trusted.
  282. magic_file = os.path.join(tempfile.mkdtemp(),'magic')
  283. with open(magic_file,'wb') as mf: mf.write(magic)
  284. if type(f) is str: self._subprocess = subprocess.Popen( decompressor.replace('{}',f), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  285. else: self._subprocess = subprocess.Popen('{ cat "'+magic_file+'"; cat; } | ' + decompressor, stdin=self._file, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  286. self.file_decompressor = decompressor
  287. data = self._subprocess.stdout.read(35536)
  288. self.file_bytes_read += len(data)
  289. while data:
  290. yield data
  291. data = self._subprocess.stdout.read(35536)
  292. self.file_bytes_read += len(data)
  293. self._file.close()
  294. DEVNULL.close()
  295. return
  296. else:
  297. raise PybamError('\n\nThe input file is not in a format I understand. First four bytes: ' + repr(magic) + '\n')
  298. ## At this point, we know that whatever decompression method was used, a call to self._generator will return some uncompressed data.
  299. self._generator = generator()
  300. ## So lets parse the BAM header:
  301. header_cache = b''
  302. while len(header_cache) < 4:
  303. header_cache += next(self._generator)
  304. p_from = 0; p_to = 4
  305. if header_cache[p_from:p_to] != b'BAM\x01':
  306. raise PybamError('\n\nInput file ' + self.file_name + ' does not appear to be a BAM file.\n')
  307. ## Parse the BAM header:
  308. p_from = p_to; p_to += 4
  309. length_of_header = unpack('<i',header_cache[p_from:p_to])[0]
  310. p_from = p_to; p_to += length_of_header
  311. while len(header_cache) < p_to: header_cache += str(next(self._generator))
  312. self.file_header = header_cache[p_from:p_to]
  313. p_from = p_to; p_to += 4
  314. while len(header_cache) < p_to: header_cache += str(next(self._generator))
  315. number_of_reference_sequences = unpack('<i',header_cache[p_from:p_to])[0]
  316. for _ in range(number_of_reference_sequences):
  317. p_from = p_to; p_to += 4
  318. while len(header_cache) < p_to: header_cache += next(self._generator)
  319. l_name = unpack('<l',header_cache[p_from:p_to])[0]
  320. p_from = p_to; p_to += l_name
  321. while len(header_cache) < p_to: header_cache += next(self._generator)
  322. self.file_chromosomes.append(header_cache[p_from:p_to -1].decode('ascii'))
  323. p_from = p_to; p_to += 4
  324. while len(header_cache) < p_to: header_cache += next(self._generator)
  325. self.file_chromosome_lengths[self.file_chromosomes[-1]] = unpack('<l',header_cache[p_from:p_to])[0]
  326. self.file_bytes_read = p_to
  327. self.file_binary_header = memoryview(header_cache[:p_to])
  328. header_cache = header_cache[p_to:]
  329. # A quick check to make sure the header of this BAM file makes sense:
  330. chromosomes_from_header = []
  331. for line in str(self.file_header).split('\\n'):
  332. if line.startswith('@SQ\\tSN:'):
  333. chromosomes_from_header.append(line.split('\\t')[1][3:])
  334. if chromosomes_from_header != self.file_chromosomes:
  335. raise PybamWarn('For some reason the BAM format stores the chromosome names in two locations,\n the ASCII text header we all know and love, viewable with samtools view -H, and another special binary header\n which is used to translate the chromosome refID (a number) into a chromosome RNAME when you do bam -> sam.\n\nThese two headers should always be the same, but apparently they are not:\nThe ASCII header looks like: ' + self.file_header + '\nWhile the binary header has the following chromosomes: ' + self.file_chromosomes + '\n')
  336. #HL - decoding header
  337. self.file_header = self.file_header.decode()
  338. ## Variable parsing:
  339. def new_entry(header_cache):
  340. cache = header_cache # we keep a small cache of X bytes of decompressed BAM data, to smoothen out disk access.
  341. p = 0 # where the next alignment/entry starts in the cache
  342. while True:
  343. try:
  344. while len(cache) < p + 4: cache = cache[p:] + next(self._generator); p = 0 # Grab enough bytes to parse blocksize
  345. self.sam_block_size = unpack('<i',cache[p:p+4])[0]
  346. self.file_alignments_read += 1
  347. while len(cache) < p + 4 + self.sam_block_size:
  348. cache = cache[p:] + next(self._generator); p = 0 # Grab enough bytes to parse entry
  349. except StopIteration: break
  350. self.bam = cache[p:p + 4 + self.sam_block_size]
  351. p = p + 4 + self.sam_block_size
  352. yield self
  353. self._new_entry = new_entry(header_cache)
  354. def compile_parser(self,fields):
  355. temp_code = ''
  356. end_of_qname = False
  357. end_of_cigar = False
  358. end_of_seq = False
  359. end_of_qual = False
  360. dependencies = set(fields)
  361. if 'bam' in fields:
  362. fields[fields.index('bam')] = 'self.bam'
  363. if 'sam_block_size' in fields:
  364. fields[fields.index('sam_block_size')] = 'self.sam_block_size'
  365. if 'sam' in dependencies:
  366. dependencies.update(['sam_qname','sam_flag','sam_rname','sam_pos1','sam_mapq','sam_cigar_string','bam_refID','bam_next_refID','sam_rnext','sam_pnext1','sam_tlen','sam_seq','sam_qual','sam_tags_string'])
  367. if 'sam_tags_string' in dependencies:
  368. dependencies.update(['sam_tags_list'])
  369. if 'sam_pos1' in dependencies:
  370. temp_code += "\n sam_pos1 = (0 if sam_pos0 < 0 else sam_pos0 + 1)"
  371. dependencies.update(['sam_pos0'])
  372. if 'sam_pnext1' in dependencies:
  373. temp_code += "\n sam_pnext1 = (0 if sam_pnext0 < 0 else sam_pnext0 + 1)"
  374. dependencies.update(['sam_pnext0'])
  375. if 'sam_qname' in dependencies or 'bam_qname' in dependencies:
  376. temp_code += "\n _end_of_qname = 36 + sam_l_read_name"
  377. dependencies.update(['sam_l_read_name'])
  378. end_of_qname = True
  379. if 'sam_cigar_string' in dependencies or 'sam_cigar_list' in dependencies or 'bam_cigar' in dependencies:
  380. if end_of_qname:
  381. pass
  382. else:
  383. temp_code += "\n _end_of_qname = 36 + sam_l_read_name"
  384. temp_code += "\n _end_of_cigar = _end_of_qname + (4*sam_n_cigar_op)"
  385. dependencies.update(['sam_l_read_name','sam_n_cigar_op'])
  386. end_of_cigar = True
  387. if 'sam_seq' in dependencies or 'bam_seq' in dependencies:
  388. if end_of_cigar:
  389. pass
  390. elif end_of_qname:
  391. temp_code += "\n _end_of_cigar = _end_of_qname + (4*sam_n_cigar_op)"
  392. else:
  393. temp_code += "\n _end_of_cigar = 36 + sam_l_read_name + (4*sam_n_cigar_op)"
  394. temp_code += "\n _end_of_seq = _end_of_cigar + (-((-sam_l_seq)//2))"
  395. dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
  396. end_of_seq = True
  397. if 'sam_qual' in dependencies or 'bam_qual' in dependencies:
  398. if end_of_seq:
  399. pass
  400. elif end_of_cigar:
  401. temp_code += "\n _end_of_seq = _end_of_cigar + (-((-sam_l_seq)//2))"
  402. elif end_of_qname:
  403. temp_code += "\n _end_of_seq = _end_of_qname + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2))"
  404. else:
  405. temp_code += "\n _end_of_seq = 36 + sam_l_read_name + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2))"
  406. temp_code += "\n _end_of_qual = _end_of_seq + sam_l_seq"
  407. dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
  408. end_of_qual = True
  409. if 'sam_tags_list' in dependencies or 'bam_tags' in dependencies:
  410. if end_of_qual:
  411. pass
  412. elif end_of_seq:
  413. temp_code += "\n _end_of_qual = _end_of_seq + sam_l_seq"
  414. elif end_of_cigar:
  415. temp_code += "\n _end_of_qual = _end_of_cigar + (-((-sam_l_seq)//2)) + sam_l_seq"
  416. elif end_of_qname:
  417. temp_code += "\n _end_of_qual = _end_of_qname + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2)) + sam_l_seq"
  418. else:
  419. temp_code += "\n _end_of_qual = 36 + sam_l_read_name + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2)) + sam_l_seq"
  420. dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
  421. if 'sam_rname' in dependencies:
  422. temp_code += "\n sam_rname = '*' if sam_refID < 0 else self.file_chromosomes[sam_refID]"
  423. dependencies.update(['sam_refID'])
  424. if 'sam_rnext' in dependencies:
  425. temp_code += "\n sam_rnext = '*' if sam_next_refID < 0 else self.file_chromosomes[sam_next_refID]"
  426. dependencies.update(['sam_next_refID'])
  427. ## First we figure out what data from the static portion of the BAM entry we'll need:
  428. tmp = {}
  429. tmp['code'] = 'def parser(self):\n from array import array\n from struct import unpack\n for _ in self._new_entry:'
  430. tmp['last_start'] = None
  431. tmp['name_list'] = []
  432. tmp['dtype_list'] = []
  433. def pack_up(name,dtype,length,end,tmp):
  434. if name in dependencies:
  435. if tmp['last_start'] is None:
  436. tmp['last_start'] = end - length
  437. tmp['name_list'].append(name)
  438. tmp['dtype_list'].append(dtype)
  439. elif tmp['last_start'] is not None:
  440. tmp['code'] += '\n ' + ', '.join(tmp['name_list']) + ' = unpack("<' + ''.join(tmp['dtype_list']) + '",self.bam[' + str(tmp['last_start']) + ':' + str(end-length) + '])'
  441. if len(tmp['dtype_list']) == 1:
  442. tmp['code'] += '[0]'
  443. tmp['last_start'] = None
  444. tmp['name_list'] = []
  445. tmp['dtype_list'] = []
  446. pack_up('sam_refID', 'i',4, 8,tmp)
  447. pack_up('sam_pos0', 'i',4,12,tmp)
  448. pack_up('sam_l_read_name', 'B',1,13,tmp)
  449. pack_up('sam_mapq', 'B',1,14,tmp)
  450. pack_up('sam_bin', 'H',2,16,tmp)
  451. pack_up('sam_n_cigar_op', 'H',2,18,tmp)
  452. pack_up('sam_flag', 'H',2,20,tmp)
  453. pack_up('sam_l_seq', 'i',4,24,tmp)
  454. pack_up('sam_next_refID', 'i',4,28,tmp)
  455. pack_up('sam_pnext0', 'i',4,32,tmp)
  456. pack_up('sam_tlen', 'i',4,36,tmp)
  457. pack_up( None, None,0,36,tmp) # To add anything not yet added.
  458. code = tmp['code']
  459. del tmp
  460. code += temp_code
  461. # Fixed-length BAM data (where we just grab the bytes, we dont unpack) can, however, be grabbed individually.
  462. if 'bam_block_size' in dependencies: code += "\n bam_block_size = self.bam[0 : 4 ]"
  463. if 'bam_refID' in dependencies: code += "\n bam_refID = self.bam[4 : 8 ]"
  464. if 'bam_pos' in dependencies: code += "\n bam_pos = self.bam[8 : 12 ]"
  465. if 'bam_l_read_name' in dependencies: code += "\n bam_l_read_name = self.bam[12 : 13 ]"
  466. if 'bam_mapq' in dependencies: code += "\n bam_mapq = self.bam[13 : 14 ]"
  467. if 'bam_bin' in dependencies: code += "\n bam_bin = self.bam[14 : 16 ]"
  468. if 'bam_n_cigar_op' in dependencies: code += "\n bam_n_cigar_op = self.bam[16 : 18 ]"
  469. if 'bam_flag' in dependencies: code += "\n bam_flag = self.bam[18 : 20 ]"
  470. if 'bam_l_seq' in dependencies: code += "\n bam_l_seq = self.bam[20 : 24 ]"
  471. if 'bam_next_refID' in dependencies: code += "\n bam_next_refID = self.bam[24 : 28 ]"
  472. if 'bam_pnext' in dependencies: code += "\n bam_pnext = self.bam[28 : 32 ]"
  473. if 'bam_tlen' in dependencies: code += "\n bam_tlen = self.bam[32 : 36 ]"
  474. if 'bam_qname' in dependencies: code += "\n bam_qname = self.bam[36 : _end_of_qname ]"
  475. if 'bam_cigar' in dependencies: code += "\n bam_cigar = self.bam[_end_of_qname : _end_of_cigar ]"
  476. if 'bam_seq' in dependencies: code += "\n bam_seq = self.bam[_end_of_cigar : _end_of_seq ]"
  477. if 'bam_qual' in dependencies: code += "\n bam_qual = self.bam[_end_of_seq : _end_of_qual ]"
  478. if 'bam_tags' in dependencies: code += "\n bam_tags = self.bam[_end_of_qual : ]"
  479. if 'sam_qname' in dependencies:
  480. if 'bam_qname' in dependencies: code += "\n sam_qname = bam_qname[:-1]"
  481. else: code += "\n sam_qname = self.bam[36 : _end_of_qname -1 ]"
  482. if 'sam_cigar_list' in dependencies:
  483. if 'bam_cigar' in dependencies: code += "\n sam_cigar_list = [( cig >> 4 , cigar_codes[cig & 0b1111]) for cig in array('I', bam_cigar) ]"
  484. else: code += "\n sam_cigar_list = [( cig >> 4 , cigar_codes[cig & 0b1111]) for cig in array('I', self.bam[_end_of_qname : _end_of_cigar]) ]"
  485. if 'sam_cigar_string'in dependencies:
  486. if 'bam_cigar' in dependencies: code += "\n sam_cigar_string = ''.join([ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', bam_cigar)])"
  487. else: code += "\n sam_cigar_string = ''.join([ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', self.bam[_end_of_qname : _end_of_cigar]) ])"
  488. if 'sam_seq' in dependencies:
  489. if 'bam_seq' in dependencies: code += "\n sam_seq = ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', bam_seq)])[:sam_l_seq]"
  490. else: code += "\n sam_seq = ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', self.bam[_end_of_cigar : _end_of_seq])])[:sam_l_seq]"
  491. if 'sam_qual' in dependencies:
  492. if 'bam_qual' in dependencies: code += "\n sam_qual = b''.join( [ chr(ord(quality) + 33) for quality in bam_qual ])"
  493. else: code += "\n sam_qual = b''.join( [ chr(ord(quality) + 33) for quality in self.bam[_end_of_seq : _end_of_qual ]])"
  494. if 'sam_tags_list' in dependencies:
  495. code += '''
  496. sam_tags_list = []
  497. offset = _end_of_qual
  498. while offset != len(self.bam):
  499. tag_name = self.bam[offset:offset+2]
  500. tag_type = self.bam[offset+2]
  501. if tag_type == 'Z':
  502. offset_end = self.bam.index('\\0',offset+3)+1
  503. tag_data = self.bam[offset+3:offset_end-1]
  504. elif tag_type in CtoPy:
  505. offset_end = offset+3+py4py[tag_type]
  506. tag_data = unpack(CtoPy[tag_type],self.bam[offset+3:offset_end])[0]
  507. elif tag_type == 'B':
  508. offset_end = offset+8+(unpack('<i',self.bam[offset+4:offset+8])[0]*py4py[self.bam[offset+3]])
  509. tag_data = array(self.bam[offset+3] , self.bam[offset+8:offset_end] )
  510. else:
  511. print 'PYBAM ERROR: I dont know how to parse BAM tags in this format: ',repr(tag_type)
  512. print ' This is simply because I never saw this kind of tag during development.'
  513. print ' If you could mail the following chunk of text to john at john.uk.com, i will fix this up for everyone :)'
  514. print repr(tag_type),repr(self.bam[offset+3:end])
  515. exit()
  516. sam_tags_list.append((tag_name,tag_type,tag_data))
  517. offset = offset_end'''
  518. if 'sam_tags_string' in dependencies:
  519. code += "\n sam_tags_string = '\t'.join(A + ':' + ('i' if B in 'cCsSI' else B) + ':' + ((C.typecode + ',' + ','.join(map(str,C))) if type(C)==array else str(C)) for A,B,C in self.sam_tags_list)"
  520. if 'sam' in dependencies:
  521. code += "\n sam = sam_qname + '\t' + str(sam_flag) + '\t' + sam_rname + '\t' + str(sam_pos1) + '\t' + str(sam_mapq) + '\t' + ('*' if sam_cigar_string == '' else sam_cigar_string) + '\t' + ('=' if bam_refID == bam_next_refID else sam_rnext) + '\t' + str(sam_pnext1) + '\t' + str(sam_tlen) + '\t' + sam_seq + '\t' + sam_qual + '\t' + sam_tags_string"
  522. code += '\n yield ' + ','.join([x for x in fields]) + '\n'
  523. self._static_parser_code = code # "code" is the static parser's code as a string (a function called "parser")
  524. exec_dict = { # This dictionary stores things the exec'd code needs to know about, and will store the compiled function after exec()
  525. 'unpack':unpack,
  526. 'array':array,
  527. 'dna_codes':dna_codes,
  528. 'CtoPy':CtoPy,
  529. 'py4py':py4py,
  530. 'cigar_codes':cigar_codes
  531. }
  532. exec(code, exec_dict) # exec() compiles "code" to real code, creating the "parser" function and adding it to exec_dict['parser']
  533. return exec_dict['parser']
  534. if fields:
  535. static_parser = compile_parser(self,fields)(self)
  536. def next_read(): return next(self._new_entry)
  537. else:
  538. def next_read(): return next(self._new_entry)
  539. self.next = next_read
  540. def __next__(self):
  541. return next(self._new_entry)
  542. def __iter__(self): return self
  543. def __str__(self): return self.sam
  544. ## Methods to pull out raw bam data from entry (so still in its binary encoding). This can be helpful in some scenarios.
  545. @property
  546. def bam_block_size(self): return self.bam[ : 4 ]
  547. @property
  548. def bam_refID(self): return self.bam[ 4 : 8 ]
  549. @property
  550. def bam_pos(self): return self.bam[ 8 : 12 ]
  551. @property
  552. def bam_l_read_name(self): return self.bam[ 12 : 13 ]
  553. @property
  554. def bam_mapq(self): return self.bam[ 13 : 14 ]
  555. @property
  556. def bam_bin(self): return self.bam[ 14 : 16 ]
  557. @property
  558. def bam_n_cigar_op(self): return self.bam[ 16 : 18 ]
  559. @property
  560. def bam_flag(self): return self.bam[ 18 : 20 ]
  561. @property
  562. def bam_l_seq(self): return self.bam[ 20 : 24 ]
  563. @property
  564. def bam_next_refID(self): return self.bam[ 24 : 28 ]
  565. @property
  566. def bam_pnext(self): return self.bam[ 28 : 32 ]
  567. @property
  568. def bam_tlen(self): return self.bam[ 32 : 36 ]
  569. @property
  570. def bam_qname(self): return self.bam[ 36 : self._end_of_qname ]
  571. @property
  572. def bam_cigar(self): return self.bam[ self._end_of_qname : self._end_of_cigar ]
  573. @property
  574. def bam_seq(self): return self.bam[ self._end_of_cigar : self._end_of_seq ]
  575. @property
  576. def bam_qual(self): return self.bam[ self._end_of_seq : self._end_of_qual ]
  577. @property
  578. def bam_tags(self): return self.bam[ self._end_of_qual : ]
  579. @property
  580. def sam_refID(self): return unpack( '<i', self.bam[ 4 : 8 ] )[0]
  581. @property
  582. def sam_pos0(self): return unpack( '<i', self.bam[ 8 : 12 ] )[0]
  583. @property
  584. def sam_l_read_name(self): return unpack( '<B', self.bam[ 12 : 13 ] )[0]
  585. @property
  586. def sam_mapq(self): return unpack( '<B', self.bam[ 13 : 14 ] )[0]
  587. @property
  588. def sam_bin(self): return unpack( '<H', self.bam[ 14 : 16 ] )[0]
  589. @property
  590. def sam_n_cigar_op(self): return unpack( '<H', self.bam[ 16 : 18 ] )[0]
  591. @property
  592. def sam_flag(self): return unpack( '<H', self.bam[ 18 : 20 ] )[0]
  593. @property
  594. def sam_l_seq(self): return unpack( '<i', self.bam[ 20 : 24 ] )[0]
  595. @property
  596. def sam_next_refID(self): return unpack( '<i', self.bam[ 24 : 28 ] )[0]
  597. @property
  598. def sam_pnext0(self): return unpack( '<i', self.bam[ 28 : 32 ] )[0]
  599. @property
  600. def sam_tlen(self): return unpack( '<i', self.bam[ 32 : 36 ] )[0]
  601. @property
  602. def sam_qname(self): return self.bam[ 36 : self._end_of_qname -1 ].decode() # -1 to remove trailing NUL byte
  603. @property
  604. def sam_cigar_list(self): return [ (cig >> 4 , cigar_codes[cig & 0b1111] ) for cig in array('I', self.bam[self._end_of_qname : self._end_of_cigar ])]
  605. @property
  606. def sam_cigar_string(self): return ''.join( [ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', self.bam[self._end_of_qname : self._end_of_cigar ])])
  607. @property
  608. def sam_seq(self): return ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', self.bam[self._end_of_cigar : self._end_of_seq ])])[:self.sam_l_seq] # As DNA is 4 bits packed 2-per-byte, there might be a trailing '0000', so we can either
  609. @property
  610. def sam_qual(self):
  611. return ''.join( [ chr(quality + 33) for quality in self.bam[self._end_of_seq : self._end_of_qual ]])
  612. @property
  613. def sam_tags_list(self):
  614. result = []
  615. offset = self._end_of_qual
  616. while offset != len(self.bam):
  617. tag_name = self.bam[offset:offset+2].decode()
  618. tag_type = chr(self.bam[offset+2])
  619. if tag_type == 'Z':
  620. offset_end = self.bam.index(b'\x00',offset+3)+1
  621. tag_data = self.bam[offset+3:offset_end-1].decode()
  622. elif tag_type in CtoPy:
  623. offset_end = offset+3+py4py[tag_type]
  624. tag_data = unpack(CtoPy[tag_type],self.bam[offset+3:offset_end])[0]
  625. elif tag_type == 'B':
  626. offset_end = offset+8+(unpack('<i',self.bam[offset+4:offset+8])[0]*py4py[self.bam[offset+3]])
  627. tag_data = array(self.bam[offset+3] , self.bam[offset+8:offset_end] )
  628. else:
  629. print('PYBAM ERROR: I dont know how to parse BAM tags in this format: ',repr(tag_type))
  630. print(' This is simply because I never saw this kind of tag during development.')
  631. print(' If you could mail the following chunk of text to john at john.uk.com, ill fix this up :)')
  632. print(repr(tag_type),repr(self.bam[offset+3:end]))
  633. exit()
  634. result.append((tag_name,tag_type,tag_data))
  635. offset = offset_end
  636. return result
  637. @property
  638. def sam_tags_string(self):
  639. return '\t'.join(A + ':' + ('i' if B in 'cCsSI' else B) + ':' + ((C.typecode + ',' + ','.join(map(str,C))) if type(C)==array else str(C)) for A,B,C in self.sam_tags_list)
  640. ## BONUS methods - methods that mimic how samtools works.
  641. @property
  642. def sam_pos1(self): return 0 if self.sam_pos0 < 0 else self.sam_pos0 + 1
  643. @property
  644. def sam_pnext1(self): return 0 if self.sam_pnext0 < 0 else self.sam_pnext0 + 1
  645. @property
  646. def sam_rname(self): return '*' if self.sam_refID < 0 else self.file_chromosomes[self.sam_refID ]
  647. @property
  648. def sam_rnext(self): return '*' if self.sam_next_refID < 0 else self.file_chromosomes[self.sam_next_refID]
  649. @property
  650. def sam(self): return (
  651. self.sam_qname + '\t' +
  652. str(self.sam_flag) + '\t' +
  653. self.sam_rname + '\t' +
  654. str(self.sam_pos1) + '\t' +
  655. str(self.sam_mapq) + '\t' +
  656. ('*' if self.sam_cigar_string == '' else self.sam_cigar_string) + '\t' +
  657. ('=' if self.bam_refID == self.bam_next_refID else self.sam_rnext) + '\t' +
  658. str(self.sam_pnext1) + '\t' +
  659. str(self.sam_tlen) + '\t' +
  660. self.sam_seq + '\t' +
  661. self.sam_qual + '\t' +
  662. self.sam_tags_string
  663. )
  664. ## Internal methods - methods used to calculate where variable-length blocks start/end
  665. @property
  666. def _end_of_qname(self): return self.sam_l_read_name + 36 # fixed-length stuff at the beginning takes up 36 bytes.
  667. @property
  668. def _end_of_cigar(self): return self._end_of_qname + (4*self.sam_n_cigar_op) # 4 bytes per n_cigar_op
  669. @property
  670. def _end_of_seq(self): return self._end_of_cigar + (-((-self.sam_l_seq)//2)) # {blurgh}
  671. @property
  672. def _end_of_qual(self): return self._end_of_seq + self.sam_l_seq # qual has the same length as seq
  673. def __del__(self):
  674. if self._subprocess.returncode is None: self._subprocess.kill()
  675. self._file.close()
  676. class PybamWarn(Exception): pass
  677. class PybamError(Exception): pass