__init__.py 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305
  1. # pylint: disable=R0913, R0914, C0301
  2. """
  3. Fasta file -> Faidx -> Fasta -> FastaRecord -> Sequence
  4. """
  5. from __future__ import division
  6. import os
  7. import re
  8. import string
  9. import sys
  10. import shutil
  11. import warnings
  12. from collections import namedtuple
  13. from itertools import islice
  14. from math import ceil
  15. from os.path import getmtime
  16. from threading import Lock
  17. from six import PY2, PY3, integer_types, string_types
  18. from six.moves import zip_longest
  19. try:
  20. from collections import OrderedDict
  21. except ImportError: #python 2.6
  22. from ordereddict import OrderedDict
  23. if sys.version_info > (3, ):
  24. buffer = memoryview
  25. dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
  26. __version__ = '0.6.2'
  27. class KeyFunctionError(ValueError):
  28. """Raised if the key_function argument is invalid."""
  29. class FastaIndexingError(Exception):
  30. """Raised if we encounter malformed FASTA that prevents indexing."""
  31. class IndexNotFoundError(IOError):
  32. """Raised if read_fai cannot open the index file."""
  33. class FastaNotFoundError(IOError):
  34. """Raised if the fasta file cannot be opened."""
  35. class FetchError(IndexError):
  36. """Raised if a request to fetch a FASTA sequence cannot be fulfilled."""
  37. class BedError(ValueError):
  38. """Indicates a malformed BED entry."""
  39. class RegionError(Exception):
  40. # This exception class is currently unused, but has been retained for
  41. # backwards compatibility.
  42. """A region error occurred."""
  43. class UnsupportedCompressionFormat(IOError):
  44. """
  45. Raised when a FASTA file is given with a recognized but unsupported
  46. compression extension.
  47. """
  48. class Sequence(object):
  49. """
  50. name = FASTA entry name
  51. seq = FASTA sequence
  52. start, end = coordinates of subsequence (optional)
  53. comp = boolean switch for complement property
  54. """
  55. def __init__(self, name='', seq='', start=None, end=None, comp=False):
  56. self.name = name
  57. self.seq = seq
  58. self.start = start
  59. self.end = end
  60. self.comp = comp
  61. assert isinstance(name, string_types)
  62. assert isinstance(seq, string_types)
  63. def __getitem__(self, n):
  64. """ Returns a sliced version of Sequence
  65. >>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6)
  66. >>> x
  67. >chr1:1-6
  68. ATCGTA
  69. >>> x[:3]
  70. >chr1:1-3
  71. ATC
  72. >>> x[3:]
  73. >chr1:4-6
  74. GTA
  75. >>> x[1:-1]
  76. >chr1:2-5
  77. TCGT
  78. >>> x[::-1]
  79. >chr1:6-1
  80. ATGCTA
  81. >>> x[::-3]
  82. >chr1
  83. AC
  84. >>> x = Sequence(name='chr1', seq='ATCGTA', start=0, end=6)
  85. >>> x
  86. >chr1:0-6
  87. ATCGTA
  88. >>> x[:3]
  89. >chr1:0-3
  90. ATC
  91. >>> x[3:]
  92. >chr1:3-6
  93. GTA
  94. >>> x[1:-1]
  95. >chr1:1-5
  96. TCGT
  97. >>> x[::-1]
  98. >chr1:6-0
  99. ATGCTA
  100. >>> x[::-3]
  101. >chr1
  102. AC
  103. """
  104. if self.start is None or self.end is None or len(self.seq) == 0:
  105. correction_factor = 0
  106. elif len(
  107. self.seq
  108. ) == abs(self.end - self.start) + 1: # determine coordinate system
  109. one_based = True
  110. correction_factor = -1
  111. elif len(self.seq) == abs(self.end - self.start):
  112. one_based = False
  113. correction_factor = 0
  114. elif len(self.seq) != abs(self.end - self.start):
  115. raise ValueError(
  116. "Coordinates (Sequence.start=%s and Sequence.end=%s) imply a different length than Sequence.seq (len=%s). Did you modify Sequence.seq?"
  117. % (self.start, self.end, len(self.seq)))
  118. if isinstance(n, slice):
  119. slice_start, slice_stop, slice_step = n.indices(len(self))
  120. if self.start is None or self.end is None: # there should never be self.start != self.end == None
  121. start = None
  122. end = None
  123. return self.__class__(self.name, self.seq[n], start, end,
  124. self.comp)
  125. self_end, self_start = (self.end, self.start)
  126. if abs(slice_step) > 1:
  127. start = None
  128. end = None
  129. elif slice_step == -1: # flip the coordinates when we reverse
  130. if slice_stop == -1:
  131. slice_stop = 0
  132. start = self_end - slice_stop
  133. end = self_start + slice_stop
  134. #print(locals())
  135. else:
  136. start = self_start + slice_start
  137. end = self_start + slice_stop + correction_factor
  138. return self.__class__(self.name, self.seq[n], start, end,
  139. self.comp)
  140. elif isinstance(n, integer_types):
  141. if n < 0:
  142. n = len(self) + n
  143. if self.start:
  144. return self.__class__(self.name, self.seq[n], self.start + n,
  145. self.start + n, self.comp)
  146. else:
  147. return self.__class__(self.name, self.seq[n], self.comp)
  148. def __str__(self):
  149. return self.seq
  150. def __neg__(self):
  151. """ Returns the reverse compliment of sequence
  152. >>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6)
  153. >>> x
  154. >chr1:1-6
  155. ATCGTA
  156. >>> y = -x
  157. >>> y
  158. >chr1:6-1 (complement)
  159. TACGAT
  160. >>> -y
  161. >chr1:1-6
  162. ATCGTA
  163. """
  164. return self[::-1].complement
  165. def __repr__(self):
  166. return '\n'.join([''.join(['>', self.fancy_name]), self.seq])
  167. def __len__(self):
  168. """
  169. >>> len(Sequence('chr1', 'ACT'))
  170. 3
  171. """
  172. return len(self.seq)
  173. def __eq__(self, other):
  174. """
  175. >>> Sequence('chr1', 'ACT') == 'ACT'
  176. True
  177. """
  178. return str(self) == str(other)
  179. @property
  180. def fancy_name(self):
  181. """ Return the fancy name for the sequence, including start, end, and complementation.
  182. >>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6, comp=True)
  183. >>> x.fancy_name
  184. 'chr1:1-6 (complement)'
  185. """
  186. name = self.name
  187. if self.start is not None and self.end is not None:
  188. name = ':'.join([name, '-'.join([str(self.start), str(self.end)])])
  189. if self.comp:
  190. name += ' (complement)'
  191. return name
  192. @property
  193. def long_name(self):
  194. """ DEPRECATED: Use fancy_name instead.
  195. Return the fancy name for the sequence, including start, end, and complementation.
  196. >>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6, comp=True)
  197. >>> x.long_name
  198. 'chr1:1-6 (complement)'
  199. """
  200. msg = "The `Sequence.long_name` property is deprecated, and will be removed in future versions. Please use `Sequence.fancy_name` instead."
  201. warnings.warn(msg, DeprecationWarning, stacklevel=2)
  202. return self.fancy_name
  203. @property
  204. def complement(self):
  205. """ Returns the compliment of self.
  206. >>> x = Sequence(name='chr1', seq='ATCGTA')
  207. >>> x.complement
  208. >chr1 (complement)
  209. TAGCAT
  210. """
  211. comp = self.__class__(
  212. self.name, complement(self.seq), start=self.start, end=self.end)
  213. comp.comp = False if self.comp else True
  214. return comp
  215. @property
  216. def reverse(self):
  217. """ Returns the reverse of self.
  218. >>> x = Sequence(name='chr1', seq='ATCGTA')
  219. >>> x.reverse
  220. >chr1
  221. ATGCTA
  222. """
  223. return self[::-1]
  224. @property
  225. def orientation(self):
  226. """ get the orientation forward=1, reverse=-1
  227. >>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6)
  228. >>> x.orientation
  229. 1
  230. >>> x.complement.orientation is None
  231. True
  232. >>> x[::-1].orientation is None
  233. True
  234. >>> x = -x
  235. >>> x.orientation
  236. -1
  237. """
  238. if self.start < self.end and not self.comp:
  239. return 1
  240. elif self.start > self.end and self.comp:
  241. return -1
  242. else:
  243. return None
  244. @property
  245. def gc(self):
  246. """ Return the GC content of seq as a float
  247. >>> x = Sequence(name='chr1', seq='ATCGTA')
  248. >>> y = round(x.gc, 2)
  249. >>> y == 0.33
  250. True
  251. """
  252. g = self.seq.count('G')
  253. g += self.seq.count('g')
  254. c = self.seq.count('C')
  255. c += self.seq.count('c')
  256. return (g + c) / len(self.seq)
  257. class IndexRecord(
  258. namedtuple('IndexRecord',
  259. ['rlen', 'offset', 'lenc', 'lenb', 'bend', 'prev_bend'])):
  260. __slots__ = ()
  261. def __getitem__(self, key):
  262. if type(key) == str:
  263. return getattr(self, key)
  264. return tuple.__getitem__(self, key)
  265. def __str__(self):
  266. return "{rlen:d}\t{offset:d}\t{lenc:d}\t{lenb:d}".format(
  267. **self._asdict())
  268. def __len__(self):
  269. return self.rlen
  270. class Faidx(object):
  271. """ A python implementation of samtools faidx FASTA indexing """
  272. def __init__(self,
  273. filename,
  274. default_seq=None,
  275. key_function=lambda x: x,
  276. as_raw=False,
  277. strict_bounds=False,
  278. read_ahead=None,
  279. mutable=False,
  280. split_char=None,
  281. duplicate_action="stop",
  282. filt_function=lambda x: True,
  283. one_based_attributes=True,
  284. read_long_names=False,
  285. sequence_always_upper=False,
  286. rebuild=True,
  287. build_index=True):
  288. """
  289. filename: name of fasta file
  290. key_function: optional callback function which should return a unique
  291. key for the self.index dictionary when given rname.
  292. as_raw: optional parameter to specify whether to return sequences as a
  293. Sequence() object or as a raw string.
  294. Default: False (i.e. return a Sequence() object).
  295. """
  296. self.filename = filename
  297. if filename.lower().endswith('.bgz') or filename.lower().endswith(
  298. '.gz'):
  299. # Only try to import Bio if we actually need the bgzf reader.
  300. try:
  301. from Bio import bgzf
  302. from Bio import __version__ as bgzf_version
  303. from distutils.version import LooseVersion
  304. if LooseVersion(bgzf_version) < LooseVersion('1.73'):
  305. raise ImportError
  306. except ImportError:
  307. raise ImportError(
  308. "BioPython >= 1.73 must be installed to read block gzip files.")
  309. else:
  310. self._fasta_opener = bgzf.open
  311. self._bgzf = True
  312. elif filename.lower().endswith('.bz2') or filename.lower().endswith(
  313. '.zip'):
  314. raise UnsupportedCompressionFormat(
  315. "Compressed FASTA is only supported in BGZF format. Use "
  316. "bgzip to compresss your FASTA.")
  317. else:
  318. self._fasta_opener = open
  319. self._bgzf = False
  320. try:
  321. self.file = self._fasta_opener(filename, 'r+b'
  322. if mutable else 'rb')
  323. except (ValueError, IOError) as e:
  324. if str(e).find('BGZF') > -1:
  325. raise UnsupportedCompressionFormat(
  326. "Compressed FASTA is only supported in BGZF format. Use "
  327. "the samtools bgzip utility (instead of gzip) to "
  328. "compress your FASTA.")
  329. else:
  330. raise FastaNotFoundError(
  331. "Cannot read FASTA file %s" % filename)
  332. self.indexname = filename + '.fai'
  333. self.read_long_names = read_long_names
  334. self.key_function = key_function
  335. try:
  336. key_fn_test = self.key_function(
  337. "TestingReturnType of_key_function")
  338. if not isinstance(key_fn_test, string_types):
  339. raise KeyFunctionError(
  340. "key_function argument should return a string, not {0}".
  341. format(type(key_fn_test)))
  342. except Exception as e:
  343. pass
  344. self.filt_function = filt_function
  345. assert duplicate_action in ("stop", "first", "last", "longest",
  346. "shortest", "drop")
  347. self.duplicate_action = duplicate_action
  348. self.as_raw = as_raw
  349. self.default_seq = default_seq
  350. if self._bgzf and self.default_seq is not None:
  351. raise FetchError(
  352. "The default_seq argument is not supported with using BGZF compression. Please decompress your FASTA file and try again."
  353. )
  354. if self._bgzf:
  355. self.strict_bounds = True
  356. else:
  357. self.strict_bounds = strict_bounds
  358. self.split_char = split_char
  359. self.one_based_attributes = one_based_attributes
  360. self.sequence_always_upper = sequence_always_upper
  361. self.index = OrderedDict()
  362. self.lock = Lock()
  363. self.buffer = dict((('seq', None), ('name', None), ('start', None),
  364. ('end', None)))
  365. if not read_ahead or isinstance(read_ahead, integer_types):
  366. self.read_ahead = read_ahead
  367. elif not isinstance(read_ahead, integer_types):
  368. raise ValueError("read_ahead value must be int, not {0}".format(
  369. type(read_ahead)))
  370. self.mutable = mutable
  371. with self.lock: # lock around index generation so only one thread calls method
  372. try:
  373. if os.path.exists(self.indexname) and getmtime(
  374. self.indexname) >= getmtime(self.filename):
  375. self.read_fai()
  376. elif os.path.exists(self.indexname) and getmtime(
  377. self.indexname) < getmtime(
  378. self.filename) and not rebuild:
  379. self.read_fai()
  380. warnings.warn(
  381. "Index file {0} is older than FASTA file {1}.".format(
  382. self.indexname, self.filename), RuntimeWarning)
  383. elif build_index:
  384. self.build_index()
  385. self.read_fai()
  386. else:
  387. self.read_fai()
  388. except FastaIndexingError:
  389. self.file.close()
  390. os.remove(self.indexname + '.tmp')
  391. raise
  392. except Exception:
  393. # Handle potential exceptions other than 'FastaIndexingError'
  394. self.file.close()
  395. raise
  396. def __contains__(self, region):
  397. if not self.buffer['name']:
  398. return False
  399. name, start, end = region
  400. if self.buffer['name'] == name and self.buffer['start'] <= start and self.buffer['end'] >= end:
  401. return True
  402. else:
  403. return False
  404. def __repr__(self):
  405. return 'Faidx("%s")' % (self.filename)
  406. def _index_as_string(self):
  407. """ Returns the string representation of the index as iterable """
  408. for k, v in self.index.items():
  409. yield '{k}\t{v}\n'.format(k=k, v=str(v))
  410. def read_fai(self):
  411. try:
  412. with open(self.indexname) as index:
  413. prev_bend = 0
  414. drop_keys = []
  415. for line in index:
  416. line = line.rstrip()
  417. rname, rlen, offset, lenc, lenb = line.split('\t')
  418. rlen, offset, lenc, lenb = map(int,
  419. (rlen, offset, lenc, lenb))
  420. newlines = int(ceil(rlen / lenc) * (lenb - lenc)) if lenc else 0
  421. bend = offset + newlines + rlen
  422. rec = IndexRecord(rlen, offset, lenc, lenb, bend,
  423. prev_bend)
  424. if self.read_long_names:
  425. rname = self._long_name_from_index_record(rec)
  426. if self.split_char:
  427. rname = filter(self.filt_function,
  428. self.key_function(rname).split(
  429. self.split_char))
  430. else:
  431. # filter must act on an iterable
  432. rname = filter(self.filt_function,
  433. [self.key_function(rname)])
  434. for key in rname: # mdshw5/pyfaidx/issues/64
  435. if key in self.index:
  436. if self.duplicate_action == "stop":
  437. raise ValueError('Duplicate key "%s"' % key)
  438. elif self.duplicate_action == "first":
  439. continue
  440. elif self.duplicate_action == "last":
  441. self.index[key] = rec
  442. elif self.duplicate_action == "longest":
  443. if len(rec) > len(self.index[key]):
  444. self.index[key] = rec
  445. elif self.duplicate_action == "shortest":
  446. if len(rec) < len(self.index[key]):
  447. self.index[key] = rec
  448. elif self.duplicate_action == "drop":
  449. if key not in drop_keys:
  450. drop_keys.append(key)
  451. else:
  452. self.index[key] = rec
  453. prev_bend = bend
  454. for dup in drop_keys:
  455. self.index.pop(dup, None)
  456. except IOError:
  457. raise IndexNotFoundError(
  458. "Could not read index file %s" % self.indexname)
  459. def build_index(self):
  460. try:
  461. with self._fasta_opener(self.filename, 'rb') as fastafile:
  462. with open(self.indexname + '.tmp', 'w') as indexfile:
  463. rname = None # reference sequence name
  464. offset = 0 # binary offset of end of current line
  465. rlen = 0 # reference character length
  466. blen = 0 # binary line length (includes newline)
  467. clen = 0 # character line length
  468. bad_lines = [] # lines > || < than blen
  469. thisoffset = offset
  470. valid_entry = False
  471. lastline = None
  472. for i, line in enumerate(fastafile):
  473. line_blen = len(line)
  474. line = line.decode()
  475. line_clen = len(line.rstrip('\n\r'))
  476. lastline = i
  477. # write an index line
  478. if line[0] == '>':
  479. valid_entry = check_bad_lines(
  480. rname, bad_lines, i - 1)
  481. if valid_entry and i > 0:
  482. indexfile.write(
  483. "{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(
  484. rname, rlen, thisoffset, clen, blen))
  485. elif not valid_entry:
  486. raise FastaIndexingError(
  487. "Line length of fasta"
  488. " file is not "
  489. "consistent! "
  490. "Inconsistent line found in >{0} at "
  491. "line {1:n}.".format(
  492. rname, bad_lines[0][0] + 1))
  493. blen = 0
  494. rlen = 0
  495. clen = 0
  496. bad_lines = []
  497. try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
  498. rname = line.rstrip('\n\r')[1:].split()[
  499. 0] # duplicates are detected with read_fai
  500. except IndexError:
  501. raise FastaIndexingError(
  502. "Bad sequence name %s at line %s." %
  503. (line.rstrip('\n\r'), str(i)))
  504. offset += line_blen
  505. thisoffset = fastafile.tell(
  506. ) if self._bgzf else offset
  507. else: # check line and advance offset
  508. if not blen:
  509. blen = line_blen
  510. if not clen:
  511. clen = line_clen
  512. # only one short line should be allowed
  513. # before we hit the next header, and it
  514. # should be the last line in the entry
  515. if line_blen != blen or line_blen == 1:
  516. bad_lines.append((i, line_blen))
  517. offset += line_blen
  518. rlen += line_clen
  519. # check that we find at least 1 valid FASTA record
  520. if not valid_entry:
  521. raise FastaIndexingError(
  522. "The FASTA file %s does not contain a valid sequence. "
  523. "Check that sequence definition lines start with '>'." % self.filename)
  524. # write the final index line, if there is one.
  525. if lastline is not None:
  526. valid_entry = check_bad_lines(
  527. rname, bad_lines, lastline
  528. ) # advance index since we're at the end of the file
  529. if valid_entry:
  530. indexfile.write(
  531. "{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(
  532. rname, rlen, thisoffset, clen, blen))
  533. else:
  534. raise FastaIndexingError(
  535. "Line length of fasta"
  536. " file is not "
  537. "consistent! "
  538. "Inconsistent line found in >{0} at "
  539. "line {1:n}.".format(rname,
  540. bad_lines[0][0] + 1))
  541. shutil.move(self.indexname + '.tmp', self.indexname)
  542. except (IOError, FastaIndexingError) as e:
  543. if isinstance(e, IOError):
  544. raise IOError(
  545. "%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild."
  546. % self.indexname)
  547. elif isinstance(e, FastaIndexingError):
  548. raise e
  549. def write_fai(self):
  550. with self.lock:
  551. with open(self.indexname, 'w') as outfile:
  552. for line in self._index_as_string():
  553. outfile.write(line)
  554. def from_buffer(self, start, end):
  555. i_start = start - self.buffer['start'] # want [0, 1) coordinates from [1, 1] coordinates
  556. i_end = end - self.buffer['start'] + 1
  557. return self.buffer['seq'][i_start:i_end]
  558. def fill_buffer(self, name, start, end):
  559. try:
  560. seq = self.from_file(name, start, end)
  561. self.buffer['seq'] = seq
  562. self.buffer['start'] = start
  563. self.buffer['end'] = end
  564. self.buffer['name'] = name
  565. except FetchError:
  566. pass
  567. def fetch(self, name, start, end):
  568. if self.read_ahead and not (name, start, end) in self:
  569. self.fill_buffer(name, start, end + self.read_ahead)
  570. if (name, start, end) in self:
  571. seq = self.from_buffer(start, end)
  572. else:
  573. seq = self.from_file(name, start, end)
  574. return self.format_seq(seq, name, start, end)
  575. def from_file(self, rname, start, end, internals=False):
  576. """ Fetch the sequence ``[start:end]`` from ``rname`` using 1-based coordinates
  577. 1. Count newlines before start
  578. 2. Count newlines to end
  579. 3. Difference of 1 and 2 is number of newlines in [start:end]
  580. 4. Seek to start position, taking newlines into account
  581. 5. Read to end position, return sequence
  582. """
  583. assert start == int(start)
  584. assert end == int(end)
  585. try:
  586. i = self.index[rname]
  587. except KeyError:
  588. raise FetchError("Requested rname {0} does not exist! "
  589. "Please check your FASTA file.".format(rname))
  590. start0 = start - 1 # make coordinates [0,1)
  591. if start0 < 0:
  592. raise FetchError(
  593. "Requested start coordinate must be greater than 1.")
  594. seq_len = end - start0
  595. # Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398)
  596. newlines_before = int(
  597. (start0 - 1) / i.lenc) if start0 > 0 and i.lenc else 0
  598. newlines_to_end = int(end / i.lenc) if i.lenc else 0
  599. newlines_inside = newlines_to_end - newlines_before
  600. newline_blen = i.lenb - i.lenc
  601. seq_blen = newlines_inside * newline_blen + seq_len
  602. bstart = i.offset + newlines_before * newline_blen + start0
  603. if seq_blen < 0 and self.strict_bounds:
  604. raise FetchError("Requested coordinates start={0:n} end={1:n} are "
  605. "invalid.\n".format(start, end))
  606. elif end > i.rlen and self.strict_bounds:
  607. raise FetchError("Requested end coordinate {0:n} outside of {1}. "
  608. "\n".format(end, rname))
  609. with self.lock:
  610. if self._bgzf: # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
  611. self.file.seek(i.offset)
  612. chunk = start0 + (newlines_before * newline_blen) + (newlines_inside * newline_blen) + seq_len
  613. chunk_seq = self.file.read(chunk).decode()
  614. seq = chunk_seq[start0 + newlines_before:]
  615. else:
  616. self.file.seek(bstart)
  617. # If the requested sequence exceeds len(FastaRecord), return as much as possible
  618. if bstart + seq_blen > i.bend and not self.strict_bounds:
  619. seq_blen = i.bend - bstart
  620. # Otherwise it should be safe to read the sequence
  621. if seq_blen > 0:
  622. seq = self.file.read(seq_blen).decode()
  623. # If the requested sequence is negative, we will pad the empty string with default_seq.
  624. # This was changed to support #155 with strict_bounds=True.
  625. elif seq_blen <= 0:
  626. seq = ''
  627. if not internals:
  628. return seq.replace('\n', '').replace('\r', '')
  629. else:
  630. return (seq, locals())
  631. def format_seq(self, seq, rname, start, end):
  632. start0 = start - 1
  633. if len(
  634. seq
  635. ) < end - start0 and self.default_seq: # Pad missing positions with default_seq
  636. pad_len = end - start0 - len(seq)
  637. seq = ''.join([seq, pad_len * self.default_seq])
  638. else: # Return less than requested range
  639. end = start0 + len(seq)
  640. if self.sequence_always_upper:
  641. seq = seq.upper()
  642. if not self.one_based_attributes:
  643. start = start0
  644. if self.as_raw:
  645. return seq
  646. else:
  647. return Sequence(
  648. name=rname, start=int(start), end=int(end), seq=seq)
  649. def to_file(self, rname, start, end, seq):
  650. """ Write sequence in region from start-end, overwriting current
  651. contents of the FASTA file. """
  652. if not self.mutable:
  653. raise IOError(
  654. "Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA."
  655. )
  656. file_seq, internals = self.from_file(rname, start, end, internals=True)
  657. with self.lock:
  658. if len(seq) != len(file_seq) - internals['newlines_inside']:
  659. raise IOError(
  660. "Specified replacement sequence needs to have the same length as original."
  661. )
  662. elif len(seq) == len(file_seq) - internals['newlines_inside']:
  663. line_len = internals['i'].lenc
  664. if '\r\n' in file_seq:
  665. newline_char = '\r\n'
  666. elif '\r' in file_seq:
  667. newline_char = '\r'
  668. else:
  669. newline_char = '\n'
  670. self.file.seek(internals['bstart'])
  671. if internals['newlines_inside'] == 0:
  672. self.file.write(seq.encode())
  673. elif internals['newlines_inside'] > 0:
  674. n = 0
  675. m = file_seq.index(newline_char)
  676. while m < len(seq):
  677. self.file.write(''.join([seq[n:m], newline_char]).encode())
  678. n = m
  679. m += line_len
  680. self.file.write(seq[n:].encode())
  681. self.file.flush()
  682. def get_long_name(self, rname):
  683. """ Return the full sequence defline and description. External method using the self.index """
  684. index_record = self.index[rname]
  685. if self._bgzf:
  686. return self._long_name_from_bgzf(index_record)
  687. else:
  688. return self._long_name_from_index_record(index_record)
  689. def _long_name_from_index_record(self, index_record):
  690. """ Return the full sequence defline and description. Internal method passing IndexRecord """
  691. prev_bend = index_record.prev_bend
  692. defline_end = index_record.offset
  693. self.file.seek(prev_bend)
  694. return self.file.read(defline_end - prev_bend).decode()[1:-1]
  695. def _long_name_from_bgzf(self, index_record):
  696. """ Return the full sequence defline and description. Internal method passing IndexRecord
  697. This method is present for compatibility with BGZF files, since we cannot subtract their offsets.
  698. It may be possible to implement a more efficient method. """
  699. raise NotImplementedError(
  700. "FastaRecord.long_name and Fasta(read_long_names=True) "
  701. "are not supported currently for BGZF compressed files.")
  702. prev_bend = index_record.prev_bend
  703. self.file.seek(prev_bend)
  704. defline = []
  705. while True:
  706. chunk = self.file.read(4096).decode()
  707. defline.append(chunk)
  708. if '\n' in chunk or '\r' in chunk:
  709. break
  710. return ''.join(defline)[1:].split('\n\r')[0]
  711. def close(self):
  712. self.__exit__()
  713. def __enter__(self):
  714. return self
  715. def __exit__(self, *args):
  716. self.file.close()
  717. class FastaRecord(object):
  718. __slots__ = ['name', '_fa']
  719. def __init__(self, name, fa):
  720. self.name = name
  721. self._fa = fa
  722. def __getitem__(self, n):
  723. """Return sequence from region [start, end)
  724. Coordinates are 0-based, end-exclusive."""
  725. try:
  726. if isinstance(n, slice):
  727. start, stop, step = n.start, n.stop, n.step
  728. if start is None:
  729. start = 0
  730. if stop is None:
  731. stop = len(self)
  732. if stop < 0:
  733. stop = len(self) + stop
  734. if start < 0:
  735. start = len(self) + start
  736. return self._fa.get_seq(self.name, start + 1, stop)[::step]
  737. elif isinstance(n, integer_types):
  738. if n < 0:
  739. n = len(self) + n
  740. return self._fa.get_seq(self.name, n + 1, n + 1)
  741. except FetchError:
  742. raise
  743. def __iter__(self):
  744. """ Construct a line-based generator that respects the original line lengths. """
  745. line_len = self._fa.faidx.index[self.name].lenc
  746. start = 0
  747. while True:
  748. end = start + line_len
  749. if end < len(self):
  750. yield self[start:end]
  751. else:
  752. yield self[start:]
  753. return
  754. start += line_len
  755. def __reversed__(self):
  756. """ Reverse line-based generator """
  757. line_len = self._fa.faidx.index[self.name].lenc
  758. # have to determine last line length
  759. last_line = len(self) % line_len
  760. if last_line == 0:
  761. last_line = line_len
  762. end = len(self)
  763. start = end - last_line
  764. while True:
  765. if start > 0:
  766. yield self[start:end][::-1]
  767. else:
  768. yield self[:end][::-1]
  769. return
  770. if end == len(self): # first iteration
  771. end -= last_line
  772. else:
  773. end -= line_len
  774. start = end - line_len
  775. def __repr__(self):
  776. return 'FastaRecord("%s")' % (self.name)
  777. def __len__(self):
  778. return self._fa.faidx.index[self.name].rlen
  779. @property
  780. def unpadded_len(self):
  781. """ Returns the length of the contig without 5' and 3' N padding.
  782. Functions the same as contigNonNSize in Fasta.cpp at
  783. https://github.com/Illumina/hap.py/blob/master/src/c%2B%2B/lib/tools/Fasta.cpp#L284
  784. """
  785. length = len(self)
  786. stop = False
  787. for line in iter(self):
  788. if stop:
  789. break
  790. if isinstance(line, Sequence):
  791. line = line.seq
  792. for base in line.upper():
  793. if base == 'N':
  794. length -= 1
  795. else:
  796. stop = True
  797. break
  798. stop = False
  799. for line in reversed(self):
  800. if stop:
  801. break
  802. if isinstance(line, Sequence):
  803. line = line.seq
  804. for base in line.upper():
  805. if base == 'N':
  806. length -= 1
  807. else:
  808. stop = True
  809. break
  810. return length
  811. def __str__(self):
  812. return str(self[:])
  813. @property
  814. def variant_sites(self):
  815. if isinstance(self._fa, FastaVariant):
  816. pos = []
  817. var = self._fa.vcf.fetch(self.name, 0, len(self))
  818. for site in var:
  819. if site.is_snp:
  820. sample = site.genotype(self._fa.sample)
  821. if sample.gt_type in self._fa.gt_type and eval(
  822. self._fa.filter):
  823. pos.append(site.POS)
  824. return tuple(pos)
  825. else:
  826. raise NotImplementedError(
  827. "variant_sites() only valid for FastaVariant.")
  828. @property
  829. def long_name(self):
  830. """ Read the actual defline from self._fa.faidx mdshw5/pyfaidx#54 """
  831. return self._fa.faidx.get_long_name(self.name)
  832. @property
  833. def __array_interface__(self):
  834. """ Implement numpy array interface for issue #139"""
  835. return {
  836. 'shape': (len(self), ),
  837. 'typestr': '|S1',
  838. 'version': 3,
  839. 'data': buffer(str(self).encode('ascii'))
  840. }
  841. class MutableFastaRecord(FastaRecord):
  842. def __init__(self, name, fa):
  843. super(MutableFastaRecord, self).__init__(name, fa)
  844. if self._fa.faidx._fasta_opener != open:
  845. raise UnsupportedCompressionFormat(
  846. "BGZF compressed FASTA is not supported for MutableFastaRecord. "
  847. "Please decompress your FASTA file.")
  848. def __setitem__(self, n, value):
  849. """Mutate sequence in region [start, end)
  850. to value.
  851. Coordinates are 0-based, end-exclusive."""
  852. try:
  853. if isinstance(n, slice):
  854. start, stop, step = n.start, n.stop, n.step
  855. if step:
  856. raise IndexError("Step operator is not implemented.")
  857. if not start:
  858. start = 0
  859. if not stop:
  860. stop = len(self)
  861. if stop < 0:
  862. stop = len(self) + stop
  863. if start < 0:
  864. start = len(self) + start
  865. self._fa.faidx.to_file(self.name, start + 1, stop, value)
  866. elif isinstance(n, integer_types):
  867. if n < 0:
  868. n = len(self) + n
  869. return self._fa.faidx.to_file(self.name, n + 1, n + 1, value)
  870. except (FetchError, IOError):
  871. raise
  872. class Fasta(object):
  873. def __init__(self,
  874. filename,
  875. default_seq=None,
  876. key_function=lambda x: x,
  877. as_raw=False,
  878. strict_bounds=False,
  879. read_ahead=None,
  880. mutable=False,
  881. split_char=None,
  882. filt_function=lambda x: True,
  883. one_based_attributes=True,
  884. read_long_names=False,
  885. duplicate_action="stop",
  886. sequence_always_upper=False,
  887. rebuild=True,
  888. build_index=True):
  889. """
  890. An object that provides a pygr compatible interface.
  891. filename: name of fasta file
  892. """
  893. self.filename = filename
  894. self.mutable = mutable
  895. self.faidx = Faidx(
  896. filename,
  897. key_function=key_function,
  898. as_raw=as_raw,
  899. default_seq=default_seq,
  900. strict_bounds=strict_bounds,
  901. read_ahead=read_ahead,
  902. mutable=mutable,
  903. split_char=split_char,
  904. filt_function=filt_function,
  905. one_based_attributes=one_based_attributes,
  906. read_long_names=read_long_names,
  907. duplicate_action=duplicate_action,
  908. sequence_always_upper=sequence_always_upper,
  909. rebuild=rebuild,
  910. build_index=build_index)
  911. _record_constructor = MutableFastaRecord if self.mutable else FastaRecord
  912. self.records = OrderedDict([(rname, _record_constructor(rname, self)) for rname in self.faidx.index.keys()])
  913. def __contains__(self, rname):
  914. """Return True if genome contains record."""
  915. return rname in self.faidx.index
  916. def __getitem__(self, rname):
  917. """Return a chromosome by its name, or its numerical index."""
  918. if isinstance(rname, integer_types):
  919. rname = next(islice(self.records.keys(), rname, None))
  920. try:
  921. return self.records[rname]
  922. except KeyError:
  923. raise KeyError("{0} not in {1}.".format(rname, self.filename))
  924. def __repr__(self):
  925. return 'Fasta("%s")' % (self.filename)
  926. def __iter__(self):
  927. return iter(self.records.values())
  928. def __len__(self):
  929. """Return the cumulative length of all FastaRecords in self.records."""
  930. return sum(len(record) for record in self)
  931. def get_seq(self, name, start, end, rc=False):
  932. """Return a sequence by record name and interval [start, end).
  933. Coordinates are 1-based, end-exclusive.
  934. If rc is set, reverse complement will be returned.
  935. """
  936. # Get sequence from real genome object and save result.
  937. seq = self.faidx.fetch(name, start, end)
  938. if rc:
  939. return -seq
  940. else:
  941. return seq
  942. def get_spliced_seq(self, name, intervals, rc=False):
  943. """Return a sequence by record name and list of intervals
  944. Interval list is an iterable of [start, end].
  945. Coordinates are 1-based, end-exclusive.
  946. If rc is set, reverse complement will be returned.
  947. """
  948. # Get sequence for all intervals
  949. chunks = [self.faidx.fetch(name, s, e) for s, e in intervals]
  950. start = chunks[0].start
  951. end = chunks[-1].end
  952. # reverce complement
  953. if rc:
  954. seq = "".join([(-chunk).seq for chunk in chunks[::-1]])
  955. else:
  956. seq = "".join([chunk.seq for chunk in chunks])
  957. # Sequence coordinate validation wont work since
  958. # len(Sequence.seq) != end - start
  959. return Sequence(name=name, seq=seq, start=None, end=None)
  960. def keys(self):
  961. return self.records.keys()
  962. def values(self):
  963. return self.records.values()
  964. def items(self):
  965. return self.records.items()
  966. def close(self):
  967. self.__exit__()
  968. def __enter__(self):
  969. return self
  970. def __exit__(self, *args):
  971. self.faidx.__exit__(*args)
  972. class FastaVariant(Fasta):
  973. """ Return consensus sequence from FASTA and VCF inputs
  974. """
  975. expr = set(('>', '<', '=', '!'))
  976. def __init__(self,
  977. filename,
  978. vcf_file,
  979. sample=None,
  980. het=True,
  981. hom=True,
  982. call_filter=None,
  983. **kwargs):
  984. super(FastaVariant, self).__init__(filename, **kwargs)
  985. try:
  986. import pysam
  987. except ImportError:
  988. raise ImportError("pysam must be installed for FastaVariant.")
  989. try:
  990. import vcf
  991. except ImportError:
  992. raise ImportError("PyVCF must be installed for FastaVariant.")
  993. if call_filter is not None:
  994. try:
  995. key, expr, value = call_filter.split() # 'GQ > 30'
  996. except IndexError:
  997. raise ValueError(
  998. "call_filter must be a string in the format 'XX <>!= NN'")
  999. assert all([x in self.expr for x in list(expr)])
  1000. assert all([x in string.ascii_uppercase for x in list(key)])
  1001. assert all([x in string.printable for x in list(value)])
  1002. self.filter = "sample['{key}'] {expr} {value}".format(**locals())
  1003. else:
  1004. self.filter = 'True'
  1005. if os.path.exists(vcf_file):
  1006. self.vcf = vcf.Reader(filename=vcf_file)
  1007. else:
  1008. raise IOError("File {0} does not exist.".format(vcf_file))
  1009. if sample is not None:
  1010. self.sample = sample
  1011. else:
  1012. self.sample = self.vcf.samples[0]
  1013. if len(self.vcf.samples) > 1 and sample is None:
  1014. warnings.warn("Using sample {0} genotypes.".format(
  1015. self.sample), RuntimeWarning)
  1016. if het and hom:
  1017. self.gt_type = set((1, 2))
  1018. elif het:
  1019. self.gt_type = set((1, ))
  1020. elif hom:
  1021. self.gt_type = set((2, ))
  1022. else:
  1023. self.gt_type = set()
  1024. def __repr__(self):
  1025. return 'FastaVariant("%s", "%s", gt="%s")' % (self.filename,
  1026. self.vcf.filename,
  1027. str(self.gt_type))
  1028. def get_seq(self, name, start, end):
  1029. """Return a sequence by record name and interval [start, end).
  1030. Replace positions with polymorphism with variant.
  1031. Coordinates are 0-based, end-exclusive.
  1032. """
  1033. seq = self.faidx.fetch(name, start, end)
  1034. if self.faidx.as_raw:
  1035. seq_mut = list(seq)
  1036. del seq
  1037. else:
  1038. seq_mut = list(seq.seq)
  1039. del seq.seq
  1040. var = self.vcf.fetch(name, start - 1, end)
  1041. for record in var:
  1042. if record.is_snp: # skip indels
  1043. sample = record.genotype(self.sample)
  1044. if sample.gt_type in self.gt_type and eval(self.filter):
  1045. alt = record.ALT[0]
  1046. i = (record.POS - 1) - (start - 1)
  1047. seq_mut[i:i + len(alt)] = str(alt)
  1048. # slice the list in case we added an MNP in last position
  1049. if self.faidx.as_raw:
  1050. return ''.join(seq_mut[:end - start + 1])
  1051. else:
  1052. seq.seq = ''.join(seq_mut[:end - start + 1])
  1053. return seq
  1054. def wrap_sequence(n, sequence, fillvalue=''):
  1055. args = [iter(sequence)] * n
  1056. for line in zip_longest(fillvalue=fillvalue, *args):
  1057. yield ''.join(line + ("\n", ))
  1058. # To take a complement, we map each character in the first string in this pair
  1059. # to the corresponding character in the second string.
  1060. complement_map = ('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx',
  1061. 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx')
  1062. invalid_characters_set = set(
  1063. chr(x) for x in range(256) if chr(x) not in complement_map[0])
  1064. invalid_characters_string = ''.join(invalid_characters_set)
  1065. if PY3:
  1066. complement_table = str.maketrans(complement_map[0], complement_map[1],
  1067. invalid_characters_string)
  1068. translate_arguments = (complement_table, )
  1069. elif PY2:
  1070. complement_table = string.maketrans(complement_map[0], complement_map[1])
  1071. translate_arguments = (complement_table, invalid_characters_string)
  1072. def complement(seq):
  1073. """ Returns the complement of seq.
  1074. >>> seq = 'ATCGTA'
  1075. >>> complement(seq)
  1076. 'TAGCAT'
  1077. """
  1078. seq = str(seq)
  1079. result = seq.translate(*translate_arguments)
  1080. if len(result) != len(seq):
  1081. first_invalid_position = next(
  1082. i for i in range(len(seq)) if seq[i] in invalid_characters_set)
  1083. raise ValueError(
  1084. "Sequence contains non-DNA character '{0}' at position {1:n}\n".
  1085. format(seq[first_invalid_position], first_invalid_position + 1))
  1086. return result
  1087. def translate_chr_name(from_name, to_name):
  1088. chr_name_map = dict(zip(from_name, to_name))
  1089. def map_to_function(rname):
  1090. return chr_name_map[rname]
  1091. return map_to_function
  1092. def bed_split(bed_entry):
  1093. try:
  1094. rname, start, end = bed_entry.rstrip().split()[:3]
  1095. except (IndexError, ValueError):
  1096. raise BedError('Malformed BED entry! {0}\n'.format(bed_entry.rstrip()))
  1097. start, end = (int(start), int(end))
  1098. return (rname, start, end)
  1099. def ucsc_split(region):
  1100. try:
  1101. rname, interval = region.split(':')
  1102. except ValueError:
  1103. rname = region
  1104. interval = None
  1105. try:
  1106. start, end = interval.split('-')
  1107. start, end = (int(start) - 1, int(end))
  1108. except (AttributeError, ValueError):
  1109. start, end = (None, None)
  1110. return (rname, start, end)
  1111. def check_bad_lines(rname, bad_lines, i):
  1112. """ Find inconsistent line lengths in the middle of an
  1113. entry. Allow blank lines between entries, and short lines
  1114. occurring at the last line of an entry. Returns boolean
  1115. validating the entry.
  1116. >>> check_bad_lines('chr0', [(10, 79)], 10)
  1117. True
  1118. >>> check_bad_lines('chr0', [(9, 79)], 10)
  1119. False
  1120. >>> check_bad_lines('chr0', [(9, 79), (10, 1)], 10)
  1121. True
  1122. """
  1123. if len(bad_lines) == 0:
  1124. return True
  1125. elif len(bad_lines) == 1:
  1126. if bad_lines[0][0] == i: # must be last line
  1127. return True
  1128. else:
  1129. return False
  1130. elif len(bad_lines) == 2:
  1131. if bad_lines[0][0] == i: # must not be last line
  1132. return False
  1133. elif bad_lines[1][0] == i and bad_lines[1][1] == 1: # blank last line
  1134. if bad_lines[0][0] + 1 == i and bad_lines[0][1] > 1: # non-blank line
  1135. return True
  1136. else:
  1137. return False
  1138. if len(bad_lines) > 2:
  1139. return False
  1140. raise RuntimeError("Unhandled exception during fasta indexing at entry " + rname + \
  1141. "Please report this issue at https://github.com/mdshw5/pyfaidx/issues " + \
  1142. str(bad_lines))
  1143. def get_valid_filename(s):
  1144. """
  1145. From https://github.com/django/django/blob/efc3e32d6d7fb9bb41be73b80c8607b653c1fbd6/django/utils/text.py#L222-L232
  1146. Return the given string converted to a string that can be used for a clean
  1147. filename. Remove leading and trailing spaces; convert other spaces to
  1148. underscores; and remove anything that is not an alphanumeric, dash,
  1149. underscore, or dot.
  1150. >>> get_valid_filename("HPV16_144-1.fa")
  1151. 'HPV16_144-1.fa'
  1152. >>> get_valid_filename("chromosome 6.fa")
  1153. 'chromosome_6.fa'
  1154. """
  1155. s = str(s).strip().replace(' ', '_')
  1156. return re.sub(r'(?u)[^-\w.]', '', s)
  1157. if __name__ == "__main__":
  1158. import doctest
  1159. doctest.testmod()