cli.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. #!/usr/bin/env python
  2. import argparse
  3. import sys
  4. import os.path
  5. import re
  6. from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split, get_valid_filename
  7. from collections import defaultdict
  8. def write_sequence(args):
  9. _, ext = os.path.splitext(args.fasta)
  10. if ext:
  11. ext = ext[1:] # remove the dot from extension
  12. filt_function = re.compile(args.regex).search
  13. if args.invert_match:
  14. filt_function = lambda x: not re.compile(args.regex).search(x)
  15. fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, read_long_names=args.long_names, rebuild=not args.no_rebuild)
  16. regions_to_fetch, split_function = split_regions(args)
  17. if not regions_to_fetch:
  18. regions_to_fetch = fasta.keys()
  19. header = False
  20. for region in regions_to_fetch:
  21. name, start, end = split_function(region)
  22. if args.size_range:
  23. if start is not None and end is not None:
  24. sequence_len = end - start
  25. else:
  26. sequence_len = len(fasta[name])
  27. if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
  28. continue
  29. if args.split_files: # open output file based on sequence name
  30. filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
  31. filename = get_valid_filename(filename)
  32. outfile = open(filename, 'w')
  33. elif args.out:
  34. outfile = args.out
  35. else:
  36. outfile = sys.stdout
  37. try:
  38. if args.transform:
  39. if not header and args.transform == 'nucleotide':
  40. outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\tothers\n")
  41. header = True
  42. outfile.write(transform_sequence(args, fasta, name, start, end))
  43. else:
  44. for line in fetch_sequence(args, fasta, name, start, end):
  45. outfile.write(line)
  46. except FetchError as e:
  47. raise FetchError(str(e) + " Try setting --lazy.\n")
  48. if args.split_files:
  49. outfile.close()
  50. fasta.__exit__()
  51. def fetch_sequence(args, fasta, name, start=None, end=None):
  52. try:
  53. line_len = fasta.faidx.index[name].lenc
  54. if args.auto_strand and start > end and start is not None and end is not None:
  55. # flip (0, 1] coordinates
  56. sequence = fasta[name][end - 1:start + 1]
  57. sequence = sequence.reverse.complement
  58. else:
  59. sequence = fasta[name][start:end]
  60. except KeyError:
  61. sys.stderr.write("warning: {name} not found in file\n".format(**locals()))
  62. return
  63. if args.complement:
  64. sequence = sequence.complement
  65. if args.reverse:
  66. sequence = sequence.reverse
  67. if args.no_output:
  68. return
  69. if args.no_names:
  70. pass
  71. else:
  72. if (start or end) and not args.no_coords:
  73. yield ''.join(['>', sequence.fancy_name, '\n'])
  74. else:
  75. yield ''.join(['>', sequence.name, '\n'])
  76. for line in wrap_sequence(line_len, sequence.seq):
  77. yield line
  78. def mask_sequence(args):
  79. fasta = Fasta(args.fasta, mutable=True, split_char=args.delimiter)
  80. regions_to_fetch, split_function = split_regions(args)
  81. for region in regions_to_fetch:
  82. rname, start, end = split_function(region)
  83. if args.mask_with_default_seq:
  84. if start and end:
  85. span = end - start
  86. elif not start and not end:
  87. span = len(fasta[rname])
  88. else:
  89. span = len(fasta[rname][start:end])
  90. fasta[rname][start:end] = span * args.default_seq
  91. elif args.mask_by_case:
  92. fasta[rname][start:end] = fasta[rname][start:end].lowercase()
  93. def split_regions(args):
  94. if args.bed:
  95. regions_to_fetch = args.bed
  96. split_function = bed_split
  97. else:
  98. regions_to_fetch = args.regions
  99. split_function = ucsc_split
  100. return (regions_to_fetch, split_function)
  101. def transform_sequence(args, fasta, name, start=None, end=None):
  102. line_len = fasta.faidx.index[name].lenc
  103. s = fasta[name][start:end]
  104. if args.complement:
  105. s = s.complement
  106. if args.reverse:
  107. s = s.reverse
  108. if args.no_output:
  109. return
  110. if args.transform == 'bed':
  111. return '{name}\t{start}\t{end}\n'.format(name=s.name, start=s.start - 1 , end=s.end)
  112. elif args.transform == 'chromsizes':
  113. return '{name}\t{length}\n'.format(name=s.name, length=len(s))
  114. elif args.transform == 'nucleotide':
  115. ss = str(s).upper()
  116. nucs = defaultdict(int)
  117. nucs.update([(c, ss.count(c)) for c in set(ss)])
  118. A = nucs.pop('A', 0)
  119. T = nucs.pop('T', 0)
  120. C = nucs.pop('C', 0)
  121. G = nucs.pop('G', 0)
  122. N = nucs.pop('N', 0)
  123. others = '|'.join([':'.join((k, str(v))) for k, v in nucs.items()])
  124. return '{sname}\t{sstart}\t{send}\t{A}\t{T}\t{C}\t{G}\t{N}\t{others}\n'.format(sname=s.name, sstart=s.start, send=s.end, **locals())
  125. elif args.transform == 'transposed':
  126. return '{name}\t{start}\t{end}\t{seq}\n'.format(name=s.name, start=s.start, end=s.end, seq=str(s))
  127. def main(ext_args=None):
  128. from pyfaidx import __version__
  129. parser = argparse.ArgumentParser(description="Fetch sequences from FASTA. If no regions are specified, all entries in the input file are returned. Input FASTA file must be consistently line-wrapped, and line wrapping of output is based on input line lengths.",
  130. epilog="Please cite: Shirley MD, Ma Z, Pedersen BS, Wheelan SJ. (2015) Efficient \"pythonic\" access to FASTA files using pyfaidx. PeerJ PrePrints 3:e1196 https://dx.doi.org/10.7287/peerj.preprints.970v1")
  131. parser.add_argument('fasta', type=str, help='FASTA file')
  132. parser.add_argument('regions', type=str, nargs='*', help="space separated regions of sequence to fetch e.g. chr1:1-1000")
  133. _input = parser.add_argument_group('input options')
  134. output = parser.add_argument_group('output options')
  135. header = parser.add_argument_group('header options')
  136. _input.add_argument('-b', '--bed', type=argparse.FileType('r'), help="bed file of regions (zero-based start coordinate)")
  137. output.add_argument('-o', '--out', type=argparse.FileType('w'), help="output file name (default: stdout)")
  138. output.add_argument('-i', '--transform', type=str, choices=('bed', 'chromsizes', 'nucleotide', 'transposed'), help="transform the requested regions into another format. default: %(default)s")
  139. output.add_argument('-c', '--complement', action="store_true", default=False, help="complement the sequence. default: %(default)s")
  140. output.add_argument('-r', '--reverse', action="store_true", default=False, help="reverse the sequence. default: %(default)s")
  141. output.add_argument('-y', '--auto-strand', action="store_true", default=False, help="reverse complement the sequence when start > end coordinate. default: %(default)s")
  142. output.add_argument('-a', '--size-range', type=parse_size_range, default=None, help='selected sequences are in the size range [low, high]. example: 1,1000 default: %(default)s')
  143. names = header.add_mutually_exclusive_group()
  144. names.add_argument('-n', '--no-names', action="store_true", default=False, help="omit sequence names from output. default: %(default)s")
  145. names.add_argument('-f', '--long-names', action="store_true", default=False, help="output full (long) names from the input fasta headers. default: headers are truncated after the first whitespace")
  146. header.add_argument('-t', '--no-coords', action="store_true", default=False, help="omit coordinates (e.g. chr:start-end) from output headers. default: %(default)s")
  147. output.add_argument('-x', '--split-files', action="store_true", default=False, help="write each region to a separate file (names are derived from regions)")
  148. output.add_argument('-l', '--lazy', action="store_true", default=False, help="fill in --default-seq for missing ranges. default: %(default)s")
  149. output.add_argument('-s', '--default-seq', type=check_seq_length, default=None, help='default base for missing positions and masking. default: %(default)s')
  150. header.add_argument('-d', '--delimiter', type=str, default=None, help='delimiter for splitting names to multiple values (duplicate names will be discarded). default: %(default)s')
  151. header.add_argument('-e', '--header-function', type=str, default='lambda x: x.split()[0]', help='python function to modify header lines e.g: "lambda x: x.split("|")[0]". default: %(default)s')
  152. header.add_argument('-u', '--duplicates-action', type=str, default="stop", choices=("stop", "first", "last", "longest", "shortest"), help='entry to take when duplicate sequence names are encountered. default: %(default)s')
  153. matcher = parser.add_argument_group('matching arguments')
  154. matcher.add_argument('-g', '--regex', type=str, default='.*', help='selected sequences are those matching regular expression. default: %(default)s')
  155. matcher.add_argument('-v', '--invert-match', action="store_true", default=False, help="selected sequences are those not matching 'regions' argument. default: %(default)s")
  156. masking = output.add_mutually_exclusive_group()
  157. masking.add_argument('-m', '--mask-with-default-seq', action="store_true", default=False, help="mask the FASTA file using --default-seq default: %(default)s")
  158. masking.add_argument('-M', '--mask-by-case', action="store_true", default=False, help="mask the FASTA file by changing to lowercase. default: %(default)s")
  159. output.add_argument('--no-output', action="store_true", default=False, help="do not output any sequence. default: %(default)s")
  160. parser.add_argument('--no-rebuild', action="store_true", default=False, help="do not rebuild the .fai index even if it is out of date. default: %(default)s")
  161. parser.add_argument('--version', action="version", version=__version__, help="print pyfaidx version number")
  162. # print help usage if no arguments are supplied
  163. if len(sys.argv)==1 and not ext_args:
  164. parser.print_help()
  165. sys.exit(1)
  166. elif ext_args:
  167. args = parser.parse_args(ext_args)
  168. else:
  169. args = parser.parse_args()
  170. if args.auto_strand:
  171. if args.complement:
  172. sys.stderr.write("--auto-strand and --complement are both set. Are you sure this is what you want?\n")
  173. if args.reverse:
  174. sys.stderr.write("--auto-strand and --reverse are both set. Are you sure this is what you want?\n")
  175. if args.mask_with_default_seq or args.mask_by_case:
  176. mask_sequence(args)
  177. else:
  178. write_sequence(args)
  179. def check_seq_length(value):
  180. if value is None:
  181. pass # default value
  182. elif len(value) != 1:
  183. # user is passing a single character
  184. raise argparse.ArgumentTypeError("--default-seq value must be a single character!")
  185. return value
  186. def parse_size_range(value):
  187. """ Size range argument should be in the form start,end and is end-inclusive. """
  188. if value is None:
  189. return value
  190. try:
  191. start, end = value.replace(' ', '').replace('\t', '').split(',')
  192. except (TypeError, ValueError, IndexError):
  193. raise ValueError
  194. return (int(start), int(end))
  195. class Counter(dict):
  196. '''Dict subclass for counting hashable objects. Sometimes called a bag
  197. or multiset. Elements are stored as dictionary keys and their counts
  198. are stored as dictionary values.
  199. '''
  200. def __init__(self, iterable=None, **kwds):
  201. '''Create a new, empty Counter object. And if given, count elements
  202. from an input iterable. Or, initialize the count from another mapping
  203. of elements to their counts.
  204. '''
  205. self.update(iterable, **kwds)
  206. def __missing__(self, key):
  207. return 0
  208. def most_common(self, n=None):
  209. '''List the n most common elements and their counts from the most
  210. common to the least. If n is None, then list all element counts.
  211. '''
  212. if n is None:
  213. return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
  214. return nlargest(n, self.iteritems(), key=itemgetter(1))
  215. def elements(self):
  216. '''Iterator over elements repeating each as many times as its count.
  217. If an element's count has been set to zero or is a negative number,
  218. elements() will ignore it.
  219. '''
  220. for elem, count in self.iteritems():
  221. for _ in repeat(None, count):
  222. yield elem
  223. # Override dict methods where the meaning changes for Counter objects.
  224. @classmethod
  225. def fromkeys(cls, iterable, v=None):
  226. raise NotImplementedError(
  227. 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
  228. def update(self, iterable=None, **kwds):
  229. '''Like dict.update() but add counts instead of replacing them.
  230. Source can be an iterable, a dictionary, or another Counter instance.
  231. '''
  232. if iterable is not None:
  233. if hasattr(iterable, 'iteritems'):
  234. if self:
  235. self_get = self.get
  236. for elem, count in iterable.iteritems():
  237. self[elem] = self_get(elem, 0) + count
  238. else:
  239. dict.update(self, iterable) # fast path when counter is empty
  240. else:
  241. self_get = self.get
  242. for elem in iterable:
  243. self[elem] = self_get(elem, 0) + 1
  244. if kwds:
  245. self.update(kwds)
  246. def copy(self):
  247. 'Like dict.copy() but returns a Counter instance instead of a dict.'
  248. return Counter(self)
  249. def __delitem__(self, elem):
  250. 'Like dict.__delitem__() but does not raise KeyError for missing values.'
  251. if elem in self:
  252. dict.__delitem__(self, elem)
  253. def __repr__(self):
  254. if not self:
  255. return '%s()' % self.__class__.__name__
  256. items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
  257. return '%s({%s})' % (self.__class__.__name__, items)
  258. # Multiset-style mathematical operations discussed in:
  259. # Knuth TAOCP Volume II section 4.6.3 exercise 19
  260. # and at http://en.wikipedia.org/wiki/Multiset
  261. #
  262. # Outputs guaranteed to only include positive counts.
  263. #
  264. # To strip negative and zero counts, add-in an empty counter:
  265. # c += Counter()
  266. def __add__(self, other):
  267. '''Add counts from two counters.
  268. '''
  269. if not isinstance(other, Counter):
  270. return NotImplemented
  271. result = Counter()
  272. for elem in set(self) | set(other):
  273. newcount = self[elem] + other[elem]
  274. if newcount > 0:
  275. result[elem] = newcount
  276. return result
  277. def __sub__(self, other):
  278. ''' Subtract count, but keep only results with positive counts.
  279. '''
  280. if not isinstance(other, Counter):
  281. return NotImplemented
  282. result = Counter()
  283. for elem in set(self) | set(other):
  284. newcount = self[elem] - other[elem]
  285. if newcount > 0:
  286. result[elem] = newcount
  287. return result
  288. def __or__(self, other):
  289. '''Union is the maximum of value in either of the input counters.
  290. '''
  291. if not isinstance(other, Counter):
  292. return NotImplemented
  293. _max = max
  294. result = Counter()
  295. for elem in set(self) | set(other):
  296. newcount = _max(self[elem], other[elem])
  297. if newcount > 0:
  298. result[elem] = newcount
  299. return result
  300. def __and__(self, other):
  301. ''' Intersection is the minimum of corresponding counts.
  302. '''
  303. if not isinstance(other, Counter):
  304. return NotImplemented
  305. _min = min
  306. result = Counter()
  307. if len(self) < len(other):
  308. self, other = other, self
  309. for elem in filter(self.__contains__, other):
  310. newcount = _min(self[elem], other[elem])
  311. if newcount > 0:
  312. result[elem] = newcount
  313. return result
  314. if __name__ == "__main__":
  315. main()