models.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """
  2. Models for representing genomic elements.
  3. """
  4. from __future__ import unicode_literals
  5. from collections import namedtuple
  6. class Position(object):
  7. """A position in the genome."""
  8. def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand):
  9. self.chrom = chrom
  10. self.chrom_start = chrom_start
  11. self.chrom_stop = chrom_stop
  12. self.is_forward_strand = is_forward_strand
  13. def __repr__(self):
  14. return "<Position %s[%d:%d]>" % (
  15. self.chrom, self.chrom_start, self.chrom_stop)
  16. class Gene(object):
  17. def __init__(self, name):
  18. self.name = name
  19. class Transcript(object):
  20. """RefGene Transcripts for hg19
  21. A gene may have multiple transcripts with different combinations of exons.
  22. """
  23. def __init__(self, name, version, gene, tx_position, cds_position,
  24. is_default=False, exons=None):
  25. self.name = name
  26. self.version = version
  27. self.gene = Gene(gene)
  28. self.tx_position = tx_position
  29. self.cds_position = cds_position
  30. self.is_default = is_default
  31. self.exons = exons if exons else []
  32. @property
  33. def full_name(self):
  34. if self.version is not None:
  35. return '%s.%d' % (self.name, self.version)
  36. else:
  37. return self.name
  38. @property
  39. def is_coding(self):
  40. # Coding transcripts have CDS with non-zero length.
  41. return (self.cds_position.chrom_stop -
  42. self.cds_position.chrom_start > 0)
  43. @property
  44. def strand(self):
  45. return ('+' if self.tx_position.is_forward_strand else '-')
  46. @property
  47. def coding_exons(self):
  48. return [exon.get_as_interval(coding_only=True)
  49. for exon in self.exons]
  50. BED6Interval_base = namedtuple(
  51. "BED6Interval_base", (
  52. "chrom",
  53. "chrom_start",
  54. "chrom_end",
  55. "name",
  56. "score",
  57. "strand"))
  58. class BED6Interval(BED6Interval_base):
  59. def distance(self, offset):
  60. """Return the distance to the interval.
  61. if offset is inside the exon, distance is zero.
  62. otherwise, distance is the distance to the nearest edge.
  63. distance is positive if the exon comes after the offset.
  64. distance is negative if the exon comes before the offset.
  65. """
  66. start = self.chrom_start + 1
  67. end = self.chrom_end
  68. if start <= offset <= end:
  69. return 0
  70. start_distance = start - offset
  71. end_distance = offset - end
  72. if abs(start_distance) < abs(end_distance):
  73. return start_distance
  74. else:
  75. return -end_distance
  76. class Exon(object):
  77. def __init__(self, transcript, tx_position, exon_number):
  78. self.transcript = transcript
  79. self.tx_position = tx_position
  80. self.exon_number = exon_number
  81. @property
  82. def get_exon_name(self):
  83. return "%s.%d" % (self.transcript.name, self.exon_number)
  84. def get_as_interval(self, coding_only=False):
  85. """Returns the coding region for this exon as a BED6Interval.
  86. This function returns a BED6Interval objects containing position
  87. information for this exon. This may be used as input for
  88. pybedtools.create_interval_from_list() after casting chrom_start
  89. and chrom_end as strings.
  90. coding_only: only include exons in the coding region
  91. """
  92. exon_start = self.tx_position.chrom_start
  93. exon_stop = self.tx_position.chrom_stop
  94. # Get only exon coding region if requested
  95. if coding_only:
  96. if (exon_stop <= self.transcript.cds_position.chrom_start or
  97. exon_start >= self.transcript.cds_position.chrom_stop):
  98. return None
  99. exon_start = max(exon_start,
  100. self.transcript.cds_position.chrom_start)
  101. exon_stop = min(
  102. max(exon_stop, self.transcript.cds_position.chrom_start),
  103. self.transcript.cds_position.chrom_stop)
  104. return BED6Interval(
  105. self.tx_position.chrom,
  106. exon_start,
  107. exon_stop,
  108. self.get_exon_name,
  109. '.',
  110. self.strand,
  111. )
  112. @property
  113. def strand(self):
  114. strand = '+' if self.tx_position.is_forward_strand else '-'
  115. return strand