SearchDoublePoints.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. class SearchDoublePoints:
  2. def __init__(self):
  3. self.BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'}
  4. def GetMapInfo(self,info_str):
  5. #info_str:99M51S
  6. #return {0:'M99',1:'S51'}
  7. num_list=[]
  8. map_dict={}
  9. count=0
  10. for item in info_str:
  11. if item.isalpha():
  12. num=len(num_list)
  13. index=0
  14. for i in range(num):
  15. index += int(num_list[i])*10**(num-i-1)
  16. map_dict[count]=item+":"+str(index)
  17. count +=1
  18. num_list=[]
  19. elif item.isdigit():
  20. num_list.append(item)
  21. return map_dict
  22. #adjust D or I
  23. def AdjInfo(self,seq,map_info):
  24. flag = True
  25. for key in map_info.keys():
  26. item = map_info[key]
  27. if 'D' in item or 'I' in item:
  28. flag = False
  29. if 'H' in map_info[0]:
  30. flag = False
  31. #不需要调整
  32. if flag:
  33. return seq,map_info
  34. else:
  35. new_seq_list=[]
  36. new_info1={}
  37. start=0
  38. old_item=''
  39. for key in map_info.keys():
  40. item = map_info[key]
  41. temp_len=int(item.split(":")[1])
  42. end = start+temp_len
  43. if item.startswith('M') or item.startswith('S'):
  44. for index in range(start,end):
  45. new_seq_list.append(seq[index])
  46. old_item=item
  47. elif item.startswith('D') and old_item.startswith('M'):
  48. end = end-temp_len
  49. for index in range(temp_len):
  50. new_seq_list.append('-')
  51. elif item.startswith('H'):
  52. end = end-temp_len
  53. start = end
  54. #
  55. num=len(map_info)
  56. index =0
  57. new_index= 0
  58. while index+3 <= num:
  59. item1=map_info[index]
  60. item2=map_info[index+1]
  61. item3=map_info[index+2]
  62. if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
  63. pos1=int(item1.split(":")[1])
  64. pos2=int(item2.split(":")[1])
  65. pos3=int(item3.split(":")[1])
  66. new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
  67. new_index +=1
  68. index +=3
  69. elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
  70. pos1=int(item1.split(":")[1])
  71. pos3=int(item3.split(":")[1])
  72. new_info1[new_index] = 'M:'+str(pos1+pos3)
  73. new_index +=1
  74. index +=3
  75. elif item1.startswith("H") and item2.startswith("M"):
  76. new_info1[new_index] = item2
  77. new_index +=1
  78. index +=2
  79. else:
  80. new_info1[new_index] = item1
  81. new_index += 1
  82. index += 1
  83. while index < num:
  84. new_info1[new_index]=map_info[index]
  85. new_index += 1
  86. index += 1
  87. return "".join(new_seq_list),new_info1
  88. def GetPoints(self,soft_info,hard_info,OVERLAP=10):
  89. #不分析有缺省值的
  90. if len(soft_info)<7 or len(hard_info)<7:
  91. return False,1
  92. #map_info:{0:'M:90',1:'S:60'}
  93. #seq:ATGC.....
  94. map_info1=self.GetMapInfo(soft_info[5])
  95. map_info2=self.GetMapInfo(hard_info[5])
  96. seq1=soft_info[6]
  97. seq2=hard_info[6]
  98. #矫正map info中有D,I的
  99. if 'D' in soft_info[5] or 'I' in soft_info[5]:
  100. info1_temp=self.AdjInfo(seq1,map_info1)
  101. seq1=info1_temp[0]
  102. map_info1=info1_temp[1]
  103. if 'D' in hard_info[5] or 'I' in hard_info[5]:
  104. info2_temp=self.AdjInfo(seq2,map_info2)
  105. seq2=info2_temp[0]
  106. map_info2=info2_temp[1]
  107. chr1=soft_info[2]
  108. pos1=int(soft_info[3])
  109. chr2=hard_info[2]
  110. pos2=int(hard_info[3])
  111. #point1
  112. soft_stat=""
  113. match_ms=0
  114. point_ms=0
  115. match_sm=0
  116. point_sm=0
  117. match_sms=0
  118. skip_sms_f=0
  119. skip_sms_e=0
  120. point_sms_f=0
  121. point_sms_e=0
  122. if len(map_info1)==2:
  123. #{0: 'M:131', 1: 'S:19'}
  124. if 'S' in map_info1[1]:
  125. soft_stat="MS"
  126. match_ms=int(map_info1[0].split(":")[1])
  127. point_ms=pos1+match_ms-1
  128. #{0:'S:19',1:'M:131'}
  129. elif 'S' in map_info1[0]:
  130. soft_stat="SM"
  131. match_sm=int(map_info1[0].split(":")[1])
  132. point_sm=pos1
  133. else:
  134. return False,2
  135. elif len(map_info1)==3:
  136. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  137. if 'S' in map_info1[0] and 'M' in map_info1[1] and 'S' in map_info1[2]:
  138. soft_stat="SMS"
  139. match_sms=int(map_info1[1].split(":")[1])
  140. skip_sms_f=int(map_info1[0].split(":")[1])
  141. skip_sms_e=int(map_info1[2].split(":")[1])
  142. point_sms_f=pos1
  143. point_sms_e=pos1+match_sms-1
  144. else:
  145. return False,3
  146. else:
  147. return False,4
  148. #point2
  149. hard_stat=""
  150. match_mh=0
  151. skip_mh=0
  152. point_mh=0
  153. skip_hm=0
  154. point_hm=0
  155. match_hmh=0
  156. skip_hmh_f=0
  157. skip_hmh_e=0
  158. point_hmh_f=0
  159. point_hmh_e=0
  160. if len(map_info2)==2:
  161. #{0:'M:31',1:'H:50'}
  162. if 'H' in map_info2[1]:
  163. hard_stat="MH"
  164. match_mh=int(map_info2[0].split(":")[1])
  165. skip_mh=int(map_info2[1].split(":")[1])
  166. point_mh=pos2+match_mh-1
  167. #{0:'H:50',1:'M:31'}
  168. elif 'H' in map_info2[1]:
  169. hard_stat="HM"
  170. skip_hm=int(map_info2[0].split(":")[1])
  171. point_hm=pos2
  172. else:
  173. return False,5
  174. elif len(map_info2)==3:
  175. #{0:'H:30',1:"M:100",2:"H:15"}
  176. if 'H' in map_info2[0] and 'M' in map_info2[1] and 'H' in map_info2[2]:
  177. hard_stat="HMH"
  178. match_hmh=int(map_info2[1].split(":")[1])
  179. skip_hmh_f=int(map_info2[0].split(":")[1])
  180. skip_hmh_e=int(map_info2[2].split(":")[1])
  181. point_hmh_f=pos2
  182. point_hmh_e=pos2+match_hmh-1
  183. else:
  184. return False,6
  185. else:
  186. return False,7
  187. seq1_rev="".join([self.BASE_DICT[base] for base in seq1][::-1])
  188. #确定points和fusion_seq
  189. point1=0
  190. point1_e=0
  191. point2=0
  192. point2_e=0
  193. overlap=0
  194. fusion_seq=""
  195. if soft_stat =="MS":
  196. point1=point_ms
  197. point1_e=point1-30
  198. fusion_seq=seq1[match_ms-30:match_ms+30]
  199. if hard_stat == "MH":
  200. if match_ms >= skip_mh:
  201. overlap = match_ms - skip_mh
  202. else:
  203. return False,8
  204. point2=point_mh-overlap
  205. point2_e=point2-30
  206. elif hard_stat == "HM":
  207. if match_ms >= skip_hm:
  208. overlap = match_ms - skip_hm
  209. else:
  210. return False,9
  211. point2=point_hm+overlap
  212. point2_e=point2+30
  213. elif hard_stat == "HMH":
  214. seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
  215. seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
  216. if seq2 == seq1_temp:
  217. if match_ms >= skip_hmh_f:
  218. overlap = match_ms - skip_hmh_f
  219. else:
  220. return False,10
  221. point2=point_hmh_f+overlap
  222. point2_e=point2+30
  223. elif seq2 == seq1_rev_temp:
  224. if match_ms >= skip_hmh_e:
  225. overlap = match_ms -skip_hmh_e
  226. else:
  227. return False,11
  228. point2=point_hmh_e-overlap
  229. point2_e=point2-30
  230. else:
  231. return False,12
  232. else:
  233. return False,13
  234. elif soft_stat == "SM":
  235. fusion_seq=seq1[match_sm-30:match_sm+30]
  236. point1=point_sm
  237. point1_e=point1+30
  238. if hard_stat == "HM":
  239. if match_sm >=skip_hm:
  240. overlap = match_sm - skip_hm
  241. else:
  242. return False,14
  243. point2=point_hm+overlap
  244. point2_e=point2+30
  245. elif hard_stat == "MH":
  246. if match_sm >= skip_mh:
  247. overlap = match_sm - skip_mh
  248. else:
  249. return False,15
  250. point2=point_mh-overlap
  251. point2_e=point2-30
  252. elif hard_stat == "HMH":
  253. seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
  254. seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
  255. if seq2 == seq1_temp:
  256. if match_ms >= skip_hmh_e:
  257. overlap = match_ms - skip_hmh_e
  258. else:
  259. return False,16
  260. point2=point_hmh_e-overlap
  261. point2_e=point2-30
  262. elif seq2 == seq1_rev_temp:
  263. if match_ms >= skip_hmh_f:
  264. overlap = match_ms - skip_hmh_f
  265. else:
  266. return False,17
  267. point2=point_hmh_f+overlap
  268. point2_e=point2+30
  269. else:
  270. return False,18
  271. elif soft_stat == "SMS":
  272. if hard_stat == "HM":
  273. seq1_temp=seq1[skip_hm:]
  274. seq1_rev_temp=seq1_rev[skip_hm:]
  275. if seq2 == seq1_temp:
  276. point1=point_sms_e
  277. point1_e=point1-30
  278. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  279. if skip_sms_f+match_sms >= skip_hm:
  280. overlap = skip_sms_f+match_sms - skip_hm
  281. else:
  282. return False,19
  283. elif seq2 == seq1_rev_temp:
  284. point1=point_sms_f
  285. point1_e=point1+30
  286. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  287. if skip_sms_e+match_sms >= skip_hm:
  288. overlap = skip_sms_e+match_sms - skip_hm
  289. else:
  290. return False,20
  291. else:
  292. return False,21
  293. point2=point_hm+overlap
  294. point2_e=point2+30
  295. elif hard_stat == "MH":
  296. seq1_temp=seq1[:match_mh]
  297. seq1_rev_temp=seq1_rev[:match_mh]
  298. if seq2 == seq1_temp:
  299. point1=point_sms_f
  300. point1_e=point1+30
  301. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  302. if skip_sms_e+match_sms >= skip_mh:
  303. overlap = skip_sms_e+match_sms - skip_mh
  304. else:
  305. return False,22
  306. elif seq2 == seq1_rev_temp:
  307. point1=point_sms_e
  308. point1_e=point1-30
  309. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  310. if skip_sms_f+match_sms >= skip_hm:
  311. overlap = skip_sms_f+match_sms - skip_hm
  312. else:
  313. return False,23
  314. else:
  315. return False,24
  316. point2=point_hm-overlap
  317. point2_e=point2-30
  318. elif hard_stat == "HMH":
  319. if skip_hmh_f < skip_sms_f and skip_sms_f <= skip_hmh_f + match_hmh:
  320. seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
  321. if seq2 == seq_temp:
  322. point1 = point_sms_f
  323. point1_e = point1 + 30
  324. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  325. overlap = skip_hmh_f + match_hmh - skip_sms_f
  326. point2 = point_hmh_e - overlap
  327. point2_e = point2 -30
  328. else:
  329. return False,25
  330. elif skip_hmh_f >= skip_sms_f and skip_hmh_f <= skip_sms_f + match_sms:
  331. seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
  332. if seq2 == seq_temp:
  333. point1 = point_sms_e
  334. point1_e = point1 - 30
  335. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  336. overlap = skip_sms_f + match_sms - skip_hmh_f
  337. point2 = point_hmh_f + overlap
  338. point2_e = point2 +30
  339. return False,26
  340. elif skip_sms_e > skip_hmh_e and skip_sms_e <= skip_hmh_f+match_hmh:
  341. seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
  342. if seq2 == seq_temp:
  343. point1 = point_sms_e
  344. point1_e = point1 - 30
  345. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  346. overlap = skip_hmh_f+match_hmh - skip_sms_e
  347. point2 = point_hmh_e - overlap
  348. point2_e = point2 - 30
  349. else:
  350. return False,27
  351. elif skip_hmh_f > skip_sms_e and skip_hmh_f <= skip_sms_e+match_sms:
  352. seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
  353. if seq2 == seq_temp:
  354. point1 = point_sms_f
  355. point1_e = point1 + 30
  356. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  357. overlap = skip_sms_e+match_sms - skip_hmh_f
  358. point2 = point_hmh_e + overlap
  359. point2_e = point2 + 30
  360. else:
  361. return False,28
  362. return False,29
  363. else:
  364. return False,30
  365. else:
  366. return False,31
  367. if abs(overlap) <= OVERLAP and point1 !=0:
  368. return True,chr1+":"+str(point1),chr2+":"+str(point2),chr1+":"+str(point1_e),fusion_seq,str(overlap),chr1+":"+str(point1_e),chr2+":"+str(point2_e)
  369. else:
  370. return False,32