SearchReadsFusion.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658
  1. class ReadsFusion:
  2. def __init__(self):
  3. pass
  4. ###输入primer序列和reads序列
  5. ###返回primer在reads上的位置,Front or End or NO
  6. def PrimerLoc(self,primer,seq):
  7. primer_len=len(primer)
  8. seq_primer=seq[:primer_len]
  9. seq_rev_comp="".join([self.BASE_DICT[base] for base in seq][::-1])
  10. seq_rev_comp_primer=seq_rev_comp[:primer_len]
  11. if primer == seq_primer:
  12. return "Front"
  13. elif primer == seq_rev_comp_primer:
  14. return "End"
  15. return "NO"
  16. #####输入map info,例如99M51S
  17. #####返回{0:'M99',1:'S51'}
  18. def GetMapInfo(self,info_str):
  19. num_list=[]
  20. map_dict={}
  21. count=0
  22. if not info_str:
  23. return map_dict
  24. for item in info_str:
  25. if item.isalpha():
  26. num=len(num_list)
  27. index=0
  28. for i in range(num):
  29. index += int(num_list[i])*10**(num-i-1)
  30. map_dict[count]=item+":"+str(index)
  31. count +=1
  32. num_list=[]
  33. elif item.isdigit():
  34. num_list.append(item)
  35. return map_dict
  36. ######输入reads序列,map info,目的是矫正map info中的 D or I
  37. ###返回矫正后的reads序列和矫正后的map info
  38. def AdjInfo(self,seq,map_info):
  39. flag = True
  40. for key in map_info.keys():
  41. item = map_info[key]
  42. if 'D' in item or 'I' in item:
  43. flag = False
  44. if 'H' in map_info[0]:
  45. flag = False
  46. #不需要调整
  47. if flag:
  48. return seq,map_info
  49. else:
  50. new_seq_list=[]
  51. new_info1={}
  52. start=0
  53. old_item=''
  54. for key in map_info.keys():
  55. item = map_info[key]
  56. temp_len=int(item.split(":")[1])
  57. end = start+temp_len
  58. if item.startswith('M') or item.startswith('S'):
  59. for index in range(start,end):
  60. new_seq_list.append(seq[index])
  61. old_item=item
  62. elif item.startswith('D') and old_item.startswith('M'):
  63. end = end-temp_len
  64. for index in range(temp_len):
  65. new_seq_list.append('-')
  66. elif item.startswith('H'):
  67. end = end-temp_len
  68. start = end
  69. #
  70. num=len(map_info)
  71. index =0
  72. new_index= 0
  73. while index+3 <= num:
  74. item1=map_info[index]
  75. item2=map_info[index+1]
  76. item3=map_info[index+2]
  77. if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
  78. pos1=int(item1.split(":")[1])
  79. pos2=int(item2.split(":")[1])
  80. pos3=int(item3.split(":")[1])
  81. new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
  82. new_index +=1
  83. index +=3
  84. elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
  85. pos1=int(item1.split(":")[1])
  86. pos3=int(item3.split(":")[1])
  87. new_info1[new_index] = 'M:'+str(pos1+pos3)
  88. new_index +=1
  89. index +=3
  90. elif item1.startswith("H") and item2.startswith("M"):
  91. new_info1[new_index] = item2
  92. new_index +=1
  93. index +=2
  94. else:
  95. new_info1[new_index] = item1
  96. new_index += 1
  97. index += 1
  98. while index < num:
  99. new_info1[new_index]=map_info[index]
  100. new_index += 1
  101. index += 1
  102. return "".join(new_seq_list),new_info1
  103. ###对于double reads,同一条reads上同时比对到位置,输入同一reads两种比对情况
  104. ###输出比对位置断点坐标
  105. def GetPoints(self,soft_info,hard_info,OVERLAP=10):
  106. #不分析有缺省值的
  107. if len(soft_info)<7 or len(hard_info)<7:
  108. return False,1
  109. #map_info:{0:'M:90',1:'S:60'}
  110. #seq:ATGC.....
  111. map_info1=self.GetMapInfo(soft_info[5])
  112. map_info2=self.GetMapInfo(hard_info[5])
  113. seq1=soft_info[6]
  114. seq2=hard_info[6]
  115. #矫正map info中有D,I的
  116. if 'D' in soft_info[5] or 'I' in soft_info[5]:
  117. info1_temp=self.AdjInfo(seq1,map_info1)
  118. seq1=info1_temp[0]
  119. map_info1=info1_temp[1]
  120. if 'D' in hard_info[5] or 'I' in hard_info[5]:
  121. info2_temp=self.AdjInfo(seq2,map_info2)
  122. seq2=info2_temp[0]
  123. map_info2=info2_temp[1]
  124. chr1=soft_info[2]
  125. pos1=int(soft_info[3])
  126. chr2=hard_info[2]
  127. pos2=int(hard_info[3])
  128. #point1
  129. soft_stat=""
  130. match_ms=0
  131. point_ms=0
  132. match_sm=0
  133. point_sm=0
  134. match_sms=0
  135. skip_sms_f=0
  136. skip_sms_e=0
  137. point_sms_f=0
  138. point_sms_e=0
  139. if len(map_info1)==2:
  140. #{0: 'M:131', 1: 'S:19'}
  141. if 'S' in map_info1[1]:
  142. soft_stat="MS"
  143. match_ms=int(map_info1[0].split(":")[1])
  144. point_ms=pos1+match_ms-1
  145. #{0:'S:19',1:'M:131'}
  146. elif 'S' in map_info1[0]:
  147. soft_stat="SM"
  148. match_sm=int(map_info1[1].split(":")[1])
  149. point_sm=pos1
  150. else:
  151. return False,2
  152. elif len(map_info1)==3:
  153. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  154. if 'S' in map_info1[0] and 'M' in map_info1[1] and 'S' in map_info1[2]:
  155. soft_stat="SMS"
  156. match_sms=int(map_info1[1].split(":")[1])
  157. skip_sms_f=int(map_info1[0].split(":")[1])
  158. skip_sms_e=int(map_info1[2].split(":")[1])
  159. point_sms_f=pos1
  160. point_sms_e=pos1+match_sms-1
  161. else:
  162. return False,3
  163. else:
  164. return False,4
  165. #point2
  166. hard_stat=""
  167. match_mh=0
  168. skip_mh=0
  169. point_mh=0
  170. skip_hm=0
  171. point_hm=0
  172. match_hmh=0
  173. skip_hmh_f=0
  174. skip_hmh_e=0
  175. point_hmh_f=0
  176. point_hmh_e=0
  177. if len(map_info2)==2:
  178. #{0:'M:31',1:'H:50'}
  179. if 'H' in map_info2[1]:
  180. hard_stat="MH"
  181. match_mh=int(map_info2[0].split(":")[1])
  182. skip_mh=int(map_info2[1].split(":")[1])
  183. point_mh=pos2+match_mh-1
  184. #{0:'H:50',1:'M:31'}
  185. elif 'H' in map_info2[0]:
  186. hard_stat="HM"
  187. skip_hm=int(map_info2[0].split(":")[1])
  188. point_hm=pos2
  189. else:
  190. return False,5
  191. elif len(map_info2)==3:
  192. #{0:'H:30',1:"M:100",2:"H:15"}
  193. if 'H' in map_info2[0] and 'M' in map_info2[1] and 'H' in map_info2[2]:
  194. hard_stat="HMH"
  195. match_hmh=int(map_info2[1].split(":")[1])
  196. skip_hmh_f=int(map_info2[0].split(":")[1])
  197. skip_hmh_e=int(map_info2[2].split(":")[1])
  198. point_hmh_f=pos2
  199. point_hmh_e=pos2+match_hmh-1
  200. else:
  201. return False,6
  202. else:
  203. return False,7
  204. seq1_rev="".join([self.BASE_DICT[base] for base in seq1][::-1])
  205. #确定points和fusion_seq
  206. point1=0
  207. point1_e=0
  208. point2=0
  209. point2_e=0
  210. overlap=0
  211. fusion_seq=""
  212. if soft_stat =="MS":
  213. point1=point_ms
  214. point1_e=point1-30
  215. fusion_seq=seq1[match_ms-30:match_ms+30]
  216. if hard_stat == "MH":
  217. if match_ms >= skip_mh:
  218. overlap = match_ms - skip_mh
  219. else:
  220. return False,8
  221. point2=point_mh-overlap
  222. point2_e=point2-30
  223. elif hard_stat == "HM":
  224. if match_ms >= skip_hm:
  225. overlap = match_ms - skip_hm
  226. else:
  227. return False,9
  228. point2=point_hm+overlap
  229. point2_e=point2+30
  230. elif hard_stat == "HMH":
  231. seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
  232. seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
  233. if seq2 == seq1_temp:
  234. if match_ms >= skip_hmh_f:
  235. overlap = match_ms - skip_hmh_f
  236. else:
  237. return False,10
  238. point2=point_hmh_f+overlap
  239. point2_e=point2+30
  240. elif seq2 == seq1_rev_temp:
  241. if match_ms >= skip_hmh_e:
  242. overlap = match_ms -skip_hmh_e
  243. else:
  244. return False,11
  245. point2=point_hmh_e-overlap
  246. point2_e=point2-30
  247. else:
  248. return False,12
  249. else:
  250. return False,13
  251. elif soft_stat == "SM":
  252. fusion_seq=seq1[match_sm-30:match_sm+30]
  253. point1=point_sm
  254. point1_e=point1+30
  255. if hard_stat == "HM":
  256. if match_sm >=skip_hm:
  257. overlap = match_sm - skip_hm
  258. else:
  259. return False,14
  260. point2=point_hm+overlap
  261. point2_e=point2+30
  262. elif hard_stat == "MH":
  263. if match_sm >= skip_mh:
  264. overlap = match_sm - skip_mh
  265. else:
  266. return False,15
  267. point2=point_mh-overlap
  268. point2_e=point2-30
  269. elif hard_stat == "HMH":
  270. seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
  271. seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
  272. if seq2 == seq1_temp:
  273. if match_sm >= skip_hmh_e:
  274. overlap = match_sm - skip_hmh_e
  275. else:
  276. return False,16
  277. point2=point_hmh_e-overlap
  278. point2_e=point2-30
  279. elif seq2 == seq1_rev_temp:
  280. if match_sm >= skip_hmh_f:
  281. overlap = match_sm - skip_hmh_f
  282. else:
  283. return False,17
  284. point2=point_hmh_f+overlap
  285. point2_e=point2+30
  286. else:
  287. return False,18
  288. elif soft_stat == "SMS":
  289. if hard_stat == "HM":
  290. seq1_temp=seq1[skip_hm:]
  291. seq1_rev_temp=seq1_rev[skip_hm:]
  292. if seq2 == seq1_temp:
  293. point1=point_sms_e
  294. point1_e=point1-30
  295. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  296. if skip_sms_f+match_sms >= skip_hm:
  297. overlap = skip_sms_f+match_sms - skip_hm
  298. else:
  299. return False,19
  300. elif seq2 == seq1_rev_temp:
  301. point1=point_sms_f
  302. point1_e=point1+30
  303. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  304. if skip_sms_e+match_sms >= skip_hm:
  305. overlap = skip_sms_e+match_sms - skip_hm
  306. else:
  307. return False,20
  308. else:
  309. return False,21
  310. point2=point_hm+overlap
  311. point2_e=point2+30
  312. elif hard_stat == "MH":
  313. seq1_temp=seq1[:match_mh]
  314. seq1_rev_temp=seq1_rev[:match_mh]
  315. if seq2 == seq1_temp:
  316. point1=point_sms_f
  317. point1_e=point1+30
  318. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  319. if skip_sms_e+match_sms >= skip_mh:
  320. overlap = skip_sms_e+match_sms - skip_mh
  321. else:
  322. return False,22
  323. elif seq2 == seq1_rev_temp:
  324. point1=point_sms_e
  325. point1_e=point1-30
  326. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  327. if skip_sms_f+match_sms >= skip_mh:
  328. overlap = skip_sms_f+match_sms - skip_mh
  329. else:
  330. return False,23
  331. else:
  332. return False,24
  333. point2=point_mh-overlap
  334. point2_e=point2-30
  335. elif hard_stat == "HMH":
  336. if skip_hmh_f < skip_sms_f and skip_sms_f <= skip_hmh_f + match_hmh:
  337. seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
  338. if seq2 == seq_temp:
  339. point1 = point_sms_f
  340. point1_e = point1 + 30
  341. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  342. overlap = skip_hmh_f + match_hmh - skip_sms_f
  343. point2 = point_hmh_e - overlap
  344. point2_e = point2 -30
  345. else:
  346. return False,25
  347. elif skip_hmh_f >= skip_sms_f and skip_hmh_f <= skip_sms_f + match_sms:
  348. seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
  349. if seq2 == seq_temp:
  350. point1 = point_sms_e
  351. point1_e = point1 - 30
  352. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  353. overlap = skip_sms_f + match_sms - skip_hmh_f
  354. point2 = point_hmh_f + overlap
  355. point2_e = point2 +30
  356. return False,26
  357. elif skip_sms_e > skip_hmh_e and skip_sms_e <= skip_hmh_f+match_hmh:
  358. seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
  359. if seq2 == seq_temp:
  360. point1 = point_sms_e
  361. point1_e = point1 - 30
  362. fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
  363. overlap = skip_hmh_f+match_hmh - skip_sms_e
  364. point2 = point_hmh_e - overlap
  365. point2_e = point2 - 30
  366. else:
  367. return False,27
  368. elif skip_hmh_f > skip_sms_e and skip_hmh_f <= skip_sms_e+match_sms:
  369. seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
  370. if seq2 == seq_temp:
  371. point1 = point_sms_f
  372. point1_e = point1 + 30
  373. fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
  374. overlap = skip_sms_e+match_sms - skip_hmh_f
  375. point2 = point_hmh_e + overlap
  376. point2_e = point2 + 30
  377. else:
  378. return False,28
  379. return False,29
  380. else:
  381. return False,30
  382. else:
  383. return False,31
  384. if abs(overlap) <= OVERLAP and point1 !=0 and point2 !=0:
  385. return True,chr1+":"+str(point1),chr2+":"+str(point2),chr1+":"+str(point1_e),fusion_seq,str(overlap),chr1+":"+str(point1_e),chr2+":"+str(point2_e)
  386. else:
  387. return False,32
  388. ###输入含有soft clip 的reads
  389. ###输出比对到的位置坐标
  390. def GetSinglePoints(self,R1_read_info,R2_read_info):
  391. ID_info = R1_read_info[0].split("_")
  392. ID = R1_read_info[0]
  393. R1_seq=R1_read_info[6]
  394. breakpoint1=0
  395. breakpoint1_e=0
  396. breakpoint2=0
  397. breakpoint2_e=0
  398. R1_loc=""
  399. R1_map_info={}
  400. map_str=R1_read_info[5]
  401. chr1=R1_read_info[2]
  402. R1_mismatch_seq=""
  403. R1_match_seq=""
  404. #R1 primer存在
  405. if len(ID_info)>=4:
  406. R1_primer=ID_info[3]
  407. R1_map_info=self.GetMapInfo(R1_read_info[5])
  408. R1_loc=self.PrimerLoc(R1_primer,R1_seq)
  409. R1_primer_len=len(R1_primer)
  410. #SMS
  411. if len(R1_map_info)==3 and R1_map_info[0][0] == "S" and R1_map_info[2][0]=="S":
  412. end_mismatch_len=int(R1_map_info[2].split(":")[1])
  413. start_mismatch_len=int(R1_map_info[0].split(":")[1])
  414. match_len=int(R1_map_info[1].split(":")[1])
  415. if R1_loc == "End":
  416. #mismatch in end
  417. if end_mismatch_len > R1_primer_len:
  418. mismatch_len=end_mismatch_len
  419. R1_seq="".join([self.BASE_DICT[base] for base in R1_seq][::-1])
  420. R1_mismatch_seq=R1_seq[:mismatch_len]
  421. R1_match_seq=R1_seq[mismatch_len:]
  422. breakpoint1=R1_read_info[3]+match_len-1
  423. breakpoint1_e=breakpoint1-30
  424. #mismatch in front
  425. else:
  426. mismatch_len=start_mismatch_len
  427. R1_mismatch_seq=R1_seq[:mismatch_len]
  428. R1_match_seq=R1_seq[mismatch_len:]
  429. breakpoint1=R1_read_info[3]
  430. breakpoint1_e=breakpoint1+30
  431. elif R1_loc == "Front":
  432. #mismatch in front
  433. if start_mismatch_len > R1_primer_len:
  434. mismatch_len=start_mismatch_len
  435. R1_mismatch_seq=R1_seq[:mismatch_len]
  436. R1_match_seq=R1_seq[mismatch_len:]
  437. breakpoint1=R1_read_info[3]
  438. breakpoint1_e=breakpoint1+30
  439. #mismatch in end
  440. else:
  441. mismatch_len=end_mismatch_len
  442. R1_seq="".join([self.BASE_DICT[base] for base in R1_seq][::-1])
  443. R1_mismatch_seq=R1_seq[:mismatch_len]
  444. R1_match_seq=R1_seq[mismatch_len:]
  445. breakpoint1=R1_read_info[3]+match_len-1
  446. breakpoint1_e=breakpoint1-30
  447. #SM or MS
  448. elif len(R1_map_info)==2:
  449. #SM
  450. #mismatch in front
  451. if R1_map_info[0][0]=="S":
  452. mismatch_len=int(R1_map_info[0].split(":")[1])
  453. if R1_loc == "End" or (R1_loc == "Front" and R1_primer_len < mismatch_len):
  454. R1_mismatch_seq=R1_seq[:mismatch_len]
  455. R1_match_seq=R1_seq[mismatch_len:]
  456. breakpoint1=R1_read_info[3]
  457. breakpoint1_e=breakpoint1+30
  458. #else:
  459. # return [],[]
  460. #MS
  461. #mismatch in end
  462. elif R1_map_info[1][0]=="S":
  463. mismatch_len=int(R1_map_info[1].split(":")[1])
  464. if R1_loc == "Front" or (R1_loc=="End" and R1_primer_len < mismatch_len):
  465. R1_seq="".join([self.BASE_DICT[base] for base in R1_seq][::-1])
  466. match_len=int(R1_map_info[0].split(":")[1])
  467. R1_mismatch_seq=R1_seq[:mismatch_len]
  468. R1_match_seq=R1_seq[mismatch_len:]
  469. breakpoint1=R1_read_info[3]+match_len-1
  470. breakpoint1_e=breakpoint1-30
  471. #else:
  472. # return [],[]
  473. R2_seq=R2_read_info[6]
  474. R2_loc=""
  475. R2_map_info={}
  476. map_str=R2_read_info[5]
  477. chr2=R2_read_info[2]
  478. R2_mismatch_seq=""
  479. R2_match_seq=""
  480. #R1,R2都存在primer
  481. if len(ID_info)==5:
  482. R2_primer=ID_info[4]
  483. R2_map_info=self.GetMapInfo(map_str)
  484. R2_loc=self.PrimerLoc(R2_primer,R2_seq)
  485. R2_primer_len=len(R2_primer)
  486. #SMS
  487. if len(R2_map_info)==3 and R2_map_info[0][0] == "S" and R2_map_info[2][0]=="S":
  488. end_mismatch_len=int(R2_map_info[2].split(":")[1])
  489. start_mismatch_len=int(R2_map_info[0].split(":")[1])
  490. match_len=int(R2_map_info[1].split(":")[1])
  491. if R2_loc == "End":
  492. #mismatch in end
  493. if end_mismatch_len > R2_primer_len:
  494. R2_seq="".join([self.BASE_DICT[base] for base in R2_seq][::-1])
  495. mismatch_len=end_mismatch_len
  496. R2_mismatch_seq=R2_seq[:mismatch_len]
  497. R2_match_seq=R2_seq[mismatch_len:]
  498. breakpoint2=R2_read_info[3]+match_len-1
  499. breakpoint2_e=breakpoint2-30
  500. #mismatch in front
  501. else:
  502. mismatch_len=start_mismatch_len
  503. R2_mismatch_seq=R2_seq[:mismatch_len]
  504. R2_match_seq=R2_seq[mismatch_len:]
  505. breakpoint2=R2_read_info[3]
  506. breakpoint2_e=breakpoint2+30
  507. if R2_loc == "Front":
  508. #mismatch in front
  509. if start_mismatch_len > R2_primer_len:
  510. mismatch_len=start_mismatch_len
  511. R2_mismatch_seq=R2_seq[:mismatch_len]
  512. R2_match_seq=R2_seq[mismatch_len:]
  513. breakpoint2=R2_read_info[3]
  514. breakpoint2_e=breakpoint2+30
  515. #mismatch in end
  516. else:
  517. mismatch_len=end_mismatch_len
  518. R2_seq="".join([self.BASE_DICT[base] for base in R2_seq][::-1])
  519. R2_mismatch_seq=R2_seq[:mismatch_len]
  520. R2_match_seq=R2_seq[mismatch_len:]
  521. breakpoint2=R2_read_info[3]+match_len-1
  522. breakpoint2_e=breakpoint2-30
  523. elif len(R2_map_info)==2:
  524. #SM
  525. #mismatch in front
  526. if R2_map_info[0][0]=="S":
  527. mismatch_len=int(R2_map_info[0].split(":")[1])
  528. if R2_loc == "End" or (R2_loc == "Front" and R2_primer_len<mismatch_len):
  529. R2_mismatch_seq=R2_seq[:mismatch_len]
  530. R2_match_seq=R2_seq[mismatch_len:]
  531. breakpoint2=R2_read_info[3]
  532. breakpoint2_e=breakpoint2+30
  533. #else:
  534. # return [],[]
  535. #MS
  536. #mismatch in end
  537. elif R2_map_info[1][0]=="S":
  538. mismatch_len=int(R2_map_info[1].split(":")[1])
  539. if R2_loc == "Front" or (R2_loc == "End" and R2_primer_len<mismatch_len):
  540. match_len=int(R2_map_info[0].split(":")[1])
  541. R2_seq="".join([self.BASE_DICT[base] for base in R2_seq][::-1])
  542. R2_mismatch_seq=R2_seq[:mismatch_len]
  543. R2_match_seq=R2_seq[mismatch_len:]
  544. breakpoint2=R2_read_info[3]+match_len-1
  545. breakpoint2_e=breakpoint2-30
  546. #else:
  547. # return [],[]
  548. R1_point_list=[]
  549. R2_point_list=[]
  550. if breakpoint1 !=0 and len(R1_mismatch_seq) >=10:
  551. R1_point_list=[chr1+":"+str(breakpoint1),R1_mismatch_seq,R1_match_seq,chr1+":"+str(breakpoint1_e),ID]
  552. if breakpoint2 !=0 and len(R2_mismatch_seq)>=10:
  553. R2_point_list=[chr2+":"+str(breakpoint2),R2_mismatch_seq,R2_match_seq,chr2+":"+str(breakpoint2_e),ID]
  554. return R1_point_list,R2_point_list
  555. class DoubleReadsFusion(ReadsFusion):
  556. def __init__(self):
  557. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  558. def HardClipFusion(self,reads_info):
  559. R1_soft_info=[]
  560. R1_hard_info=[]
  561. R2_soft_info=[]
  562. R2_hard_info=[]
  563. for reads in reads_info:
  564. if reads[1]<128:
  565. R1_soft_info=reads
  566. elif reads[1]<256:
  567. R2_soft_info=reads
  568. elif reads[1]<2176:
  569. R1_hard_info=reads
  570. elif reads[1]<2304:
  571. R2_hard_info=reads
  572. if len(R1_soft_info)>0 and len(R1_hard_info)>0 and R1_soft_info[4]>=30:
  573. results=self.GetPoints(R1_soft_info,R1_hard_info)
  574. if results[0]:
  575. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  576. return out_list
  577. #R2:hard-clip
  578. if len(R2_hard_info) > 0 and len(R2_soft_info) > 0 and R2_soft_info[4]>=30:
  579. results=self.GetPoints(R2_soft_info,R2_hard_info)
  580. if results[0]:
  581. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  582. return out_list
  583. return []
  584. class SingleReadsFusion(ReadsFusion):
  585. def __init__(self):
  586. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  587. def SoftClipFusion(self,reads_info):
  588. if len(reads_info)==1:
  589. item = reads_info[0]
  590. reads_info.append(item)
  591. R1_read_info=[]
  592. R2_read_info=[]
  593. for reads in reads_info:
  594. if reads[1] > 128:
  595. R2_read_info=reads
  596. else:
  597. R1_read_info=reads
  598. if len(R1_read_info) == 0:
  599. R1_read_info=R2_read_info
  600. if len(R2_read_info) == 0:
  601. R2_read_info=R1_read_info
  602. #比对质量值至少一个MAQ>=60
  603. if int(R1_read_info[4])<60 and int(R2_read_info[4])<60:
  604. return [],[]
  605. return self.GetSinglePoints(R1_read_info,R2_read_info)