breakpoint_search_v6.1.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time,os
  4. BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'}
  5. def read_sam(insam):
  6. ID_dict={}
  7. with open(insam,'r') as inputSam:
  8. for line in inputSam:
  9. if not line.startswith("@"):
  10. info=line.strip("\n").split("\t")
  11. #ID_info = info[0].split("_")
  12. #hrad clip reads MAP>0
  13. if int(info[1])>256 and int(info[4])==0:
  14. continue
  15. ID = info[0]
  16. if ID not in ID_dict.keys():
  17. ID_dict[ID]=[info]
  18. else:
  19. ID_dict[ID].append(info)
  20. return ID_dict
  21. def check_clip(reads_info):
  22. if len(reads_info)>=3:
  23. return "hard_clip"
  24. elif len(reads_info)==2:
  25. for item in reads_info:
  26. if "S" in item[5]:
  27. return "soft_clip"
  28. return "no_clip"
  29. def get_map_info(map_str):
  30. #99M51S
  31. num_list=[]
  32. map_dict={}
  33. count=0
  34. for item in map_str:
  35. if item.isalpha():
  36. num=len(num_list)
  37. index=0
  38. for i in range(num):
  39. index += int(num_list[i])*10**(num-i-1)
  40. map_dict[count]=item+":"+str(index)
  41. count +=1
  42. num_list=[]
  43. elif item.isdigit():
  44. num_list.append(item)
  45. return map_dict
  46. def check_match(match1,match2):
  47. if match1<30 or match2<30:
  48. return False
  49. return True
  50. def adj_info(seq,info1):
  51. flag = True
  52. for key in info1.keys():
  53. item = info1[key]
  54. if 'D' in item or 'I' in item:
  55. flag = False
  56. if 'H' in info1[0]:
  57. flag = False
  58. #不需要调整
  59. if flag:
  60. return seq,info1
  61. else:
  62. new_seq_list=[]
  63. new_info1={}
  64. start=0
  65. old_item=''
  66. for key in info1.keys():
  67. item = info1[key]
  68. temp_len=int(item.split(":")[1])
  69. end = start+temp_len
  70. if item.startswith('M') or item.startswith('S'):
  71. for index in range(start,end):
  72. new_seq_list.append(seq[index])
  73. old_item=item
  74. elif item.startswith('D') and old_item.startswith('M'):
  75. end = end-temp_len
  76. for index in range(temp_len):
  77. new_seq_list.append('-')
  78. elif item.startswith('H'):
  79. end = end-temp_len
  80. start = end
  81. #
  82. num=len(info1)
  83. index =0
  84. new_index= 0
  85. while index+3 <= num:
  86. item1=info1[index]
  87. item2=info1[index+1]
  88. item3=info1[index+2]
  89. if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
  90. pos1=int(item1.split(":")[1])
  91. pos2=int(item2.split(":")[1])
  92. pos3=int(item3.split(":")[1])
  93. new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
  94. new_index +=1
  95. index +=3
  96. elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
  97. pos1=int(item1.split(":")[1])
  98. pos3=int(item3.split(":")[1])
  99. new_info1[new_index] = 'M:'+str(pos1+pos3)
  100. new_index +=1
  101. index +=3
  102. elif item1.startswith("H") and item2.startswith("M"):
  103. new_info1[new_index] = item2
  104. new_index +=1
  105. index +=2
  106. else:
  107. new_info1[new_index] = item1
  108. new_index += 1
  109. index += 1
  110. while index < num:
  111. new_info1[new_index]=info1[index]
  112. new_index += 1
  113. index += 1
  114. return "".join(new_seq_list),new_info1
  115. def get_points(info1,info2,OVERLAP=10):
  116. #OVERLAP默认值是10,默认不用这个参数值
  117. if len(info1)>=5 and len(info2)>=5:
  118. map_info1=get_map_info(info1[5])
  119. map_info2=get_map_info(info2[5])
  120. seq1=info1[9]
  121. seq2=info2[9]
  122. #print(map_info1)
  123. #print(seq1)
  124. info1_temp=adj_info(seq1,map_info1)
  125. seq=info1_temp[0]
  126. map_info1=info1_temp[1]
  127. #print(map_info1)
  128. #print(seq)
  129. #print(map_info2)
  130. info2_temp=adj_info(seq2,map_info2)
  131. seq2=info2_temp[0]
  132. map_info2=info2_temp[1]
  133. #print(map_info2)
  134. chr1=info1[2]
  135. pos1=int(info1[3])
  136. chr2=info2[2]
  137. pos2=int(info2[3])
  138. flag = int(info1[1])
  139. overlap=0
  140. breakpoint1=0
  141. breakpoint1_1=0
  142. breakpoint2=0
  143. breakpoint1_e=0
  144. breakpoint2_e=0
  145. fusion_seq=""
  146. if len(map_info1)==2 and len(map_info2)==2:
  147. #soft-clip end,hard-clip front
  148. #{0: 'M:131', 1: 'S:19'}
  149. #{0: 'H:118', 1: 'M:32'}
  150. if 'S' in map_info1[1] and 'H' in map_info2[0]:
  151. match1=int(map_info1[0].split(":")[1])
  152. skip1=int(map_info1[1].split(":")[1])
  153. match2=int(map_info2[1].split(":")[1])
  154. if not check_match(match1,match2):
  155. return False,0
  156. overlap=match2-skip1
  157. breakpoint1=pos1+match1-1-overlap
  158. breakpoint2=pos2
  159. breakpoint1_1=pos1+match1-1
  160. if skip1 >30:
  161. len2=30
  162. else:
  163. len2=skip1
  164. fusion_seq=seq[match1-30:match1+len2]
  165. breakpoint1_e=breakpoint1_1-30
  166. breakpoint2_e=breakpoint2+30
  167. #soft-clip end,hard-clip end
  168. #{0: 'M:99', 1: 'S:51'}
  169. #{0: 'M:53', 1: 'H:97'}
  170. elif 'S' in map_info1[1] and 'H' in map_info2[1]:
  171. match1=int(map_info1[0].split(":")[1])
  172. skip1=int(map_info1[1].split(":")[1])
  173. match2=int(map_info2[0].split(":")[1])
  174. hide2=int(map_info2[1].split(":")[1])
  175. if not check_match(match1,match2):
  176. return False,0
  177. overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
  178. if match2 > skip1:
  179. breakpoint1=pos1+match1-1-overlap
  180. else:
  181. breakpoint1=pos1+match1+overlap
  182. breakpoint2=pos2+match2-1
  183. breakpoint1_1=pos1+match1-1
  184. if skip1 >30:
  185. len2=30
  186. else:
  187. len2=skip1
  188. #print(match1)
  189. fusion_seq=seq[match1-30:match1+len2]
  190. #print(fusion_seq)
  191. breakpoint1_e=breakpoint1_1-30
  192. breakpoint2_e=breakpoint2-30
  193. #soft-clip front,hard-clip end
  194. #{0: 'S:51', 1: 'M:99'}
  195. #{0: 'M:53', 1: 'H:97'}
  196. elif 'S' in map_info1[0] and 'H' in map_info2[1]:
  197. match1=int(map_info1[1].split(":")[1])
  198. skip1=int(map_info1[0].split(":")[1])
  199. match2=int(map_info2[0].split(":")[1])
  200. hide2=int(map_info2[1].split(":")[1])
  201. if not check_match(match1,match2):
  202. return False,0
  203. overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
  204. breakpoint1=pos1+overlap
  205. breakpoint2=pos2+match2-1
  206. breakpoint1_1=pos1
  207. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  208. breakpoint1_e=breakpoint1_1+30
  209. breakpoint2_e=breakpoint2-30
  210. #soft-clip front,hard-clip front
  211. #{0: 'S:51', 1: 'M:99'}
  212. #{0: 'H:97', 1: 'M:53'}
  213. elif 'S' in map_info1[0] and 'H' in map_info2[0]:
  214. match1=int(map_info1[1].split(":")[1])
  215. skip1=int(map_info1[0].split(":")[1])
  216. match2=int(map_info2[1].split(":")[1])
  217. if not check_match(match1,match2):
  218. return False,0
  219. overlap=abs(match2-skip1)
  220. breakpoint1=pos1+overlap
  221. breakpoint2=pos2
  222. breakpoint1_1=pos1
  223. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  224. breakpoint1_e=breakpoint1_1+30
  225. breakpoint2_e=breakpoint2+30
  226. elif len(map_info1)==3 and len(map_info2)==2:
  227. flag_bin=bin(flag)
  228. if flag_bin[-5] == '1':
  229. #soft-clip front
  230. if 'H' in map_info2[0]:
  231. #hard-clip front
  232. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  233. #{0: 'H:97', 1: 'M:53'}
  234. match1=int(map_info1[1].split(":")[1])
  235. skip1=int(map_info1[0].split(":")[1])
  236. match2=int(map_info2[1].split(":")[1])
  237. if not check_match(match1,match2):
  238. return False,0
  239. overlap=abs(match2-skip1)
  240. breakpoint1=pos1+overlap
  241. breakpoint2=pos2
  242. breakpoint1_1=pos1
  243. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  244. breakpoint1_e=breakpoint1_1+30
  245. breakpoint2_e=breakpoint2+30
  246. elif 'H' in map_info2[1]:
  247. #hard-clip end
  248. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  249. #{0: 'M:53', 1: 'H:97'}
  250. match1=int(map_info1[1].split(":")[1])
  251. skip1=int(map_info1[0].split(":")[1])
  252. match2=int(map_info2[0].split(":")[1])
  253. if not check_match(match1,match2):
  254. return False,0
  255. overlap=abs(match2-skip1)
  256. breakpoint1=pos1+overlap
  257. breakpoint2=pos2+match2-1
  258. breakpoint1_1=pos1
  259. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  260. breakpoint1_e=breakpoint1_1+30
  261. breakpoint2_e=breakpoint2-30
  262. elif flag_bin[-5] == '0':
  263. #soft-clip end
  264. if 'H' in map_info2[0]:
  265. #hard-clip front
  266. #{0: 'S:5', 1: 'M:99', 2:'S:51'}
  267. #{0: 'H:97', 1: 'M:53'}
  268. former_skip=int(map_info1[0].split(":")[1])
  269. match1=int(map_info1[1].split(":")[1])
  270. skip1=int(map_info1[2].split(":")[1])
  271. match2=int(map_info2[1].split(":")[1])
  272. if not check_match(match1,match2):
  273. return False,0
  274. overlap=abs(match2-skip1)
  275. breakpoint1=pos1+match1-1-overlap
  276. breakpoint2=pos2
  277. breakpoint1_1=pos1+match1-1
  278. if skip1 >30:
  279. len2=30
  280. else:
  281. len2=skip1
  282. fusion_seq=seq[former_skip+match1-30:former_skip+match1+len2]
  283. breakpoint1_e=breakpoint1_1-30
  284. breakpoint2_e=breakpoint2+30
  285. elif 'H' in map_info2[1]:
  286. #hard-clip end
  287. #{0: 'S:5', 1: 'M:99', 2:'S:51'}
  288. #{0: 'M:53', 1: 'H:97'}
  289. former_skip=int(map_info1[0].split(":")[1])
  290. match1=int(map_info1[1].split(":")[1])
  291. skip1=int(map_info1[2].split(":")[1])
  292. match2=int(map_info2[0].split(":")[1])
  293. if not check_match(match1,match2):
  294. return False,0
  295. overlap=abs(match2-skip1)
  296. breakpoint1=pos1+match1-1-overlap
  297. breakpoint2=pos2+match2-1
  298. breakpoint1_1=pos1+match1-1
  299. if skip1 >30:
  300. len2=30
  301. else:
  302. len2=skip1
  303. fusion_seq=seq[former_skip+match1-30:+former_skip+match1+len2]
  304. breakpoint1_e=breakpoint1_1-30
  305. breakpoint2_e=breakpoint2-30
  306. if abs(overlap) <= OVERLAP and breakpoint1 !=0:
  307. return True,chr1+":"+str(breakpoint1),chr2+":"+str(breakpoint2),chr1+":"+str(breakpoint1_1),fusion_seq,str(overlap),chr1+":"+str(breakpoint1_e),chr2+":"+str(breakpoint2_e)
  308. return False,0
  309. return False,0
  310. def hard_clip_fusion(reads_info):
  311. #R1:hard-clip
  312. R1_soft_info=reads_info[0]
  313. R1_hard_info=reads_info[1]
  314. if int(R1_hard_info[1])>256 and int(R1_soft_info[4])>=60:
  315. results=get_points(R1_soft_info,R1_hard_info)
  316. if results[0]:
  317. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  318. return out_list
  319. #R2:hard-clip
  320. R2_soft_info=reads_info[-2]
  321. R2_hard_info=reads_info[-1]
  322. if int(R2_hard_info[1])>256 and int(R2_soft_info[4])>=60:
  323. results=get_points(R2_soft_info,R2_hard_info)
  324. if results[0]:
  325. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  326. return out_list
  327. return []
  328. def primer_loc(primer,seq):
  329. primer_num=len(primer)
  330. seq_primer=seq[:primer_num]
  331. seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1])
  332. seq_rev_comp_primer=seq_rev_comp[:primer_num]
  333. if primer == seq_primer:
  334. return "Front"
  335. elif primer == seq_rev_comp_primer:
  336. return "End"
  337. return "NO"
  338. def get_pos_end(map_info,pos):
  339. chr=pos.split(":")[0]
  340. pos_temp=int(pos.split(":")[1])
  341. new_pos=0
  342. correct_pos=pos_temp
  343. if map_info[0][0]=='M':
  344. match_len=int(map_info[0].split(":")[1])
  345. pos_temp=pos_temp+match_len-1
  346. correct_pos=pos_temp
  347. new_pos=pos_temp-30
  348. elif map_info[0][0]=='S':
  349. new_pos=pos_temp+30
  350. else:
  351. return pos
  352. return chr+":"+str(correct_pos),chr+":"+str(new_pos)
  353. def IS_overlap(seq1,seq2,msimatch=2):
  354. num = len(seq1)
  355. count =0
  356. for i in range(num):
  357. if seq1[i] !=seq2[i]:
  358. count +=1
  359. if count >=msimatch:
  360. return False
  361. return True
  362. def soft_clip_fusion(reads_info):
  363. R1_read_info=reads_info[0]
  364. R2_read_info=reads_info[1]
  365. ID_info = R1_read_info[0].split("_")
  366. ID = R1_read_info[0]
  367. R1_primer_loc=""
  368. R1_seq=R1_read_info[9]
  369. R1_mismatch_seq=""
  370. #比对质量值至少一个MAQ>=60
  371. if int(R1_read_info[4])<60 and int(R2_read_info[4])<60:
  372. return False,"",""
  373. R1_loc=""
  374. R1_map_info={}
  375. R1_map_info_adj={}
  376. #R1 primer存在
  377. if len(ID_info)>=4:
  378. R1_primer=ID_info[3]
  379. R1_map_info=get_map_info(R1_read_info[5])
  380. R1_loc=primer_loc(R1_primer,R1_read_info[9])
  381. if len(R1_map_info)==3 and R1_map_info[0][0] == "S" and R1_map_info[2][0]=="S":
  382. end_mismatch_len=int(R1_map_info[2].split(":")[1])
  383. start_mismatch_len=int(R1_map_info[0].split(":")[1])
  384. R1_primer_len=len(R1_primer)
  385. if R1_loc == "End":
  386. if end_mismatch_len > R1_primer_len:
  387. R1_map_info_adj[0]=R1_map_info[1]
  388. R1_map_info_adj[1]=R1_map_info[2]
  389. R1_primer_loc="Same"
  390. mismatch_len=end_mismatch_len
  391. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  392. R1_mismatch_seq=R1_seq[:mismatch_len]
  393. else:
  394. R1_map_info_adj[0]=R1_map_info[0]
  395. R1_map_info_adj[1]=R1_map_info[1]
  396. R1_primer_loc="Not the Same"
  397. mismatch_len=start_mismatch_len
  398. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  399. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  400. elif R1_loc == "Front":
  401. if start_mismatch_len > R1_primer_len:
  402. R1_map_info_adj[0]=R1_map_info[0]
  403. R1_map_info_adj[1]=R1_map_info[1]
  404. R1_primer_loc="Same"
  405. mismatch_len=start_mismatch_len
  406. R1_mismatch_seq=R1_seq[:mismatch_len]
  407. else:
  408. R1_map_info_adj[0]=R1_map_info[1]
  409. R1_map_info_adj[1]=R1_map_info[2]
  410. R1_primer_loc="Not the Same"
  411. mismatch_len=end_mismatch_len
  412. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  413. elif len(R1_map_info)==2:
  414. R1_map_info_adj=R1_map_info
  415. if R1_loc == "End" and R1_map_info[0][0]=="S":
  416. R1_primer_loc="Not the Same"
  417. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  418. mismatch_len=int(R1_map_info[0].split(":")[1])
  419. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  420. elif R1_loc == "End" and R1_map_info[1][0]=="S":
  421. R1_primer_loc="Same"
  422. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  423. mismatch_len=int(R1_map_info[1].split(":")[1])
  424. #if len(R1_primer)+5 >=mismatch_len:
  425. if len(R1_primer) >=mismatch_len:
  426. return False,"",""
  427. R1_mismatch_seq=R1_seq[:mismatch_len]
  428. elif R1_loc == "Front" and R1_map_info[0][0]=="S":
  429. R1_primer_loc="Same"
  430. mismatch_len=int(R1_map_info[0].split(":")[1])
  431. #if len(R1_primer)+5 >=mismatch_len:
  432. if len(R1_primer) >=mismatch_len:
  433. return False,"",""
  434. R1_mismatch_seq=R1_seq[:mismatch_len]
  435. elif R1_loc == "Front" and R1_map_info[1][0] =="S":
  436. R1_primer_loc="Not the Same"
  437. mismatch_len=int(R1_map_info[1].split(":")[1])
  438. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  439. #no soft clip
  440. else:
  441. R1_primer_loc="None"
  442. R2_primer_loc=""
  443. R2_seq=R2_read_info[9]
  444. R2_mismatch_seq=""
  445. R2_loc=""
  446. R2_map_info={}
  447. R2_map_info_adj={}
  448. #R1,R2都存在primer
  449. if len(ID_info)==5:
  450. R2_primer=ID_info[4]
  451. R2_map_info=get_map_info(R2_read_info[5])
  452. R2_loc=primer_loc(R2_primer,R2_read_info[9])
  453. if len(R2_map_info)==3 and R2_map_info[0][0] == "S" and R2_map_info[2][0]=="S":
  454. end_mismatch_len=int(R2_map_info[2].split(":")[1])
  455. start_mismatch_len=int(R2_map_info[0].split(":")[1])
  456. R2_primer_len=len(R2_primer)
  457. if R2_loc == "End":
  458. if end_mismatch_len > R2_primer_len:
  459. R2_map_info_adj[0]=R2_map_info[1]
  460. R2_map_info_adj[1]=R2_map_info[2]
  461. R2_primer_loc="Same"
  462. mismatch_len=end_mismatch_len
  463. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  464. else:
  465. R2_map_info_adj[0]=R2_map_info[0]
  466. R2_map_info_adj[1]=R2_map_info[1]
  467. R2_primer_loc="Not the Same"
  468. mismatch_len=start_mismatch_len
  469. R2_mismatch_seq=R2_seq[:mismatch_len:]
  470. if R2_loc == "Front":
  471. if start_mismatch_len > R2_primer_len:
  472. R2_map_info_adj[0]=R2_map_info[0]
  473. R2_map_info_adj[1]=R2_map_info[1]
  474. R2_primer_loc="Same"
  475. mismatch_len=start_mismatch_len
  476. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  477. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  478. else:
  479. R2_map_info_adj[0]=R2_map_info[1]
  480. R2_map_info_adj[1]=R2_map_info[2]
  481. R2_primer_loc="Not the Same"
  482. mismatch_len=end_mismatch_len
  483. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  484. R2_mismatch_seq=R2_seq[:mismatch_len]
  485. elif len(R2_map_info)==2:
  486. R2_map_info_adj=R2_map_info
  487. if R2_loc == "End" and R2_map_info[0][0]=="S":
  488. R2_primer_loc="Not the Same"
  489. mismatch_len=int(R2_map_info[0].split(":")[1])
  490. R2_mismatch_seq=R2_seq[:mismatch_len]
  491. elif R2_loc == "End" and R2_map_info[1][0]=="S":
  492. R2_primer_loc="Same"
  493. mismatch_len=int(R2_map_info[1].split(":")[1])
  494. #if len(R2_primer)+5 >=mismatch_len:
  495. if len(R2_primer) >=mismatch_len:
  496. return False,"",""
  497. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  498. elif R2_loc == "Front" and R2_map_info[0][0]=="S":
  499. R2_primer_loc="Same"
  500. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  501. mismatch_len=int(R2_map_info[0].split(":")[1])
  502. #if len(R2_primer)+5 >= mismatch_len:
  503. if len(R2_primer) >= mismatch_len:
  504. return False,"",""
  505. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  506. elif R2_loc == "Front" and R2_map_info[1][0] =="S":
  507. R2_primer_loc="Not the Same"
  508. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  509. mismatch_len=int(R2_map_info[1].split(":")[1])
  510. R2_mismatch_seq=R2_seq[:mismatch_len]
  511. else:
  512. R2_primer_loc="None"
  513. if R1_primer_loc=="Not the Same" and R2_primer_loc == "Not the Same":
  514. R1_mismatch_len=len(R1_mismatch_seq)
  515. R2_mismatch_len=len(R2_mismatch_seq)
  516. if R1_mismatch_len>=5 and R2_mismatch_len >=5:
  517. for index in range(5):
  518. R2_match_seq = R2_seq[index+R2_mismatch_len:index+R2_mismatch_len+R1_mismatch_len]
  519. if R2_match_seq == R1_mismatch_seq:
  520. R2_match_seq_tmp = R2_seq[:index+R2_mismatch_len+R1_mismatch_len]
  521. R1_match_seq_tmp = R1_seq[-1*len(R2_match_seq_tmp):]
  522. if R2_match_seq_tmp == R1_match_seq_tmp:
  523. FusionSeq=R1_seq[-1*R1_mismatch_len-30:-1*R1_mismatch_len]+R2_seq[R2_mismatch_len+index:R2_mismatch_len+index+30]
  524. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  525. if R2_loc == "End":
  526. R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])+index)
  527. elif R2_loc == "Front":
  528. R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])-index)
  529. else:
  530. return True,"Double",outstr
  531. R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
  532. R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
  533. outstr=R1_point+"-"+R2_point+"\t"+R1_read_info[2]+":"+R1_read_info[3]+"\t"+R2_read_info[2]+":"+R2_read_info[3]+"\t"+FusionSeq+"\t"+str(index)+"\t"+R1_point_list[1]+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  534. return True,"Double",outstr
  535. elif R1_primer_loc=="Not the Same" and R2_primer_loc == "Same":
  536. R1_mismatch_len=len(R1_mismatch_seq)
  537. R2_mismatch_len=len(R2_mismatch_seq)
  538. common_len=min(R1_mismatch_len,R2_mismatch_len)
  539. R1_mismatch_tmp=R1_mismatch_seq[-1*R1_mismatch_len:-1*common_len]
  540. R2_mismatch_tmp=R2_mismatch_seq[-1*R2_mismatch_len:-1*common_len]
  541. R2_match_seq=R2_seq[:-1*R2_mismatch_len]
  542. if R1_mismatch_tmp==R2_mismatch_tmp:
  543. R2_point=R2_read_info[2]+":"+R2_read_info[3]
  544. R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
  545. otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  546. return True,"Single",otustr
  547. elif R1_primer_loc=="Same" and R2_primer_loc == "Same":
  548. return False,"",""
  549. elif R1_primer_loc=="Same" and R2_primer_loc == "Not the Same":
  550. R1_mismatch_len=len(R1_mismatch_seq)
  551. R2_mismatch_len=len(R2_mismatch_seq)
  552. common_len=min(R1_mismatch_len,R2_mismatch_len)
  553. R1_mismatch_tmp=R1_mismatch_seq[R1_mismatch_len-1*common_len:R1_mismatch_len]
  554. R2_mismatch_tmp=R2_mismatch_seq[R2_mismatch_len-1*common_len:R2_mismatch_len]
  555. R1_match_seq=R1_seq[R1_mismatch_len:]
  556. if IS_overlap(R1_mismatch_tmp,R2_mismatch_tmp):
  557. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  558. R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
  559. otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
  560. return True,"Single",otustr
  561. else:
  562. return False,"",""
  563. '''
  564. elif R1_primer_loc=="Same" and R2_primer_loc == "None":
  565. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  566. R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
  567. R1_mismatch_len=len(R1_mismatch_seq)
  568. R1_match_seq=R1_seq[R1_mismatch_len:]
  569. otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
  570. return True,"Single",otustr
  571. elif R1_primer_loc=="Not the Same" and R2_primer_loc == "None":
  572. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  573. R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
  574. R1_mismatch_len=len(R1_mismatch_seq)
  575. R1_match_seq=R1_seq[:-1*R1_mismatch_len]
  576. otustr=R1_point_list[0]+"\tDown\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
  577. return True,"Single",otustr
  578. elif R1_primer_loc=="None" and R2_primer_loc == "Same":
  579. R2_point=R2_read_info[2]+":"+R2_read_info[3]
  580. R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
  581. R2_mismatch_len=len(R2_mismatch_seq)
  582. R2_match_seq=R2_seq[:-1*R2_mismatch_len]
  583. otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  584. return True,"Single",otustr
  585. elif R1_primer_loc=="None" and R2_primer_loc == "Not the Same":
  586. R2_point=R2_read_info[2]+":"+R2_read_info[3]
  587. R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
  588. R2_mismatch_len=len(R2_mismatch_seq)
  589. R2_match_seq=R2_seq[R2_mismatch_len:]
  590. otustr=R2_point_list[0]+"\tUp\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  591. return True,"Single",otustr
  592. '''
  593. return False,"",""
  594. def main(insam,outdir):
  595. t1=time.time()
  596. #将sam内容按照ID区分,放到ID_info中
  597. ID_dict=read_sam(insam)
  598. #存放soft_clip的reads ID
  599. soft_clip_ID=[]
  600. #输出到本地hard clip fusion
  601. hard_clip_file=outdir+"/hard_clip_fusion_breakpoints.txt"
  602. hard_clip_out=open(hard_clip_file,'w')
  603. header=["Site","Site1","Site2","Seq","Overlap","Site1_Down","Site2_Down","ReadsID"]
  604. hard_clip_out.write("\t".join(header)+"\n")
  605. for ID in ID_dict.keys():
  606. reads_info = ID_dict[ID]
  607. clip_stat=check_clip(reads_info)
  608. if clip_stat == "soft_clip":
  609. soft_clip_ID.append(ID)
  610. elif clip_stat == "hard_clip":
  611. out_list=hard_clip_fusion(reads_info)
  612. if len(out_list) !=0:
  613. out_list.append(ID)
  614. hard_clip_out.write("\t".join(out_list)+"\n")
  615. hard_clip_out.close()
  616. #输出到本地single
  617. single_points=outdir+"/soft_clip_single_breakpoints.txt"
  618. SingleInput=open(single_points,'w')
  619. header=["Site","Up_Down","MismatchSeq","MatchSeq","Site_Down","ReadsID"]
  620. SingleInput.write("\t".join(header)+"\n")
  621. #输出到本地double
  622. double_points=outdir+"/soft_clip_double_breakpoints.txt"
  623. DoubleInput=open(double_points,'w')
  624. #DoubleInput.write("\t".join(header)+"\n")
  625. #
  626. for soft_ID in soft_clip_ID:
  627. reads_info = ID_dict[soft_ID]
  628. soft_clip_results=soft_clip_fusion(reads_info)
  629. if soft_clip_results[0]:
  630. if soft_clip_results[1]=="Double":
  631. DoubleInput.write(soft_clip_results[2])
  632. elif soft_clip_results[1]=="Single":
  633. SingleInput.write(soft_clip_results[2])
  634. SingleInput.close()
  635. DoubleInput.close()
  636. double_points_file=outdir+"/double_breakpoints.txt"
  637. os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file+" && rm "+hard_clip_file+" && rm "+double_points)
  638. single_points_file=outdir+"/single_breakpoints.txt"
  639. os.system("cp "+single_points+" "+single_points_file+" && rm "+single_points)
  640. t2=time.time()
  641. print("Times: "+str(int(t2-t1))+"s")
  642. print("BreakPoints Search Done!")
  643. if __name__ == '__main__':
  644. parser = argparse.ArgumentParser(description='get reads mapping location')
  645. parser.add_argument('-i', required=True, type=str, help="insam")
  646. parser.add_argument('-o', required=True, type=str, help="outdir")
  647. args = parser.parse_args()
  648. main(args.i, args.o)