breakpoint_search_v6.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time,os
  4. BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'}
  5. def read_sam(insam):
  6. ID_dict={}
  7. with open(insam,'r') as inputSam:
  8. for line in inputSam:
  9. if not line.startswith("@"):
  10. info=line.strip("\n").split("\t")
  11. #ID_info = info[0].split("_")
  12. ID = info[0]
  13. if ID not in ID_dict.keys():
  14. ID_dict[ID]=[info]
  15. else:
  16. ID_dict[ID].append(info)
  17. return ID_dict
  18. def check_clip(reads_info):
  19. if len(reads_info)>=3:
  20. return "hard_clip"
  21. elif len(reads_info)==2:
  22. for item in reads_info:
  23. if "S" in item[5]:
  24. return "soft_clip"
  25. return "no_clip"
  26. def get_map_info(map_str):
  27. #99M51S
  28. num_list=[]
  29. map_dict={}
  30. count=0
  31. for item in map_str:
  32. if item.isalpha():
  33. num=len(num_list)
  34. index=0
  35. for i in range(num):
  36. index += int(num_list[i])*10**(num-i-1)
  37. map_dict[count]=item+":"+str(index)
  38. count +=1
  39. num_list=[]
  40. elif item.isdigit():
  41. num_list.append(item)
  42. return map_dict
  43. def check_match(match1,match2):
  44. if match1<30 or match2<30:
  45. return False
  46. return True
  47. def adj_info(seq,info1):
  48. flag = True
  49. for key in info1.keys():
  50. item = info1[key]
  51. if 'D' in item or 'I' in item:
  52. flag = False
  53. if 'H' in info1[0]:
  54. flag = False
  55. #不需要调整
  56. if flag:
  57. return seq,info1
  58. else:
  59. new_seq_list=[]
  60. new_info1={}
  61. start=0
  62. old_item=''
  63. for key in info1.keys():
  64. item = info1[key]
  65. temp_len=int(item.split(":")[1])
  66. end = start+temp_len
  67. if item.startswith('M') or item.startswith('S'):
  68. for index in range(start,end):
  69. new_seq_list.append(seq[index])
  70. old_item=item
  71. elif item.startswith('D') and old_item.startswith('M'):
  72. end = end-temp_len
  73. for index in range(temp_len):
  74. new_seq_list.append('-')
  75. elif item.startswith('H'):
  76. end = end-temp_len
  77. start = end
  78. #
  79. num=len(info1)
  80. index =0
  81. new_index= 0
  82. while index+3 <= num:
  83. item1=info1[index]
  84. item2=info1[index+1]
  85. item3=info1[index+2]
  86. if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
  87. pos1=int(item1.split(":")[1])
  88. pos2=int(item2.split(":")[1])
  89. pos3=int(item3.split(":")[1])
  90. new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
  91. new_index +=1
  92. index +=3
  93. elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
  94. pos1=int(item1.split(":")[1])
  95. pos3=int(item3.split(":")[1])
  96. new_info1[new_index] = 'M:'+str(pos1+pos3)
  97. new_index +=1
  98. index +=3
  99. elif item1.startswith("H") and item2.startswith("M"):
  100. new_info1[new_index] = item2
  101. new_index +=1
  102. index +=2
  103. else:
  104. new_info1[new_index] = item1
  105. new_index += 1
  106. index += 1
  107. while index < num:
  108. new_info1[new_index]=info1[index]
  109. new_index += 1
  110. index += 1
  111. return "".join(new_seq_list),new_info1
  112. def get_points(info1,info2,OVERLAP=10):
  113. #OVERLAP默认值是10,默认不用这个参数值
  114. if len(info1)>=5 and len(info2)>=5:
  115. map_info1=get_map_info(info1[5])
  116. map_info2=get_map_info(info2[5])
  117. seq1=info1[9]
  118. seq2=info2[9]
  119. #print(map_info1)
  120. #print(seq1)
  121. info1_temp=adj_info(seq1,map_info1)
  122. seq=info1_temp[0]
  123. map_info1=info1_temp[1]
  124. #print(map_info1)
  125. #print(seq)
  126. #print(map_info2)
  127. info2_temp=adj_info(seq2,map_info2)
  128. seq2=info2_temp[0]
  129. map_info2=info2_temp[1]
  130. #print(map_info2)
  131. chr1=info1[2]
  132. pos1=int(info1[3])
  133. chr2=info2[2]
  134. pos2=int(info2[3])
  135. flag = int(info1[1])
  136. overlap=0
  137. breakpoint1=0
  138. breakpoint1_1=0
  139. breakpoint2=0
  140. breakpoint1_e=0
  141. breakpoint2_e=0
  142. fusion_seq=""
  143. if len(map_info1)==2 and len(map_info2)==2:
  144. #soft-clip end,hard-clip front
  145. #{0: 'M:131', 1: 'S:19'}
  146. #{0: 'H:118', 1: 'M:32'}
  147. if 'S' in map_info1[1] and 'H' in map_info2[0]:
  148. match1=int(map_info1[0].split(":")[1])
  149. skip1=int(map_info1[1].split(":")[1])
  150. match2=int(map_info2[1].split(":")[1])
  151. if not check_match(match1,match2):
  152. return False,0
  153. overlap=abs(match2-skip1)
  154. breakpoint1=pos1+match1-1-overlap
  155. breakpoint2=pos2
  156. breakpoint1_1=pos1+match1-1
  157. if skip1 >30:
  158. len2=30
  159. else:
  160. len2=skip1
  161. fusion_seq=seq[match1-30:match1+len2]
  162. breakpoint1_e=breakpoint1_1-30
  163. breakpoint2_e=breakpoint2+30
  164. #soft-clip end,hard-clip end
  165. #{0: 'M:99', 1: 'S:51'}
  166. #{0: 'M:53', 1: 'H:97'}
  167. elif 'S' in map_info1[1] and 'H' in map_info2[1]:
  168. match1=int(map_info1[0].split(":")[1])
  169. skip1=int(map_info1[1].split(":")[1])
  170. match2=int(map_info2[0].split(":")[1])
  171. hide2=int(map_info2[1].split(":")[1])
  172. if not check_match(match1,match2):
  173. return False,0
  174. overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
  175. breakpoint1=pos1+match1-1-overlap
  176. breakpoint2=pos2+match2-1
  177. breakpoint1_1=pos1+match1-1
  178. if skip1 >30:
  179. len2=30
  180. else:
  181. len2=skip1
  182. #print(match1)
  183. fusion_seq=seq[match1-30:match1+len2]
  184. #print(fusion_seq)
  185. breakpoint1_e=breakpoint1_1-30
  186. breakpoint2_e=breakpoint2-30
  187. #soft-clip front,hard-clip end
  188. #{0: 'S:51', 1: 'M:99'}
  189. #{0: 'M:53', 1: 'H:97'}
  190. elif 'S' in map_info1[0] and 'H' in map_info2[1]:
  191. match1=int(map_info1[1].split(":")[1])
  192. skip1=int(map_info1[0].split(":")[1])
  193. match2=int(map_info2[0].split(":")[1])
  194. hide2=int(map_info2[1].split(":")[1])
  195. if not check_match(match1,match2):
  196. return False,0
  197. overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
  198. breakpoint1=pos1+overlap
  199. breakpoint2=pos2+match2-1
  200. breakpoint1_1=pos1
  201. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  202. breakpoint1_e=breakpoint1_1+30
  203. breakpoint2_e=breakpoint2-30
  204. #soft-clip front,hard-clip front
  205. #{0: 'S:51', 1: 'M:99'}
  206. #{0: 'H:97', 1: 'M:53'}
  207. elif 'S' in map_info1[0] and 'H' in map_info2[0]:
  208. match1=int(map_info1[1].split(":")[1])
  209. skip1=int(map_info1[0].split(":")[1])
  210. match2=int(map_info2[1].split(":")[1])
  211. if not check_match(match1,match2):
  212. return False,0
  213. overlap=abs(match2-skip1)
  214. breakpoint1=pos1+overlap
  215. breakpoint2=pos2
  216. breakpoint1_1=pos1
  217. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  218. breakpoint1_e=breakpoint1_1+30
  219. breakpoint2_e=breakpoint2+30
  220. elif len(map_info1)==3 and len(map_info2)==2:
  221. flag_bin=bin(flag)
  222. if flag_bin[-5] == '1':
  223. #soft-clip front
  224. if 'H' in map_info2[0]:
  225. #hard-clip front
  226. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  227. #{0: 'H:97', 1: 'M:53'}
  228. match1=int(map_info1[1].split(":")[1])
  229. skip1=int(map_info1[0].split(":")[1])
  230. match2=int(map_info2[1].split(":")[1])
  231. if not check_match(match1,match2):
  232. return False,0
  233. overlap=abs(match2-skip1)
  234. breakpoint1=pos1+overlap
  235. breakpoint2=pos2
  236. breakpoint1_1=pos1
  237. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  238. breakpoint1_e=breakpoint1_1+30
  239. breakpoint2_e=breakpoint2+30
  240. elif 'H' in map_info2[1]:
  241. #hard-clip end
  242. #{0: 'S:51', 1: 'M:99', 2:'S:5'}
  243. #{0: 'M:53', 1: 'H:97'}
  244. match1=int(map_info1[1].split(":")[1])
  245. skip1=int(map_info1[0].split(":")[1])
  246. match2=int(map_info2[0].split(":")[1])
  247. if not check_match(match1,match2):
  248. return False,0
  249. overlap=abs(match2-skip1)
  250. breakpoint1=pos1+overlap
  251. breakpoint2=pos2+match2-1
  252. breakpoint1_1=pos1
  253. fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
  254. breakpoint1_e=breakpoint1_1+30
  255. breakpoint2_e=breakpoint2-30
  256. elif flag_bin[-5] == '0':
  257. #soft-clip end
  258. if 'H' in map_info2[0]:
  259. #hard-clip front
  260. #{0: 'S:5', 1: 'M:99', 2:'S:51'}
  261. #{0: 'H:97', 1: 'M:53'}
  262. former_skip=int(map_info1[0].split(":")[1])
  263. match1=int(map_info1[1].split(":")[1])
  264. skip1=int(map_info1[2].split(":")[1])
  265. match2=int(map_info2[1].split(":")[1])
  266. if not check_match(match1,match2):
  267. return False,0
  268. overlap=abs(match2-skip1)
  269. breakpoint1=pos1+match1-1-overlap
  270. breakpoint2=pos2
  271. breakpoint1_1=pos1+match1-1
  272. if skip1 >30:
  273. len2=30
  274. else:
  275. len2=skip1
  276. fusion_seq=seq[former_skip+match1-30:former_skip+match1+len2]
  277. breakpoint1_e=breakpoint1_1-30
  278. breakpoint2_e=breakpoint2+30
  279. elif 'H' in map_info2[1]:
  280. #hard-clip end
  281. #{0: 'S:5', 1: 'M:99', 2:'S:51'}
  282. #{0: 'M:53', 1: 'H:97'}
  283. former_skip=int(map_info1[0].split(":")[1])
  284. match1=int(map_info1[1].split(":")[1])
  285. skip1=int(map_info1[2].split(":")[1])
  286. match2=int(map_info2[0].split(":")[1])
  287. if not check_match(match1,match2):
  288. return False,0
  289. overlap=abs(match2-skip1)
  290. breakpoint1=pos1+match1-1-overlap
  291. breakpoint2=pos2+match2-1
  292. breakpoint1_1=pos1+match1-1
  293. if skip1 >30:
  294. len2=30
  295. else:
  296. len2=skip1
  297. fusion_seq=seq[former_skip+match1-30:+former_skip+match1+len2]
  298. breakpoint1_e=breakpoint1_1-30
  299. breakpoint2_e=breakpoint2-30
  300. if overlap <= OVERLAP and breakpoint1 !=0:
  301. return True,chr1+":"+str(breakpoint1),chr2+":"+str(breakpoint2),chr1+":"+str(breakpoint1_1),fusion_seq,str(overlap),chr1+":"+str(breakpoint1_e),chr2+":"+str(breakpoint2_e)
  302. return False,0
  303. return False,0
  304. def hard_clip_fusion(reads_info):
  305. #R1:hard-clip
  306. R1_soft_info=reads_info[0]
  307. R1_hard_info=reads_info[1]
  308. if int(R1_hard_info[1])>256 and int(R1_soft_info[4])>=30:
  309. results=get_points(R1_soft_info,R1_hard_info)
  310. if results[0]:
  311. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  312. return out_list
  313. #R2:hard-clip
  314. R2_soft_info=reads_info[-2]
  315. R2_hard_info=reads_info[-1]
  316. if int(R2_hard_info[1])>256 and int(R2_soft_info[4])>=30:
  317. results=get_points(R2_soft_info,R2_hard_info)
  318. if results[0]:
  319. out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
  320. return out_list
  321. return []
  322. def primer_loc(primer,seq):
  323. primer_num=len(primer)
  324. seq_primer=seq[:primer_num]
  325. seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1])
  326. seq_rev_comp_primer=seq_rev_comp[:primer_num]
  327. if primer == seq_primer:
  328. return "Front"
  329. elif primer == seq_rev_comp_primer:
  330. return "End"
  331. return "NO"
  332. def get_pos_end(map_info,pos):
  333. chr=pos.split(":")[0]
  334. pos_temp=int(pos.split(":")[1])
  335. new_pos=0
  336. correct_pos=pos_temp
  337. if map_info[0][0]=='M':
  338. match_len=int(map_info[0].split(":")[1])
  339. pos_temp=pos_temp+match_len-1
  340. correct_pos=pos_temp
  341. new_pos=pos_temp-30
  342. elif map_info[0][0]=='S':
  343. new_pos=pos_temp+30
  344. else:
  345. return pos
  346. return chr+":"+str(correct_pos),chr+":"+str(new_pos)
  347. def soft_clip_fusion(reads_info):
  348. R1_read_info=reads_info[0]
  349. R2_read_info=reads_info[1]
  350. ID_info = R1_read_info[0].split("_")
  351. ID = R1_read_info[0]
  352. R1_primer_loc=""
  353. R1_seq=R1_read_info[9]
  354. R1_mismatch_seq=""
  355. #比对质量值至少一个MAQ>=60
  356. if int(R1_read_info[4])<60 and int(R2_read_info[4])<60:
  357. return False,"",""
  358. R1_loc=""
  359. R1_map_info={}
  360. R2_map_info={}
  361. #R1 primer存在
  362. if len(ID_info)>=4:
  363. R1_primer=ID_info[3]
  364. R1_map_info=get_map_info(R1_read_info[5])
  365. R1_loc=primer_loc(R1_primer,R1_read_info[9])
  366. if len(R1_map_info)==3 and R1_map_info[0][0] == "S" and R1_map_info[2][0]=="S":
  367. if R1_loc == "End":
  368. R1_primer_loc="Not the Same"
  369. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  370. mismatch_len=int(R1_map_info[0].split(":")[1])
  371. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  372. if R1_loc == "Front":
  373. R1_primer_loc="Not the Same"
  374. mismatch_len=int(R1_map_info[2].split(":")[1])
  375. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  376. elif len(R1_map_info)==2:
  377. if R1_loc == "End" and R1_map_info[0][0]=="S":
  378. R1_primer_loc="Not the Same"
  379. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  380. mismatch_len=int(R1_map_info[0].split(":")[1])
  381. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  382. elif R1_loc == "End" and R1_map_info[1][0]=="S":
  383. R1_primer_loc="Same"
  384. R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
  385. mismatch_len=int(R1_map_info[1].split(":")[1])
  386. if len(R1_primer)+5 >=mismatch_len:
  387. return False,"",""
  388. R1_mismatch_seq=R1_seq[:mismatch_len]
  389. elif R1_loc == "Front" and R1_map_info[0][0]=="S":
  390. R1_primer_loc="Same"
  391. mismatch_len=int(R1_map_info[0].split(":")[1])
  392. if len(R1_primer)+5 >=mismatch_len:
  393. return False,"",""
  394. R1_mismatch_seq=R1_seq[:mismatch_len]
  395. elif R1_loc == "Front" and R1_map_info[1][0] =="S":
  396. R1_primer_loc="Not the Same"
  397. mismatch_len=int(R1_map_info[1].split(":")[1])
  398. R1_mismatch_seq=R1_seq[-1*mismatch_len:]
  399. R2_primer_loc=""
  400. R2_seq=R2_read_info[9]
  401. R2_mismatch_seq=""
  402. R2_loc=""
  403. R2_map_info={}
  404. #R1,R2都存在primer
  405. if len(ID_info)==5:
  406. R2_primer=ID_info[4]
  407. R2_map_info=get_map_info(R2_read_info[5])
  408. R2_loc=primer_loc(R2_primer,R2_read_info[9])
  409. if len(R2_map_info)==3 and R2_map_info[0][0] == "S" and R2_map_info[2][0]=="S":
  410. if R2_loc == "End":
  411. R2_primer_loc="Not the Same"
  412. mismatch_len=int(R2_map_info[0].split(":")[1])
  413. R2_mismatch_seq=R2_seq[:mismatch_len]
  414. if R2_loc == "Front":
  415. R2_primer_loc="Not the Same"
  416. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  417. mismatch_len=int(R2_map_info[2].split(":")[1])
  418. R2_mismatch_seq=R2_seq[:mismatch_len]
  419. elif len(R2_map_info)==2:
  420. if R2_loc == "End" and R2_map_info[0][0]=="S":
  421. R2_primer_loc="Not the Same"
  422. mismatch_len=int(R2_map_info[0].split(":")[1])
  423. R2_mismatch_seq=R2_seq[:mismatch_len]
  424. elif R2_loc == "End" and R2_map_info[1][0]=="S":
  425. R2_primer_loc="Same"
  426. mismatch_len=int(R2_map_info[1].split(":")[1])
  427. if len(R2_primer)+5 >=mismatch_len:
  428. return False,"",""
  429. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  430. elif R2_loc == "Front" and R2_map_info[0][0]=="S":
  431. R2_primer_loc="Same"
  432. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  433. mismatch_len=int(R2_map_info[0].split(":")[1])
  434. if len(R2_primer)+5 >= mismatch_len:
  435. return False,"",""
  436. R2_mismatch_seq=R2_seq[-1*mismatch_len:]
  437. elif R2_loc == "Front" and R2_map_info[1][0] =="S":
  438. R2_primer_loc="Not the Same"
  439. R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
  440. mismatch_len=int(R2_map_info[1].split(":")[1])
  441. R2_mismatch_seq=R2_seq[:mismatch_len]
  442. if R1_primer_loc=="Not the Same" and R2_primer_loc == "Not the Same":
  443. R1_mismatch_len=len(R1_mismatch_seq)
  444. R2_mismatch_len=len(R2_mismatch_seq)
  445. if R1_mismatch_len>=5 and R2_mismatch_len >=5:
  446. for index in range(5):
  447. R2_match_seq = R2_seq[index+R2_mismatch_len:index+R2_mismatch_len+R1_mismatch_len]
  448. if R2_match_seq == R1_mismatch_seq:
  449. R2_match_seq_tmp = R2_seq[:index+R2_mismatch_len+R1_mismatch_len]
  450. R1_match_seq_tmp = R1_seq[-1*len(R2_match_seq_tmp):]
  451. if R2_match_seq_tmp == R1_match_seq_tmp:
  452. FusionSeq=R1_seq[-1*R1_mismatch_len-30:-1*R1_mismatch_len]+R2_seq[R2_mismatch_len+index:R2_mismatch_len+index+30]
  453. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  454. if R2_loc == "End":
  455. R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])+index)
  456. elif R2_loc == "Front":
  457. R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])-index)
  458. else:
  459. return True,"Double",outstr
  460. R1_point_list=get_pos_end(R1_map_info,R1_point)
  461. R2_point_list=get_pos_end(R2_map_info,R2_point)
  462. outstr=R1_point+"-"+R2_point+"\t"+R1_read_info[2]+":"+R1_read_info[3]+"\t"+R2_read_info[2]+":"+R2_read_info[3]+"\t"+FusionSeq+"\t"+str(index)+"\t"+R1_point_list[1]+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  463. return True,"Double",outstr
  464. elif R1_primer_loc=="Not the Same" and R2_primer_loc == "Same":
  465. R1_mismatch_len=len(R1_mismatch_seq)
  466. R2_mismatch_len=len(R2_mismatch_seq)
  467. common_len=min(R1_mismatch_len,R2_mismatch_len)
  468. R1_mismatch_tmp=R1_mismatch_seq[-1*R1_mismatch_len:-1*common_len]
  469. R2_mismatch_tmp=R2_mismatch_seq[-1*R2_mismatch_len:-1*common_len]
  470. R2_match_seq=R2_seq[:-1*R2_mismatch_len]
  471. if R1_mismatch_tmp==R2_mismatch_tmp:
  472. R2_point=R2_read_info[2]+":"+R2_read_info[3]
  473. R2_point_list=get_pos_end(R2_map_info,R2_point)
  474. otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
  475. return True,"Single",otustr
  476. elif R1_primer_loc=="Same" and R2_primer_loc == "Same":
  477. return False,"",""
  478. elif R1_primer_loc=="Same" and R2_primer_loc == "Not the Same":
  479. R1_mismatch_len=len(R1_mismatch_seq)
  480. R2_mismatch_len=len(R2_mismatch_seq)
  481. common_len=min(R1_mismatch_len,R2_mismatch_len)
  482. R1_mismatch_tmp=R1_mismatch_seq[R1_mismatch_len-1*common_len:R1_mismatch_len]
  483. R2_mismatch_tmp=R2_mismatch_seq[R2_mismatch_len-1*common_len:R2_mismatch_len]
  484. R1_match_seq=R1_seq[R1_mismatch_len:]
  485. if R1_mismatch_tmp == R2_mismatch_tmp:
  486. R1_point=R1_read_info[2]+":"+R1_read_info[3]
  487. R1_point_list=get_pos_end(R1_map_info,R1_point)
  488. otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
  489. return True,"Single",otustr
  490. else:
  491. return False,"",""
  492. return False,"",""
  493. def main(insam,outdir):
  494. t1=time.time()
  495. #将sam内容按照ID区分,放到ID_info中
  496. ID_dict=read_sam(insam)
  497. #存放soft_clip的reads ID
  498. soft_clip_ID=[]
  499. #输出到本地hard clip fusion
  500. hard_clip_file=outdir+"/hard_clip_fusion_breakpoints.txt"
  501. hard_clip_out=open(hard_clip_file,'w')
  502. for ID in ID_dict.keys():
  503. reads_info = ID_dict[ID]
  504. clip_stat=check_clip(reads_info)
  505. if clip_stat == "soft_clip":
  506. soft_clip_ID.append(ID)
  507. elif clip_stat == "hard_clip":
  508. out_list=hard_clip_fusion(reads_info)
  509. if len(out_list) !=0:
  510. out_list.append(ID)
  511. hard_clip_out.write("\t".join(out_list)+"\n")
  512. hard_clip_out.close()
  513. #输出到本地single
  514. single_points=outdir+"/soft_clip_single_breakpoints.txt"
  515. SingleInput=open(single_points,'w')
  516. #输出到本地double
  517. double_points=outdir+"/soft_clip_double_breakpoints.txt"
  518. DoubleInput=open(double_points,'w')
  519. #
  520. for soft_ID in soft_clip_ID:
  521. reads_info = ID_dict[soft_ID]
  522. soft_clip_results=soft_clip_fusion(reads_info)
  523. if soft_clip_results[0]:
  524. if soft_clip_results[1]=="Double":
  525. DoubleInput.write(soft_clip_results[2])
  526. elif soft_clip_results[1]=="Single":
  527. SingleInput.write(soft_clip_results[2])
  528. SingleInput.close()
  529. DoubleInput.close()
  530. double_points_file=outdir+"/double_breakpoints.txt"
  531. os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file)
  532. single_points_file=outdir+"/single_breakpoints.txt"
  533. os.system("cp "+single_points+" "+single_points_file)
  534. t2=time.time()
  535. print("Times: "+str(int(t2-t1))+"s")
  536. print("BreakPoints Search Done!")
  537. if __name__ == '__main__':
  538. parser = argparse.ArgumentParser(description='get reads mapping location')
  539. parser.add_argument('-i', required=True, type=str, help="insam")
  540. parser.add_argument('-o', required=True, type=str, help="outdir")
  541. args = parser.parse_args()
  542. main(args.i, args.o)