breakpoint_stat_v6.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time
  4. BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N','-':'-'}
  5. def get_site_pos(site):
  6. f_chr=site.split("-")[0].split(':')[0]
  7. f_pos=site.split("-")[0].split(':')[1]
  8. e_chr=site.split("-")[1].split(':')[0]
  9. e_pos=site.split("-")[1].split(':')[1]
  10. return f_chr,f_pos,e_chr,e_pos
  11. def check_site(key_site,select_site):
  12. key_site_info=get_site_pos(key_site)
  13. select_site_info=get_site_pos(select_site)
  14. if key_site_info[0] == select_site_info[0] and abs(int(key_site_info[1])-int(select_site_info[1])) <=5:
  15. if key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5:
  16. return True
  17. elif key_site_info[0] == select_site_info[2] and abs(int(key_site_info[1])-int(select_site_info[3])) <=5:
  18. if key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5:
  19. return True
  20. #elif key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5:
  21. # return True
  22. #elif key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5:
  23. # return True
  24. else:
  25. return False
  26. def uniq_site_ID(site_list):
  27. site_dict_num={}
  28. total_num = len(site_list)
  29. for item in site_list:
  30. if item not in site_dict_num.keys():
  31. site_dict_num[item]=1
  32. else:
  33. site_dict_num[item]+=1
  34. seq_site_sort= sorted(site_dict_num.items(), key=lambda d:d[1], reverse = True)
  35. site_num=len(seq_site_sort)
  36. key_site =seq_site_sort[0]
  37. site_list_new=[key_site]
  38. count = int(key_site[1])
  39. for index in range(1,site_num):
  40. select_site=seq_site_sort[index]
  41. if not check_site(key_site[0],select_site[0]):
  42. site_list_new.append(select_site)
  43. else:
  44. count += int(select_site[1])
  45. if 'chr2:29223740-chr1:13397210' in site_list_new[0][0]:
  46. print("TEST")
  47. print(seq_site_sort[0])
  48. print(count)
  49. print(total_num)
  50. #不超过1半,则不合并
  51. if count*2 >= total_num:
  52. return site_list_new
  53. else:
  54. return seq_site_sort
  55. def Seq_Overlap(seq1,seq2,MISMATCH=4):
  56. seq_len=min(len(seq1),len(seq2))
  57. count =0
  58. for index in range(seq_len):
  59. if seq1[index] != seq2[index]:
  60. count +=1
  61. if count >MISMATCH:
  62. return False
  63. return True
  64. def IS_seq(key_seq,select_seq):
  65. select_seq_rev_comp="".join([BASE_DICT[base] for base in select_seq][::-1])
  66. #select seq former
  67. for index in range(5):
  68. select_seq_temp=select_seq[index:]
  69. temp_len=len(select_seq_temp)
  70. key_seq_temp=key_seq[:temp_len]
  71. if Seq_Overlap(select_seq_temp,key_seq_temp):
  72. return True
  73. #select seq latter
  74. for index in range(5):
  75. key_seq_temp=key_seq[index:]
  76. temp_len=len(key_seq_temp)
  77. select_seq_temp=select_seq[:temp_len]
  78. if Seq_Overlap(select_seq_temp,key_seq_temp):
  79. return True
  80. #rev_comp select seq former
  81. for index in range(5):
  82. select_seq_rev_comp_temp=select_seq_rev_comp[index:]
  83. temp_len=len(select_seq_rev_comp_temp)
  84. key_seq_temp=key_seq[:temp_len]
  85. if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp):
  86. return True
  87. #rev_comp select seq latter
  88. for index in range(5):
  89. key_seq_temp=key_seq[index:]
  90. temp_len=len(key_seq_temp)
  91. select_seq_rev_comp_temp=select_seq_rev_comp[:temp_len]
  92. if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp):
  93. return True
  94. return False
  95. def get_combine_seq(key_seq,check_index,seq_count_sort):
  96. total_seq_num=len(seq_count_sort)
  97. key_seq_list=[]
  98. for index in range(total_seq_num):
  99. if index not in check_index:
  100. select_seq=seq_count_sort[index][0]
  101. if IS_seq(key_seq,select_seq):
  102. key_seq_list.append(select_seq)
  103. check_index.append(index)
  104. return key_seq_list
  105. def stat_double_fusion(infile,readsID_dict,site_dict):
  106. seq_site_dict={}
  107. seq_readsID_dict={}
  108. seq_site_info={}
  109. with open(infile,'r') as inputF:
  110. for line in inputF:
  111. info = line.strip("\n").split("\t")
  112. seq = info[3]
  113. ID = info[7]
  114. site = info[0]
  115. seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1])
  116. if seq == "":
  117. continue
  118. seq_site_info[site]=[info[5],info[6]]
  119. if seq not in seq_site_dict.keys() and seq_rev_comp not in seq_site_dict.keys():
  120. seq_site_dict[seq]=[site]
  121. seq_readsID_dict[seq]=[ID]
  122. elif seq_rev_comp in seq_site_dict.keys():
  123. site_temp=site.split("-")
  124. site=site_temp[1]+'-'+site_temp[0]
  125. seq_site_dict[seq_rev_comp].append(site)
  126. seq_readsID_dict[seq_rev_comp].append(ID)
  127. elif seq in seq_site_dict.keys():
  128. seq_site_dict[seq].append(site)
  129. seq_readsID_dict[seq].append(ID)
  130. #sort by count
  131. seq_count_dict={}
  132. for seq in seq_site_dict.keys():
  133. count = len(seq_site_dict[seq])
  134. seq_count_dict[seq]=count
  135. seq_count_sort= sorted(seq_count_dict.items(), key=lambda d:d[1], reverse = True)
  136. seq_site_detail={}
  137. #combine site
  138. for item in seq_count_sort:
  139. seq=item[0]
  140. #print(seq_site_dict[seq])
  141. seq_site_dict[seq]=uniq_site_ID(seq_site_dict[seq])
  142. #print(seq_site_dict[seq])
  143. site=seq_site_dict[seq][0][0]
  144. site_rev_temp=site.split("-")
  145. site_rev=site_rev_temp[1]+'-'+site_rev_temp[0]
  146. seq_site_detail[seq]=seq_site_info[site] if site in seq_site_info else seq_site_info[site_rev]
  147. #combine seq
  148. total_seq_num=len(seq_count_sort)
  149. check_index=[]
  150. for i_key in seq_count_sort:
  151. print(i_key)
  152. for index in range(total_seq_num):
  153. if index not in check_index:
  154. key_seq=seq_count_sort[index][0]
  155. check_index.append(index)
  156. combine_seq_list=get_combine_seq(key_seq,check_index,seq_count_sort)
  157. print(key_seq)
  158. site_dict[key_seq]=[seq_site_dict[key_seq][0][0],seq_site_detail[key_seq][0],seq_site_detail[key_seq][1]]
  159. seq_ID_list=seq_readsID_dict[key_seq]
  160. for seq in combine_seq_list:
  161. seq_ID_list.extend(seq_readsID_dict[seq])
  162. readsID_dict[key_seq]=seq_ID_list
  163. #out_test(site_dict)
  164. def check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict):
  165. site = info[0]+"-"+info[0]
  166. if info[1] == "Down":
  167. seq = info[3][-10:]+info[2][:10]
  168. elif info[1] == "Up":
  169. seq = info[2][-10:]+info[3][:10]
  170. for key_seq in FusionSeq_readsID_dict.keys():
  171. key_seq_len_half=int(len(key_seq)/2)
  172. key_seq_temp=key_seq[key_seq_len_half-10:key_seq_len_half+10]
  173. if IS_seq(key_seq_temp,seq) and check_site(site,FusionSeq_site_dict[key_seq][0]):
  174. if key_seq not in single_readsID_dict.keys():
  175. single_readsID_dict[key_seq]=[info[-1]]
  176. single_site_dict[key_seq]=FusionSeq_site_dict[key_seq]
  177. else:
  178. single_readsID_dict[key_seq].append(info[-1])
  179. return True
  180. return False
  181. def fusion_seq_overlap(fusionseq1,fusionseq2):
  182. if Seq_Overlap(fusionseq1,fusionseq2):
  183. return True,0
  184. for i in range(1,6):
  185. fusionseq1_tmp=fusionseq1[i:]
  186. fusionseq2_tmp=fusionseq2[:-i]
  187. if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp):
  188. return True,i
  189. for i in range(1,6):
  190. fusionseq2_tmp=fusionseq2[i:]
  191. fusionseq1_tmp=fusionseq1[:-i]
  192. if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp):
  193. return True,-1*i
  194. return False,0
  195. def check_fusion_seq(down_list,up_list):
  196. #up_list
  197. #['CCCAGGCTGGAGTGCAGTGGTATGATCTTGGCTCATGGCAATCTCTGCCTCCGGGGTTCAAGCAATTCTTATGCCTCAGCCT', 'TTGAGATGGAGTCTCACTTTGTTG']
  198. #down_list
  199. #['CCGGGTGCACACCATTCTCCTGCCTCAGCCTCCCCGGTAGGTAGCTGGGACTACAGGCGCCCACCCCCATGCCC', 'AGGAGTCTAG']
  200. up_mismatch_len=min(len(up_list[1]),len(down_list[0]))
  201. down_mismatch_len=min(len(down_list[1]),len(up_list[0]))
  202. if up_mismatch_len > 25:
  203. up_mismatch_len=min(up_mismatch_len-10,25)
  204. else:
  205. up_mismatch_len=10
  206. if down_mismatch_len > 25:
  207. down_mismatch_len=min(down_mismatch_len-10,25)
  208. else:
  209. down_mismatch_len=10
  210. up_seq=up_list[1][-up_mismatch_len:]+up_list[0][:down_mismatch_len]
  211. down_seq=down_list[0][-up_mismatch_len:]+down_list[1][:down_mismatch_len]
  212. if len(up_seq)>=25:
  213. fusion_seq_overlap_results=fusion_seq_overlap(up_seq,down_seq)
  214. mismatch=fusion_seq_overlap_results[1]
  215. if mismatch >0:
  216. down_seq=down_list[0][-up_mismatch_len-mismatch:]+down_list[1][:down_mismatch_len-mismatch]
  217. fusion_seq_overlap_results_tmp=fusion_seq_overlap(up_seq,down_seq)
  218. if fusion_seq_overlap_results_tmp[1] !=0:
  219. return False,0,""
  220. return [fusion_seq_overlap_results[0],fusion_seq_overlap_results[1],up_seq]
  221. else:
  222. return False,0,""
  223. def check_adj_pos(p_fusion_seq,fusion_seq):
  224. for i in range(5):
  225. p_fusion_seq_tmp=p_fusion_seq[i:]
  226. tmp_len = len(p_fusion_seq_tmp)
  227. fusion_seq_tmp=fusion_seq[:tmp_len]
  228. if p_fusion_seq_tmp == fusion_seq_tmp:
  229. return True,i
  230. return False,0
  231. def adj_pos(pos1,pos2,index):
  232. pos1_info=pos1.split(":")
  233. pos2_info=pos2.split(":")
  234. if index != 0:
  235. pos2_info[1]=str(int(pos2_info[1])+index)
  236. return ":".join(pos1_info)+"-"+":".join(pos2_info)
  237. def get_single_fusion(down_dict,up_dict,known_point,know_list,up_strand):
  238. fusion_seq_dict={}
  239. if up_strand == "+":
  240. index = 1
  241. elif up_strand == "-":
  242. index = -1
  243. for item in down_dict.keys():
  244. for i in range(5):
  245. match_seq=down_dict[item][0][-5-i:]
  246. mismatch_seq=down_dict[item][1][:5-i]
  247. fusion_seq=match_seq+mismatch_seq
  248. #0 base
  249. if fusion_seq not in fusion_seq_dict.keys():
  250. fusion_seq_dict[fusion_seq]=[item]
  251. else:
  252. fusion_seq_dict[fusion_seq].append(item)
  253. for item in up_dict.keys():
  254. match_seq=up_dict[item][0][:5]
  255. mismatch_seq=up_dict[item][1][-5:]
  256. p_fusion_seq=mismatch_seq+match_seq
  257. #total
  258. if p_fusion_seq in fusion_seq_dict.keys():
  259. for p_item in fusion_seq_dict[p_fusion_seq]:
  260. if item+"-"+p_item in know_list:
  261. continue
  262. check_fusion_seq_results=check_fusion_seq(down_dict[p_item],up_dict[item])
  263. if check_fusion_seq_results[0]:
  264. fusion_seq=down_dict[p_item][0][-5:]+down_dict[p_item][1][:5]
  265. pos_results = check_adj_pos(p_fusion_seq,fusion_seq)
  266. if pos_results[0]:
  267. mismatch=index*int(pos_results[1])
  268. known_point[adj_pos(p_item,item,mismatch)]=[item,p_item,check_fusion_seq_results[2],str(abs(mismatch))]
  269. know_list.append(item+"-"+p_item)
  270. know_list.append(p_item+"-"+item)
  271. continue
  272. def count_base(base):
  273. tmp_dict={'A':0,'T':0,'G':0,'C':0,'N':0}
  274. tmp_dict[base] +=1
  275. return tmp_dict
  276. def base_dict(seq):
  277. tmp_dict={}
  278. index =0
  279. for base in seq:
  280. tmp_dict[index]=count_base(base)
  281. index +=1
  282. return tmp_dict
  283. def combine_dict(BASE_COUNT_DICT,temp_dict):
  284. num1=len(BASE_COUNT_DICT)
  285. num2=len(temp_dict)
  286. num=min(num1,num2)
  287. for pos in range(num):
  288. total_tmp_dict=BASE_COUNT_DICT[pos]
  289. tmp_tmp_dict=temp_dict[pos]
  290. for base in tmp_tmp_dict.keys():
  291. total_tmp_dict[base] +=tmp_tmp_dict[base]
  292. if num2>num1:
  293. for pos in range(num,num2):
  294. BASE_COUNT_DICT[pos]=temp_dict[pos]
  295. def get_common_seq(seq_list,flag='Front'):
  296. BASE_COUNT_DICT={0:{'A':0,'T':0,'G':0,'C':0,'N':0}}
  297. for seq in seq_list:
  298. if flag == 'End':
  299. seq=seq[::-1]
  300. index =0
  301. temp_dict=base_dict(seq)
  302. combine_dict(BASE_COUNT_DICT,temp_dict)
  303. common_seq=[]
  304. for index in BASE_COUNT_DICT.keys():
  305. temp_dict = BASE_COUNT_DICT[index]
  306. temp_count=0
  307. temp_base=''
  308. for base in temp_dict.keys():
  309. if temp_dict[base]>temp_count:
  310. temp_count=temp_dict[base]
  311. temp_base=base
  312. common_seq.append(temp_base)
  313. if flag =='End':
  314. return "".join(common_seq[::-1])
  315. return "".join(common_seq)
  316. def stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir):
  317. single_points_list=[]
  318. with open(single_points_file,'r') as spf:
  319. for line in spf:
  320. info = line.strip("\n").split("\t")
  321. if check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict):
  322. continue
  323. single_points_list.append(info)
  324. #single points
  325. #从reads i 根据比对结果找基因A,readss j上根据比对结果找基因B
  326. #同一个site,获取match seq和mismatch seq的commonc seq
  327. up_match_dict={}
  328. up_mismatch_dict={}
  329. down_match_dict={}
  330. down_mismatch_dict={}
  331. for item in single_points_list:
  332. match_seq=item[3]
  333. mismatch_seq=item[2]
  334. if item[1] == "Up":
  335. if item[0] not in up_match_dict.keys():
  336. up_match_dict[item[0]]=[match_seq]
  337. up_mismatch_dict[item[0]]=[mismatch_seq]
  338. else:
  339. up_match_dict[item[0]].append(match_seq)
  340. up_mismatch_dict[item[0]].append(mismatch_seq)
  341. elif item[1] == "Down":
  342. if item[0] not in down_match_dict.keys():
  343. down_match_dict[item[0]]=[match_seq]
  344. down_mismatch_dict[item[0]]=[mismatch_seq]
  345. else:
  346. down_match_dict[item[0]].append(match_seq)
  347. down_mismatch_dict[item[0]].append(mismatch_seq)
  348. #获取common seq
  349. up_dict={}
  350. up_rev_comp_dict={}
  351. down_dict={}
  352. down_rev_comp_dict={}
  353. for site in up_match_dict.keys():
  354. match_seq=get_common_seq(up_match_dict[site],'Front')
  355. mismatch_seq=get_common_seq(up_mismatch_dict[site],'End')
  356. up_dict[site]=[mismatch_seq,match_seq]
  357. match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1]
  358. mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1]
  359. up_rev_comp_dict[site]=[match_seq_rev_comp,mismatch_seq_rev_comp]
  360. for site in down_match_dict.keys():
  361. match_seq=get_common_seq(down_match_dict[site],'End')
  362. mismatch_seq=get_common_seq(down_mismatch_dict[site],'Front')
  363. down_dict[site]=[match_seq,mismatch_seq]
  364. match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1]
  365. mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1]
  366. down_rev_comp_dict[site]=[mismatch_seq_rev_comp,match_seq_rev_comp]
  367. known_point_dict={}
  368. know_list=[]#避免重复比较up_dict和down_dict
  369. #up-down
  370. if len(up_dict)>0 and len(down_dict)>0:
  371. get_single_fusion(down_dict,up_dict,known_point_dict,know_list,"+")
  372. #up-up_rev_comp
  373. if len(up_dict)>0:
  374. get_single_fusion(up_rev_comp_dict,up_dict,known_point_dict,know_list,"+")
  375. #down-down_rev_comp
  376. if len(down_dict)>0:
  377. get_single_fusion(down_dict,down_rev_comp_dict,known_point_dict,know_list,"-")
  378. #将known_point_dict中内容放到single_site_dict中
  379. for site in known_point_dict.keys():
  380. seq=known_point_dict[site][2]
  381. if seq not in single_site_dict.keys():
  382. single_site_dict[seq]=site
  383. #记录每个readsID对应可能的fusion site,之后从reads i上根据比对结果找基因A,primer坐标上找基因B
  384. single_points_primerPre_file=indir+"/single_points_PrimersPre_readsInfo.txt"
  385. single_points_primerPre=open(single_points_primerPre_file,'w')
  386. single_points_PrimersPre_site={}
  387. for item in single_points_list:
  388. match_seq=item[3]
  389. mismatch_seq=item[2]
  390. select_site=item[0]+"-"+item[0]
  391. ID=item[-1]
  392. if item[1] == "Up":
  393. select_seq=mismatch_seq[-10:]+match_seq[:10]
  394. elif item[1] == "Down":
  395. select_seq=match_seq[-10:]+mismatch_seq[:10]
  396. else:
  397. continue
  398. for key_seq in single_site_dict.keys():
  399. key_site=single_site_dict[key_seq]
  400. if IS_seq(key_seq,select_seq) and check_site(select_site,key_site[0]):
  401. if key_seq not in single_readsID_dict.keys():
  402. single_readsID_dict[key_seq]=[ID]
  403. else:
  404. single_readsID_dict[key_seq].append(ID)
  405. continue
  406. single_points_PrimersPre_site[item[0]]=[item[0],'Up',up_dict[item[0]][0],up_dict[item[0]][1],item[-2],ID] if item[1] == "Up" else [item[0],'Down',down_dict[item[0]][0],down_dict[item[0]][1],item[-2],ID]
  407. single_points_primerPre.write("\t".join(item)+"\n")
  408. single_points_primerPre.close()
  409. #记录每个fusion site对应的fusion seq
  410. single_points_primerPre_site_file=indir+"/single_points_PrimersPre_site.txt"
  411. ingle_points_primerPre_site=open(single_points_primerPre_site_file,'w')
  412. for site in single_points_PrimersPre_site.keys():
  413. ingle_points_primerPre_site.write("\t".join(single_points_PrimersPre_site[site])+"\n")
  414. ingle_points_primerPre_site.close()
  415. def get_Uniq_UMIs(ID_list):
  416. UMIs_dict={}
  417. for ID in ID_list:
  418. temp=ID.split("_")
  419. if len(temp)>=3:
  420. UMI=ID.split("_")[2]
  421. UMIs_dict[UMI]=0
  422. return len(UMIs_dict)
  423. def out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir):
  424. #out fusionseq and stat double and single readsID
  425. Final_FusionSeq_dict={}
  426. for key_seq in FusionSeq_readsID_dict.keys():
  427. site_list = FusionSeq_site_dict[key_seq]
  428. ID_list=FusionSeq_readsID_dict[key_seq]
  429. doubleID=len(ID_list)
  430. singleID=0
  431. if key_seq in single_readsID_dict.keys():
  432. singleID=len(single_readsID_dict[key_seq])
  433. ID_list.extend(single_readsID_dict[key_seq])
  434. UMIs=get_Uniq_UMIs(ID_list)
  435. site_list.append(str(doubleID))
  436. site_list.append(str(singleID))
  437. site_list.append(str(UMIs))
  438. Final_FusionSeq_dict[key_seq]=site_list
  439. for key_seq in single_readsID_dict.keys():
  440. if key_seq not in Final_FusionSeq_dict.keys():
  441. site_dict=single_site_dict[key_seq]
  442. doubleID=0
  443. singleID=len(single_readsID_dict[key_seq])
  444. ID_list=single_readsID_dict[key_seq]
  445. UMIs=get_Uniq_UMIs(ID_list)
  446. site_list.append(str(doubleID))
  447. site_list.append(str(singleID))
  448. site_list.append(str(UMIs))
  449. Final_FusionSeq_dict[key_seq]=site_dict
  450. fusion_points_file=indir+"/fusion_breakpoints_stat.txt"
  451. fusion_points=open(fusion_points_file,'w')
  452. fusion_points.write("\t".join(['Points','Point1_End','Point2_End','DoubleReads','SingleReads','UMIkinds'])+"\n")
  453. for key_seq in Final_FusionSeq_dict.keys():
  454. fusion_points.write("\t".join(Final_FusionSeq_dict[key_seq])+"\n")
  455. fusion_points.close()
  456. #输出每个ID对应fusion
  457. fusion_points_readsID_file=indir+"/fusion_breakpoints_readsID.txt"
  458. fusion_points_readsID=open(fusion_points_readsID_file,'w')
  459. fusion_points_readsID.write("\t".join(['Points','FusionSeq','ReadsID'])+"\n")
  460. for key_seq in FusionSeq_readsID_dict.keys():
  461. ID_list=FusionSeq_readsID_dict[key_seq]
  462. site=FusionSeq_site_dict[key_seq][0]
  463. for ID in ID_list:
  464. fusion_points_readsID.write("\t".join([site,key_seq,ID,"Double"])+"\n")
  465. for key_seq in single_readsID_dict.keys():
  466. ID_list=single_readsID_dict[key_seq]
  467. site=single_site_dict[key_seq][0]
  468. for ID in ID_list:
  469. fusion_points_readsID.write("\t".join([site,key_seq,ID,"Single"])+"\n")
  470. fusion_points_readsID.close()
  471. def out_test(item_dict):
  472. for item in item_dict.keys():
  473. print("TEST")
  474. print(item)
  475. print(item_dict[item])
  476. def main(indir):
  477. t1=time.time()
  478. FusionSeq_readsID_dict={}
  479. FusionSeq_site_dict={}
  480. single_readsID_dict={}
  481. single_site_dict={}
  482. #double fusion
  483. double_points_file=indir+"/double_breakpoints.txt"
  484. stat_double_fusion(double_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict)
  485. #single fusion
  486. single_points_file=indir+"/single_breakpoints.txt"
  487. stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir)
  488. #out results
  489. out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir)
  490. t2=time.time()
  491. print("Times: "+str(int(t2-t1))+"s")
  492. print("BreakPoints Stat Done!")
  493. if __name__ == '__main__':
  494. parser = argparse.ArgumentParser(description='get reads mapping location')
  495. parser.add_argument('-i', required=True, type=str, help="indir")
  496. args = parser.parse_args()
  497. main(args.i)