breakpoint_stat_v6.1.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time
  4. BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N','-':'-'}
  5. def get_site_pos(site):
  6. f_chr=site.split("-")[0].split(':')[0]
  7. f_pos=site.split("-")[0].split(':')[1]
  8. e_chr=site.split("-")[1].split(':')[0]
  9. e_pos=site.split("-")[1].split(':')[1]
  10. return f_chr,f_pos,e_chr,e_pos
  11. def check_site(key_site,select_site,FLAG=False):
  12. key_site_info=get_site_pos(key_site)
  13. select_site_info=get_site_pos(select_site)
  14. if key_site_info[0] == select_site_info[0] and abs(int(key_site_info[1])-int(select_site_info[1])) <=5:
  15. if FLAG or key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5:
  16. return True
  17. elif key_site_info[0] == select_site_info[2] and abs(int(key_site_info[1])-int(select_site_info[3])) <=5:
  18. if FLAG or key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5:
  19. return True
  20. else:
  21. return False
  22. def uniq_site_ID(site_list):
  23. site_dict_num={}
  24. total_num = len(site_list)
  25. for item in site_list:
  26. if item not in site_dict_num.keys():
  27. site_dict_num[item]=1
  28. else:
  29. site_dict_num[item]+=1
  30. seq_site_sort= sorted(site_dict_num.items(), key=lambda d:d[1], reverse = True)
  31. site_num=len(seq_site_sort)
  32. key_site =seq_site_sort[0]
  33. site_list_new=[key_site]
  34. count = int(key_site[1])
  35. for index in range(1,site_num):
  36. select_site=seq_site_sort[index]
  37. if not check_site(key_site[0],select_site[0]):
  38. site_list_new.append(select_site)
  39. else:
  40. count += int(select_site[1])
  41. #不超过1半,则不合并
  42. if count*2 >= total_num:
  43. return site_list_new
  44. else:
  45. return seq_site_sort
  46. def Seq_Overlap(seq1,seq2,MISMATCH=4):
  47. seq_len=min(len(seq1),len(seq2))
  48. count =0
  49. for index in range(seq_len):
  50. if seq1[index] != seq2[index]:
  51. count +=1
  52. if count >MISMATCH:
  53. return False
  54. return True
  55. def IS_seq(key_seq,select_seq):
  56. select_seq_rev_comp="".join([BASE_DICT[base] for base in select_seq][::-1])
  57. #flag = True if key_seq == "AGTGGTTTTCTACTCTAAATATAGTAACTAGTATCCTCCTGGCTGATCAGGGGGTGGGGA" and select_seq == "GTCTCTGAGTGGTTTTCTACTCTAAATATATTAAATAGTATCCTCCTGGCTGATCAGGGG" else False
  58. flag =False
  59. #select seq former
  60. for index in range(8):
  61. select_seq_temp=select_seq[index:]
  62. temp_len=len(select_seq_temp)
  63. key_seq_temp=key_seq[:temp_len]
  64. if flag:
  65. print("A")
  66. print(select_seq_temp)
  67. print(key_seq_temp)
  68. print(Seq_Overlap(select_seq_temp,key_seq_temp,8))
  69. if Seq_Overlap(select_seq_temp,key_seq_temp,8):
  70. return True
  71. #select seq latter
  72. for index in range(8):
  73. key_seq_temp=key_seq[index:]
  74. temp_len=len(key_seq_temp)
  75. select_seq_temp=select_seq[:temp_len]
  76. if flag:
  77. print("B")
  78. print(Seq_Overlap(select_seq_temp,key_seq_temp))
  79. if Seq_Overlap(select_seq_temp,key_seq_temp):
  80. return True
  81. #rev_comp select seq former
  82. for index in range(6):
  83. select_seq_rev_comp_temp=select_seq_rev_comp[index:]
  84. temp_len=len(select_seq_rev_comp_temp)
  85. key_seq_temp=key_seq[:temp_len]
  86. if flag:
  87. print("C")
  88. print(Seq_Overlap(select_seq_temp,key_seq_temp))
  89. if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp):
  90. return True
  91. #rev_comp select seq latter
  92. for index in range(5):
  93. key_seq_temp=key_seq[index:]
  94. temp_len=len(key_seq_temp)
  95. select_seq_rev_comp_temp=select_seq_rev_comp[:temp_len]
  96. if flag:
  97. print("D")
  98. print(Seq_Overlap(select_seq_temp,key_seq_temp))
  99. if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp):
  100. return True
  101. return False
  102. def get_combine_seq(key_seq,check_index,seq_count_sort):
  103. total_seq_num=len(seq_count_sort)
  104. key_seq_list=[]
  105. for index in range(total_seq_num):
  106. if index not in check_index:
  107. select_seq=seq_count_sort[index][0]
  108. if IS_seq(key_seq,select_seq):
  109. key_seq_list.append(select_seq)
  110. check_index.append(index)
  111. return key_seq_list
  112. def stat_double_fusion(infile,readsID_dict,site_dict):
  113. seq_site_dict={}
  114. seq_readsID_dict={}
  115. seq_site_info={}
  116. with open(infile,'r') as inputF:
  117. header=inputF.readline().strip("\n").split("\t")
  118. seq_index=header.index("Seq")
  119. ReadsID_index=header.index("ReadsID")
  120. site_index=header.index("Site")
  121. Site1_Down_index=header.index("Site1_Down")
  122. Site2_Down_index=header.index("Site2_Down")
  123. Overlap_index=header.index("Overlap")
  124. for line in inputF:
  125. info = line.strip("\n").split("\t")
  126. seq = info[seq_index]
  127. ID = info[ReadsID_index]
  128. site = info[site_index]
  129. Site1_Down = info[Site1_Down_index]
  130. Site2_Down = info[Site2_Down_index]
  131. Overlap = info[Overlap_index]
  132. seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1])
  133. if seq == "":
  134. continue
  135. seq_site_info[site]=[Site1_Down,Site2_Down,Overlap]
  136. if seq not in seq_site_dict.keys() and seq_rev_comp not in seq_site_dict.keys():
  137. seq_site_dict[seq]=[site]
  138. seq_readsID_dict[seq]=[ID]
  139. elif seq_rev_comp in seq_site_dict.keys():
  140. site_temp=site.split("-")
  141. site=site_temp[1]+'-'+site_temp[0]
  142. seq_site_dict[seq_rev_comp].append(site)
  143. seq_readsID_dict[seq_rev_comp].append(ID)
  144. elif seq in seq_site_dict.keys():
  145. seq_site_dict[seq].append(site)
  146. seq_readsID_dict[seq].append(ID)
  147. #sort by count
  148. seq_count_dict={}
  149. for seq in seq_site_dict.keys():
  150. count = len(seq_site_dict[seq])
  151. seq_count_dict[seq]=count
  152. seq_count_sort= sorted(seq_count_dict.items(), key=lambda d:d[1], reverse = True)
  153. #combine seq
  154. total_seq_num=len(seq_count_sort)
  155. check_index=[]
  156. site_dict_temp={}
  157. for index in range(total_seq_num):
  158. if index not in check_index:
  159. key_seq=seq_count_sort[index][0]
  160. check_index.append(index)
  161. combine_seq_list=get_combine_seq(key_seq,check_index,seq_count_sort)
  162. site_dict_temp[key_seq]=seq_site_dict[key_seq]
  163. seq_ID_list=seq_readsID_dict[key_seq]
  164. for seq in combine_seq_list:
  165. seq_ID_list.extend(seq_readsID_dict[seq])
  166. site_list=seq_site_dict[seq]
  167. for site_item in site_list:
  168. site_dict_temp[key_seq].append(site_item)
  169. readsID_dict[key_seq]=seq_ID_list
  170. #combine site
  171. for key_seq in site_dict_temp.keys():
  172. site_list=site_dict_temp[key_seq]
  173. temp = uniq_site_ID(site_list)
  174. site=temp[0][0]
  175. site_rev_temp=site.split("-")
  176. site_rev=site_rev_temp[1]+'-'+site_rev_temp[0]
  177. site_temp=seq_site_info[site] if site in seq_site_info else seq_site_info[site_rev]
  178. site_dict[key_seq]=[site]
  179. for item in site_temp:
  180. site_dict[key_seq].append(item)
  181. def check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict):
  182. site = info[0]+"-"+info[0]
  183. if info[1] == "Down":
  184. seq = info[3][-10:]+info[2][:10]
  185. elif info[1] == "Up":
  186. seq = info[2][-10:]+info[3][:10]
  187. for key_seq in FusionSeq_readsID_dict.keys():
  188. key_seq_len_half=int(len(key_seq)/2)
  189. key_seq_temp=key_seq[key_seq_len_half-10:key_seq_len_half+10]
  190. if IS_seq(key_seq_temp,seq) and check_site(site,FusionSeq_site_dict[key_seq][0],True):
  191. if key_seq not in single_readsID_dict.keys():
  192. single_readsID_dict[key_seq]=[info[-1]]
  193. single_site_dict[key_seq]=FusionSeq_site_dict[key_seq]
  194. else:
  195. single_readsID_dict[key_seq].append(info[-1])
  196. return True
  197. return False
  198. def fusion_seq_overlap(fusionseq1,fusionseq2):
  199. if Seq_Overlap(fusionseq1,fusionseq2):
  200. return True,0
  201. for i in range(1,6):
  202. fusionseq1_tmp=fusionseq1[i:]
  203. fusionseq2_tmp=fusionseq2[:-i]
  204. if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp):
  205. return True,i
  206. for i in range(1,6):
  207. fusionseq2_tmp=fusionseq2[i:]
  208. fusionseq1_tmp=fusionseq1[:-i]
  209. if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp):
  210. return True,-1*i
  211. return False,0
  212. def check_fusion_seq(down_list,up_list):
  213. #up_list
  214. #['CCCAGGCTGGAGTGCAGTGGTATGATCTTGGCTCATGGCAATCTCTGCCTCCGGGGTTCAAGCAATTCTTATGCCTCAGCCT', 'TTGAGATGGAGTCTCACTTTGTTG']
  215. #down_list
  216. #['CCGGGTGCACACCATTCTCCTGCCTCAGCCTCCCCGGTAGGTAGCTGGGACTACAGGCGCCCACCCCCATGCCC', 'AGGAGTCTAG']
  217. up_mismatch_len=min(len(up_list[1]),len(down_list[0]))
  218. down_mismatch_len=min(len(down_list[1]),len(up_list[0]))
  219. if up_mismatch_len > 25:
  220. up_mismatch_len=min(up_mismatch_len-10,25)
  221. else:
  222. up_mismatch_len=10
  223. if down_mismatch_len > 25:
  224. down_mismatch_len=min(down_mismatch_len-10,25)
  225. else:
  226. down_mismatch_len=10
  227. up_seq=up_list[1][-up_mismatch_len:]+up_list[0][:down_mismatch_len]
  228. down_seq=down_list[0][-up_mismatch_len:]+down_list[1][:down_mismatch_len]
  229. if len(up_seq)>=25:
  230. fusion_seq_overlap_results=fusion_seq_overlap(up_seq,down_seq)
  231. mismatch=fusion_seq_overlap_results[1]
  232. if mismatch >0:
  233. down_seq=down_list[0][-up_mismatch_len-mismatch:]+down_list[1][:down_mismatch_len-mismatch]
  234. fusion_seq_overlap_results_tmp=fusion_seq_overlap(up_seq,down_seq)
  235. if fusion_seq_overlap_results_tmp[1] !=0:
  236. return False,0,""
  237. return [fusion_seq_overlap_results[0],fusion_seq_overlap_results[1],up_seq]
  238. else:
  239. return False,0,""
  240. def check_adj_pos(p_fusion_seq,fusion_seq):
  241. for i in range(5):
  242. p_fusion_seq_tmp=p_fusion_seq[i:]
  243. tmp_len = len(p_fusion_seq_tmp)
  244. fusion_seq_tmp=fusion_seq[:tmp_len]
  245. if p_fusion_seq_tmp == fusion_seq_tmp:
  246. return True,i
  247. return False,0
  248. def adj_pos(pos1,pos2,index):
  249. pos1_info=pos1.split(":")
  250. pos2_info=pos2.split(":")
  251. if index != 0:
  252. pos2_info[1]=str(int(pos2_info[1])+index)
  253. return ":".join(pos1_info)+"-"+":".join(pos2_info)
  254. def get_single_fusion(down_dict,up_dict,known_point,know_list,up_strand):
  255. fusion_seq_dict={}
  256. if up_strand == "+":
  257. index = 1
  258. elif up_strand == "-":
  259. index = -1
  260. for item in down_dict.keys():
  261. for i in range(5):
  262. match_seq=down_dict[item][0][-5-i:]
  263. mismatch_seq=down_dict[item][1][:5-i]
  264. fusion_seq=match_seq+mismatch_seq
  265. #0 base
  266. if fusion_seq not in fusion_seq_dict.keys():
  267. fusion_seq_dict[fusion_seq]=[item]
  268. else:
  269. fusion_seq_dict[fusion_seq].append(item)
  270. for item in up_dict.keys():
  271. match_seq=up_dict[item][0][:5]
  272. mismatch_seq=up_dict[item][1][-5:]
  273. p_fusion_seq=mismatch_seq+match_seq
  274. #total
  275. if p_fusion_seq in fusion_seq_dict.keys():
  276. for p_item in fusion_seq_dict[p_fusion_seq]:
  277. if item+"-"+p_item in know_list:
  278. continue
  279. check_fusion_seq_results=check_fusion_seq(down_dict[p_item],up_dict[item])
  280. if check_fusion_seq_results[0]:
  281. fusion_seq=down_dict[p_item][0][-5:]+down_dict[p_item][1][:5]
  282. pos_results = check_adj_pos(p_fusion_seq,fusion_seq)
  283. if pos_results[0]:
  284. mismatch=index*int(pos_results[1])
  285. known_point[adj_pos(p_item,item,mismatch)]=[item,p_item,check_fusion_seq_results[2],str(abs(mismatch))]
  286. know_list.append(item+"-"+p_item)
  287. know_list.append(p_item+"-"+item)
  288. continue
  289. def count_base(base):
  290. tmp_dict={'A':0,'T':0,'G':0,'C':0,'N':0}
  291. tmp_dict[base] +=1
  292. return tmp_dict
  293. def base_dict(seq):
  294. tmp_dict={}
  295. index =0
  296. for base in seq:
  297. tmp_dict[index]=count_base(base)
  298. index +=1
  299. return tmp_dict
  300. def combine_dict(BASE_COUNT_DICT,temp_dict):
  301. num1=len(BASE_COUNT_DICT)
  302. num2=len(temp_dict)
  303. num=min(num1,num2)
  304. for pos in range(num):
  305. total_tmp_dict=BASE_COUNT_DICT[pos]
  306. tmp_tmp_dict=temp_dict[pos]
  307. for base in tmp_tmp_dict.keys():
  308. total_tmp_dict[base] +=tmp_tmp_dict[base]
  309. if num2>num1:
  310. for pos in range(num,num2):
  311. BASE_COUNT_DICT[pos]=temp_dict[pos]
  312. def get_common_seq(seq_list,flag='Front'):
  313. BASE_COUNT_DICT={0:{'A':0,'T':0,'G':0,'C':0,'N':0}}
  314. for seq in seq_list:
  315. if flag == 'End':
  316. seq=seq[::-1]
  317. index =0
  318. temp_dict=base_dict(seq)
  319. combine_dict(BASE_COUNT_DICT,temp_dict)
  320. common_seq=[]
  321. temp_count = len(seq_list)
  322. for index in BASE_COUNT_DICT.keys():
  323. temp_dict = BASE_COUNT_DICT[index]
  324. temp_base=''
  325. for base in temp_dict.keys():
  326. if temp_dict[base]>temp_count*0.5:
  327. temp_count=temp_dict[base]
  328. temp_base=base
  329. common_seq.append(temp_base)
  330. if flag =='End':
  331. return "".join(common_seq[::-1])
  332. return "".join(common_seq)
  333. def stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir):
  334. single_points_list=[]
  335. with open(single_points_file,'r') as spf:
  336. header = spf.readline()
  337. for line in spf:
  338. info = line.strip("\n").split("\t")
  339. if check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict):
  340. continue
  341. single_points_list.append(info)
  342. #single points
  343. #从reads i 根据比对结果找基因A,readss j上根据比对结果找基因B
  344. #同一个site,获取match seq和mismatch seq的commonc seq
  345. up_match_dict={}
  346. up_mismatch_dict={}
  347. down_match_dict={}
  348. down_mismatch_dict={}
  349. for item in single_points_list:
  350. match_seq=item[3]
  351. mismatch_seq=item[2]
  352. if item[1] == "Up":
  353. if item[0] not in up_match_dict.keys():
  354. up_match_dict[item[0]]=[match_seq]
  355. up_mismatch_dict[item[0]]=[mismatch_seq]
  356. else:
  357. up_match_dict[item[0]].append(match_seq)
  358. up_mismatch_dict[item[0]].append(mismatch_seq)
  359. elif item[1] == "Down":
  360. if item[0] not in down_match_dict.keys():
  361. down_match_dict[item[0]]=[match_seq]
  362. down_mismatch_dict[item[0]]=[mismatch_seq]
  363. else:
  364. down_match_dict[item[0]].append(match_seq)
  365. down_mismatch_dict[item[0]].append(mismatch_seq)
  366. #获取common seq
  367. up_dict={}
  368. up_rev_comp_dict={}
  369. down_dict={}
  370. down_rev_comp_dict={}
  371. for site in up_match_dict.keys():
  372. match_seq=get_common_seq(up_match_dict[site],'Front')
  373. mismatch_seq=get_common_seq(up_mismatch_dict[site],'End')
  374. up_dict[site]=[mismatch_seq,match_seq]
  375. match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1]
  376. mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1]
  377. up_rev_comp_dict[site]=[match_seq_rev_comp,mismatch_seq_rev_comp]
  378. for site in down_match_dict.keys():
  379. match_seq=get_common_seq(down_match_dict[site],'End')
  380. mismatch_seq=get_common_seq(down_mismatch_dict[site],'Front')
  381. down_dict[site]=[match_seq,mismatch_seq]
  382. match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1]
  383. mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1]
  384. down_rev_comp_dict[site]=[mismatch_seq_rev_comp,match_seq_rev_comp]
  385. known_point_dict={}
  386. know_list=[]#避免重复比较up_dict和down_dict
  387. #up-down
  388. if len(up_dict)>0 and len(down_dict)>0:
  389. get_single_fusion(down_dict,up_dict,known_point_dict,know_list,"+")
  390. #up-up_rev_comp
  391. if len(up_dict)>0:
  392. get_single_fusion(up_rev_comp_dict,up_dict,known_point_dict,know_list,"+")
  393. #down-down_rev_comp
  394. if len(down_dict)>0:
  395. get_single_fusion(down_dict,down_rev_comp_dict,known_point_dict,know_list,"-")
  396. #将known_point_dict中内容放到single_site_dict中
  397. for site in known_point_dict.keys():
  398. seq=known_point_dict[site][2]
  399. if seq not in single_site_dict.keys():
  400. single_site_dict[seq]=[site]
  401. #记录每个readsID对应可能的fusion site,之后从reads i上根据比对结果找基因A,primer坐标上找基因B
  402. single_points_primerPre_file=indir+"/single_points_PrimersPre_readsInfo.txt"
  403. single_points_primerPre=open(single_points_primerPre_file,'w')
  404. single_points_PrimersPre_site={}
  405. for item in single_points_list:
  406. match_seq=item[3]
  407. mismatch_seq=item[2]
  408. select_site=item[0]+"-"+item[0]
  409. ID=item[-1]
  410. if item[1] == "Up":
  411. select_seq=mismatch_seq[-10:]+match_seq[:10]
  412. elif item[1] == "Down":
  413. select_seq=match_seq[-10:]+mismatch_seq[:10]
  414. else:
  415. continue
  416. for key_seq in single_site_dict.keys():
  417. key_site=single_site_dict[key_seq]
  418. if IS_seq(key_seq,select_seq) and check_site(select_site,key_site[0],True):
  419. if key_seq not in single_readsID_dict.keys():
  420. single_readsID_dict[key_seq]=[ID]
  421. else:
  422. single_readsID_dict[key_seq].append(ID)
  423. continue
  424. single_points_PrimersPre_site[item[0]]=[item[0],'Up',up_dict[item[0]][0],up_dict[item[0]][1],item[-2],ID] if item[1] == "Up" else [item[0],'Down',down_dict[item[0]][0],down_dict[item[0]][1],item[-2],ID]
  425. single_points_primerPre.write("\t".join(item)+"\n")
  426. single_points_primerPre.close()
  427. #记录每个fusion site对应的fusion seq
  428. single_points_primerPre_site_file=indir+"/single_points_PrimersPre_site.txt"
  429. ingle_points_primerPre_site=open(single_points_primerPre_site_file,'w')
  430. for site in single_points_PrimersPre_site.keys():
  431. ingle_points_primerPre_site.write("\t".join(single_points_PrimersPre_site[site])+"\n")
  432. ingle_points_primerPre_site.close()
  433. def get_Uniq_UMIs(ID_list):
  434. UMIs_dict={}
  435. for ID in ID_list:
  436. temp=ID.split("_")
  437. if len(temp)>=3:
  438. UMI=ID.split("_")[2]
  439. UMIs_dict[UMI]=0
  440. return len(UMIs_dict)
  441. def out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir):
  442. #out fusionseq and stat double and single readsID
  443. Final_FusionSeq_dict={}
  444. for key_seq in FusionSeq_readsID_dict.keys():
  445. site_list = FusionSeq_site_dict[key_seq]
  446. ID_list=FusionSeq_readsID_dict[key_seq]
  447. doubleID=len(ID_list)
  448. singleID=0
  449. if key_seq in single_readsID_dict.keys():
  450. singleID=len(single_readsID_dict[key_seq])
  451. ID_list.extend(single_readsID_dict[key_seq])
  452. UMIs=get_Uniq_UMIs(ID_list)
  453. site_list.append(str(doubleID))
  454. site_list.append(str(singleID))
  455. site_list.append(str(UMIs))
  456. Final_FusionSeq_dict[key_seq]=site_list
  457. for key_seq in single_readsID_dict.keys():
  458. if key_seq not in Final_FusionSeq_dict.keys():
  459. site_list=single_site_dict[key_seq]
  460. doubleID=0
  461. singleID=len(single_readsID_dict[key_seq])
  462. ID_list=single_readsID_dict[key_seq]
  463. UMIs=get_Uniq_UMIs(ID_list)
  464. site_temp=site_list[0].split("-")
  465. site1 = site_temp[0]
  466. site2 = site_temp[1]
  467. site_list.append(site1)
  468. site_list.append(site2)
  469. site_list.append(str(0))
  470. site_list.append(str(doubleID))
  471. site_list.append(str(singleID))
  472. site_list.append(str(UMIs))
  473. Final_FusionSeq_dict[key_seq]=site_list
  474. fusion_points_file=indir+"/fusion_breakpoints_stat.txt"
  475. fusion_points=open(fusion_points_file,'w')
  476. fusion_points.write("\t".join(['Points','Point1_End','Point2_End','Overlap','DoubleReads','SingleReads','UMIkinds','FusionSeq'])+"\n")
  477. for key_seq in Final_FusionSeq_dict.keys():
  478. fusion_points.write("\t".join(Final_FusionSeq_dict[key_seq])+"\t"+key_seq+"\n")
  479. fusion_points.close()
  480. #输出每个ID对应fusion
  481. fusion_points_readsID_file=indir+"/fusion_breakpoints_readsID.txt"
  482. fusion_points_readsID=open(fusion_points_readsID_file,'w')
  483. fusion_points_readsID.write("\t".join(['Points','FusionSeq','ReadsID'])+"\n")
  484. for key_seq in FusionSeq_readsID_dict.keys():
  485. ID_list=FusionSeq_readsID_dict[key_seq]
  486. site=FusionSeq_site_dict[key_seq][0]
  487. for ID in ID_list:
  488. fusion_points_readsID.write("\t".join([site,key_seq,ID,"Double"])+"\n")
  489. for key_seq in single_readsID_dict.keys():
  490. ID_list=single_readsID_dict[key_seq]
  491. site=single_site_dict[key_seq][0]
  492. for ID in ID_list:
  493. fusion_points_readsID.write("\t".join([site,key_seq,ID,"Single"])+"\n")
  494. fusion_points_readsID.close()
  495. def out_test(item_dict):
  496. for item in item_dict.keys():
  497. print("TEST")
  498. print(item)
  499. print(item_dict[item])
  500. def main(indir):
  501. t1=time.time()
  502. FusionSeq_readsID_dict={}
  503. FusionSeq_site_dict={}
  504. single_readsID_dict={}
  505. single_site_dict={}
  506. #double fusion
  507. double_points_file=indir+"/double_breakpoints.txt"
  508. stat_double_fusion(double_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict)
  509. #single fusion
  510. single_points_file=indir+"/single_breakpoints.txt"
  511. stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir)
  512. #out results
  513. out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir)
  514. t2=time.time()
  515. print("Times: "+str(int(t2-t1))+"s")
  516. print("BreakPoints Stat Done!")
  517. if __name__ == '__main__':
  518. parser = argparse.ArgumentParser(description='get reads mapping location')
  519. parser.add_argument('-i', required=True, type=str, help="indir")
  520. args = parser.parse_args()
  521. main(args.i)