breakpoint_stat_single_v6.3.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import time,argparse
  4. from StatReadsFusion import SingleReadsFusion as SRF
  5. def GetHomo(homo_file,homo_dict):
  6. with open(homo_file,'r') as hf:
  7. for line in hf:
  8. info = line.strip("\n").split("\t")
  9. primer = info[0]
  10. info_list = info[1].split(",")
  11. homo_dict[primer]=info_list
  12. def GetInfo(info_file,info_dict,primer_seq_dict):
  13. with open(info_file,'r') as inf:
  14. for line in inf:
  15. info = line.strip("\n").split("\t")
  16. primer = info[0]
  17. info_dict[primer]=info
  18. seq = info[1]
  19. if seq not in primer_seq_dict.keys():
  20. primer_seq_dict[seq]=[info[0]]
  21. else:
  22. primer_seq_dict[seq].append(info[0])
  23. def GetUniqUMIPrimer(ID_list,readsPrimer_dict):
  24. UMIs_dict={}
  25. Primer_dict={}
  26. for ID in ID_list:
  27. temp=ID.split("_")
  28. if len(temp)>=5:
  29. UMI_tmp=ID.split("_")
  30. UMI = UMI_tmp[2]
  31. ReadsID = UMI_tmp[0]
  32. primer = readsPrimer_dict[ReadsID]
  33. UMIs_dict[UMI]=0
  34. Primer_dict[primer]=0
  35. return len(UMIs_dict),len(Primer_dict)
  36. def main(indir,readsIDFile,homo_file,info_file):
  37. t1=time.time()
  38. homo_dict={}
  39. info_dict={}
  40. primer_seq_dict={}
  41. #double fuison
  42. fusion_file=indir+"/fusion_stat.txt"
  43. fusion_dict={}
  44. fusion_header=""
  45. with open(fusion_file,'r') as dff:
  46. fusion_header = dff.readline()
  47. for line in dff:
  48. info = line.strip("\n").split("\t")
  49. site = info[0]
  50. fusion_dict[site]=info
  51. #double fusion readsID
  52. fusion_ID_file=indir+"/fusion_readsID.txt"
  53. fusion_ID_dict={}
  54. fusion_ID_info_list=[]
  55. ID_header=""
  56. with open(fusion_ID_file,'r') as dff:
  57. ID_header = dff.readline()
  58. for line in dff:
  59. info = line.strip("\n").split("\t")
  60. site = info[0]
  61. readsID = info[1]
  62. if site not in fusion_ID_dict.keys():
  63. fusion_ID_dict[site]=[readsID]
  64. else:
  65. fusion_ID_dict[site].append(readsID)
  66. fusion_ID_info_list.append(info)
  67. GetHomo(homo_file,homo_dict)
  68. GetInfo(info_file,info_dict,primer_seq_dict)
  69. single_points_file=indir+"/single_breakpoint.txt"
  70. single_site_info_dict={}
  71. single_site_ID_dict={}
  72. single_site_total_list=[]
  73. readsPrimer_dict={}
  74. srf=SRF(homo_dict,info_dict,primer_seq_dict)
  75. srf.StatPrimer(readsIDFile,readsPrimer_dict)
  76. with open(single_points_file,'r') as spf:
  77. header = spf.readline().strip("\n").split("\t")
  78. Reads_index = header.index("ReadsID")
  79. Site_index = header.index("Site")
  80. for line in spf:
  81. info = line.strip("\n").split("\t")
  82. site = info[Site_index]
  83. ID = info[Reads_index]
  84. single_site_total_list.append(info)
  85. if site not in single_site_info_dict.keys():
  86. single_site_info_dict[site]=[info]
  87. single_site_ID_dict[site]=[ID]
  88. else:
  89. single_site_info_dict[site].append(info)
  90. single_site_ID_dict[site].append(ID)
  91. single_site_cluster_dict={}
  92. single_double_site_dict={}
  93. single_select_list=[]
  94. #combine seq
  95. for site in single_site_info_dict.keys():
  96. results=srf.GetSingleCombine(single_site_info_dict[site])
  97. if len(results[0])<10:
  98. continue
  99. single_site_cluster_dict[site]=list(results)
  100. for double_site in fusion_dict.keys():
  101. if srf.CheckSite(site,double_site):
  102. double_temp_list=fusion_dict[double_site]
  103. fusion_seq = double_temp_list[-1]
  104. if srf.CheckFusionSeq(results[0]+results[1],fusion_seq):
  105. single_select_list.append(site)
  106. if double_site not in single_double_site_dict.keys():
  107. single_double_site_dict[double_site]=[site]
  108. else:
  109. single_double_site_dict[double_site].append(site)
  110. #将single cluster加到fusion_dict中
  111. for ds in single_double_site_dict.keys():
  112. site_list = single_double_site_dict[ds]
  113. temp_ID = []
  114. for site in site_list:
  115. ID_list = single_site_ID_dict[site]
  116. temp_ID.extend(ID_list)
  117. for ID_temp in ID_list:
  118. fusion_ID_info_list.append([ds,ID_temp,'Single'])
  119. singleNum = len(temp_ID)
  120. info = fusion_dict[ds]
  121. info[5] = str(singleNum)
  122. temp_ID.extend(fusion_ID_dict[ds])
  123. results=GetUniqUMIPrimer(temp_ID,readsPrimer_dict)
  124. UMIs = results[0]
  125. primerNum = results[1]
  126. info[6] = str(UMIs)
  127. info[7] = str(primerNum)
  128. fusion_dict[ds]=info
  129. #将single 加到fusion dict中
  130. #points ont the same front
  131. site_ops=[]
  132. for item in single_site_total_list:
  133. site = item[0]
  134. if site in single_select_list:
  135. continue
  136. mismatch_seq= item[1]
  137. match_seq = item[2]
  138. readsID = item[4]
  139. results=srf.GetSinglePoint(readsID,mismatch_seq,match_seq)
  140. if results[0] and len(results[1])>0:
  141. for item_list in results[1]:
  142. site_ops.append(site)
  143. #['GGCATGGTGGTGGGCGTCTGTAGTCCCAGCTACTCAGGAGGCTGAGG', 'chr10:59897395', 'chr10:59897425']
  144. fusion_seq=item_list[0]
  145. site1 = item_list[1]
  146. site1_e = item_list[2]
  147. site2 = site
  148. site2_e = item[3]
  149. singleNum = len(single_site_ID_dict[site])
  150. results_NUM=GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
  151. UMIs = results_NUM[0]
  152. primerNum = results_NUM[1]
  153. out_str=[site1+"-"+site2,site1_e,site2_e,'0','0',str(singleNum),str(UMIs),str(primerNum),fusion_seq]
  154. fusion_dict[out_str[0]] = out_str
  155. #输出fusion stat final
  156. fusion_final_file=indir+"/fusion_stat_final.txt"
  157. fusion_final=open(fusion_final_file,'w')
  158. fusion_final.write(fusion_header)
  159. for site in fusion_dict.keys():
  160. fusion_final.write("\t".join(fusion_dict[site])+"\n")
  161. fusion_final.close()
  162. #输出fusion stat readsID
  163. fusion_readsID_file=indir+"/fusion_readsID_final.txt"
  164. fusion_readsID=open(fusion_readsID_file,'w')
  165. fusion_readsID.write(ID_header)
  166. for info in fusion_ID_info_list:
  167. fusion_readsID.write("\t".join(info)+"\n")
  168. fusion_readsID.close()
  169. #输出single cluster结果
  170. single_cluster_file=indir+"/single_stat.txt"
  171. single_cluster=open(single_cluster_file,'w')
  172. single_cluster.write("\t".join(["Point","PointDown","SingleNum","UMI","PrimerPair","MismatchSeq","MatchSeq"])+"\n")
  173. site_count={}
  174. site_out={}
  175. for site in single_site_cluster_dict.keys():
  176. if site not in site_ops:
  177. singleNum = len(single_site_ID_dict[site])
  178. results = GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
  179. PointDown=single_site_info_dict[site][0][3]
  180. UMIs = results[0]
  181. primerNum = results[1]
  182. site_count[site] = singleNum
  183. info = [site,PointDown,str(singleNum),str(UMIs),str(primerNum)]
  184. info.extend(single_site_cluster_dict[site])
  185. site_out[site]=info
  186. site_count_sort= sorted(site_count.items(), key=lambda d:d[1], reverse = True)
  187. num = len(site_count_sort)
  188. for index in range(num):
  189. site = site_count_sort[index][0]
  190. single_cluster.write("\t".join(site_out[site])+"\n")
  191. single_cluster.close()
  192. t2=time.time()
  193. print("Times: "+str(int(t2-t1))+"s")
  194. print("BreakPoints Stat Single Done!")
  195. if __name__ == '__main__':
  196. parser = argparse.ArgumentParser(description='single breakpoints primers')
  197. parser.add_argument('-i', required=True, type=str, help="indir")
  198. parser.add_argument('-r', required=True, type=str, help="reads primer file")
  199. parser.add_argument('-m', required=True, type=str, help="primer homo file")
  200. parser.add_argument('-f', required=True, type=str, help="primer info file")
  201. args = parser.parse_args()
  202. main(args.i,args.r,args.m,args.f)