breakpoint_stat_single_v6.3.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import time,argparse
  4. from StatReadsFusion import SingleReadsFusion as SRF
  5. def GetHomo(homo_file,homo_dict):
  6. with open(homo_file,'r') as hf:
  7. for line in hf:
  8. info = line.strip("\n").split("\t")
  9. primer = info[0]
  10. info_list = info[1].split(",")
  11. homo_dict[primer]=info_list
  12. def GetInfo(info_file,info_dict,primer_seq_dict):
  13. with open(info_file,'r') as inf:
  14. for line in inf:
  15. info = line.strip("\n").split("\t")
  16. primer = info[0]
  17. info_dict[primer]=info
  18. seq = info[1]
  19. if seq not in primer_seq_dict.keys():
  20. primer_seq_dict[seq]=[info[0]]
  21. else:
  22. primer_seq_dict[seq].append(info[0])
  23. def GetUniqUMIPrimer(ID_list,readsPrimer_dict):
  24. UMIs_dict={}
  25. Primer_dict={}
  26. for ID in ID_list:
  27. temp=ID.split("_")
  28. if len(temp)>=5:
  29. UMI_tmp=ID.split("_")
  30. UMI = UMI_tmp[2]
  31. ReadsID = UMI_tmp[0]
  32. primer = readsPrimer_dict[ReadsID]
  33. UMIs_dict[UMI]=0
  34. Primer_dict[primer]=0
  35. return len(UMIs_dict),len(Primer_dict)
  36. def main(indir,readsIDFile):
  37. t1=time.time()
  38. homo_file="/cgdata/IVDRD/script/QAseq-Fusion_v2/DB/primer_home_stat.txt"
  39. info_file="/cgdata/IVDRD/script/QAseq-Fusion_v2/DB/primer_info_final.txt"
  40. homo_dict={}
  41. info_dict={}
  42. primer_seq_dict={}
  43. #double fuison
  44. fusion_file=indir+"/fusion_stat.txt"
  45. fusion_dict={}
  46. fusion_header=""
  47. with open(fusion_file,'r') as dff:
  48. fusion_header = dff.readline()
  49. for line in dff:
  50. info = line.strip("\n").split("\t")
  51. site = info[0]
  52. fusion_dict[site]=info
  53. #double fusion readsID
  54. fusion_ID_file=indir+"/fusion_readsID.txt"
  55. fusion_ID_dict={}
  56. fusion_ID_info_list=[]
  57. ID_header=""
  58. with open(fusion_ID_file,'r') as dff:
  59. ID_header = dff.readline()
  60. for line in dff:
  61. info = line.strip("\n").split("\t")
  62. site = info[0]
  63. readsID = info[1]
  64. if site not in fusion_ID_dict.keys():
  65. fusion_ID_dict[site]=[readsID]
  66. else:
  67. fusion_ID_dict[site].append(readsID)
  68. fusion_ID_info_list.append(info)
  69. GetHomo(homo_file,homo_dict)
  70. GetInfo(info_file,info_dict,primer_seq_dict)
  71. single_points_file=indir+"/single_breakpoints.txt"
  72. single_site_info_dict={}
  73. single_site_ID_dict={}
  74. single_site_total_list=[]
  75. readsPrimer_dict={}
  76. srf=SRF(homo_dict,info_dict,primer_seq_dict)
  77. srf.StatPrimer(readsIDFile,readsPrimer_dict)
  78. with open(single_points_file,'r') as spf:
  79. header = spf.readline().strip("\n").split("\t")
  80. Reads_index = header.index("ReadsID")
  81. Site_index = header.index("Site")
  82. for line in spf:
  83. info = line.strip("\n").split("\t")
  84. site = info[Site_index]
  85. ID = info[Reads_index]
  86. single_site_total_list.append(info)
  87. if site not in single_site_info_dict.keys():
  88. single_site_info_dict[site]=[info]
  89. single_site_ID_dict[site]=[ID]
  90. else:
  91. single_site_info_dict[site].append(info)
  92. single_site_ID_dict[site].append(ID)
  93. single_site_cluster_dict={}
  94. single_double_site_dict={}
  95. single_select_list=[]
  96. #combine seq
  97. for site in single_site_info_dict.keys():
  98. results=srf.GetSingleCombine(single_site_info_dict[site])
  99. if len(results[0])<10:
  100. continue
  101. single_site_cluster_dict[site]=list(results)
  102. for double_site in fusion_dict.keys():
  103. if srf.CheckSite(site,double_site):
  104. double_temp_list=fusion_dict[double_site]
  105. fusion_seq = double_temp_list[-1]
  106. if srf.CheckFusionSeq(results[0]+results[1],fusion_seq):
  107. single_select_list.append(site)
  108. if double_site not in single_double_site_dict.keys():
  109. single_double_site_dict[double_site]=[site]
  110. else:
  111. single_double_site_dict[double_site].append(site)
  112. #将single cluster加到fusion_dict中
  113. for ds in single_double_site_dict.keys():
  114. site_list = single_double_site_dict[ds]
  115. temp_ID = []
  116. for site in site_list:
  117. ID_list = single_site_ID_dict[site]
  118. temp_ID.extend(ID_list)
  119. for ID_temp in ID_list:
  120. fusion_ID_info_list.append([ds,ID_temp,'Single'])
  121. singleNum = len(temp_ID)
  122. info = fusion_dict[ds]
  123. info[5] = str(singleNum)
  124. temp_ID.extend(fusion_ID_dict[ds])
  125. results=GetUniqUMIPrimer(temp_ID,readsPrimer_dict)
  126. UMIs = results[0]
  127. primerNum = results[1]
  128. info[6] = str(int(info[6])+UMIs)
  129. info[7] = str(primerNum)
  130. fusion_dict[ds]=info
  131. #将single 加到fusion dict中
  132. for item in single_site_total_list:
  133. site = item[0]
  134. if site in single_select_list:
  135. continue
  136. mismatch_seq= item[1]
  137. match_seq = item[2]
  138. readsID = item[4]
  139. results=srf.GetSinglePoint(readsID,mismatch_seq,match_seq)
  140. if len(results) >0:
  141. single_select_list.append(site)
  142. for item_list in results:
  143. #['GGCATGGTGGTGGGCGTCTGTAGTCCCAGCTACTCAGGAGGCTGAGG', 'chr10:59897395', 'chr10:59897425']
  144. fusion_seq=item_list[0]
  145. site1 = item_list[1]
  146. site1_e = item_list[2]
  147. site2 = site
  148. site2_e = item[3]
  149. singleNum = len(single_site_ID_dict[site])
  150. results=GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
  151. UMIs = results[0]
  152. primerNum = results[1]
  153. out_str=[site1+"-"+site2,site1_e,site2_e,'0','0',str(singleNum),str(UMIs),str(primerNum),fusion_seq]
  154. fusion_dict[out_str[0]] = out_str
  155. #输出fusion stat final
  156. fusion_final_file=indir+"/fusion_stat_final.txt"
  157. fusion_final=open(fusion_final_file,'w')
  158. fusion_final.write(fusion_header)
  159. for site in fusion_dict.keys():
  160. fusion_final.write("\t".join(fusion_dict[site])+"\n")
  161. fusion_final.close()
  162. #输出fusion stat readsID
  163. fusion_readsID_file=indir+"/fusion_readsID_final.txt"
  164. fusion_readsID=open(fusion_readsID_file,'w')
  165. fusion_readsID.write(ID_header)
  166. for info in fusion_ID_info_list:
  167. fusion_readsID.write("\t".join(info)+"\n")
  168. fusion_readsID.close()
  169. #输出single cluster结果
  170. single_cluster_file=indir+"/single_stat.txt"
  171. single_cluster=open(single_cluster_file,'w')
  172. single_cluster.write("\t".join(["Point","PointDown","SingleNum","UMIs","PrimerPairs","MismatchSeq","MatchSeq"])+"\n")
  173. site_count={}
  174. site_out={}
  175. for site in single_site_cluster_dict.keys():
  176. if site not in single_select_list:
  177. singleNum = len(single_site_ID_dict[site])
  178. results = GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
  179. PointDown=single_site_info_dict[site][0][3]
  180. UMIs = results[0]
  181. primerNum = results[1]
  182. site_count[site] = singleNum
  183. info = [site,PointDown,str(singleNum),str(UMIs),str(primerNum)]
  184. info.extend(single_site_cluster_dict[site])
  185. site_out[site]=info
  186. site_count_sort= sorted(site_count.items(), key=lambda d:d[1], reverse = True)
  187. num = len(site_count_sort)
  188. for index in range(num):
  189. site = site_count_sort[index][0]
  190. single_cluster.write("\t".join(site_out[site])+"\n")
  191. single_cluster.close()
  192. t2=time.time()
  193. print("Times: "+str(int(t2-t1))+"s")
  194. print("BreakPoints Stat Single Done!")
  195. if __name__ == '__main__':
  196. parser = argparse.ArgumentParser(description='single breakpoints primers')
  197. parser.add_argument('-i', required=True, type=str, help="indir")
  198. parser.add_argument('-r', required=True, type=str, help="reads primer file")
  199. args = parser.parse_args()
  200. main(args.i,args.r)