breakpoint_stat_single_v6.1.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time
  4. gene_info = {}
  5. gene_strand = {}
  6. DB_DIR="/cgdata/IVDRD/18gene_DB/qaseq_fusion"
  7. primer_homo_file=DB_DIR+"/sample_primer_stat.txt"
  8. primer_seq_file=DB_DIR+"/select_primer_info_total.txt"
  9. ref=DB_DIR+"/hg38.fa"
  10. db=DB_DIR+"/hg38"
  11. transcript_nm=DB_DIR+"/big_panel_transcript_nm_hg38.xls"
  12. PRIMER_INFO={}
  13. PRIMER_ID={}
  14. PRIMER_HOMO={}
  15. BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N'}
  16. def get_primer_info():
  17. #PRIMER HOMO
  18. #{'primer':['primer1','primer2','primer3',...],}
  19. with open(primer_homo_file,'r') as phf:
  20. for line in phf:
  21. info = line.strip("\n").split("\t")
  22. PRIMER_HOMO[info[0]]=info[1].split(",")
  23. #PRIMER ID
  24. #{'primer':'ATGC','primer1':'ATGCCC'}
  25. #PRIMER INFO
  26. with open(primer_seq_file,'r') as psf:
  27. for line in psf:
  28. info = line.strip("\n").split("\t")
  29. PRIMER_ID[info[0]]=info[1]
  30. PRIMER_INFO[info[0]]=info
  31. def get_PrimerPre_Site(PrimerPre_Site_Info_file,reads_Primer_dict):
  32. Info_dict={}
  33. with open(PrimerPre_Site_Info_file,'r') as PPSI:
  34. for line in PPSI:
  35. info = line.strip("\n").split("\t")
  36. ID=info[-1].split("_")[0]
  37. if ID == "":
  38. continue
  39. primerID=reads_Primer_dict[ID][0]
  40. if 'IPC' in primerID:
  41. continue
  42. mismatch_seq=info[2]
  43. match_seq=info[3]
  44. if info[1] =="Down":
  45. primerID=reads_Primer_dict[ID][1]
  46. if 'IPC' in primerID:
  47. continue
  48. mismatch_seq="".join([BASE_DICT[base] for base in mismatch_seq][::-1])
  49. match_seq="".join([BASE_DICT[base] for base in match_seq][::-1])
  50. Info_dict[info[0]]=[info[0],primerID,mismatch_seq,match_seq,info[-2]]
  51. return Info_dict
  52. def out_test(dict):
  53. for key in dict.keys():
  54. print("TEST:")
  55. print(key)
  56. print(dict[key])
  57. def OvarLap(seq1,seq2,MISMATCH=1):
  58. num = len(seq1)
  59. count=0
  60. for index in range(num):
  61. if seq1[index] !=seq2[index]:
  62. count +=1
  63. if count >MISMATCH:
  64. return False
  65. return True
  66. def check(primer_seq,primer_down_seq,mismatch_seq):
  67. mismatch_len=len(mismatch_seq)
  68. print("TEST")
  69. print(primer_seq)
  70. print(primer_down_seq)
  71. print(mismatch_seq)
  72. for index in range(mismatch_len,5,-1):
  73. num=min(len(primer_seq),mismatch_len-index)
  74. for match_index in range(num,5,-1):
  75. primer_temp_seq=primer_seq[-1*match_index:]+primer_down_seq[:index]
  76. mismatch_temp_seq=mismatch_seq[-1*index-match_index:]
  77. if OvarLap(primer_temp_seq,mismatch_temp_seq):
  78. return True,index,match_index+index
  79. return False,0,0
  80. def get_primer_Site(primer_list,mismatch_seq,match_seq):
  81. primer_candidate_fusion_dict={}
  82. for primer in primer_list:
  83. primer_temp=PRIMER_INFO[primer]
  84. primer_down_seq=primer_temp[-1]
  85. primer_up_seq=primer_temp[-2]
  86. primer_seq=primer_temp[1]
  87. check_results=check(primer_seq,primer_down_seq,mismatch_seq)
  88. print(check_results)
  89. if check_results[0]:
  90. match_common_len=int(check_results[1])
  91. total_match_common_len=int(check_results[2])
  92. strand = primer_temp[5]
  93. primer_chr=primer_temp[2]
  94. primer_start=int(primer_temp[3])
  95. primer_end=int(primer_temp[4])
  96. breakpoints=primer_end+match_common_len
  97. site_e=primer_chr+":"+str(breakpoints-30)
  98. if match_common_len>=30:
  99. fusion_seq=primer_down_seq[match_common_len-30:match_common_len]+match_seq[:30]
  100. else:
  101. if match_common_len+len(primer_seq)>=30:
  102. fusion_seq=primer_seq[-1*(30-match_common_len):]+primer_down_seq[:match_common_len]+match_seq[:30]
  103. elif match_common_len+len(primer_seq)>=15:
  104. primer_seq_len=len(primer_seq)
  105. fusion_seq=primer_up_seq[-1*(30-match_common_len-primer_seq_len):]+primer_seq+primer_down_seq[:match_common_len]+match_seq[:30]
  106. if strand == '-1':
  107. breakpoints=primer_start-match_common_len
  108. site_e=primer_chr+":"+str(breakpoints+30)
  109. site=primer_chr+":"+str(breakpoints)
  110. primer_candidate_fusion_dict[primer]=[primer,site,site_e,fusion_seq,str(total_match_common_len)]
  111. return primer_candidate_fusion_dict
  112. def get_Uniq_UMIs(ID_list):
  113. UMIs_dict={}
  114. for ID in ID_list:
  115. UMI=ID.split("_")[2]
  116. UMIs_dict[UMI]=0
  117. return len(UMIs_dict)
  118. def get_single_primer_points(PrimerPre_Site_Info,indir):
  119. #记录每个readsID 对应site
  120. single_points_PrimersPre_readsInfo_file=indir+"/single_points_PrimersPre_readsInfo.txt"
  121. single_points_PrimersPre_readsInfo_dict={}
  122. with open(single_points_PrimersPre_readsInfo_file,'r') as sppr:
  123. for line in sppr:
  124. info=line.strip("\n").split("\t")
  125. if info[0] not in single_points_PrimersPre_readsInfo_dict.keys():
  126. single_points_PrimersPre_readsInfo_dict[info[0]]=[info[-1]]
  127. else:
  128. single_points_PrimersPre_readsInfo_dict[info[0]].append(info[-1])
  129. fusion_breakpoints_detail_file=indir+"/single_fusion_breakpoints_detail.txt"
  130. fusion_breakpoints_detail=open(fusion_breakpoints_detail_file,'w')
  131. fusion_breakpoints_detail.write("\t".join(['KeyPoints','Points','Point1_End','Point2_End','FusionSeq','Primer'])+"\n")
  132. fusion_breakpoints_readsID_file=indir+"/single_fusion_breakpoints_readsID.txt"
  133. fusion_breakpoints_readsID=open(fusion_breakpoints_readsID_file,'w')
  134. fusion_breakpoints_readsID.write("\t".join(['Points','FusionSeq','ReadsID'])+"\n")
  135. fusion_breakpoints_stat_file=indir+"/single_fusion_breakpoints_stat.txt"
  136. fusion_breakpoints_stat=open(fusion_breakpoints_stat_file,'w')
  137. fusion_breakpoints_stat.write("\t".join(['Points','Point1_End','Point2_End',"Overlap",'DoubleReads','SingleReads','UMIkinds','FusionSeq'])+"\n")
  138. for key_site in PrimerPre_Site_Info.keys():
  139. temp_info_dict=PrimerPre_Site_Info[key_site]
  140. primerID=temp_info_dict[1]
  141. key_site_end=temp_info_dict[-1]
  142. if primerID in PRIMER_HOMO.keys():
  143. primer_list=PRIMER_HOMO[primerID]
  144. else:
  145. primer_list=[]
  146. primer_list.append(primerID)
  147. primer_candidate_fusion_dict=get_primer_Site(primer_list,temp_info_dict[2],temp_info_dict[3])
  148. for item in primer_candidate_fusion_dict.keys():
  149. item_temp=primer_candidate_fusion_dict[item]
  150. site=item_temp[1]+"-"+key_site
  151. fusion_breakpoints_detail.write("\t".join([key_site,site,item_temp[2],key_site_end,item_temp[3],item])+"\n")
  152. ID_list=single_points_PrimersPre_readsInfo_dict[key_site]
  153. SingleID=len(ID_list)
  154. UMIkinds=get_Uniq_UMIs(ID_list)
  155. fusion_breakpoints_stat.write("\t".join([site,item_temp[2],key_site_end,str(0),str(0),str(SingleID),str(UMIkinds),item_temp[3]])+"\n")
  156. for ID in ID_list:
  157. fusion_breakpoints_readsID.write("\t".join([site,item_temp[-2],ID])+"\n")
  158. fusion_breakpoints_detail.close()
  159. fusion_breakpoints_readsID.close()
  160. fusion_breakpoints_stat.close()
  161. def get_reads_primers_info(readsPrimer):
  162. reads_Primer_dict={}
  163. with open(readsPrimer,'r') as rp:
  164. for line in rp:
  165. info=line.strip("\n").split("\t")
  166. if len(info)==10:
  167. reads_Primer_dict[info[0]]=[info[3],info[7]]
  168. return reads_Primer_dict
  169. def main(indir,readsPrimer):
  170. t1=time.time()
  171. get_primer_info()
  172. #读取reads primer信息
  173. reads_Primer_dict=get_reads_primers_info(readsPrimer)
  174. PrimerPre_Site_Info_file=indir+"/single_points_PrimersPre_site.txt"
  175. PrimerPre_Site_Info=get_PrimerPre_Site(PrimerPre_Site_Info_file,reads_Primer_dict)
  176. print(PrimerPre_Site_Info)
  177. get_single_primer_points(PrimerPre_Site_Info,indir)
  178. t2=time.time()
  179. print("Times: "+str(int(t2-t1))+"s")
  180. print("BreakPoints Stat Single Done!")
  181. if __name__ == '__main__':
  182. parser = argparse.ArgumentParser(description='single breakpoints primers')
  183. parser.add_argument('-i', required=True, type=str, help="indir")
  184. parser.add_argument('-r', required=True, type=str, help="reads primers file")
  185. args = parser.parse_args()
  186. main(args.i,args.r)