123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import time,argparse
- from StatReadsFusion import SingleReadsFusion as SRF
- def GetHomo(homo_file,homo_dict):
-
- with open(homo_file,'r') as hf:
- for line in hf:
- info = line.strip("\n").split("\t")
- primer = info[0]
- info_list = info[1].split(",")
- homo_dict[primer]=info_list
- def GetInfo(info_file,info_dict,primer_seq_dict):
-
- with open(info_file,'r') as inf:
- for line in inf:
- info = line.strip("\n").split("\t")
- primer = info[0]
- info_dict[primer]=info
- seq = info[1]
- if seq not in primer_seq_dict.keys():
- primer_seq_dict[seq]=[info[0]]
- else:
- primer_seq_dict[seq].append(info[0])
- def GetUniqUMIPrimer(ID_list,readsPrimer_dict):
- UMIs_dict={}
- Primer_dict={}
- for ID in ID_list:
- temp=ID.split("_")
- if len(temp)>=5:
- UMI_tmp=ID.split("_")
- UMI = UMI_tmp[2]
- ReadsID = UMI_tmp[0]
- primer = readsPrimer_dict[ReadsID]
- UMIs_dict[UMI]=0
- Primer_dict[primer]=0
- return len(UMIs_dict),len(Primer_dict)
- def main(indir,readsIDFile,homo_file,info_file):
- t1=time.time()
- homo_dict={}
- info_dict={}
- primer_seq_dict={}
- #double fuison
- fusion_file=indir+"/fusion_stat.txt"
- fusion_dict={}
- fusion_header=""
- with open(fusion_file,'r') as dff:
- fusion_header = dff.readline()
- for line in dff:
- info = line.strip("\n").split("\t")
- site = info[0]
- fusion_dict[site]=info
- #double fusion readsID
- fusion_ID_file=indir+"/fusion_readsID.txt"
- fusion_ID_dict={}
- fusion_ID_info_list=[]
- ID_header=""
- with open(fusion_ID_file,'r') as dff:
- ID_header = dff.readline()
- for line in dff:
- info = line.strip("\n").split("\t")
- site = info[0]
- readsID = info[1]
- if site not in fusion_ID_dict.keys():
- fusion_ID_dict[site]=[readsID]
- else:
- fusion_ID_dict[site].append(readsID)
- fusion_ID_info_list.append(info)
- GetHomo(homo_file,homo_dict)
- GetInfo(info_file,info_dict,primer_seq_dict)
- single_points_file=indir+"/single_breakpoint.txt"
- single_site_info_dict={}
- single_site_ID_dict={}
- single_site_total_list=[]
- readsPrimer_dict={}
- srf=SRF(homo_dict,info_dict,primer_seq_dict)
- srf.StatPrimer(readsIDFile,readsPrimer_dict)
- with open(single_points_file,'r') as spf:
- header = spf.readline().strip("\n").split("\t")
- Reads_index = header.index("ReadsID")
- Site_index = header.index("Site")
- for line in spf:
- info = line.strip("\n").split("\t")
- site = info[Site_index]
- ID = info[Reads_index]
- single_site_total_list.append(info)
- if site not in single_site_info_dict.keys():
- single_site_info_dict[site]=[info]
- single_site_ID_dict[site]=[ID]
- else:
- single_site_info_dict[site].append(info)
- single_site_ID_dict[site].append(ID)
-
- single_site_cluster_dict={}
- single_double_site_dict={}
- single_select_list=[]
- #combine seq
- for site in single_site_info_dict.keys():
- results=srf.GetSingleCombine(single_site_info_dict[site])
- if len(results[0])<10:
- continue
- single_site_cluster_dict[site]=list(results)
- for double_site in fusion_dict.keys():
- if srf.CheckSite(site,double_site):
- double_temp_list=fusion_dict[double_site]
- fusion_seq = double_temp_list[-1]
- if srf.CheckFusionSeq(results[0]+results[1],fusion_seq):
- single_select_list.append(site)
- if double_site not in single_double_site_dict.keys():
- single_double_site_dict[double_site]=[site]
- else:
- single_double_site_dict[double_site].append(site)
-
- #将single cluster加到fusion_dict中
- for ds in single_double_site_dict.keys():
- site_list = single_double_site_dict[ds]
- temp_ID = []
- for site in site_list:
- ID_list = single_site_ID_dict[site]
- temp_ID.extend(ID_list)
- for ID_temp in ID_list:
- fusion_ID_info_list.append([ds,ID_temp,'Single'])
- singleNum = len(temp_ID)
- info = fusion_dict[ds]
- info[5] = str(singleNum)
- temp_ID.extend(fusion_ID_dict[ds])
- results=GetUniqUMIPrimer(temp_ID,readsPrimer_dict)
- UMIs = results[0]
- primerNum = results[1]
- info[6] = str(UMIs)
- info[7] = str(primerNum)
- fusion_dict[ds]=info
- #将single 加到fusion dict中
- #points ont the same front
- site_ops=[]
- for item in single_site_total_list:
- site = item[0]
- if site in single_select_list:
- continue
- mismatch_seq= item[1]
- match_seq = item[2]
- readsID = item[4]
- results=srf.GetSinglePoint(readsID,mismatch_seq,match_seq)
- if results[0] and len(results[1])>0:
- for item_list in results[1]:
- site_ops.append(site)
- #['GGCATGGTGGTGGGCGTCTGTAGTCCCAGCTACTCAGGAGGCTGAGG', 'chr10:59897395', 'chr10:59897425']
- fusion_seq=item_list[0]
- site1 = item_list[1]
- site1_e = item_list[2]
- site2 = site
- site2_e = item[3]
- singleNum = len(single_site_ID_dict[site])
- results_NUM=GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
- UMIs = results_NUM[0]
- primerNum = results_NUM[1]
- out_str=[site1+"-"+site2,site1_e,site2_e,'0','0',str(singleNum),str(UMIs),str(primerNum),fusion_seq]
- fusion_dict[out_str[0]] = out_str
-
- #输出fusion stat final
- fusion_final_file=indir+"/fusion_stat_final.txt"
- fusion_final=open(fusion_final_file,'w')
- fusion_final.write(fusion_header)
- for site in fusion_dict.keys():
- fusion_final.write("\t".join(fusion_dict[site])+"\n")
- fusion_final.close()
- #输出fusion stat readsID
- fusion_readsID_file=indir+"/fusion_readsID_final.txt"
- fusion_readsID=open(fusion_readsID_file,'w')
- fusion_readsID.write(ID_header)
- for info in fusion_ID_info_list:
- fusion_readsID.write("\t".join(info)+"\n")
- fusion_readsID.close()
- #输出single cluster结果
- single_cluster_file=indir+"/single_stat.txt"
- single_cluster=open(single_cluster_file,'w')
- single_cluster.write("\t".join(["Point","PointDown","SingleNum","UMI","PrimerPair","MismatchSeq","MatchSeq"])+"\n")
- site_count={}
- site_out={}
- for site in single_site_cluster_dict.keys():
- if site not in site_ops:
- singleNum = len(single_site_ID_dict[site])
- results = GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict)
- PointDown=single_site_info_dict[site][0][3]
- UMIs = results[0]
- primerNum = results[1]
- site_count[site] = singleNum
- info = [site,PointDown,str(singleNum),str(UMIs),str(primerNum)]
- info.extend(single_site_cluster_dict[site])
- site_out[site]=info
- site_count_sort= sorted(site_count.items(), key=lambda d:d[1], reverse = True)
- num = len(site_count_sort)
- for index in range(num):
- site = site_count_sort[index][0]
- single_cluster.write("\t".join(site_out[site])+"\n")
- single_cluster.close()
- t2=time.time()
- print("Times: "+str(int(t2-t1))+"s")
- print("BreakPoints Stat Single Done!")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='single breakpoints primers')
- parser.add_argument('-i', required=True, type=str, help="indir")
- parser.add_argument('-r', required=True, type=str, help="reads primer file")
- parser.add_argument('-m', required=True, type=str, help="primer homo file")
- parser.add_argument('-f', required=True, type=str, help="primer info file")
- args = parser.parse_args()
- main(args.i,args.r,args.m,args.f)
|