#!/usr/bin/python # -*- coding:utf-8 -*- import time,argparse from StatReadsFusion import SingleReadsFusion as SRF def GetHomo(homo_file,homo_dict): with open(homo_file,'r') as hf: for line in hf: info = line.strip("\n").split("\t") primer = info[0] info_list = info[1].split(",") homo_dict[primer]=info_list def GetInfo(info_file,info_dict,primer_seq_dict): with open(info_file,'r') as inf: for line in inf: info = line.strip("\n").split("\t") primer = info[0] info_dict[primer]=info seq = info[1] if seq not in primer_seq_dict.keys(): primer_seq_dict[seq]=[info[0]] else: primer_seq_dict[seq].append(info[0]) def GetUniqUMIPrimer(ID_list,readsPrimer_dict): UMIs_dict={} Primer_dict={} for ID in ID_list: temp=ID.split("_") if len(temp)>=5: UMI_tmp=ID.split("_") UMI = UMI_tmp[2] ReadsID = UMI_tmp[0] primer = readsPrimer_dict[ReadsID] UMIs_dict[UMI]=0 Primer_dict[primer]=0 return len(UMIs_dict),len(Primer_dict) def main(indir,readsIDFile,homo_file,info_file): t1=time.time() homo_dict={} info_dict={} primer_seq_dict={} #double fuison fusion_file=indir+"/fusion_stat.txt" fusion_dict={} fusion_header="" with open(fusion_file,'r') as dff: fusion_header = dff.readline() for line in dff: info = line.strip("\n").split("\t") site = info[0] fusion_dict[site]=info #double fusion readsID fusion_ID_file=indir+"/fusion_readsID.txt" fusion_ID_dict={} fusion_ID_info_list=[] ID_header="" with open(fusion_ID_file,'r') as dff: ID_header = dff.readline() for line in dff: info = line.strip("\n").split("\t") site = info[0] readsID = info[1] if site not in fusion_ID_dict.keys(): fusion_ID_dict[site]=[readsID] else: fusion_ID_dict[site].append(readsID) fusion_ID_info_list.append(info) GetHomo(homo_file,homo_dict) GetInfo(info_file,info_dict,primer_seq_dict) single_points_file=indir+"/single_breakpoint.txt" single_site_info_dict={} single_site_ID_dict={} single_site_total_list=[] readsPrimer_dict={} srf=SRF(homo_dict,info_dict,primer_seq_dict) srf.StatPrimer(readsIDFile,readsPrimer_dict) with open(single_points_file,'r') as spf: header = spf.readline().strip("\n").split("\t") Reads_index = header.index("ReadsID") Site_index = header.index("Site") for line in spf: info = line.strip("\n").split("\t") site = info[Site_index] ID = info[Reads_index] single_site_total_list.append(info) if site not in single_site_info_dict.keys(): single_site_info_dict[site]=[info] single_site_ID_dict[site]=[ID] else: single_site_info_dict[site].append(info) single_site_ID_dict[site].append(ID) single_site_cluster_dict={} single_double_site_dict={} single_select_list=[] #combine seq for site in single_site_info_dict.keys(): results=srf.GetSingleCombine(single_site_info_dict[site]) if len(results[0])<10: continue single_site_cluster_dict[site]=list(results) for double_site in fusion_dict.keys(): if srf.CheckSite(site,double_site): double_temp_list=fusion_dict[double_site] fusion_seq = double_temp_list[-1] if srf.CheckFusionSeq(results[0]+results[1],fusion_seq): single_select_list.append(site) if double_site not in single_double_site_dict.keys(): single_double_site_dict[double_site]=[site] else: single_double_site_dict[double_site].append(site) #将single cluster加到fusion_dict中 for ds in single_double_site_dict.keys(): site_list = single_double_site_dict[ds] temp_ID = [] for site in site_list: ID_list = single_site_ID_dict[site] temp_ID.extend(ID_list) for ID_temp in ID_list: fusion_ID_info_list.append([ds,ID_temp,'Single']) singleNum = len(temp_ID) info = fusion_dict[ds] info[5] = str(singleNum) temp_ID.extend(fusion_ID_dict[ds]) results=GetUniqUMIPrimer(temp_ID,readsPrimer_dict) UMIs = results[0] primerNum = results[1] info[6] = str(UMIs) info[7] = str(primerNum) fusion_dict[ds]=info #将single 加到fusion dict中 #points ont the same front site_ops=[] for item in single_site_total_list: site = item[0] if site in single_select_list: continue mismatch_seq= item[1] match_seq = item[2] readsID = item[4] results=srf.GetSinglePoint(readsID,mismatch_seq,match_seq) if results[0] and len(results[1])>0: for item_list in results[1]: site_ops.append(site) #['GGCATGGTGGTGGGCGTCTGTAGTCCCAGCTACTCAGGAGGCTGAGG', 'chr10:59897395', 'chr10:59897425'] fusion_seq=item_list[0] site1 = item_list[1] site1_e = item_list[2] site2 = site site2_e = item[3] singleNum = len(single_site_ID_dict[site]) results_NUM=GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict) UMIs = results_NUM[0] primerNum = results_NUM[1] out_str=[site1+"-"+site2,site1_e,site2_e,'0','0',str(singleNum),str(UMIs),str(primerNum),fusion_seq] fusion_dict[out_str[0]] = out_str #输出fusion stat final fusion_final_file=indir+"/fusion_stat_final.txt" fusion_final=open(fusion_final_file,'w') fusion_final.write(fusion_header) for site in fusion_dict.keys(): fusion_final.write("\t".join(fusion_dict[site])+"\n") fusion_final.close() #输出fusion stat readsID fusion_readsID_file=indir+"/fusion_readsID_final.txt" fusion_readsID=open(fusion_readsID_file,'w') fusion_readsID.write(ID_header) for info in fusion_ID_info_list: fusion_readsID.write("\t".join(info)+"\n") fusion_readsID.close() #输出single cluster结果 single_cluster_file=indir+"/single_stat.txt" single_cluster=open(single_cluster_file,'w') single_cluster.write("\t".join(["Point","PointDown","SingleNum","UMI","PrimerPair","MismatchSeq","MatchSeq"])+"\n") site_count={} site_out={} for site in single_site_cluster_dict.keys(): if site not in site_ops: singleNum = len(single_site_ID_dict[site]) results = GetUniqUMIPrimer(single_site_ID_dict[site],readsPrimer_dict) PointDown=single_site_info_dict[site][0][3] UMIs = results[0] primerNum = results[1] site_count[site] = singleNum info = [site,PointDown,str(singleNum),str(UMIs),str(primerNum)] info.extend(single_site_cluster_dict[site]) site_out[site]=info site_count_sort= sorted(site_count.items(), key=lambda d:d[1], reverse = True) num = len(site_count_sort) for index in range(num): site = site_count_sort[index][0] single_cluster.write("\t".join(site_out[site])+"\n") single_cluster.close() t2=time.time() print("Times: "+str(int(t2-t1))+"s") print("BreakPoints Stat Single Done!") if __name__ == '__main__': parser = argparse.ArgumentParser(description='single breakpoints primers') parser.add_argument('-i', required=True, type=str, help="indir") parser.add_argument('-r', required=True, type=str, help="reads primer file") parser.add_argument('-m', required=True, type=str, help="primer homo file") parser.add_argument('-f', required=True, type=str, help="primer info file") args = parser.parse_args() main(args.i,args.r,args.m,args.f)