#!/usr/bin/python # -*- coding:utf-8 -*- import argparse,time BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N','-':'-'} def get_site_pos(site): f_chr=site.split("-")[0].split(':')[0] f_pos=site.split("-")[0].split(':')[1] e_chr=site.split("-")[1].split(':')[0] e_pos=site.split("-")[1].split(':')[1] return f_chr,f_pos,e_chr,e_pos def check_site(key_site,select_site): key_site_info=get_site_pos(key_site) select_site_info=get_site_pos(select_site) if key_site_info[0] == select_site_info[0] and abs(int(key_site_info[1])-int(select_site_info[1])) <=5: if key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5: return True elif key_site_info[0] == select_site_info[2] and abs(int(key_site_info[1])-int(select_site_info[3])) <=5: if key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5: return True #elif key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5: # return True #elif key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5: # return True else: return False def uniq_site_ID(site_list): site_dict_num={} total_num = len(site_list) for item in site_list: if item not in site_dict_num.keys(): site_dict_num[item]=1 else: site_dict_num[item]+=1 seq_site_sort= sorted(site_dict_num.items(), key=lambda d:d[1], reverse = True) site_num=len(seq_site_sort) key_site =seq_site_sort[0] site_list_new=[key_site] count = int(key_site[1]) for index in range(1,site_num): select_site=seq_site_sort[index] if not check_site(key_site[0],select_site[0]): site_list_new.append(select_site) else: count += int(select_site[1]) if 'chr2:29223740-chr1:13397210' in site_list_new[0][0]: print("TEST") print(seq_site_sort[0]) print(count) print(total_num) #不超过1半,则不合并 if count*2 >= total_num: return site_list_new else: return seq_site_sort def Seq_Overlap(seq1,seq2,MISMATCH=4): seq_len=min(len(seq1),len(seq2)) count =0 for index in range(seq_len): if seq1[index] != seq2[index]: count +=1 if count >MISMATCH: return False return True def IS_seq(key_seq,select_seq): select_seq_rev_comp="".join([BASE_DICT[base] for base in select_seq][::-1]) #select seq former for index in range(5): select_seq_temp=select_seq[index:] temp_len=len(select_seq_temp) key_seq_temp=key_seq[:temp_len] if Seq_Overlap(select_seq_temp,key_seq_temp): return True #select seq latter for index in range(5): key_seq_temp=key_seq[index:] temp_len=len(key_seq_temp) select_seq_temp=select_seq[:temp_len] if Seq_Overlap(select_seq_temp,key_seq_temp): return True #rev_comp select seq former for index in range(5): select_seq_rev_comp_temp=select_seq_rev_comp[index:] temp_len=len(select_seq_rev_comp_temp) key_seq_temp=key_seq[:temp_len] if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp): return True #rev_comp select seq latter for index in range(5): key_seq_temp=key_seq[index:] temp_len=len(key_seq_temp) select_seq_rev_comp_temp=select_seq_rev_comp[:temp_len] if Seq_Overlap(select_seq_rev_comp_temp,key_seq_temp): return True return False def get_combine_seq(key_seq,check_index,seq_count_sort): total_seq_num=len(seq_count_sort) key_seq_list=[] for index in range(total_seq_num): if index not in check_index: select_seq=seq_count_sort[index][0] if IS_seq(key_seq,select_seq): key_seq_list.append(select_seq) check_index.append(index) return key_seq_list def stat_double_fusion(infile,readsID_dict,site_dict): seq_site_dict={} seq_readsID_dict={} seq_site_info={} with open(infile,'r') as inputF: for line in inputF: info = line.strip("\n").split("\t") seq = info[3] ID = info[7] site = info[0] seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1]) if seq == "": continue seq_site_info[site]=[info[5],info[6]] if seq not in seq_site_dict.keys() and seq_rev_comp not in seq_site_dict.keys(): seq_site_dict[seq]=[site] seq_readsID_dict[seq]=[ID] elif seq_rev_comp in seq_site_dict.keys(): site_temp=site.split("-") site=site_temp[1]+'-'+site_temp[0] seq_site_dict[seq_rev_comp].append(site) seq_readsID_dict[seq_rev_comp].append(ID) elif seq in seq_site_dict.keys(): seq_site_dict[seq].append(site) seq_readsID_dict[seq].append(ID) #sort by count seq_count_dict={} for seq in seq_site_dict.keys(): count = len(seq_site_dict[seq]) seq_count_dict[seq]=count seq_count_sort= sorted(seq_count_dict.items(), key=lambda d:d[1], reverse = True) seq_site_detail={} #combine site for item in seq_count_sort: seq=item[0] #print(seq_site_dict[seq]) seq_site_dict[seq]=uniq_site_ID(seq_site_dict[seq]) #print(seq_site_dict[seq]) site=seq_site_dict[seq][0][0] site_rev_temp=site.split("-") site_rev=site_rev_temp[1]+'-'+site_rev_temp[0] seq_site_detail[seq]=seq_site_info[site] if site in seq_site_info else seq_site_info[site_rev] #combine seq total_seq_num=len(seq_count_sort) check_index=[] for i_key in seq_count_sort: print(i_key) for index in range(total_seq_num): if index not in check_index: key_seq=seq_count_sort[index][0] check_index.append(index) combine_seq_list=get_combine_seq(key_seq,check_index,seq_count_sort) print(key_seq) site_dict[key_seq]=[seq_site_dict[key_seq][0][0],seq_site_detail[key_seq][0],seq_site_detail[key_seq][1]] seq_ID_list=seq_readsID_dict[key_seq] for seq in combine_seq_list: seq_ID_list.extend(seq_readsID_dict[seq]) readsID_dict[key_seq]=seq_ID_list #out_test(site_dict) def check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict): site = info[0]+"-"+info[0] if info[1] == "Down": seq = info[3][-10:]+info[2][:10] elif info[1] == "Up": seq = info[2][-10:]+info[3][:10] for key_seq in FusionSeq_readsID_dict.keys(): key_seq_len_half=int(len(key_seq)/2) key_seq_temp=key_seq[key_seq_len_half-10:key_seq_len_half+10] if IS_seq(key_seq_temp,seq) and check_site(site,FusionSeq_site_dict[key_seq][0]): if key_seq not in single_readsID_dict.keys(): single_readsID_dict[key_seq]=[info[-1]] single_site_dict[key_seq]=FusionSeq_site_dict[key_seq] else: single_readsID_dict[key_seq].append(info[-1]) return True return False def fusion_seq_overlap(fusionseq1,fusionseq2): if Seq_Overlap(fusionseq1,fusionseq2): return True,0 for i in range(1,6): fusionseq1_tmp=fusionseq1[i:] fusionseq2_tmp=fusionseq2[:-i] if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp): return True,i for i in range(1,6): fusionseq2_tmp=fusionseq2[i:] fusionseq1_tmp=fusionseq1[:-i] if Seq_Overlap(fusionseq1_tmp,fusionseq2_tmp): return True,-1*i return False,0 def check_fusion_seq(down_list,up_list): #up_list #['CCCAGGCTGGAGTGCAGTGGTATGATCTTGGCTCATGGCAATCTCTGCCTCCGGGGTTCAAGCAATTCTTATGCCTCAGCCT', 'TTGAGATGGAGTCTCACTTTGTTG'] #down_list #['CCGGGTGCACACCATTCTCCTGCCTCAGCCTCCCCGGTAGGTAGCTGGGACTACAGGCGCCCACCCCCATGCCC', 'AGGAGTCTAG'] up_mismatch_len=min(len(up_list[1]),len(down_list[0])) down_mismatch_len=min(len(down_list[1]),len(up_list[0])) if up_mismatch_len > 25: up_mismatch_len=min(up_mismatch_len-10,25) else: up_mismatch_len=10 if down_mismatch_len > 25: down_mismatch_len=min(down_mismatch_len-10,25) else: down_mismatch_len=10 up_seq=up_list[1][-up_mismatch_len:]+up_list[0][:down_mismatch_len] down_seq=down_list[0][-up_mismatch_len:]+down_list[1][:down_mismatch_len] if len(up_seq)>=25: fusion_seq_overlap_results=fusion_seq_overlap(up_seq,down_seq) mismatch=fusion_seq_overlap_results[1] if mismatch >0: down_seq=down_list[0][-up_mismatch_len-mismatch:]+down_list[1][:down_mismatch_len-mismatch] fusion_seq_overlap_results_tmp=fusion_seq_overlap(up_seq,down_seq) if fusion_seq_overlap_results_tmp[1] !=0: return False,0,"" return [fusion_seq_overlap_results[0],fusion_seq_overlap_results[1],up_seq] else: return False,0,"" def check_adj_pos(p_fusion_seq,fusion_seq): for i in range(5): p_fusion_seq_tmp=p_fusion_seq[i:] tmp_len = len(p_fusion_seq_tmp) fusion_seq_tmp=fusion_seq[:tmp_len] if p_fusion_seq_tmp == fusion_seq_tmp: return True,i return False,0 def adj_pos(pos1,pos2,index): pos1_info=pos1.split(":") pos2_info=pos2.split(":") if index != 0: pos2_info[1]=str(int(pos2_info[1])+index) return ":".join(pos1_info)+"-"+":".join(pos2_info) def get_single_fusion(down_dict,up_dict,known_point,know_list,up_strand): fusion_seq_dict={} if up_strand == "+": index = 1 elif up_strand == "-": index = -1 for item in down_dict.keys(): for i in range(5): match_seq=down_dict[item][0][-5-i:] mismatch_seq=down_dict[item][1][:5-i] fusion_seq=match_seq+mismatch_seq #0 base if fusion_seq not in fusion_seq_dict.keys(): fusion_seq_dict[fusion_seq]=[item] else: fusion_seq_dict[fusion_seq].append(item) for item in up_dict.keys(): match_seq=up_dict[item][0][:5] mismatch_seq=up_dict[item][1][-5:] p_fusion_seq=mismatch_seq+match_seq #total if p_fusion_seq in fusion_seq_dict.keys(): for p_item in fusion_seq_dict[p_fusion_seq]: if item+"-"+p_item in know_list: continue check_fusion_seq_results=check_fusion_seq(down_dict[p_item],up_dict[item]) if check_fusion_seq_results[0]: fusion_seq=down_dict[p_item][0][-5:]+down_dict[p_item][1][:5] pos_results = check_adj_pos(p_fusion_seq,fusion_seq) if pos_results[0]: mismatch=index*int(pos_results[1]) known_point[adj_pos(p_item,item,mismatch)]=[item,p_item,check_fusion_seq_results[2],str(abs(mismatch))] know_list.append(item+"-"+p_item) know_list.append(p_item+"-"+item) continue def count_base(base): tmp_dict={'A':0,'T':0,'G':0,'C':0,'N':0} tmp_dict[base] +=1 return tmp_dict def base_dict(seq): tmp_dict={} index =0 for base in seq: tmp_dict[index]=count_base(base) index +=1 return tmp_dict def combine_dict(BASE_COUNT_DICT,temp_dict): num1=len(BASE_COUNT_DICT) num2=len(temp_dict) num=min(num1,num2) for pos in range(num): total_tmp_dict=BASE_COUNT_DICT[pos] tmp_tmp_dict=temp_dict[pos] for base in tmp_tmp_dict.keys(): total_tmp_dict[base] +=tmp_tmp_dict[base] if num2>num1: for pos in range(num,num2): BASE_COUNT_DICT[pos]=temp_dict[pos] def get_common_seq(seq_list,flag='Front'): BASE_COUNT_DICT={0:{'A':0,'T':0,'G':0,'C':0,'N':0}} for seq in seq_list: if flag == 'End': seq=seq[::-1] index =0 temp_dict=base_dict(seq) combine_dict(BASE_COUNT_DICT,temp_dict) common_seq=[] for index in BASE_COUNT_DICT.keys(): temp_dict = BASE_COUNT_DICT[index] temp_count=0 temp_base='' for base in temp_dict.keys(): if temp_dict[base]>temp_count: temp_count=temp_dict[base] temp_base=base common_seq.append(temp_base) if flag =='End': return "".join(common_seq[::-1]) return "".join(common_seq) def stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir): single_points_list=[] with open(single_points_file,'r') as spf: for line in spf: info = line.strip("\n").split("\t") if check_single_fusion(info,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict): continue single_points_list.append(info) #single points #从reads i 根据比对结果找基因A,readss j上根据比对结果找基因B #同一个site,获取match seq和mismatch seq的commonc seq up_match_dict={} up_mismatch_dict={} down_match_dict={} down_mismatch_dict={} for item in single_points_list: match_seq=item[3] mismatch_seq=item[2] if item[1] == "Up": if item[0] not in up_match_dict.keys(): up_match_dict[item[0]]=[match_seq] up_mismatch_dict[item[0]]=[mismatch_seq] else: up_match_dict[item[0]].append(match_seq) up_mismatch_dict[item[0]].append(mismatch_seq) elif item[1] == "Down": if item[0] not in down_match_dict.keys(): down_match_dict[item[0]]=[match_seq] down_mismatch_dict[item[0]]=[mismatch_seq] else: down_match_dict[item[0]].append(match_seq) down_mismatch_dict[item[0]].append(mismatch_seq) #获取common seq up_dict={} up_rev_comp_dict={} down_dict={} down_rev_comp_dict={} for site in up_match_dict.keys(): match_seq=get_common_seq(up_match_dict[site],'Front') mismatch_seq=get_common_seq(up_mismatch_dict[site],'End') up_dict[site]=[mismatch_seq,match_seq] match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1] mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1] up_rev_comp_dict[site]=[match_seq_rev_comp,mismatch_seq_rev_comp] for site in down_match_dict.keys(): match_seq=get_common_seq(down_match_dict[site],'End') mismatch_seq=get_common_seq(down_mismatch_dict[site],'Front') down_dict[site]=[match_seq,mismatch_seq] match_seq_rev_comp="".join([BASE_DICT[base] for base in match_seq])[::-1] mismatch_seq_rev_comp="".join([BASE_DICT[base] for base in mismatch_seq])[::-1] down_rev_comp_dict[site]=[mismatch_seq_rev_comp,match_seq_rev_comp] known_point_dict={} know_list=[]#避免重复比较up_dict和down_dict #up-down if len(up_dict)>0 and len(down_dict)>0: get_single_fusion(down_dict,up_dict,known_point_dict,know_list,"+") #up-up_rev_comp if len(up_dict)>0: get_single_fusion(up_rev_comp_dict,up_dict,known_point_dict,know_list,"+") #down-down_rev_comp if len(down_dict)>0: get_single_fusion(down_dict,down_rev_comp_dict,known_point_dict,know_list,"-") #将known_point_dict中内容放到single_site_dict中 for site in known_point_dict.keys(): seq=known_point_dict[site][2] if seq not in single_site_dict.keys(): single_site_dict[seq]=site #记录每个readsID对应可能的fusion site,之后从reads i上根据比对结果找基因A,primer坐标上找基因B single_points_primerPre_file=indir+"/single_points_PrimersPre_readsInfo.txt" single_points_primerPre=open(single_points_primerPre_file,'w') single_points_PrimersPre_site={} for item in single_points_list: match_seq=item[3] mismatch_seq=item[2] select_site=item[0]+"-"+item[0] ID=item[-1] if item[1] == "Up": select_seq=mismatch_seq[-10:]+match_seq[:10] elif item[1] == "Down": select_seq=match_seq[-10:]+mismatch_seq[:10] else: continue for key_seq in single_site_dict.keys(): key_site=single_site_dict[key_seq] if IS_seq(key_seq,select_seq) and check_site(select_site,key_site[0]): if key_seq not in single_readsID_dict.keys(): single_readsID_dict[key_seq]=[ID] else: single_readsID_dict[key_seq].append(ID) continue single_points_PrimersPre_site[item[0]]=[item[0],'Up',up_dict[item[0]][0],up_dict[item[0]][1],item[-2],ID] if item[1] == "Up" else [item[0],'Down',down_dict[item[0]][0],down_dict[item[0]][1],item[-2],ID] single_points_primerPre.write("\t".join(item)+"\n") single_points_primerPre.close() #记录每个fusion site对应的fusion seq single_points_primerPre_site_file=indir+"/single_points_PrimersPre_site.txt" ingle_points_primerPre_site=open(single_points_primerPre_site_file,'w') for site in single_points_PrimersPre_site.keys(): ingle_points_primerPre_site.write("\t".join(single_points_PrimersPre_site[site])+"\n") ingle_points_primerPre_site.close() def get_Uniq_UMIs(ID_list): UMIs_dict={} for ID in ID_list: temp=ID.split("_") if len(temp)>=3: UMI=ID.split("_")[2] UMIs_dict[UMI]=0 return len(UMIs_dict) def out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir): #out fusionseq and stat double and single readsID Final_FusionSeq_dict={} for key_seq in FusionSeq_readsID_dict.keys(): site_list = FusionSeq_site_dict[key_seq] ID_list=FusionSeq_readsID_dict[key_seq] doubleID=len(ID_list) singleID=0 if key_seq in single_readsID_dict.keys(): singleID=len(single_readsID_dict[key_seq]) ID_list.extend(single_readsID_dict[key_seq]) UMIs=get_Uniq_UMIs(ID_list) site_list.append(str(doubleID)) site_list.append(str(singleID)) site_list.append(str(UMIs)) Final_FusionSeq_dict[key_seq]=site_list for key_seq in single_readsID_dict.keys(): if key_seq not in Final_FusionSeq_dict.keys(): site_dict=single_site_dict[key_seq] doubleID=0 singleID=len(single_readsID_dict[key_seq]) ID_list=single_readsID_dict[key_seq] UMIs=get_Uniq_UMIs(ID_list) site_list.append(str(doubleID)) site_list.append(str(singleID)) site_list.append(str(UMIs)) Final_FusionSeq_dict[key_seq]=site_dict fusion_points_file=indir+"/fusion_breakpoints_stat.txt" fusion_points=open(fusion_points_file,'w') fusion_points.write("\t".join(['Points','Point1_End','Point2_End','DoubleReads','SingleReads','UMIkinds'])+"\n") for key_seq in Final_FusionSeq_dict.keys(): fusion_points.write("\t".join(Final_FusionSeq_dict[key_seq])+"\n") fusion_points.close() #输出每个ID对应fusion fusion_points_readsID_file=indir+"/fusion_breakpoints_readsID.txt" fusion_points_readsID=open(fusion_points_readsID_file,'w') fusion_points_readsID.write("\t".join(['Points','FusionSeq','ReadsID'])+"\n") for key_seq in FusionSeq_readsID_dict.keys(): ID_list=FusionSeq_readsID_dict[key_seq] site=FusionSeq_site_dict[key_seq][0] for ID in ID_list: fusion_points_readsID.write("\t".join([site,key_seq,ID,"Double"])+"\n") for key_seq in single_readsID_dict.keys(): ID_list=single_readsID_dict[key_seq] site=single_site_dict[key_seq][0] for ID in ID_list: fusion_points_readsID.write("\t".join([site,key_seq,ID,"Single"])+"\n") fusion_points_readsID.close() def out_test(item_dict): for item in item_dict.keys(): print("TEST") print(item) print(item_dict[item]) def main(indir): t1=time.time() FusionSeq_readsID_dict={} FusionSeq_site_dict={} single_readsID_dict={} single_site_dict={} #double fusion double_points_file=indir+"/double_breakpoints.txt" stat_double_fusion(double_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict) #single fusion single_points_file=indir+"/single_breakpoints.txt" stat_single_fusion(single_points_file,FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir) #out results out_fusion_stat_results(FusionSeq_readsID_dict,FusionSeq_site_dict,single_readsID_dict,single_site_dict,indir) t2=time.time() print("Times: "+str(int(t2-t1))+"s") print("BreakPoints Stat Done!") if __name__ == '__main__': parser = argparse.ArgumentParser(description='get reads mapping location') parser.add_argument('-i', required=True, type=str, help="indir") args = parser.parse_args() main(args.i)