#!/usr/bin/python # -*- coding:utf-8 -*- import argparse,time,os BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'} def read_sam(insam): ID_dict={} with open(insam,'r') as inputSam: for line in inputSam: if not line.startswith("@"): info=line.strip("\n").split("\t") #ID_info = info[0].split("_") #hrad clip reads MAP>0 if int(info[1])>256 and int(info[4])==0: continue ID = info[0] if ID not in ID_dict.keys(): ID_dict[ID]=[info] else: ID_dict[ID].append(info) return ID_dict def check_clip(reads_info): if len(reads_info)>=3: return "hard_clip" elif len(reads_info)==2: for item in reads_info: if "S" in item[5]: return "soft_clip" return "no_clip" def get_map_info(map_str): #99M51S num_list=[] map_dict={} count=0 for item in map_str: if item.isalpha(): num=len(num_list) index=0 for i in range(num): index += int(num_list[i])*10**(num-i-1) map_dict[count]=item+":"+str(index) count +=1 num_list=[] elif item.isdigit(): num_list.append(item) return map_dict def check_match(match1,match2): if match1<30 or match2<30: return False return True def adj_info(seq,info1): flag = True for key in info1.keys(): item = info1[key] if 'D' in item or 'I' in item: flag = False if 'H' in info1[0]: flag = False #不需要调整 if flag: return seq,info1 else: new_seq_list=[] new_info1={} start=0 old_item='' for key in info1.keys(): item = info1[key] temp_len=int(item.split(":")[1]) end = start+temp_len if item.startswith('M') or item.startswith('S'): for index in range(start,end): new_seq_list.append(seq[index]) old_item=item elif item.startswith('D') and old_item.startswith('M'): end = end-temp_len for index in range(temp_len): new_seq_list.append('-') elif item.startswith('H'): end = end-temp_len start = end # num=len(info1) index =0 new_index= 0 while index+3 <= num: item1=info1[index] item2=info1[index+1] item3=info1[index+2] if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"): pos1=int(item1.split(":")[1]) pos2=int(item2.split(":")[1]) pos3=int(item3.split(":")[1]) new_info1[new_index] = 'M:'+str(pos1+pos2+pos3) new_index +=1 index +=3 elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"): pos1=int(item1.split(":")[1]) pos3=int(item3.split(":")[1]) new_info1[new_index] = 'M:'+str(pos1+pos3) new_index +=1 index +=3 elif item1.startswith("H") and item2.startswith("M"): new_info1[new_index] = item2 new_index +=1 index +=2 else: new_info1[new_index] = item1 new_index += 1 index += 1 while index < num: new_info1[new_index]=info1[index] new_index += 1 index += 1 return "".join(new_seq_list),new_info1 def get_points(info1,info2,OVERLAP=10): #OVERLAP默认值是10,默认不用这个参数值 if len(info1)>=5 and len(info2)>=5: map_info1=get_map_info(info1[5]) map_info2=get_map_info(info2[5]) seq1=info1[9] seq2=info2[9] #print(map_info1) #print(seq1) info1_temp=adj_info(seq1,map_info1) seq=info1_temp[0] map_info1=info1_temp[1] #print(map_info1) #print(seq) #print(map_info2) info2_temp=adj_info(seq2,map_info2) seq2=info2_temp[0] map_info2=info2_temp[1] #print(map_info2) chr1=info1[2] pos1=int(info1[3]) chr2=info2[2] pos2=int(info2[3]) flag = int(info1[1]) overlap=0 breakpoint1=0 breakpoint1_1=0 breakpoint2=0 breakpoint1_e=0 breakpoint2_e=0 fusion_seq="" if len(map_info1)==2 and len(map_info2)==2: #soft-clip end,hard-clip front #{0: 'M:131', 1: 'S:19'} #{0: 'H:118', 1: 'M:32'} if 'S' in map_info1[1] and 'H' in map_info2[0]: match1=int(map_info1[0].split(":")[1]) skip1=int(map_info1[1].split(":")[1]) match2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=match2-skip1 breakpoint1=pos1+match1-1-overlap breakpoint2=pos2 breakpoint1_1=pos1+match1-1 if skip1 >30: len2=30 else: len2=skip1 fusion_seq=seq[match1-30:match1+len2] breakpoint1_e=breakpoint1_1-30 breakpoint2_e=breakpoint2+30 #soft-clip end,hard-clip end #{0: 'M:99', 1: 'S:51'} #{0: 'M:53', 1: 'H:97'} elif 'S' in map_info1[1] and 'H' in map_info2[1]: match1=int(map_info1[0].split(":")[1]) skip1=int(map_info1[1].split(":")[1]) match2=int(map_info2[0].split(":")[1]) hide2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2)) if match2 > skip1: breakpoint1=pos1+match1-1-overlap else: breakpoint1=pos1+match1+overlap breakpoint2=pos2+match2-1 breakpoint1_1=pos1+match1-1 if skip1 >30: len2=30 else: len2=skip1 #print(match1) fusion_seq=seq[match1-30:match1+len2] #print(fusion_seq) breakpoint1_e=breakpoint1_1-30 breakpoint2_e=breakpoint2-30 #soft-clip front,hard-clip end #{0: 'S:51', 1: 'M:99'} #{0: 'M:53', 1: 'H:97'} elif 'S' in map_info1[0] and 'H' in map_info2[1]: match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[0].split(":")[1]) match2=int(map_info2[0].split(":")[1]) hide2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2)) breakpoint1=pos1+overlap breakpoint2=pos2+match2-1 breakpoint1_1=pos1 fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30] breakpoint1_e=breakpoint1_1+30 breakpoint2_e=breakpoint2-30 #soft-clip front,hard-clip front #{0: 'S:51', 1: 'M:99'} #{0: 'H:97', 1: 'M:53'} elif 'S' in map_info1[0] and 'H' in map_info2[0]: match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[0].split(":")[1]) match2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(match2-skip1) breakpoint1=pos1+overlap breakpoint2=pos2 breakpoint1_1=pos1 fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30] breakpoint1_e=breakpoint1_1+30 breakpoint2_e=breakpoint2+30 elif len(map_info1)==3 and len(map_info2)==2: flag_bin=bin(flag) if flag_bin[-5] == '1': #soft-clip front if 'H' in map_info2[0]: #hard-clip front #{0: 'S:51', 1: 'M:99', 2:'S:5'} #{0: 'H:97', 1: 'M:53'} match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[0].split(":")[1]) match2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(match2-skip1) breakpoint1=pos1+overlap breakpoint2=pos2 breakpoint1_1=pos1 fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30] breakpoint1_e=breakpoint1_1+30 breakpoint2_e=breakpoint2+30 elif 'H' in map_info2[1]: #hard-clip end #{0: 'S:51', 1: 'M:99', 2:'S:5'} #{0: 'M:53', 1: 'H:97'} match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[0].split(":")[1]) match2=int(map_info2[0].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(match2-skip1) breakpoint1=pos1+overlap breakpoint2=pos2+match2-1 breakpoint1_1=pos1 fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30] breakpoint1_e=breakpoint1_1+30 breakpoint2_e=breakpoint2-30 elif flag_bin[-5] == '0': #soft-clip end if 'H' in map_info2[0]: #hard-clip front #{0: 'S:5', 1: 'M:99', 2:'S:51'} #{0: 'H:97', 1: 'M:53'} former_skip=int(map_info1[0].split(":")[1]) match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[2].split(":")[1]) match2=int(map_info2[1].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(match2-skip1) breakpoint1=pos1+match1-1-overlap breakpoint2=pos2 breakpoint1_1=pos1+match1-1 if skip1 >30: len2=30 else: len2=skip1 fusion_seq=seq[former_skip+match1-30:former_skip+match1+len2] breakpoint1_e=breakpoint1_1-30 breakpoint2_e=breakpoint2+30 elif 'H' in map_info2[1]: #hard-clip end #{0: 'S:5', 1: 'M:99', 2:'S:51'} #{0: 'M:53', 1: 'H:97'} former_skip=int(map_info1[0].split(":")[1]) match1=int(map_info1[1].split(":")[1]) skip1=int(map_info1[2].split(":")[1]) match2=int(map_info2[0].split(":")[1]) if not check_match(match1,match2): return False,0 overlap=abs(match2-skip1) breakpoint1=pos1+match1-1-overlap breakpoint2=pos2+match2-1 breakpoint1_1=pos1+match1-1 if skip1 >30: len2=30 else: len2=skip1 fusion_seq=seq[former_skip+match1-30:+former_skip+match1+len2] breakpoint1_e=breakpoint1_1-30 breakpoint2_e=breakpoint2-30 if abs(overlap) <= OVERLAP and breakpoint1 !=0: return True,chr1+":"+str(breakpoint1),chr2+":"+str(breakpoint2),chr1+":"+str(breakpoint1_1),fusion_seq,str(overlap),chr1+":"+str(breakpoint1_e),chr2+":"+str(breakpoint2_e) return False,0 return False,0 def hard_clip_fusion(reads_info): #R1:hard-clip R1_soft_info=reads_info[0] R1_hard_info=reads_info[1] if int(R1_hard_info[1])>256 and int(R1_soft_info[4])>=60: results=get_points(R1_soft_info,R1_hard_info) if results[0]: out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]] return out_list #R2:hard-clip R2_soft_info=reads_info[-2] R2_hard_info=reads_info[-1] if int(R2_hard_info[1])>256 and int(R2_soft_info[4])>=60: results=get_points(R2_soft_info,R2_hard_info) if results[0]: out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]] return out_list return [] def primer_loc(primer,seq): primer_num=len(primer) seq_primer=seq[:primer_num] seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1]) seq_rev_comp_primer=seq_rev_comp[:primer_num] if primer == seq_primer: return "Front" elif primer == seq_rev_comp_primer: return "End" return "NO" def get_pos_end(map_info,pos): chr=pos.split(":")[0] pos_temp=int(pos.split(":")[1]) new_pos=0 correct_pos=pos_temp if map_info[0][0]=='M': match_len=int(map_info[0].split(":")[1]) pos_temp=pos_temp+match_len-1 correct_pos=pos_temp new_pos=pos_temp-30 elif map_info[0][0]=='S': new_pos=pos_temp+30 else: return pos return chr+":"+str(correct_pos),chr+":"+str(new_pos) def IS_overlap(seq1,seq2,msimatch=2): num = len(seq1) count =0 for i in range(num): if seq1[i] !=seq2[i]: count +=1 if count >=msimatch: return False return True def soft_clip_fusion(reads_info): R1_read_info=reads_info[0] R2_read_info=reads_info[1] ID_info = R1_read_info[0].split("_") ID = R1_read_info[0] R1_primer_loc="" R1_seq=R1_read_info[9] R1_mismatch_seq="" #比对质量值至少一个MAQ>=60 if int(R1_read_info[4])<60 and int(R2_read_info[4])<60: return False,"","" R1_loc="" R1_map_info={} R1_map_info_adj={} #R1 primer存在 if len(ID_info)>=4: R1_primer=ID_info[3] R1_map_info=get_map_info(R1_read_info[5]) R1_loc=primer_loc(R1_primer,R1_read_info[9]) if len(R1_map_info)==3 and R1_map_info[0][0] == "S" and R1_map_info[2][0]=="S": end_mismatch_len=int(R1_map_info[2].split(":")[1]) start_mismatch_len=int(R1_map_info[0].split(":")[1]) R1_primer_len=len(R1_primer) if R1_loc == "End": if end_mismatch_len > R1_primer_len: R1_map_info_adj[0]=R1_map_info[1] R1_map_info_adj[1]=R1_map_info[2] R1_primer_loc="Same" mismatch_len=end_mismatch_len R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1]) R1_mismatch_seq=R1_seq[:mismatch_len] else: R1_map_info_adj[0]=R1_map_info[0] R1_map_info_adj[1]=R1_map_info[1] R1_primer_loc="Not the Same" mismatch_len=start_mismatch_len R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1]) R1_mismatch_seq=R1_seq[-1*mismatch_len:] elif R1_loc == "Front": if start_mismatch_len > R1_primer_len: R1_map_info_adj[0]=R1_map_info[0] R1_map_info_adj[1]=R1_map_info[1] R1_primer_loc="Same" mismatch_len=start_mismatch_len R1_mismatch_seq=R1_seq[:mismatch_len] else: R1_map_info_adj[0]=R1_map_info[1] R1_map_info_adj[1]=R1_map_info[2] R1_primer_loc="Not the Same" mismatch_len=end_mismatch_len R1_mismatch_seq=R1_seq[-1*mismatch_len:] elif len(R1_map_info)==2: R1_map_info_adj=R1_map_info if R1_loc == "End" and R1_map_info[0][0]=="S": R1_primer_loc="Not the Same" R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1]) mismatch_len=int(R1_map_info[0].split(":")[1]) R1_mismatch_seq=R1_seq[-1*mismatch_len:] elif R1_loc == "End" and R1_map_info[1][0]=="S": R1_primer_loc="Same" R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1]) mismatch_len=int(R1_map_info[1].split(":")[1]) #if len(R1_primer)+5 >=mismatch_len: if len(R1_primer) >=mismatch_len: return False,"","" R1_mismatch_seq=R1_seq[:mismatch_len] elif R1_loc == "Front" and R1_map_info[0][0]=="S": R1_primer_loc="Same" mismatch_len=int(R1_map_info[0].split(":")[1]) #if len(R1_primer)+5 >=mismatch_len: if len(R1_primer) >=mismatch_len: return False,"","" R1_mismatch_seq=R1_seq[:mismatch_len] elif R1_loc == "Front" and R1_map_info[1][0] =="S": R1_primer_loc="Not the Same" mismatch_len=int(R1_map_info[1].split(":")[1]) R1_mismatch_seq=R1_seq[-1*mismatch_len:] #no soft clip else: R1_primer_loc="None" R2_primer_loc="" R2_seq=R2_read_info[9] R2_mismatch_seq="" R2_loc="" R2_map_info={} R2_map_info_adj={} #R1,R2都存在primer if len(ID_info)==5: R2_primer=ID_info[4] R2_map_info=get_map_info(R2_read_info[5]) R2_loc=primer_loc(R2_primer,R2_read_info[9]) if len(R2_map_info)==3 and R2_map_info[0][0] == "S" and R2_map_info[2][0]=="S": end_mismatch_len=int(R2_map_info[2].split(":")[1]) start_mismatch_len=int(R2_map_info[0].split(":")[1]) R2_primer_len=len(R2_primer) if R2_loc == "End": if end_mismatch_len > R2_primer_len: R2_map_info_adj[0]=R2_map_info[1] R2_map_info_adj[1]=R2_map_info[2] R2_primer_loc="Same" mismatch_len=end_mismatch_len R2_mismatch_seq=R2_seq[-1*mismatch_len:] else: R2_map_info_adj[0]=R2_map_info[0] R2_map_info_adj[1]=R2_map_info[1] R2_primer_loc="Not the Same" mismatch_len=start_mismatch_len R2_mismatch_seq=R2_seq[:mismatch_len:] if R2_loc == "Front": if start_mismatch_len > R2_primer_len: R2_map_info_adj[0]=R2_map_info[0] R2_map_info_adj[1]=R2_map_info[1] R2_primer_loc="Same" mismatch_len=start_mismatch_len R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1]) R2_mismatch_seq=R2_seq[-1*mismatch_len:] else: R2_map_info_adj[0]=R2_map_info[1] R2_map_info_adj[1]=R2_map_info[2] R2_primer_loc="Not the Same" mismatch_len=end_mismatch_len R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1]) R2_mismatch_seq=R2_seq[:mismatch_len] elif len(R2_map_info)==2: R2_map_info_adj=R2_map_info if R2_loc == "End" and R2_map_info[0][0]=="S": R2_primer_loc="Not the Same" mismatch_len=int(R2_map_info[0].split(":")[1]) R2_mismatch_seq=R2_seq[:mismatch_len] elif R2_loc == "End" and R2_map_info[1][0]=="S": R2_primer_loc="Same" mismatch_len=int(R2_map_info[1].split(":")[1]) #if len(R2_primer)+5 >=mismatch_len: if len(R2_primer) >=mismatch_len: return False,"","" R2_mismatch_seq=R2_seq[-1*mismatch_len:] elif R2_loc == "Front" and R2_map_info[0][0]=="S": R2_primer_loc="Same" R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1]) mismatch_len=int(R2_map_info[0].split(":")[1]) #if len(R2_primer)+5 >= mismatch_len: if len(R2_primer) >= mismatch_len: return False,"","" R2_mismatch_seq=R2_seq[-1*mismatch_len:] elif R2_loc == "Front" and R2_map_info[1][0] =="S": R2_primer_loc="Not the Same" R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1]) mismatch_len=int(R2_map_info[1].split(":")[1]) R2_mismatch_seq=R2_seq[:mismatch_len] else: R2_primer_loc="None" if R1_primer_loc=="Not the Same" and R2_primer_loc == "Not the Same": R1_mismatch_len=len(R1_mismatch_seq) R2_mismatch_len=len(R2_mismatch_seq) if R1_mismatch_len>=5 and R2_mismatch_len >=5: for index in range(5): R2_match_seq = R2_seq[index+R2_mismatch_len:index+R2_mismatch_len+R1_mismatch_len] if R2_match_seq == R1_mismatch_seq: R2_match_seq_tmp = R2_seq[:index+R2_mismatch_len+R1_mismatch_len] R1_match_seq_tmp = R1_seq[-1*len(R2_match_seq_tmp):] if R2_match_seq_tmp == R1_match_seq_tmp: FusionSeq=R1_seq[-1*R1_mismatch_len-30:-1*R1_mismatch_len]+R2_seq[R2_mismatch_len+index:R2_mismatch_len+index+30] R1_point=R1_read_info[2]+":"+R1_read_info[3] if R2_loc == "End": R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])+index) elif R2_loc == "Front": R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])-index) else: return True,"Double",outstr R1_point_list=get_pos_end(R1_map_info_adj,R1_point) R2_point_list=get_pos_end(R2_map_info_adj,R2_point) outstr=R1_point+"-"+R2_point+"\t"+R1_read_info[2]+":"+R1_read_info[3]+"\t"+R2_read_info[2]+":"+R2_read_info[3]+"\t"+FusionSeq+"\t"+str(index)+"\t"+R1_point_list[1]+"\t"+R2_point_list[1]+"\t"+ID+"\n" return True,"Double",outstr elif R1_primer_loc=="Not the Same" and R2_primer_loc == "Same": R1_mismatch_len=len(R1_mismatch_seq) R2_mismatch_len=len(R2_mismatch_seq) common_len=min(R1_mismatch_len,R2_mismatch_len) R1_mismatch_tmp=R1_mismatch_seq[-1*R1_mismatch_len:-1*common_len] R2_mismatch_tmp=R2_mismatch_seq[-1*R2_mismatch_len:-1*common_len] R2_match_seq=R2_seq[:-1*R2_mismatch_len] if R1_mismatch_tmp==R2_mismatch_tmp: R2_point=R2_read_info[2]+":"+R2_read_info[3] R2_point_list=get_pos_end(R2_map_info_adj,R2_point) otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr elif R1_primer_loc=="Same" and R2_primer_loc == "Same": return False,"","" elif R1_primer_loc=="Same" and R2_primer_loc == "Not the Same": R1_mismatch_len=len(R1_mismatch_seq) R2_mismatch_len=len(R2_mismatch_seq) common_len=min(R1_mismatch_len,R2_mismatch_len) R1_mismatch_tmp=R1_mismatch_seq[R1_mismatch_len-1*common_len:R1_mismatch_len] R2_mismatch_tmp=R2_mismatch_seq[R2_mismatch_len-1*common_len:R2_mismatch_len] R1_match_seq=R1_seq[R1_mismatch_len:] if IS_overlap(R1_mismatch_tmp,R2_mismatch_tmp): R1_point=R1_read_info[2]+":"+R1_read_info[3] R1_point_list=get_pos_end(R1_map_info_adj,R1_point) otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr else: return False,"","" ''' elif R1_primer_loc=="Same" and R2_primer_loc == "None": R1_point=R1_read_info[2]+":"+R1_read_info[3] R1_point_list=get_pos_end(R1_map_info_adj,R1_point) R1_mismatch_len=len(R1_mismatch_seq) R1_match_seq=R1_seq[R1_mismatch_len:] otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr elif R1_primer_loc=="Not the Same" and R2_primer_loc == "None": R1_point=R1_read_info[2]+":"+R1_read_info[3] R1_point_list=get_pos_end(R1_map_info_adj,R1_point) R1_mismatch_len=len(R1_mismatch_seq) R1_match_seq=R1_seq[:-1*R1_mismatch_len] otustr=R1_point_list[0]+"\tDown\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr elif R1_primer_loc=="None" and R2_primer_loc == "Same": R2_point=R2_read_info[2]+":"+R2_read_info[3] R2_point_list=get_pos_end(R2_map_info_adj,R2_point) R2_mismatch_len=len(R2_mismatch_seq) R2_match_seq=R2_seq[:-1*R2_mismatch_len] otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr elif R1_primer_loc=="None" and R2_primer_loc == "Not the Same": R2_point=R2_read_info[2]+":"+R2_read_info[3] R2_point_list=get_pos_end(R2_map_info_adj,R2_point) R2_mismatch_len=len(R2_mismatch_seq) R2_match_seq=R2_seq[R2_mismatch_len:] otustr=R2_point_list[0]+"\tUp\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n" return True,"Single",otustr ''' return False,"","" def main(insam,outdir): t1=time.time() #将sam内容按照ID区分,放到ID_info中 ID_dict=read_sam(insam) #存放soft_clip的reads ID soft_clip_ID=[] #输出到本地hard clip fusion hard_clip_file=outdir+"/hard_clip_fusion_breakpoints.txt" hard_clip_out=open(hard_clip_file,'w') header=["Site","Site1","Site2","Seq","Overlap","Site1_Down","Site2_Down","ReadsID"] hard_clip_out.write("\t".join(header)+"\n") for ID in ID_dict.keys(): reads_info = ID_dict[ID] clip_stat=check_clip(reads_info) if clip_stat == "soft_clip": soft_clip_ID.append(ID) elif clip_stat == "hard_clip": out_list=hard_clip_fusion(reads_info) if len(out_list) !=0: out_list.append(ID) hard_clip_out.write("\t".join(out_list)+"\n") hard_clip_out.close() #输出到本地single single_points=outdir+"/soft_clip_single_breakpoints.txt" SingleInput=open(single_points,'w') header=["Site","Up_Down","MismatchSeq","MatchSeq","Site_Down","ReadsID"] SingleInput.write("\t".join(header)+"\n") #输出到本地double double_points=outdir+"/soft_clip_double_breakpoints.txt" DoubleInput=open(double_points,'w') #DoubleInput.write("\t".join(header)+"\n") # for soft_ID in soft_clip_ID: reads_info = ID_dict[soft_ID] soft_clip_results=soft_clip_fusion(reads_info) if soft_clip_results[0]: if soft_clip_results[1]=="Double": DoubleInput.write(soft_clip_results[2]) elif soft_clip_results[1]=="Single": SingleInput.write(soft_clip_results[2]) SingleInput.close() DoubleInput.close() double_points_file=outdir+"/double_breakpoints.txt" os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file+" && rm "+hard_clip_file+" && rm "+double_points) single_points_file=outdir+"/single_breakpoints.txt" os.system("cp "+single_points+" "+single_points_file+" && rm "+single_points) t2=time.time() print("Times: "+str(int(t2-t1))+"s") print("BreakPoints Search Done!") if __name__ == '__main__': parser = argparse.ArgumentParser(description='get reads mapping location') parser.add_argument('-i', required=True, type=str, help="insam") parser.add_argument('-o', required=True, type=str, help="outdir") args = parser.parse_args() main(args.i, args.o)