123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import argparse,time,os
- BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'}
- def read_sam(insam):
- ID_dict={}
- with open(insam,'r') as inputSam:
- for line in inputSam:
- if not line.startswith("@"):
- info=line.strip("\n").split("\t")
- #ID_info = info[0].split("_")
- #hrad clip reads MAP>0
- if int(info[1])>256 and int(info[4])==0:
- continue
- ID = info[0]
- if ID not in ID_dict.keys():
- ID_dict[ID]=[info]
- else:
- ID_dict[ID].append(info)
- return ID_dict
- def check_clip(reads_info):
- if len(reads_info)>=3:
- return "hard_clip"
- elif len(reads_info)==2:
- for item in reads_info:
- if "S" in item[5]:
- return "soft_clip"
- return "no_clip"
- def get_map_info(map_str):
- #99M51S
- num_list=[]
- map_dict={}
- count=0
- for item in map_str:
- if item.isalpha():
- num=len(num_list)
- index=0
- for i in range(num):
- index += int(num_list[i])*10**(num-i-1)
- map_dict[count]=item+":"+str(index)
- count +=1
- num_list=[]
- elif item.isdigit():
- num_list.append(item)
- return map_dict
- def check_match(match1,match2):
- if match1<30 or match2<30:
- return False
- return True
- def adj_info(seq,info1):
- flag = True
- for key in info1.keys():
- item = info1[key]
- if 'D' in item or 'I' in item:
- flag = False
- if 'H' in info1[0]:
- flag = False
- #不需要调整
- if flag:
- return seq,info1
- else:
- new_seq_list=[]
- new_info1={}
- start=0
- old_item=''
- for key in info1.keys():
- item = info1[key]
- temp_len=int(item.split(":")[1])
- end = start+temp_len
- if item.startswith('M') or item.startswith('S'):
- for index in range(start,end):
- new_seq_list.append(seq[index])
- old_item=item
- elif item.startswith('D') and old_item.startswith('M'):
- end = end-temp_len
- for index in range(temp_len):
- new_seq_list.append('-')
- elif item.startswith('H'):
- end = end-temp_len
- start = end
- #
- num=len(info1)
- index =0
- new_index= 0
- while index+3 <= num:
- item1=info1[index]
- item2=info1[index+1]
- item3=info1[index+2]
- if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
- pos1=int(item1.split(":")[1])
- pos2=int(item2.split(":")[1])
- pos3=int(item3.split(":")[1])
- new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
- new_index +=1
- index +=3
- elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
- pos1=int(item1.split(":")[1])
- pos3=int(item3.split(":")[1])
- new_info1[new_index] = 'M:'+str(pos1+pos3)
- new_index +=1
- index +=3
- elif item1.startswith("H") and item2.startswith("M"):
- new_info1[new_index] = item2
- new_index +=1
- index +=2
- else:
- new_info1[new_index] = item1
- new_index += 1
- index += 1
-
- while index < num:
- new_info1[new_index]=info1[index]
- new_index += 1
- index += 1
- return "".join(new_seq_list),new_info1
- def get_points(info1,info2,OVERLAP=10):
-
- #OVERLAP默认值是10,默认不用这个参数值
- if len(info1)>=5 and len(info2)>=5:
- map_info1=get_map_info(info1[5])
- map_info2=get_map_info(info2[5])
- seq1=info1[9]
- seq2=info2[9]
- #print(map_info1)
- #print(seq1)
- info1_temp=adj_info(seq1,map_info1)
- seq=info1_temp[0]
- map_info1=info1_temp[1]
- #print(map_info1)
- #print(seq)
- #print(map_info2)
- info2_temp=adj_info(seq2,map_info2)
- seq2=info2_temp[0]
- map_info2=info2_temp[1]
- #print(map_info2)
- chr1=info1[2]
- pos1=int(info1[3])
- chr2=info2[2]
- pos2=int(info2[3])
- flag = int(info1[1])
-
- overlap=0
- breakpoint1=0
- breakpoint1_1=0
- breakpoint2=0
- breakpoint1_e=0
- breakpoint2_e=0
- fusion_seq=""
- if len(map_info1)==2 and len(map_info2)==2:
- #soft-clip end,hard-clip front
- #{0: 'M:131', 1: 'S:19'}
- #{0: 'H:118', 1: 'M:32'}
- if 'S' in map_info1[1] and 'H' in map_info2[0]:
- match1=int(map_info1[0].split(":")[1])
- skip1=int(map_info1[1].split(":")[1])
- match2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
-
- overlap=match2-skip1
- breakpoint1=pos1+match1-1-overlap
- breakpoint2=pos2
- breakpoint1_1=pos1+match1-1
- if skip1 >30:
- len2=30
- else:
- len2=skip1
- fusion_seq=seq[match1-30:match1+len2]
- breakpoint1_e=breakpoint1_1-30
- breakpoint2_e=breakpoint2+30
-
- #soft-clip end,hard-clip end
- #{0: 'M:99', 1: 'S:51'}
- #{0: 'M:53', 1: 'H:97'}
- elif 'S' in map_info1[1] and 'H' in map_info2[1]:
- match1=int(map_info1[0].split(":")[1])
- skip1=int(map_info1[1].split(":")[1])
- match2=int(map_info2[0].split(":")[1])
- hide2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
-
- overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
- if match2 > skip1:
- breakpoint1=pos1+match1-1-overlap
- else:
- breakpoint1=pos1+match1+overlap
- breakpoint2=pos2+match2-1
- breakpoint1_1=pos1+match1-1
- if skip1 >30:
- len2=30
- else:
- len2=skip1
- #print(match1)
- fusion_seq=seq[match1-30:match1+len2]
- #print(fusion_seq)
- breakpoint1_e=breakpoint1_1-30
- breakpoint2_e=breakpoint2-30
- #soft-clip front,hard-clip end
- #{0: 'S:51', 1: 'M:99'}
- #{0: 'M:53', 1: 'H:97'}
- elif 'S' in map_info1[0] and 'H' in map_info2[1]:
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[0].split(":")[1])
- match2=int(map_info2[0].split(":")[1])
- hide2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
-
- overlap=abs(abs(match2-skip1)-abs(match1+skip1-match2-hide2))
- breakpoint1=pos1+overlap
- breakpoint2=pos2+match2-1
- breakpoint1_1=pos1
- fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
-
- breakpoint1_e=breakpoint1_1+30
- breakpoint2_e=breakpoint2-30
- #soft-clip front,hard-clip front
- #{0: 'S:51', 1: 'M:99'}
- #{0: 'H:97', 1: 'M:53'}
- elif 'S' in map_info1[0] and 'H' in map_info2[0]:
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[0].split(":")[1])
- match2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
- overlap=abs(match2-skip1)
- breakpoint1=pos1+overlap
- breakpoint2=pos2
- breakpoint1_1=pos1
- fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
- breakpoint1_e=breakpoint1_1+30
- breakpoint2_e=breakpoint2+30
- elif len(map_info1)==3 and len(map_info2)==2:
- flag_bin=bin(flag)
- if flag_bin[-5] == '1':
- #soft-clip front
- if 'H' in map_info2[0]:
- #hard-clip front
- #{0: 'S:51', 1: 'M:99', 2:'S:5'}
- #{0: 'H:97', 1: 'M:53'}
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[0].split(":")[1])
- match2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
- overlap=abs(match2-skip1)
- breakpoint1=pos1+overlap
- breakpoint2=pos2
- breakpoint1_1=pos1
- fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
- breakpoint1_e=breakpoint1_1+30
- breakpoint2_e=breakpoint2+30
- elif 'H' in map_info2[1]:
- #hard-clip end
- #{0: 'S:51', 1: 'M:99', 2:'S:5'}
- #{0: 'M:53', 1: 'H:97'}
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[0].split(":")[1])
- match2=int(map_info2[0].split(":")[1])
- if not check_match(match1,match2):
- return False,0
-
- overlap=abs(match2-skip1)
- breakpoint1=pos1+overlap
- breakpoint2=pos2+match2-1
- breakpoint1_1=pos1
- fusion_seq=seq[skip1-30:skip1+30] if skip1 > 30 else seq[:skip1+30]
- breakpoint1_e=breakpoint1_1+30
- breakpoint2_e=breakpoint2-30
- elif flag_bin[-5] == '0':
- #soft-clip end
- if 'H' in map_info2[0]:
- #hard-clip front
- #{0: 'S:5', 1: 'M:99', 2:'S:51'}
- #{0: 'H:97', 1: 'M:53'}
- former_skip=int(map_info1[0].split(":")[1])
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[2].split(":")[1])
- match2=int(map_info2[1].split(":")[1])
- if not check_match(match1,match2):
- return False,0
- overlap=abs(match2-skip1)
- breakpoint1=pos1+match1-1-overlap
- breakpoint2=pos2
- breakpoint1_1=pos1+match1-1
- if skip1 >30:
- len2=30
- else:
- len2=skip1
- fusion_seq=seq[former_skip+match1-30:former_skip+match1+len2]
- breakpoint1_e=breakpoint1_1-30
- breakpoint2_e=breakpoint2+30
- elif 'H' in map_info2[1]:
- #hard-clip end
- #{0: 'S:5', 1: 'M:99', 2:'S:51'}
- #{0: 'M:53', 1: 'H:97'}
- former_skip=int(map_info1[0].split(":")[1])
- match1=int(map_info1[1].split(":")[1])
- skip1=int(map_info1[2].split(":")[1])
- match2=int(map_info2[0].split(":")[1])
- if not check_match(match1,match2):
- return False,0
- overlap=abs(match2-skip1)
- breakpoint1=pos1+match1-1-overlap
- breakpoint2=pos2+match2-1
- breakpoint1_1=pos1+match1-1
- if skip1 >30:
- len2=30
- else:
- len2=skip1
- fusion_seq=seq[former_skip+match1-30:+former_skip+match1+len2]
- breakpoint1_e=breakpoint1_1-30
- breakpoint2_e=breakpoint2-30
- if abs(overlap) <= OVERLAP and breakpoint1 !=0:
- return True,chr1+":"+str(breakpoint1),chr2+":"+str(breakpoint2),chr1+":"+str(breakpoint1_1),fusion_seq,str(overlap),chr1+":"+str(breakpoint1_e),chr2+":"+str(breakpoint2_e)
- return False,0
- return False,0
- def hard_clip_fusion(reads_info):
- #R1:hard-clip
- R1_soft_info=reads_info[0]
- R1_hard_info=reads_info[1]
- if int(R1_hard_info[1])>256 and int(R1_soft_info[4])>=60:
- results=get_points(R1_soft_info,R1_hard_info)
- if results[0]:
- out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
- return out_list
- #R2:hard-clip
- R2_soft_info=reads_info[-2]
- R2_hard_info=reads_info[-1]
- if int(R2_hard_info[1])>256 and int(R2_soft_info[4])>=60:
- results=get_points(R2_soft_info,R2_hard_info)
- if results[0]:
- out_list=[results[1]+"-"+results[2],results[3],results[2],results[4],results[5],results[6],results[7]]
- return out_list
- return []
- def primer_loc(primer,seq):
- primer_num=len(primer)
- seq_primer=seq[:primer_num]
- seq_rev_comp="".join([BASE_DICT[base] for base in seq][::-1])
- seq_rev_comp_primer=seq_rev_comp[:primer_num]
- if primer == seq_primer:
- return "Front"
- elif primer == seq_rev_comp_primer:
- return "End"
- return "NO"
- def get_pos_end(map_info,pos):
-
- chr=pos.split(":")[0]
- pos_temp=int(pos.split(":")[1])
- new_pos=0
- correct_pos=pos_temp
- if map_info[0][0]=='M':
- match_len=int(map_info[0].split(":")[1])
- pos_temp=pos_temp+match_len-1
- correct_pos=pos_temp
- new_pos=pos_temp-30
- elif map_info[0][0]=='S':
- new_pos=pos_temp+30
- else:
- return pos
- return chr+":"+str(correct_pos),chr+":"+str(new_pos)
- def IS_overlap(seq1,seq2,msimatch=2):
- num = len(seq1)
- count =0
- for i in range(num):
- if seq1[i] !=seq2[i]:
- count +=1
- if count >=msimatch:
- return False
- return True
- def soft_clip_fusion(reads_info):
- R1_read_info=reads_info[0]
- R2_read_info=reads_info[1]
- ID_info = R1_read_info[0].split("_")
- ID = R1_read_info[0]
- R1_primer_loc=""
- R1_seq=R1_read_info[9]
- R1_mismatch_seq=""
- #比对质量值至少一个MAQ>=60
- if int(R1_read_info[4])<60 and int(R2_read_info[4])<60:
- return False,"",""
-
- R1_loc=""
- R1_map_info={}
- R1_map_info_adj={}
- #R1 primer存在
- if len(ID_info)>=4:
- R1_primer=ID_info[3]
- R1_map_info=get_map_info(R1_read_info[5])
- R1_loc=primer_loc(R1_primer,R1_read_info[9])
- if len(R1_map_info)==3 and R1_map_info[0][0] == "S" and R1_map_info[2][0]=="S":
- end_mismatch_len=int(R1_map_info[2].split(":")[1])
- start_mismatch_len=int(R1_map_info[0].split(":")[1])
- R1_primer_len=len(R1_primer)
- if R1_loc == "End":
- if end_mismatch_len > R1_primer_len:
- R1_map_info_adj[0]=R1_map_info[1]
- R1_map_info_adj[1]=R1_map_info[2]
- R1_primer_loc="Same"
- mismatch_len=end_mismatch_len
- R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
- R1_mismatch_seq=R1_seq[:mismatch_len]
-
- else:
- R1_map_info_adj[0]=R1_map_info[0]
- R1_map_info_adj[1]=R1_map_info[1]
- R1_primer_loc="Not the Same"
- mismatch_len=start_mismatch_len
- R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
- R1_mismatch_seq=R1_seq[-1*mismatch_len:]
- elif R1_loc == "Front":
- if start_mismatch_len > R1_primer_len:
- R1_map_info_adj[0]=R1_map_info[0]
- R1_map_info_adj[1]=R1_map_info[1]
- R1_primer_loc="Same"
- mismatch_len=start_mismatch_len
- R1_mismatch_seq=R1_seq[:mismatch_len]
- else:
- R1_map_info_adj[0]=R1_map_info[1]
- R1_map_info_adj[1]=R1_map_info[2]
- R1_primer_loc="Not the Same"
- mismatch_len=end_mismatch_len
- R1_mismatch_seq=R1_seq[-1*mismatch_len:]
- elif len(R1_map_info)==2:
- R1_map_info_adj=R1_map_info
- if R1_loc == "End" and R1_map_info[0][0]=="S":
- R1_primer_loc="Not the Same"
- R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
- mismatch_len=int(R1_map_info[0].split(":")[1])
- R1_mismatch_seq=R1_seq[-1*mismatch_len:]
- elif R1_loc == "End" and R1_map_info[1][0]=="S":
- R1_primer_loc="Same"
- R1_seq="".join([BASE_DICT[base] for base in R1_seq][::-1])
- mismatch_len=int(R1_map_info[1].split(":")[1])
- #if len(R1_primer)+5 >=mismatch_len:
- if len(R1_primer) >=mismatch_len:
- return False,"",""
- R1_mismatch_seq=R1_seq[:mismatch_len]
- elif R1_loc == "Front" and R1_map_info[0][0]=="S":
- R1_primer_loc="Same"
- mismatch_len=int(R1_map_info[0].split(":")[1])
- #if len(R1_primer)+5 >=mismatch_len:
- if len(R1_primer) >=mismatch_len:
- return False,"",""
- R1_mismatch_seq=R1_seq[:mismatch_len]
- elif R1_loc == "Front" and R1_map_info[1][0] =="S":
- R1_primer_loc="Not the Same"
- mismatch_len=int(R1_map_info[1].split(":")[1])
- R1_mismatch_seq=R1_seq[-1*mismatch_len:]
- #no soft clip
- else:
- R1_primer_loc="None"
-
- R2_primer_loc=""
- R2_seq=R2_read_info[9]
- R2_mismatch_seq=""
- R2_loc=""
- R2_map_info={}
- R2_map_info_adj={}
- #R1,R2都存在primer
- if len(ID_info)==5:
- R2_primer=ID_info[4]
- R2_map_info=get_map_info(R2_read_info[5])
- R2_loc=primer_loc(R2_primer,R2_read_info[9])
- if len(R2_map_info)==3 and R2_map_info[0][0] == "S" and R2_map_info[2][0]=="S":
- end_mismatch_len=int(R2_map_info[2].split(":")[1])
- start_mismatch_len=int(R2_map_info[0].split(":")[1])
- R2_primer_len=len(R2_primer)
- if R2_loc == "End":
- if end_mismatch_len > R2_primer_len:
- R2_map_info_adj[0]=R2_map_info[1]
- R2_map_info_adj[1]=R2_map_info[2]
- R2_primer_loc="Same"
- mismatch_len=end_mismatch_len
- R2_mismatch_seq=R2_seq[-1*mismatch_len:]
- else:
- R2_map_info_adj[0]=R2_map_info[0]
- R2_map_info_adj[1]=R2_map_info[1]
- R2_primer_loc="Not the Same"
- mismatch_len=start_mismatch_len
- R2_mismatch_seq=R2_seq[:mismatch_len:]
- if R2_loc == "Front":
- if start_mismatch_len > R2_primer_len:
- R2_map_info_adj[0]=R2_map_info[0]
- R2_map_info_adj[1]=R2_map_info[1]
- R2_primer_loc="Same"
- mismatch_len=start_mismatch_len
- R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
- R2_mismatch_seq=R2_seq[-1*mismatch_len:]
- else:
- R2_map_info_adj[0]=R2_map_info[1]
- R2_map_info_adj[1]=R2_map_info[2]
- R2_primer_loc="Not the Same"
- mismatch_len=end_mismatch_len
- R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
- R2_mismatch_seq=R2_seq[:mismatch_len]
- elif len(R2_map_info)==2:
- R2_map_info_adj=R2_map_info
- if R2_loc == "End" and R2_map_info[0][0]=="S":
- R2_primer_loc="Not the Same"
- mismatch_len=int(R2_map_info[0].split(":")[1])
- R2_mismatch_seq=R2_seq[:mismatch_len]
- elif R2_loc == "End" and R2_map_info[1][0]=="S":
- R2_primer_loc="Same"
- mismatch_len=int(R2_map_info[1].split(":")[1])
- #if len(R2_primer)+5 >=mismatch_len:
- if len(R2_primer) >=mismatch_len:
- return False,"",""
- R2_mismatch_seq=R2_seq[-1*mismatch_len:]
- elif R2_loc == "Front" and R2_map_info[0][0]=="S":
- R2_primer_loc="Same"
- R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
- mismatch_len=int(R2_map_info[0].split(":")[1])
- #if len(R2_primer)+5 >= mismatch_len:
- if len(R2_primer) >= mismatch_len:
- return False,"",""
- R2_mismatch_seq=R2_seq[-1*mismatch_len:]
- elif R2_loc == "Front" and R2_map_info[1][0] =="S":
- R2_primer_loc="Not the Same"
- R2_seq="".join([BASE_DICT[base] for base in R2_seq][::-1])
- mismatch_len=int(R2_map_info[1].split(":")[1])
- R2_mismatch_seq=R2_seq[:mismatch_len]
- else:
- R2_primer_loc="None"
- if R1_primer_loc=="Not the Same" and R2_primer_loc == "Not the Same":
- R1_mismatch_len=len(R1_mismatch_seq)
- R2_mismatch_len=len(R2_mismatch_seq)
- if R1_mismatch_len>=5 and R2_mismatch_len >=5:
- for index in range(5):
- R2_match_seq = R2_seq[index+R2_mismatch_len:index+R2_mismatch_len+R1_mismatch_len]
- if R2_match_seq == R1_mismatch_seq:
- R2_match_seq_tmp = R2_seq[:index+R2_mismatch_len+R1_mismatch_len]
- R1_match_seq_tmp = R1_seq[-1*len(R2_match_seq_tmp):]
- if R2_match_seq_tmp == R1_match_seq_tmp:
- FusionSeq=R1_seq[-1*R1_mismatch_len-30:-1*R1_mismatch_len]+R2_seq[R2_mismatch_len+index:R2_mismatch_len+index+30]
- R1_point=R1_read_info[2]+":"+R1_read_info[3]
- if R2_loc == "End":
- R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])+index)
- elif R2_loc == "Front":
- R2_point=R2_read_info[2]+":"+str(int(R2_read_info[3])-index)
- else:
- return True,"Double",outstr
- R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
- R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
- outstr=R1_point+"-"+R2_point+"\t"+R1_read_info[2]+":"+R1_read_info[3]+"\t"+R2_read_info[2]+":"+R2_read_info[3]+"\t"+FusionSeq+"\t"+str(index)+"\t"+R1_point_list[1]+"\t"+R2_point_list[1]+"\t"+ID+"\n"
- return True,"Double",outstr
-
- elif R1_primer_loc=="Not the Same" and R2_primer_loc == "Same":
- R1_mismatch_len=len(R1_mismatch_seq)
- R2_mismatch_len=len(R2_mismatch_seq)
- common_len=min(R1_mismatch_len,R2_mismatch_len)
- R1_mismatch_tmp=R1_mismatch_seq[-1*R1_mismatch_len:-1*common_len]
- R2_mismatch_tmp=R2_mismatch_seq[-1*R2_mismatch_len:-1*common_len]
- R2_match_seq=R2_seq[:-1*R2_mismatch_len]
- if R1_mismatch_tmp==R2_mismatch_tmp:
- R2_point=R2_read_info[2]+":"+R2_read_info[3]
- R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
- otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- elif R1_primer_loc=="Same" and R2_primer_loc == "Same":
- return False,"",""
-
- elif R1_primer_loc=="Same" and R2_primer_loc == "Not the Same":
- R1_mismatch_len=len(R1_mismatch_seq)
- R2_mismatch_len=len(R2_mismatch_seq)
- common_len=min(R1_mismatch_len,R2_mismatch_len)
- R1_mismatch_tmp=R1_mismatch_seq[R1_mismatch_len-1*common_len:R1_mismatch_len]
- R2_mismatch_tmp=R2_mismatch_seq[R2_mismatch_len-1*common_len:R2_mismatch_len]
- R1_match_seq=R1_seq[R1_mismatch_len:]
- if IS_overlap(R1_mismatch_tmp,R2_mismatch_tmp):
- R1_point=R1_read_info[2]+":"+R1_read_info[3]
- R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
- otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- else:
- return False,"",""
- '''
- elif R1_primer_loc=="Same" and R2_primer_loc == "None":
- R1_point=R1_read_info[2]+":"+R1_read_info[3]
- R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
- R1_mismatch_len=len(R1_mismatch_seq)
- R1_match_seq=R1_seq[R1_mismatch_len:]
- otustr=R1_point_list[0]+"\tUp\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- elif R1_primer_loc=="Not the Same" and R2_primer_loc == "None":
- R1_point=R1_read_info[2]+":"+R1_read_info[3]
- R1_point_list=get_pos_end(R1_map_info_adj,R1_point)
- R1_mismatch_len=len(R1_mismatch_seq)
- R1_match_seq=R1_seq[:-1*R1_mismatch_len]
- otustr=R1_point_list[0]+"\tDown\t"+R1_mismatch_seq+"\t"+R1_match_seq+"\t"+R1_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- elif R1_primer_loc=="None" and R2_primer_loc == "Same":
- R2_point=R2_read_info[2]+":"+R2_read_info[3]
- R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
- R2_mismatch_len=len(R2_mismatch_seq)
- R2_match_seq=R2_seq[:-1*R2_mismatch_len]
- otustr=R2_point_list[0]+"\tDown\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- elif R1_primer_loc=="None" and R2_primer_loc == "Not the Same":
- R2_point=R2_read_info[2]+":"+R2_read_info[3]
- R2_point_list=get_pos_end(R2_map_info_adj,R2_point)
- R2_mismatch_len=len(R2_mismatch_seq)
- R2_match_seq=R2_seq[R2_mismatch_len:]
- otustr=R2_point_list[0]+"\tUp\t"+R2_mismatch_seq+"\t"+R2_match_seq+"\t"+R2_point_list[1]+"\t"+ID+"\n"
- return True,"Single",otustr
- '''
- return False,"",""
- def main(insam,outdir):
- t1=time.time()
- #将sam内容按照ID区分,放到ID_info中
- ID_dict=read_sam(insam)
- #存放soft_clip的reads ID
- soft_clip_ID=[]
- #输出到本地hard clip fusion
- hard_clip_file=outdir+"/hard_clip_fusion_breakpoints.txt"
- hard_clip_out=open(hard_clip_file,'w')
- header=["Site","Site1","Site2","Seq","Overlap","Site1_Down","Site2_Down","ReadsID"]
- hard_clip_out.write("\t".join(header)+"\n")
- for ID in ID_dict.keys():
- reads_info = ID_dict[ID]
- clip_stat=check_clip(reads_info)
- if clip_stat == "soft_clip":
- soft_clip_ID.append(ID)
- elif clip_stat == "hard_clip":
- out_list=hard_clip_fusion(reads_info)
- if len(out_list) !=0:
- out_list.append(ID)
- hard_clip_out.write("\t".join(out_list)+"\n")
- hard_clip_out.close()
- #输出到本地single
- single_points=outdir+"/soft_clip_single_breakpoints.txt"
- SingleInput=open(single_points,'w')
- header=["Site","Up_Down","MismatchSeq","MatchSeq","Site_Down","ReadsID"]
- SingleInput.write("\t".join(header)+"\n")
- #输出到本地double
- double_points=outdir+"/soft_clip_double_breakpoints.txt"
- DoubleInput=open(double_points,'w')
- #DoubleInput.write("\t".join(header)+"\n")
-
- #
- for soft_ID in soft_clip_ID:
- reads_info = ID_dict[soft_ID]
- soft_clip_results=soft_clip_fusion(reads_info)
- if soft_clip_results[0]:
- if soft_clip_results[1]=="Double":
- DoubleInput.write(soft_clip_results[2])
- elif soft_clip_results[1]=="Single":
- SingleInput.write(soft_clip_results[2])
- SingleInput.close()
- DoubleInput.close()
- double_points_file=outdir+"/double_breakpoints.txt"
- os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file+" && rm "+hard_clip_file+" && rm "+double_points)
- single_points_file=outdir+"/single_breakpoints.txt"
- os.system("cp "+single_points+" "+single_points_file+" && rm "+single_points)
- t2=time.time()
- print("Times: "+str(int(t2-t1))+"s")
- print("BreakPoints Search Done!")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='get reads mapping location')
- parser.add_argument('-i', required=True, type=str, help="insam")
- parser.add_argument('-o', required=True, type=str, help="outdir")
- args = parser.parse_args()
- main(args.i, args.o)
|