123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import argparse,time,pysam,os
- from SearchReadsFusion import DoubleReadsFusion as DRF
- from SearchReadsFusion import SingleReadsFusion as SRF
- def ReadBam(inbam):
- ID_dict={}
- bf=pysam.AlignmentFile(inbam,'rb')
-
- for read in bf:
- ID = read.qname
- POS = read.pos+1
- CHR = read.reference_name
- FLAG = read.flag
- MAP_INFO = read.cigarstring
- MAPQ = read.mapq
- SEQ = read.seq
- #过滤掉比对质量低的hardclip reads
- if FLAG >245 and MAPQ <30:
- continue
- read_list=[ID,FLAG,CHR,POS,MAPQ,MAP_INFO,SEQ]
- if ID not in ID_dict.keys():
- ID_dict[ID]=[read_list]
- else:
- ID_dict[ID].append(read_list)
- return ID_dict
- def CheckClip(reads_info):
- flag = False
- #[ID,flag,chr,pos,MAPQ,MAP_info,SEQ]
- for reads in reads_info:
- MAP_info = reads[5]
- if MAP_info == None:
- continue
- if 'H' in MAP_info:
- return "HARD_CLIP"
- elif 'S' in MAP_info:
- flag = True
- if flag:
- return "SOFT_CLIP"
- else:
- return "NO_CLIP"
- def main(inbam,outdir):
- t1=time.time()
- #将sam内容按照ID区分,放到ID_info中
- ID_dict=ReadBam(inbam)
- #存放soft_clip的reads ID
- soft_clip_ID=[]
- #输出到本地hard clip fusion
- hard_clip_file=outdir+"/hard_clip_fusion_breakpoint.txt"
- hard_clip_out=open(hard_clip_file,'w')
- header=["Site","Site1","Site2","Seq","Overlap","Site1_Down","Site2_Down","ReadsID"]
- hard_clip_out.write("\t".join(header)+"\n")
-
- #找double reads 类
- drf =DRF()
- for ID in ID_dict.keys():
- reads_info = ID_dict[ID]
- clip_stat=CheckClip(reads_info)
- if clip_stat == "SOFT_CLIP":
- soft_clip_ID.append(ID)
- elif clip_stat == "HARD_CLIP":
- out_list=drf.HardClipFusion(reads_info)
- if len(out_list) !=0:
- out_list.append(ID)
- hard_clip_out.write("\t".join(out_list)+"\n")
- hard_clip_out.close()
- #输出到本地single
- single_points=outdir+"/soft_clip_single_breakpoint.txt"
- SingleInput=open(single_points,'w')
- header=["Site","MismatchSeq","MatchSeq","Site_Down","ReadsID"]
- SingleInput.write("\t".join(header)+"\n")
- #输出到本地double
- double_points=outdir+"/soft_clip_double_breakpoint.txt"
- DoubleInput=open(double_points,'w')
- #DoubleInput.write("\t".join(header)+"\n")
-
- #找single reads 类
- srf=SRF()
- #
- for soft_ID in soft_clip_ID:
- reads_info = ID_dict[soft_ID]
- soft_clip_results=srf.SoftClipFusion(reads_info)
- if len(soft_clip_results[0]) !=0 and len(soft_clip_results[1]) !=0:
- if soft_clip_results[0][0] != soft_clip_results[1][0]:
- pass
- else:
- SingleInput.write("\t".join(soft_clip_results[0])+"\n")
- else:
- if len(soft_clip_results[0]) !=0:
- SingleInput.write("\t".join(soft_clip_results[0])+"\n")
- elif len(soft_clip_results[1]) !=0:
- SingleInput.write("\t".join(soft_clip_results[1])+"\n")
- SingleInput.close()
- DoubleInput.close()
- double_points_file=outdir+"/double_breakpoint.txt"
- os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file+" && rm "+hard_clip_file+" && rm "+double_points)
- single_points_file=outdir+"/single_breakpoint.txt"
- os.system("cp "+single_points+" "+single_points_file+" && rm "+single_points)
- t2=time.time()
- print("Times: "+str(int(t2-t1))+"s")
- print("BreakPoints Search Done!")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='get reads mapping location')
- parser.add_argument('-i', required=True, type=str, help="inbam")
- parser.add_argument('-o', required=True, type=str, help="outdir")
- args = parser.parse_args()
- main(args.i, args.o)
|