breakpoint_search_v6.3.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time,pysam,os
  4. from SearchReadsFusion import DoubleReadsFusion as DRF
  5. from SearchReadsFusion import SingleReadsFusion as SRF
  6. def ReadBam(inbam):
  7. ID_dict={}
  8. bf=pysam.AlignmentFile(inbam,'rb')
  9. for read in bf:
  10. ID = read.qname
  11. POS = read.pos+1
  12. CHR = read.reference_name
  13. FLAG = read.flag
  14. MAP_INFO = read.cigarstring
  15. MAPQ = read.mapq
  16. SEQ = read.seq
  17. #过滤掉比对质量低的hardclip reads
  18. if FLAG >245 and MAPQ <30:
  19. continue
  20. read_list=[ID,FLAG,CHR,POS,MAPQ,MAP_INFO,SEQ]
  21. if ID not in ID_dict.keys():
  22. ID_dict[ID]=[read_list]
  23. else:
  24. ID_dict[ID].append(read_list)
  25. return ID_dict
  26. def CheckClip(reads_info):
  27. flag = False
  28. #[ID,flag,chr,pos,MAPQ,MAP_info,SEQ]
  29. for reads in reads_info:
  30. MAP_info = reads[5]
  31. if MAP_info == None:
  32. continue
  33. if 'H' in MAP_info:
  34. return "HARD_CLIP"
  35. elif 'S' in MAP_info:
  36. flag = True
  37. if flag:
  38. return "SOFT_CLIP"
  39. else:
  40. return "NO_CLIP"
  41. def main(inbam,outdir):
  42. t1=time.time()
  43. #将sam内容按照ID区分,放到ID_info中
  44. ID_dict=ReadBam(inbam)
  45. #存放soft_clip的reads ID
  46. soft_clip_ID=[]
  47. #输出到本地hard clip fusion
  48. hard_clip_file=outdir+"/hard_clip_fusion_breakpoint.txt"
  49. hard_clip_out=open(hard_clip_file,'w')
  50. header=["Site","Site1","Site2","Seq","Overlap","Site1_Down","Site2_Down","ReadsID"]
  51. hard_clip_out.write("\t".join(header)+"\n")
  52. #找double reads 类
  53. drf =DRF()
  54. for ID in ID_dict.keys():
  55. reads_info = ID_dict[ID]
  56. clip_stat=CheckClip(reads_info)
  57. if clip_stat == "SOFT_CLIP":
  58. soft_clip_ID.append(ID)
  59. elif clip_stat == "HARD_CLIP":
  60. out_list=drf.HardClipFusion(reads_info)
  61. if len(out_list) !=0:
  62. out_list.append(ID)
  63. hard_clip_out.write("\t".join(out_list)+"\n")
  64. hard_clip_out.close()
  65. #输出到本地single
  66. single_points=outdir+"/soft_clip_single_breakpoint.txt"
  67. SingleInput=open(single_points,'w')
  68. header=["Site","MismatchSeq","MatchSeq","Site_Down","ReadsID"]
  69. SingleInput.write("\t".join(header)+"\n")
  70. #输出到本地double
  71. double_points=outdir+"/soft_clip_double_breakpoint.txt"
  72. DoubleInput=open(double_points,'w')
  73. #DoubleInput.write("\t".join(header)+"\n")
  74. #找single reads 类
  75. srf=SRF()
  76. #
  77. for soft_ID in soft_clip_ID:
  78. reads_info = ID_dict[soft_ID]
  79. soft_clip_results=srf.SoftClipFusion(reads_info)
  80. if len(soft_clip_results[0]) !=0 and len(soft_clip_results[1]) !=0:
  81. if soft_clip_results[0][0] != soft_clip_results[1][0]:
  82. pass
  83. else:
  84. SingleInput.write("\t".join(soft_clip_results[0])+"\n")
  85. else:
  86. if len(soft_clip_results[0]) !=0:
  87. SingleInput.write("\t".join(soft_clip_results[0])+"\n")
  88. elif len(soft_clip_results[1]) !=0:
  89. SingleInput.write("\t".join(soft_clip_results[1])+"\n")
  90. SingleInput.close()
  91. DoubleInput.close()
  92. double_points_file=outdir+"/double_breakpoint.txt"
  93. os.system("cat "+hard_clip_file+" > "+double_points_file+" && cat "+double_points+" >> "+double_points_file+" && rm "+hard_clip_file+" && rm "+double_points)
  94. single_points_file=outdir+"/single_breakpoint.txt"
  95. os.system("cp "+single_points+" "+single_points_file+" && rm "+single_points)
  96. t2=time.time()
  97. print("Times: "+str(int(t2-t1))+"s")
  98. print("BreakPoints Search Done!")
  99. if __name__ == '__main__':
  100. parser = argparse.ArgumentParser(description='get reads mapping location')
  101. parser.add_argument('-i', required=True, type=str, help="inbam")
  102. parser.add_argument('-o', required=True, type=str, help="outdir")
  103. args = parser.parse_args()
  104. main(args.i, args.o)