breakpoint_anno_v6.3.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time
  4. import pandas as pd
  5. def GetLocation(targetLocation):
  6. gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID'])
  7. index = 0
  8. with open(targetLocation,'r') as tr_nm:
  9. for line in tr_nm:
  10. info = line.strip("\n").split("\t")
  11. #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 -
  12. gene_location_pd.loc[index] = info
  13. index +=1
  14. return gene_location_pd
  15. def GetAnnoGeneInfo(chr,pos,gene_pd):
  16. select_pd=gene_pd[gene_pd['Chr']==chr]
  17. total_num = len(select_pd)
  18. pos=int(pos)
  19. #挨个遍历
  20. for index in range(total_num):
  21. temp_pd = select_pd.iloc[index]
  22. if int(temp_pd[1]) > pos:
  23. if index ==0 :
  24. return False,"NO","NO",0,"NO"
  25. else:
  26. temp_pre_pd= select_pd.iloc[index-1]
  27. #同一个基因
  28. if temp_pre_pd[4] == temp_pd[4]:
  29. exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6]
  30. return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5]
  31. #基因间区
  32. else:
  33. return False,"NO","NO",0,"NO"
  34. #断点在exon上
  35. elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos:
  36. return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5]
  37. return False,"NO","NO",0,"NO"
  38. def GetPrimerInfo(points_s,points_e,strand):
  39. primer=""
  40. if strand == '+':
  41. if points_e > points_s:
  42. primer='primer3'
  43. else:
  44. primer='primer5'
  45. else:
  46. if points_e > points_s:
  47. primer='primer5'
  48. else:
  49. primer='primer3'
  50. return primer
  51. def GetAnno(site,point1_e,point2_e,gene_pd):
  52. out_anno=[]
  53. site_temp=site.split("-")
  54. point1_s_temp=site_temp[0].split(":")
  55. point1_s_chr=point1_s_temp[0]
  56. point1_s_pos=point1_s_temp[1]
  57. point2_s_temp=site_temp[1].split(":")
  58. point2_s_chr=point2_s_temp[0]
  59. point2_s_pos=point2_s_temp[1]
  60. point1_e_temp=point1_e.split(":")
  61. point1_e_chr=point1_e_temp[0]
  62. point1_e_pos=point1_e_temp[1]
  63. point2_e_temp=point2_e.split(":")
  64. point2_e_chr=point2_e_temp[0]
  65. point2_e_pos=point2_e_temp[1]
  66. point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd)
  67. point2_gene_anno=GetAnnoGeneInfo(point2_s_chr,point2_s_pos,gene_pd)
  68. if point1_gene_anno[0] and point2_gene_anno[0]:
  69. point1_gene=point1_gene_anno[1]
  70. point1_exonID=point1_gene_anno[3]
  71. point1_strand=point1_gene_anno[2]
  72. point1_transcript=point1_gene_anno[4]
  73. point2_gene=point2_gene_anno[1]
  74. point2_exonID=point2_gene_anno[3]
  75. point2_strand=point2_gene_anno[2]
  76. point2_transcript=point2_gene_anno[4]
  77. point1_primer=''
  78. if point1_s_chr==point1_e_chr and abs(int(point1_s_pos)-int(point1_e_pos)) <50:
  79. point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand)
  80. elif point1_s_chr==point2_e_chr and abs(int(point1_s_pos)-int(point2_e_pos)) <50:
  81. point1_primer=GetPrimerInfo(point1_s_pos,point2_e_pos,point1_strand)
  82. point2_primer=''
  83. if point2_s_chr==point2_e_chr and abs(int(point2_s_pos)-int(point2_e_pos)) <50:
  84. point2_primer=GetPrimerInfo(point2_s_pos,point2_e_pos,point2_strand)
  85. elif point2_s_chr==point1_e_chr and abs(int(point2_s_pos)-int(point1_e_pos)) <50:
  86. point2_primer=GetPrimerInfo(point2_s_pos,point1_e_pos,point2_strand)
  87. if point1_primer == 'primer3' and point2_primer == 'primer5':
  88. site_temp=site.split("-")
  89. site_rev=site_temp[1]+'-'+site_temp[0]
  90. out_anno=[point2_gene+'('+point2_strand+')'+'-'+point1_gene+'('+point1_strand+')',point2_gene[0]+point2_exonID+'-'+point1_gene[0]+point1_exonID,point2_transcript+'-'+point1_transcript,site_rev]
  91. elif point1_primer == 'primer5' and point2_primer== 'primer3':
  92. out_anno=[point1_gene+'('+point1_strand+')'+'-'+point2_gene+'('+point2_strand+')',point1_gene[0]+point1_exonID+'-'+point2_gene[0]+point2_exonID,point1_transcript+'-'+point2_transcript,site]
  93. return out_anno
  94. def main(infile,targetLocation,outfile):
  95. t1=time.time()
  96. gene_location_pd=GetLocation(targetLocation)
  97. outf=open(outfile,'w')
  98. outf.write("\t".join(["FusionID","Exon","Transcript","BreakPoint","Overlap","DoubleNum","SingleNum","UMIkind","PrimerPair","FusionSeq"])+"\n")
  99. with open(infile,'r') as inputFile:
  100. header=inputFile.readline().strip("\n").split("\t")
  101. Points_index=header.index("Point")
  102. Points1_End_index=header.index("Point1_End")
  103. Points2_End_index=header.index("Point2_End")
  104. DoubleReads_index=header.index("DoubleRead")
  105. SingleReads_index=header.index("SingleRead")
  106. UMIkinds_index=header.index("UMIkind")
  107. Overlap_index=header.index("Overlap")
  108. Primer_index=header.index("PrimerPair")
  109. FusionSeq_index=header.index("FusionSeq")
  110. for line in inputFile:
  111. info = line.strip("\n").split("\t")
  112. Stat_list=[info[index] for index in [Overlap_index,DoubleReads_index,SingleReads_index,UMIkinds_index,Primer_index,FusionSeq_index]]
  113. out_anno=GetAnno(info[Points_index],info[Points1_End_index],info[Points2_End_index],gene_location_pd)
  114. if len(out_anno)>0:
  115. out_anno.extend(Stat_list)
  116. outf.write("\t".join(out_anno)+"\n")
  117. outf.close()
  118. t2=time.time()
  119. print("Times: "+str(int(t2-t1))+"s")
  120. print("BreakPoints Annotation Done!")
  121. if __name__ == '__main__':
  122. parser = argparse.ArgumentParser(description='get reads mapping location')
  123. parser.add_argument('-i', required=True, type=str, help="infile")
  124. parser.add_argument('-t', required=True, type=str, help="target gene location")
  125. parser.add_argument('-o', required=True, type=str, help="outfile")
  126. args = parser.parse_args()
  127. main(args.i,args.t,args.o)