breakpoint_anno_single_v6.3.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,time
  4. import pandas as pd
  5. def GetLocation(targetLocation):
  6. gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID'])
  7. index = 0
  8. with open(targetLocation,'r') as tr_nm:
  9. for line in tr_nm:
  10. info = line.strip("\n").split("\t")
  11. #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 -
  12. gene_location_pd.loc[index] = info
  13. index +=1
  14. return gene_location_pd
  15. def GetAnnoGeneInfo(chr,pos,gene_pd):
  16. select_pd=gene_pd[gene_pd['Chr']==chr]
  17. total_num = len(select_pd)
  18. pos=int(pos)
  19. #挨个遍历
  20. for index in range(total_num):
  21. temp_pd = select_pd.iloc[index]
  22. if int(temp_pd[1]) > pos:
  23. if index ==0 :
  24. return False,"NO","NO",0,"NO"
  25. else:
  26. temp_pre_pd= select_pd.iloc[index-1]
  27. #同一个基因
  28. if temp_pre_pd[4] == temp_pd[4]:
  29. exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6]
  30. return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5]
  31. #基因间区
  32. else:
  33. return False,"NO","NO",0,"NO"
  34. #断点在exon上
  35. elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos:
  36. return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5]
  37. return False,"NO","NO",0,"NO"
  38. def GetPrimerInfo(points_s,points_e,strand):
  39. primer=""
  40. if strand == '+':
  41. if points_e > points_s:
  42. primer='primer3'
  43. else:
  44. primer='primer5'
  45. else:
  46. if points_e > points_s:
  47. primer='primer5'
  48. else:
  49. primer='primer3'
  50. return primer
  51. def GetAnno(site,point1_e,gene_pd):
  52. out_anno=[]
  53. site_temp=site.split("-")
  54. point1_s_temp=site_temp[0].split(":")
  55. point1_s_chr=point1_s_temp[0]
  56. point1_s_pos=point1_s_temp[1]
  57. point1_e_temp=point1_e.split(":")
  58. point1_e_chr=point1_e_temp[0]
  59. point1_e_pos=point1_e_temp[1]
  60. point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd)
  61. if point1_gene_anno[0]:
  62. point1_strand=point1_gene_anno[2]
  63. point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand)
  64. out_anno=[point1_gene_anno[1]+"("+point1_gene_anno[2]+")",point1_gene_anno[3],point1_gene_anno[4]]
  65. out_anno.append(point1_primer)
  66. return out_anno
  67. def main(infile,targetLocation,outfile):
  68. t1=time.time()
  69. gene_location_pd=GetLocation(targetLocation)
  70. outf=open(outfile,'w')
  71. outf.write("\t".join(["GeneID","ExonID","Transcript","PrimerInfo","BreakPoints","SingleNum","UMIkinds","PrimerPair"])+"\n")
  72. with open(infile,'r') as inputFile:
  73. header=inputFile.readline().strip("\n").split("\t")
  74. Point_index=header.index("Point")
  75. PointDown_index=header.index("PointDown")
  76. SingleNum_index=header.index("SingleNum")
  77. UMIs_index=header.index("UMI")
  78. PrimerPairs_index=header.index("PrimerPair")
  79. for line in inputFile:
  80. info = line.strip("\n").split("\t")
  81. Stat_list=[info[index] for index in [Point_index,SingleNum_index,UMIs_index,PrimerPairs_index]]
  82. out_anno=GetAnno(info[Point_index],info[PointDown_index],gene_location_pd)
  83. if len(out_anno)>0:
  84. out_anno.extend(Stat_list)
  85. outf.write("\t".join(out_anno)+"\n")
  86. outf.close()
  87. t2=time.time()
  88. print("Times: "+str(int(t2-t1))+"s")
  89. print("BreakPoints Annotation Single Done!")
  90. if __name__ == '__main__':
  91. parser = argparse.ArgumentParser(description='get reads mapping location')
  92. parser.add_argument('-i', required=True, type=str, help="infile")
  93. parser.add_argument('-t', required=True, type=str, help="target gene location")
  94. parser.add_argument('-o', required=True, type=str, help="outfile")
  95. args = parser.parse_args()
  96. main(args.i,args.t,args.o)