123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import argparse,time
- import pandas as pd
- def GetLocation(targetLocation):
-
- gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID'])
- index = 0
- with open(targetLocation,'r') as tr_nm:
- for line in tr_nm:
- info = line.strip("\n").split("\t")
- #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 -
- gene_location_pd.loc[index] = info
- index +=1
- return gene_location_pd
- def GetAnnoGeneInfo(chr,pos,gene_pd):
- select_pd=gene_pd[gene_pd['Chr']==chr]
- total_num = len(select_pd)
- pos=int(pos)
- #挨个遍历
- for index in range(total_num):
- temp_pd = select_pd.iloc[index]
- if int(temp_pd[1]) > pos:
- if index ==0 :
- return False,"NO","NO",0,"NO"
- else:
- temp_pre_pd= select_pd.iloc[index-1]
- #同一个基因
- if temp_pre_pd[4] == temp_pd[4]:
- exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6]
- return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5]
- #基因间区
- else:
- return False,"NO","NO",0,"NO"
- #断点在exon上
- elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos:
- return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5]
-
- return False,"NO","NO",0,"NO"
- def GetPrimerInfo(points_s,points_e,strand):
-
- primer=""
- if strand == '+':
- if points_e > points_s:
- primer='primer3'
- else:
- primer='primer5'
- else:
- if points_e > points_s:
- primer='primer5'
- else:
- primer='primer3'
- return primer
- def GetAnno(site,point1_e,point2_e,gene_pd):
- out_anno=[]
- site_temp=site.split("-")
- point1_s_temp=site_temp[0].split(":")
- point1_s_chr=point1_s_temp[0]
- point1_s_pos=point1_s_temp[1]
- point2_s_temp=site_temp[1].split(":")
- point2_s_chr=point2_s_temp[0]
- point2_s_pos=point2_s_temp[1]
- point1_e_temp=point1_e.split(":")
- point1_e_chr=point1_e_temp[0]
- point1_e_pos=point1_e_temp[1]
- point2_e_temp=point2_e.split(":")
- point2_e_chr=point2_e_temp[0]
- point2_e_pos=point2_e_temp[1]
- point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd)
- point2_gene_anno=GetAnnoGeneInfo(point2_s_chr,point2_s_pos,gene_pd)
- if point1_gene_anno[0] and point2_gene_anno[0]:
- point1_gene=point1_gene_anno[1]
- point1_exonID=point1_gene_anno[3]
- point1_strand=point1_gene_anno[2]
- point1_transcript=point1_gene_anno[4]
- point2_gene=point2_gene_anno[1]
- point2_exonID=point2_gene_anno[3]
- point2_strand=point2_gene_anno[2]
- point2_transcript=point2_gene_anno[4]
- point1_primer=''
- if point1_s_chr==point1_e_chr and abs(int(point1_s_pos)-int(point1_e_pos)) <50:
- point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand)
- elif point1_s_chr==point2_e_chr and abs(int(point1_s_pos)-int(point2_e_pos)) <50:
- point1_primer=GetPrimerInfo(point1_s_pos,point2_e_pos,point1_strand)
-
- point2_primer=''
- if point2_s_chr==point2_e_chr and abs(int(point2_s_pos)-int(point2_e_pos)) <50:
- point2_primer=GetPrimerInfo(point2_s_pos,point2_e_pos,point2_strand)
- elif point2_s_chr==point1_e_chr and abs(int(point2_s_pos)-int(point1_e_pos)) <50:
- point2_primer=GetPrimerInfo(point2_s_pos,point1_e_pos,point2_strand)
- if point1_primer == 'primer3' and point2_primer == 'primer5':
- site_temp=site.split("-")
- site_rev=site_temp[1]+'-'+site_temp[0]
- out_anno=[point2_gene+'('+point2_strand+')'+'-'+point1_gene+'('+point1_strand+')',point2_gene[0]+point2_exonID+'-'+point1_gene[0]+point1_exonID,point2_transcript+'-'+point1_transcript,site_rev]
- elif point1_primer == 'primer5' and point2_primer== 'primer3':
- out_anno=[point1_gene+'('+point1_strand+')'+'-'+point2_gene+'('+point2_strand+')',point1_gene[0]+point1_exonID+'-'+point2_gene[0]+point2_exonID,point1_transcript+'-'+point2_transcript,site]
-
- return out_anno
- def main(infile,targetLocation,outfile):
- t1=time.time()
- gene_location_pd=GetLocation(targetLocation)
- outf=open(outfile,'w')
- outf.write("\t".join(["FusionID","Exon","Transcript","BreakPoint","Overlap","DoubleNum","SingleNum","UMIkind","PrimerPair","FusionSeq"])+"\n")
- with open(infile,'r') as inputFile:
- header=inputFile.readline().strip("\n").split("\t")
- Points_index=header.index("Point")
- Points1_End_index=header.index("Point1_End")
- Points2_End_index=header.index("Point2_End")
- DoubleReads_index=header.index("DoubleRead")
- SingleReads_index=header.index("SingleRead")
- UMIkinds_index=header.index("UMIkind")
- Overlap_index=header.index("Overlap")
- Primer_index=header.index("PrimerPair")
- FusionSeq_index=header.index("FusionSeq")
- for line in inputFile:
- info = line.strip("\n").split("\t")
- Stat_list=[info[index] for index in [Overlap_index,DoubleReads_index,SingleReads_index,UMIkinds_index,Primer_index,FusionSeq_index]]
- out_anno=GetAnno(info[Points_index],info[Points1_End_index],info[Points2_End_index],gene_location_pd)
- if len(out_anno)>0:
- out_anno.extend(Stat_list)
- outf.write("\t".join(out_anno)+"\n")
- outf.close()
-
- t2=time.time()
- print("Times: "+str(int(t2-t1))+"s")
- print("BreakPoints Annotation Done!")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='get reads mapping location')
- parser.add_argument('-i', required=True, type=str, help="infile")
- parser.add_argument('-t', required=True, type=str, help="target gene location")
- parser.add_argument('-o', required=True, type=str, help="outfile")
- args = parser.parse_args()
- main(args.i,args.t,args.o)
|