123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import argparse,time
- import pandas as pd
- def GetLocation(targetLocation):
-
- gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID'])
- index = 0
- with open(targetLocation,'r') as tr_nm:
- for line in tr_nm:
- info = line.strip("\n").split("\t")
- #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 -
- gene_location_pd.loc[index] = info
- index +=1
- return gene_location_pd
- def GetAnnoGeneInfo(chr,pos,gene_pd):
- select_pd=gene_pd[gene_pd['Chr']==chr]
- total_num = len(select_pd)
- pos=int(pos)
- #挨个遍历
- for index in range(total_num):
- temp_pd = select_pd.iloc[index]
- if int(temp_pd[1]) > pos:
- if index ==0 :
- return False,"NO","NO",0,"NO"
- else:
- temp_pre_pd= select_pd.iloc[index-1]
- #同一个基因
- if temp_pre_pd[4] == temp_pd[4]:
- exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6]
- return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5]
- #基因间区
- else:
- return False,"NO","NO",0,"NO"
- #断点在exon上
- elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos:
- return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5]
-
- return False,"NO","NO",0,"NO"
- def GetPrimerInfo(points_s,points_e,strand):
-
- primer=""
- if strand == '+':
- if points_e > points_s:
- primer='primer3'
- else:
- primer='primer5'
- else:
- if points_e > points_s:
- primer='primer5'
- else:
- primer='primer3'
- return primer
- def GetAnno(site,point1_e,gene_pd):
- out_anno=[]
- site_temp=site.split("-")
- point1_s_temp=site_temp[0].split(":")
- point1_s_chr=point1_s_temp[0]
- point1_s_pos=point1_s_temp[1]
- point1_e_temp=point1_e.split(":")
- point1_e_chr=point1_e_temp[0]
- point1_e_pos=point1_e_temp[1]
- point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd)
- if point1_gene_anno[0]:
- point1_strand=point1_gene_anno[2]
- point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand)
- out_anno=[point1_gene_anno[1]+"("+point1_gene_anno[2]+")",point1_gene_anno[3],point1_gene_anno[4]]
- out_anno.append(point1_primer)
- return out_anno
- def main(infile,targetLocation,outfile):
- t1=time.time()
- gene_location_pd=GetLocation(targetLocation)
- outf=open(outfile,'w')
- outf.write("\t".join(["GeneID","ExonID","Transcript","PrimerInfo","BreakPoints","SingleNum","UMIkinds","PrimerPair"])+"\n")
- with open(infile,'r') as inputFile:
- header=inputFile.readline().strip("\n").split("\t")
- Point_index=header.index("Point")
- PointDown_index=header.index("PointDown")
- SingleNum_index=header.index("SingleNum")
- UMIs_index=header.index("UMI")
- PrimerPairs_index=header.index("PrimerPair")
- for line in inputFile:
- info = line.strip("\n").split("\t")
- Stat_list=[info[index] for index in [Point_index,SingleNum_index,UMIs_index,PrimerPairs_index]]
- out_anno=GetAnno(info[Point_index],info[PointDown_index],gene_location_pd)
- if len(out_anno)>0:
- out_anno.extend(Stat_list)
- outf.write("\t".join(out_anno)+"\n")
- outf.close()
-
- t2=time.time()
- print("Times: "+str(int(t2-t1))+"s")
- print("BreakPoints Annotation Single Done!")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='get reads mapping location')
- parser.add_argument('-i', required=True, type=str, help="infile")
- parser.add_argument('-t', required=True, type=str, help="target gene location")
- parser.add_argument('-o', required=True, type=str, help="outfile")
- args = parser.parse_args()
- main(args.i,args.t,args.o)
|