#!/usr/bin/python # -*- coding:utf-8 -*- import argparse,time import pandas as pd def GetLocation(targetLocation): gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID']) index = 0 with open(targetLocation,'r') as tr_nm: for line in tr_nm: info = line.strip("\n").split("\t") #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 - gene_location_pd.loc[index] = info index +=1 return gene_location_pd def GetAnnoGeneInfo(chr,pos,gene_pd): select_pd=gene_pd[gene_pd['Chr']==chr] total_num = len(select_pd) pos=int(pos) #挨个遍历 for index in range(total_num): temp_pd = select_pd.iloc[index] if int(temp_pd[1]) > pos: if index ==0 : return False,"NO","NO",0,"NO" else: temp_pre_pd= select_pd.iloc[index-1] #同一个基因 if temp_pre_pd[4] == temp_pd[4]: exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6] return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5] #基因间区 else: return False,"NO","NO",0,"NO" #断点在exon上 elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos: return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5] return False,"NO","NO",0,"NO" def GetPrimerInfo(points_s,points_e,strand): primer="" if strand == '+': if points_e > points_s: primer='primer3' else: primer='primer5' else: if points_e > points_s: primer='primer5' else: primer='primer3' return primer def GetAnno(site,point1_e,point2_e,gene_pd): out_anno=[] site_temp=site.split("-") point1_s_temp=site_temp[0].split(":") point1_s_chr=point1_s_temp[0] point1_s_pos=point1_s_temp[1] point2_s_temp=site_temp[1].split(":") point2_s_chr=point2_s_temp[0] point2_s_pos=point2_s_temp[1] point1_e_temp=point1_e.split(":") point1_e_chr=point1_e_temp[0] point1_e_pos=point1_e_temp[1] point2_e_temp=point2_e.split(":") point2_e_chr=point2_e_temp[0] point2_e_pos=point2_e_temp[1] point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd) point2_gene_anno=GetAnnoGeneInfo(point2_s_chr,point2_s_pos,gene_pd) if point1_gene_anno[0] and point2_gene_anno[0]: point1_gene=point1_gene_anno[1] point1_exonID=point1_gene_anno[3] point1_strand=point1_gene_anno[2] point1_transcript=point1_gene_anno[4] point2_gene=point2_gene_anno[1] point2_exonID=point2_gene_anno[3] point2_strand=point2_gene_anno[2] point2_transcript=point2_gene_anno[4] point1_primer='' if point1_s_chr==point1_e_chr and abs(int(point1_s_pos)-int(point1_e_pos)) <50: point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand) elif point1_s_chr==point2_e_chr and abs(int(point1_s_pos)-int(point2_e_pos)) <50: point1_primer=GetPrimerInfo(point1_s_pos,point2_e_pos,point1_strand) point2_primer='' if point2_s_chr==point2_e_chr and abs(int(point2_s_pos)-int(point2_e_pos)) <50: point2_primer=GetPrimerInfo(point2_s_pos,point2_e_pos,point2_strand) elif point2_s_chr==point1_e_chr and abs(int(point2_s_pos)-int(point1_e_pos)) <50: point2_primer=GetPrimerInfo(point2_s_pos,point1_e_pos,point2_strand) if point1_primer=='primer3' and point2_primer=='primer5': site_temp=site.split("-") site_rev=site_temp[1]+'-'+site_temp[0] out_anno=[point2_gene+'('+point2_strand+')'+'-'+point1_gene+'('+point1_strand+')',point2_gene[0]+point2_exonID+'-'+point1_gene[0]+point1_exonID,point2_transcript+'-'+point1_transcript,site_rev] elif point1_primer=='primer5' and point2_primer=='primer3': out_anno=[point1_gene+'('+point1_strand+')'+'-'+point2_gene+'('+point2_strand+')',point1_gene[0]+point1_exonID+'-'+point2_gene[0]+point2_exonID,point1_transcript+'-'+point2_transcript,site] return out_anno def main(infile,targetLocation,outfile): t1=time.time() gene_location_pd=GetLocation(targetLocation) outf=open(outfile,'w') outf.write("\t".join(["FusionID","Exon","Transcript","BreakPoints","Overlap","DoubleNum","SingleNum","UMIkinds","PrimerPairs","FusionSeq"])+"\n") with open(infile,'r') as inputFile: header=inputFile.readline().strip("\n").split("\t") Points_index=header.index("Points") Points1_End_index=header.index("Point1_End") Points2_End_index=header.index("Point2_End") DoubleReads_index=header.index("DoubleReads") SingleReads_index=header.index("SingleReads") UMIkinds_index=header.index("UMIkinds") Overlap_index=header.index("Overlap") Primer_index=header.index("PrimerPairs") FusionSeq_index=header.index("FusionSeq") for line in inputFile: info = line.strip("\n").split("\t") Stat_list=[info[index] for index in [Overlap_index,DoubleReads_index,SingleReads_index,UMIkinds_index,Primer_index,FusionSeq_index]] out_anno=GetAnno(info[Points_index],info[Points1_End_index],info[Points2_End_index],gene_location_pd) if len(out_anno)>0: out_anno.extend(Stat_list) outf.write("\t".join(out_anno)+"\n") outf.close() t2=time.time() print("Times: "+str(int(t2-t1))+"s") print("BreakPoints Annotation Done!") if __name__ == '__main__': parser = argparse.ArgumentParser(description='get reads mapping location') parser.add_argument('-i', required=True, type=str, help="infile") parser.add_argument('-t', required=True, type=str, help="target gene location") parser.add_argument('-o', required=True, type=str, help="outfile") args = parser.parse_args() main(args.i,args.t,args.o)