#!/usr/bin/python # -*- coding:utf-8 -*- import argparse,time import pandas as pd def GetLocation(targetLocation): gene_location_pd=pd.DataFrame(columns=['Chr','Start','End','Strand','Gene','Transcript','ExonID']) index = 0 with open(targetLocation,'r') as tr_nm: for line in tr_nm: info = line.strip("\n").split("\t") #chr10 59788748 59793111 NM_005436 1 CCDC6 NM_005436 - gene_location_pd.loc[index] = info index +=1 return gene_location_pd def GetAnnoGeneInfo(chr,pos,gene_pd): select_pd=gene_pd[gene_pd['Chr']==chr] total_num = len(select_pd) pos=int(pos) #挨个遍历 for index in range(total_num): temp_pd = select_pd.iloc[index] if int(temp_pd[1]) > pos: if index ==0 : return False,"NO","NO",0,"NO" else: temp_pre_pd= select_pd.iloc[index-1] #同一个基因 if temp_pre_pd[4] == temp_pd[4]: exonID = temp_pd[6] if temp_pd[3] == '-' else temp_pre_pd[6] return True,temp_pd[4],temp_pd[3],exonID,temp_pd[5] #基因间区 else: return False,"NO","NO",0,"NO" #断点在exon上 elif int(temp_pd[1]) <= pos and int(temp_pd[2]) >= pos: return True,temp_pd[4],temp_pd[3],temp_pd[6],temp_pd[5] return False,"NO","NO",0,"NO" def GetPrimerInfo(points_s,points_e,strand): primer="" if strand == '+': if points_e > points_s: primer='primer3' else: primer='primer5' else: if points_e > points_s: primer='primer5' else: primer='primer3' return primer def GetAnno(site,point1_e,gene_pd): out_anno=[] site_temp=site.split("-") point1_s_temp=site_temp[0].split(":") point1_s_chr=point1_s_temp[0] point1_s_pos=point1_s_temp[1] point1_e_temp=point1_e.split(":") point1_e_chr=point1_e_temp[0] point1_e_pos=point1_e_temp[1] point1_gene_anno=GetAnnoGeneInfo(point1_s_chr,point1_s_pos,gene_pd) if point1_gene_anno[0]: point1_strand=point1_gene_anno[2] point1_primer=GetPrimerInfo(point1_s_pos,point1_e_pos,point1_strand) out_anno=[point1_gene_anno[1]+"("+point1_gene_anno[2]+")",point1_gene_anno[3],point1_gene_anno[4]] out_anno.append(point1_primer) return out_anno def main(infile,targetLocation,outfile): t1=time.time() gene_location_pd=GetLocation(targetLocation) outf=open(outfile,'w') outf.write("\t".join(["GeneID","ExonID","Transcript","PrimerInfo","BreakPoints","SingleNum","UMIkinds","PrimerPair"])+"\n") with open(infile,'r') as inputFile: header=inputFile.readline().strip("\n").split("\t") Point_index=header.index("Point") PointDown_index=header.index("PointDown") SingleNum_index=header.index("SingleNum") UMIs_index=header.index("UMI") PrimerPairs_index=header.index("PrimerPair") for line in inputFile: info = line.strip("\n").split("\t") Stat_list=[info[index] for index in [Point_index,SingleNum_index,UMIs_index,PrimerPairs_index]] out_anno=GetAnno(info[Point_index],info[PointDown_index],gene_location_pd) if len(out_anno)>0: out_anno.extend(Stat_list) outf.write("\t".join(out_anno)+"\n") outf.close() t2=time.time() print("Times: "+str(int(t2-t1))+"s") print("BreakPoints Annotation Single Done!") if __name__ == '__main__': parser = argparse.ArgumentParser(description='get reads mapping location') parser.add_argument('-i', required=True, type=str, help="infile") parser.add_argument('-t', required=True, type=str, help="target gene location") parser.add_argument('-o', required=True, type=str, help="outfile") args = parser.parse_args() main(args.i,args.t,args.o)