#for the fusion summary by the Genefusion and Factera method #date:20220217 import json import pandas as pd import sys,collections,math,os,os.path,re import xlrd import gzip from bs4 import BeautifulSoup import re import argparse #######acquire the transcrip strand form html #->mean the original strand;<-mean the recov pd.set_option('display.max_columns',6) def infro_html(htmlPath): ''' extract the transcript strand form html → means original read, ← means reverse complement :param htmlPath: the genefuse html :return: the trans strand ''' with open(htmlPath,"r",encoding="utf-8") as file: file1 = file.read() f1 = re.findall(".+, Fusion: (.+?) .+?<", file1) f2 = re.findall("(.*?)",file1) f3 = re.findall("(.*?)", file1) f3_str = ".+" +\ "|.+" +\ "|.+" f4 = re.findall(f3_str, file1) f5_str = ".+Supporting" +\ "|.+Supporting" +\ "|.+Supporting" f5 = re.findall(f5_str, file1) print(len(f1),len(f2),len(f3),len(f4),len(f5)) for i in f4: pass f4_1 = [] f5_1 = [] for i in range(len(f4)): if "←" in f4[i]: f4_1.append("3_5") elif "→" in f4[i]: f4_1.append("5_3") else: f4_1.append("no") if "←" in f5[i]: f5_1.append("3_5") elif "→" in f5[i]: f5_1.append("5_3") else: f5_1.append("no") data = pd.DataFrame([f1,f2,f4_1,f3,f5_1]) data = data.T data.columns = ["Fusion","Gene_left","Trans_left","Gene_right","Trans_right"] return data #### def infro_json(filename): ''' extract the information form json :param filename: the genefuse json :return: all the fusion information ''' with open(filename) as f: fusionlist = json.load(f) result = pd.DataFrame() fusionlist_data = fusionlist['fusions'] if fusionlist_data: for key, value in fusionlist_data.items(): # for left df_left = pd.DataFrame.from_dict(value['left'], orient='index', columns=['values']) # change row index df_left_renamed = df_left.rename( index={'gene_name': 'Gene_left', 'gene_chr': 'chr_left', 'position': 'position_left', 'reference': 'reference_left', 'ref_ext': 'ref_ext_left', 'pos_str': 'pos_str_left', 'exon_or_intron': 'exon_or_intron_left', 'exon_or_intron_id': 'exon_or_intron_id_left', 'strand': 'strand_left'}) # for right df_right = pd.DataFrame.from_dict(value['right'], orient='index', columns=['values']) # change the row index df_right_renamed = df_right.rename( index={'gene_name': 'Gene_right', 'gene_chr': 'chr_right', 'position': 'position_right', 'reference': 'reference_right', 'ref_ext': 'ref_ext_right', 'pos_str': 'pos_str_right', 'exon_or_intron': 'exon_or_intron_right', 'exon_or_intron_id': 'exon_or_intron_id_right', 'strand': 'strand_right'}) # extract unique reads uniquereads = pd.DataFrame(columns=['values']) uniquereads.loc['Unique_reads'] = value['unique'] # merge fusioninfor1 = df_left_renamed.append(df_right_renamed) fusioninfor2 = fusioninfor1.append(uniquereads) # trans the result fusioninfor3 = pd.DataFrame(fusioninfor2.values.T, index=fusioninfor2.columns, columns=fusioninfor2.index) fusioninfor3.insert(0, 'Fusion1', key) result = result.append(fusioninfor3) result = result.reset_index() del result['index'] result['Total_reads'] = ( (result['Fusion1'].str.split(' ', expand=True)[4]).str.split(',', expand=True)[0]).astype("int") result.insert(0, 'Fusion', result['Fusion1'].str.split(' ', expand=True)[1]) del result['Fusion1'] else: namelist=['Fusion', 'Gene_left', 'chr_left', 'position_left', 'reference_left','ref_ext_left', 'pos_str_left', 'exon_or_intron_left', 'exon_or_intron_id_left', 'strand_left', 'Gene_right', 'chr_right','position_right', 'reference_right', 'ref_ext_right', 'pos_str_right','exon_or_intron_right', 'exon_or_intron_id_right', 'strand_right', 'Unique_reads', 'Total_reads'] result=pd.DataFrame(columns=namelist) return result ###t提取genefuse的序列 def infro_txt(Path): ''' extract the transcript strand form html → means original read, ← means reverse complement :param htmlPath: the genefuse html :return: the trans strand ''' with open(Path,"r",encoding="utf-8") as file: file1 = file.read() t1 = re.findall("#Fusion: (.+?) .+?total: (.+?),.+", file1) num=0 for i in t1: num = num+int(i[1]) if num!=0: t2 = re.findall("name: (.+)", file1) if num==len(t2): t3 = re.findall("\n[ATCG].+", file1) t1_1 = [] for i in t1: for i1 in range(int(i[1])): t1_1.append(i[0]) data = pd.DataFrame([t1_1,t2,t3]) data = data.T data.columns = ["Fusion","Name",""] return data else: data = pd.DataFrame(columns=["Fusion", "Name", "Seq"]) print(os.path.basename(Path)+" error:The number of fusion-names is inconsistent") return data else: data = pd.DataFrame(columns=["Fusion", "Name", "Seq"]) print(os.path.basename(Path)+":File no result") return data ######对VAF的计算 #1.获得genefuse的readid #2.我们将断点区域上下各扩展50bp ##3.提取genefuse与bam中断点位置overlap的reads def VAF_fusion(laneid,sampleid,fusion_chr,fusion_position,inputpath): GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair') # acqure the read in fusion region fusion_loc_ex50=str(fusion_chr) + ':' + str(fusion_position-50) + '-' + str(fusion_position+ 50) pandir = os.path.join('/cgdata/pancancer/analyse',laneid+'/bamfile') bamdir =os.path.join(pandir, sampleid + '_clean.bam') bed_file = GeneFusiondir + '/' + sampleid + '.' + fusion_loc_ex50 + '.bed' read_bedcmd = 'samtools view -h ' + bamdir + ' ' + fusion_loc_ex50 + ' | ' + 'samtools view -bS - | bedtools bamtobed -i - ' + '>' + bed_file os.system(read_bedcmd) # readin the result read_region = pd.read_table(bed_file, sep='\t', names=['chr', 'start', 'end', 'readid', 'len', 'strand']) ##3.提取genefuse与bam中断点位置overlap的reads txtfile = GeneFusiondir + '/' + sampleid + '_fusion_GeneFusion.txt' genefuse_result = infro_txt(txtfile) # rename readinfor = genefuse_result['Name'].str.split(' ', expand=True) gfusereadid = readinfor[0] + '/' + (readinfor[1].str.split(':', expand=True))[0] gfusereadid1 = pd.DataFrame(gfusereadid) gfusereadid1.rename(columns={0: 'readid'}, inplace=True) # merge read_overlap = pd.merge(gfusereadid1, read_region, on='readid', how='inner') ###acquire the fusion depths depthoutdir = GeneFusiondir + '/' + sampleid + '_' + fusion_loc_ex50 + '.txt' depthcomd = 'samtools depth -r ' + fusion_loc_ex50+ ' ' + bamdir + '>' + depthoutdir os.system(depthcomd) depthdata = pd.read_table(depthoutdir, sep='\t', names=['chr', 'positon', 'total_reads']) #filter the depth depthdata['chr'] = depthdata['chr'].astype(str) filter1 = depthdata[(depthdata['chr'] == fusion_chr) & (depthdata['positon'] == fusion_position)] filter1.reset_index(drop=True, inplace=True) if filter1.shape[0]!=0: Total_depth = filter1.loc[0, 'total_reads'] fusion_readcounts = len(read_overlap) VAF = round(fusion_readcounts / Total_depth, 4) print('round4VAF '+str(VAF)) else: VAF=0 os.remove(bed_file) os.remove(depthoutdir) return VAF #fusion_chr=6 #fusion_position=117647036 #VAF_fusion(sampleid,fusion_chr,fusion_position,inputpath) #####修改,转录本方向改变的时候为3-5时,对应的left和right也需要同时进行修改 def fusion_extract(inputpath,laneid,jsonfile,htmlfile,tumorname): ''' ####extract the fusion information form the json and html ###and calculate the VAF :param jsonfile: the json file of genefuse :param htmlfile: the html file of genefuse :return: the information of fusion ''' headlist = [ 'Fusion_new', 'VAF', 'Gene_left', 'chr_left', 'position_left', 'exon_or_intron_left', 'exon_or_intron_id_left', 'Gene_right', 'chr_right', 'position_right', 'exon_or_intron_right', 'exon_or_intron_id_right', 'Unique_reads', 'Total_reads'] #extract the json file json_data = infro_json(jsonfile) #extract the html file html_data = pd.DataFrame(infro_html(htmlfile)) if (json_data.shape[0]!=0) & (html_data.shape[0]!=0) : #获得结果均有转录本,且转录本方向一致的结果 #html_data01 = html_data[html_data['Trans_left'] == html_data['Trans_right']] #当其中一个的转录本方向不定,结果为空, #html_data02 = html_data01[html_data01['Trans_left'] != 'no'] #html_data03 = html_data02[html_data01['Trans_right'] != 'no'] #html_data03.reset_index(drop=True, inplace=True) # merge the data fusion_infor1 = pd.merge(json_data, html_data, on=["Fusion", "Gene_left", "Gene_right"], how='right') #fusion_infor1 = pd.merge(json_data, html_data03, on=["Fusion", "Gene_left", "Gene_right"], how='right') fusion_infor1['Gene_right'] = fusion_infor1['Gene_right'].str.split('_', expand=True)[0] fusion_infor1['Gene_left'] = fusion_infor1['Gene_left'].str.split('_', expand=True)[0] for j in range(len(fusion_infor1)): print(j) #print(fusion_infor1.loc[j, :]) unique_reads = fusion_infor1.loc[j, 'Unique_reads'] right_gene = fusion_infor1.loc[j, 'Gene_right'] right_exonintron = fusion_infor1.loc[j, 'exon_or_intron_right'] right_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_right'] right_chr = fusion_infor1.loc[j, 'chr_right'] right_position = fusion_infor1.loc[j, 'position_right'] left_gene = fusion_infor1.loc[j, 'Gene_left'] left_exonintron = fusion_infor1.loc[j, 'exon_or_intron_left'] left_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_left'] left_chr = fusion_infor1.loc[j, 'chr_left'] left_position = fusion_infor1.loc[j, 'position_left'] #cal the VAF print(tumorname) VAF_right = VAF_fusion(laneid,tumorname, right_chr, abs(int(right_position)), inputpath) VAF_left= VAF_fusion(laneid,tumorname, left_chr, abs(int(left_position)), inputpath) VAF=max(VAF_right,VAF_left) print(VAF) if VAF<1: VAFnew=VAF else: VAFnew=0 fusion_infor1.loc[j, 'VAF'] = str(round(VAFnew*100,4)) + '%' if (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & ( fusion_infor1.loc[j, 'Trans_left'] == '3_5'): # for new name fusion_new = right_gene + '_' + right_exonintron + str( right_exonintron_id) + '-' + left_gene + '_' + left_exonintron + str(left_exonintron_id) fusion_infor1.loc[j, 'Fusion_new'] = fusion_new #change the left and right information fusion_infor1.loc[j, 'Gene_left'] = right_gene fusion_infor1.loc[j, 'chr_left'] = right_chr fusion_infor1.loc[j, 'position_left'] = right_position fusion_infor1.loc[j, 'exon_or_intron_left'] = right_exonintron fusion_infor1.loc[j, 'exon_or_intron_id_left'] = right_exonintron_id fusion_infor1.loc[j, 'Gene_right'] = left_gene fusion_infor1.loc[j, 'chr_right'] = left_chr fusion_infor1.loc[j, 'position_right'] = left_position fusion_infor1.loc[j, 'exon_or_intron_right'] = left_exonintron fusion_infor1.loc[j, 'exon_or_intron_id_right'] = left_exonintron_id elif (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & ( fusion_infor1.loc[j, 'Trans_left'] == '5_3'): # for new name fusion_new = left_gene + '_' + left_exonintron + str( left_exonintron_id) + '-' + right_gene + '_' + right_exonintron + str(right_exonintron_id) fusion_infor1.loc[j, 'Fusion_new'] = fusion_new elif fusion_infor1.loc[j, 'Trans_left'] != fusion_infor1.loc[j, 'Trans_right']: fusion_infor1.loc[j, 'Fusion_new'] = 'Neg' else: fusion_infor1.loc[j, 'Fusion_new'] = 'Neg' else: fusion_infor1=pd.DataFrame(columns=headlist) # output the data fusion_infor2 = fusion_infor1[headlist] fusion_infor2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True) return fusion_infor2 ###过滤假阳性 def filter_black_fusion(fusionresult2): fusion_black_dir='/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/fusion_blacklist_20230403.txt' fusion_black=pd.read_table(fusion_black_dir,sep='\t',header=0) del fusion_black['Fusion'] del fusion_black['samplecounts'] fusionresult2['loc']=fusionresult2['chr_left'].astype('str')+'_'+fusionresult2['position_left'].astype('str')+'_'+fusionresult2['chr_right'].astype('str')+'_'+fusionresult2['position_right'].astype('str') fusionlabel=pd.merge(fusionresult2,fusion_black,on=['loc'],how='left') fusion_filter=fusionlabel[fusionlabel['label']!='black'] fusion_filter.reset_index(drop=True,inplace=True) del fusion_filter['loc'] del fusion_filter['label'] return fusion_filter ###for sample def fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid): #inputpath = '/cgdata/liuxiangqiong/work62pancancer/fusiontest/lane1' tempfile=os.path.join(inputpath,'tempfile') if not os.path.exists(tempfile): os.mkdir(tempfile) fusion_tem_dir=os.path.join(tempfile,'fusion') if not os.path.exists(fusion_tem_dir): os.mkdir(fusion_tem_dir) GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair') #sampleid=tumorid[:-2] # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) tumor_jsonfile = GeneFusiondir + '/' + tumorname + '_report.json' tumor_htmlfile = GeneFusiondir + '/' + tumorname + '_report.html' if (os.path.exists(tumor_jsonfile)) & (os.path.exists(tumor_jsonfile)): fusionresult = fusion_extract(inputpath,laneid,tumor_jsonfile, tumor_htmlfile, tumorname) print(fusionresult) fusionresult.insert(0, 'sampleid', sampleid) #输出未过滤结果 outputfile0=os.path.join(fusion_tem_dir,laneid + '-' + sampleid + '.fusion_unfilter.txt') fusionresult.to_csv(outputfile0,sep='\t',index=False,header=True) #设置最后结果 Fusion_final=pd.DataFrame(columns=list(fusionresult.columns)) # 选择uniquer reads>=5的fusion,且有突变丰度的 fusionresult1 = fusionresult[fusionresult['Unique_reads'] >= 5] fusionresult2 = fusionresult1[(fusionresult1['VAF'] != '0%') & (fusionresult1['VAF'] != '0.0%')] fusionresult2.reset_index(drop=True, inplace=True) print(fusionresult2) if len(fusionresult2) == 0: Fusion_final.loc[0, 'sampleid'] = sampleid Fusion_final.loc[0, 'Fusion'] = 'no fusion' else: ###对fusion的表示进行修改CD74-ROS1 (intron6:intron33) #当有intron-1时,将其修改为基因间区 fusionresult2['Fusion']=fusionresult2['Fusion'].str.replace('intron-1', 'intergenic') fusionsplit1 = fusionresult2['Fusion'].str.split('-', expand=True) fusionsplit1_left = fusionsplit1[0].str.split('_', expand=True) fusionsplit1_right = fusionsplit1[1].str.split('_', expand=True) fusion_new = fusionsplit1_left[0] + '-' + fusionsplit1_right[0] + '(' + fusionsplit1_left[1] + ':' + \ fusionsplit1_right[1] + ')' fusionresult2.insert(1, 'Fusion_new', fusion_new) del fusionresult2['Fusion'] fusionresult2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True) #将intron=-1的修改为基因间区 fusionresult2.reset_index(drop=True, inplace=True) for i in range(len(fusionresult2)): if fusionresult2.loc[i, 'exon_or_intron_id_left'] == -1: fusionresult2.loc[i, 'exon_or_intron_id_left'] = 'intergenic' elif fusionresult2.loc[i, 'exon_or_intron_id_right'] == -1: fusionresult2.loc[i, 'exon_or_intron_id_right'] = 'intergenic' #去除假阳性 fusionresult2_1=filter_black_fusion(fusionresult2) if len(fusionresult2_1)!=0: Fusion_final=fusionresult2_1 else: Fusion_final.loc[0, 'sampleid'] = sampleid Fusion_final.loc[0, 'Fusion'] = 'no fusion' outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.xlsx') writer = pd.ExcelWriter(outputfile1) Fusion_final.to_excel(writer, sheet_name='fusion', index=False) writer.save() writer.close() else: print(tumorid+' is no data') Fusion_final=pd.DataFrame() Fusion_final.loc[0,'sampleid']=sampleid Fusion_final.loc[0,'Fusion']='No fusion file' # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) bugfile_dir = os.path.join(temp_dir, 'bugfile') if not os.path.exists(bugfile_dir): os.mkdir(bugfile_dir) outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.fusion.nofile.log.txt') Fusion_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t') def cp_fusionhtml(inputpath,sampleid,tumorname,laneid): fusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair') rawhtml = os.path.join(fusiondir, tumorname + '_report.html') result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) fusionhtml = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.html') copyfile='cp '+rawhtml+' '+fusionhtml os.system(copyfile) def fusion_runmain(inputpath,tumorname): laneid = inputpath.split('/')[-1].split('-')[-1] datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) sampledata = samplelist[samplelist['tumor'] ==tumorname] sampledata.reset_index(drop=True, inplace=True) sampleid = sampledata.loc[0, 'samplename'] fusion_GeneFusion_sample(inputpath, laneid, tumorname, sampleid) cp_fusionhtml(inputpath, sampleid, tumorname, laneid) if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--tumorname', type=str, help='the tumor name of sample') args = parser.parse_args() Inputpath = args.inputpath Tumorname = args.tumorname fusion_runmain(Inputpath,Tumorname) #for the result of GeneFusion def fusion_GeneFusion_sum(inputpath,laneid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) genefusionsummary_last = pd.DataFrame() for i in range(len(samplelist)): sampleid = samplelist.loc[i,'samplename'] tumorname = samplelist.loc[i, 'tumor'] print(sampleid) fusionresult2=fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid) genefusionsummary_last = genefusionsummary_last.append(fusionresult2) cp_fusionhtml(inputpath, sampleid, tumorname, laneid) outputname = datasummarydir + '/' + laneid+'_table8_fusion_datasummary.txt' genefusionsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')