123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510 |
- #for the fusion summary by the Genefusion and Factera method
- #date:20220217
- import json
- import pandas as pd
- import sys,collections,math,os,os.path,re
- import xlrd
- import gzip
- from bs4 import BeautifulSoup
- import re
- import argparse
- #######acquire the transcrip strand form html
- #->mean the original strand;<-mean the recov
- pd.set_option('display.max_columns',6)
- def infro_html(htmlPath):
- '''
- extract the transcript strand form html
- → means original read, ← means reverse complement
- :param htmlPath: the genefuse html
- :return: the trans strand
- '''
- with open(htmlPath,"r",encoding="utf-8") as file:
- file1 = file.read()
- f1 = re.findall("<a name='fusion_id_.+'>.+, Fusion: (.+?) .+?<", file1)
- f2 = re.findall("<tr><td width='.+?%'>(.*?)</td>",file1)
- f3 = re.findall("</td><td width='.+?%'>(.*?)</td>", file1)
- f3_str = "<tr><td class='exon_left' width='.+?%'>.+" +\
- "|<tr><td class='intron_left' width='.+?%'>.+" +\
- "|<tr></tr></table>.+"
- f4 = re.findall(f3_str, file1)
- f5_str = "<tr><td class='exon_right' width='.+?%'>.+Supporting" +\
- "|<tr><td class='intron_right' width='.+?%'>.+Supporting" +\
- "|<tr></tr></table>.+Supporting"
- f5 = re.findall(f5_str, file1)
- print(len(f1),len(f2),len(f3),len(f4),len(f5))
- for i in f4:
- pass
- f4_1 = []
- f5_1 = []
- for i in range(len(f4)):
- if "←" in f4[i]:
- f4_1.append("3_5")
- elif "→" in f4[i]:
- f4_1.append("5_3")
- else:
- f4_1.append("no")
- if "←" in f5[i]:
- f5_1.append("3_5")
- elif "→" in f5[i]:
- f5_1.append("5_3")
- else:
- f5_1.append("no")
- data = pd.DataFrame([f1,f2,f4_1,f3,f5_1])
- data = data.T
- data.columns = ["Fusion","Gene_left","Trans_left","Gene_right","Trans_right"]
- return data
- ####
- def infro_json(filename):
- '''
- extract the information form json
- :param filename: the genefuse json
- :return: all the fusion information
- '''
- with open(filename) as f:
- fusionlist = json.load(f)
- result = pd.DataFrame()
- fusionlist_data = fusionlist['fusions']
- if fusionlist_data:
- for key, value in fusionlist_data.items():
- # for left
- df_left = pd.DataFrame.from_dict(value['left'], orient='index', columns=['values'])
- # change row index
- df_left_renamed = df_left.rename(
- index={'gene_name': 'Gene_left', 'gene_chr': 'chr_left', 'position': 'position_left',
- 'reference': 'reference_left', 'ref_ext': 'ref_ext_left', 'pos_str': 'pos_str_left',
- 'exon_or_intron': 'exon_or_intron_left', 'exon_or_intron_id': 'exon_or_intron_id_left',
- 'strand': 'strand_left'})
- # for right
- df_right = pd.DataFrame.from_dict(value['right'], orient='index', columns=['values'])
- # change the row index
- df_right_renamed = df_right.rename(
- index={'gene_name': 'Gene_right', 'gene_chr': 'chr_right', 'position': 'position_right',
- 'reference': 'reference_right', 'ref_ext': 'ref_ext_right', 'pos_str': 'pos_str_right',
- 'exon_or_intron': 'exon_or_intron_right', 'exon_or_intron_id': 'exon_or_intron_id_right',
- 'strand': 'strand_right'})
- # extract unique reads
- uniquereads = pd.DataFrame(columns=['values'])
- uniquereads.loc['Unique_reads'] = value['unique']
- # merge
- fusioninfor1 = df_left_renamed.append(df_right_renamed)
- fusioninfor2 = fusioninfor1.append(uniquereads)
- # trans the result
- fusioninfor3 = pd.DataFrame(fusioninfor2.values.T, index=fusioninfor2.columns,
- columns=fusioninfor2.index)
- fusioninfor3.insert(0, 'Fusion1', key)
- result = result.append(fusioninfor3)
- result = result.reset_index()
- del result['index']
- result['Total_reads'] = (
- (result['Fusion1'].str.split(' ', expand=True)[4]).str.split(',', expand=True)[0]).astype("int")
- result.insert(0, 'Fusion', result['Fusion1'].str.split(' ', expand=True)[1])
- del result['Fusion1']
- else:
- namelist=['Fusion', 'Gene_left', 'chr_left', 'position_left', 'reference_left','ref_ext_left', 'pos_str_left', 'exon_or_intron_left',
- 'exon_or_intron_id_left', 'strand_left', 'Gene_right', 'chr_right','position_right', 'reference_right', 'ref_ext_right',
- 'pos_str_right','exon_or_intron_right', 'exon_or_intron_id_right', 'strand_right', 'Unique_reads', 'Total_reads']
- result=pd.DataFrame(columns=namelist)
- return result
- ###t提取genefuse的序列
- def infro_txt(Path):
- '''
- extract the transcript strand form html
- → means original read, ← means reverse complement
- :param htmlPath: the genefuse html
- :return: the trans strand
- '''
- with open(Path,"r",encoding="utf-8") as file:
- file1 = file.read()
- t1 = re.findall("#Fusion: (.+?) .+?total: (.+?),.+", file1)
- num=0
- for i in t1:
- num = num+int(i[1])
- if num!=0:
- t2 = re.findall("name: (.+)", file1)
- if num==len(t2):
- t3 = re.findall("\n[ATCG].+", file1)
- t1_1 = []
- for i in t1:
- for i1 in range(int(i[1])):
- t1_1.append(i[0])
- data = pd.DataFrame([t1_1,t2,t3])
- data = data.T
- data.columns = ["Fusion","Name",""]
- return data
- else:
- data = pd.DataFrame(columns=["Fusion", "Name", "Seq"])
- print(os.path.basename(Path)+" error:The number of fusion-names is inconsistent")
- return data
- else:
- data = pd.DataFrame(columns=["Fusion", "Name", "Seq"])
- print(os.path.basename(Path)+":File no result")
- return data
- ######对VAF的计算
- #1.获得genefuse的readid
- #2.我们将断点区域上下各扩展50bp
- ##3.提取genefuse与bam中断点位置overlap的reads
- def VAF_fusion(laneid,sampleid,fusion_chr,fusion_position,inputpath):
- GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- # acqure the read in fusion region
- fusion_loc_ex50=str(fusion_chr) + ':' + str(fusion_position-50) + '-' + str(fusion_position+ 50)
- pandir = os.path.join('/cgdata/pancancer/analyse',laneid+'/bamfile')
- bamdir =os.path.join(pandir, sampleid + '_clean.bam')
- bed_file = GeneFusiondir + '/' + sampleid + '.' + fusion_loc_ex50 + '.bed'
- read_bedcmd = 'samtools view -h ' + bamdir + ' ' + fusion_loc_ex50 + ' | ' + 'samtools view -bS - | bedtools bamtobed -i - ' + '>' + bed_file
- os.system(read_bedcmd)
- # readin the result
- read_region = pd.read_table(bed_file, sep='\t', names=['chr', 'start', 'end', 'readid', 'len', 'strand'])
- ##3.提取genefuse与bam中断点位置overlap的reads
- txtfile = GeneFusiondir + '/' + sampleid + '_fusion_GeneFusion.txt'
- genefuse_result = infro_txt(txtfile)
- # rename
- readinfor = genefuse_result['Name'].str.split(' ', expand=True)
- gfusereadid = readinfor[0] + '/' + (readinfor[1].str.split(':', expand=True))[0]
- gfusereadid1 = pd.DataFrame(gfusereadid)
- gfusereadid1.rename(columns={0: 'readid'}, inplace=True)
- # merge
- read_overlap = pd.merge(gfusereadid1, read_region, on='readid', how='inner')
- ###acquire the fusion depths
- depthoutdir = GeneFusiondir + '/' + sampleid + '_' + fusion_loc_ex50 + '.txt'
- depthcomd = 'samtools depth -r ' + fusion_loc_ex50+ ' ' + bamdir + '>' + depthoutdir
- os.system(depthcomd)
- depthdata = pd.read_table(depthoutdir, sep='\t', names=['chr', 'positon', 'total_reads'])
- #filter the depth
- depthdata['chr'] = depthdata['chr'].astype(str)
- filter1 = depthdata[(depthdata['chr'] == fusion_chr) & (depthdata['positon'] == fusion_position)]
- filter1.reset_index(drop=True, inplace=True)
- if filter1.shape[0]!=0:
- Total_depth = filter1.loc[0, 'total_reads']
- fusion_readcounts = len(read_overlap)
- VAF = round(fusion_readcounts / Total_depth, 4)
- print('round4VAF '+str(VAF))
- else:
- VAF=0
- os.remove(bed_file)
- os.remove(depthoutdir)
- return VAF
- #fusion_chr=6
- #fusion_position=117647036
- #VAF_fusion(sampleid,fusion_chr,fusion_position,inputpath)
- #####修改,转录本方向改变的时候为3-5时,对应的left和right也需要同时进行修改
- def fusion_extract(inputpath,laneid,jsonfile,htmlfile,tumorname):
- '''
- ####extract the fusion information form the json and html
- ###and calculate the VAF
- :param jsonfile: the json file of genefuse
- :param htmlfile: the html file of genefuse
- :return: the information of fusion
- '''
- headlist = [ 'Fusion_new', 'VAF', 'Gene_left', 'chr_left', 'position_left', 'exon_or_intron_left',
- 'exon_or_intron_id_left', 'Gene_right', 'chr_right', 'position_right', 'exon_or_intron_right',
- 'exon_or_intron_id_right', 'Unique_reads', 'Total_reads']
- #extract the json file
- json_data = infro_json(jsonfile)
- #extract the html file
- html_data = pd.DataFrame(infro_html(htmlfile))
- if (json_data.shape[0]!=0) & (html_data.shape[0]!=0) :
- #获得结果均有转录本,且转录本方向一致的结果
- #html_data01 = html_data[html_data['Trans_left'] == html_data['Trans_right']]
- #当其中一个的转录本方向不定,结果为空,
- #html_data02 = html_data01[html_data01['Trans_left'] != 'no']
- #html_data03 = html_data02[html_data01['Trans_right'] != 'no']
- #html_data03.reset_index(drop=True, inplace=True)
- # merge the data
- fusion_infor1 = pd.merge(json_data, html_data, on=["Fusion", "Gene_left", "Gene_right"], how='right')
- #fusion_infor1 = pd.merge(json_data, html_data03, on=["Fusion", "Gene_left", "Gene_right"], how='right')
- fusion_infor1['Gene_right'] = fusion_infor1['Gene_right'].str.split('_', expand=True)[0]
- fusion_infor1['Gene_left'] = fusion_infor1['Gene_left'].str.split('_', expand=True)[0]
- for j in range(len(fusion_infor1)):
- print(j)
- #print(fusion_infor1.loc[j, :])
- unique_reads = fusion_infor1.loc[j, 'Unique_reads']
- right_gene = fusion_infor1.loc[j, 'Gene_right']
- right_exonintron = fusion_infor1.loc[j, 'exon_or_intron_right']
- right_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_right']
- right_chr = fusion_infor1.loc[j, 'chr_right']
- right_position = fusion_infor1.loc[j, 'position_right']
- left_gene = fusion_infor1.loc[j, 'Gene_left']
- left_exonintron = fusion_infor1.loc[j, 'exon_or_intron_left']
- left_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_left']
- left_chr = fusion_infor1.loc[j, 'chr_left']
- left_position = fusion_infor1.loc[j, 'position_left']
- #cal the VAF
- print(tumorname)
- VAF_right = VAF_fusion(laneid,tumorname, right_chr, abs(int(right_position)), inputpath)
- VAF_left= VAF_fusion(laneid,tumorname, left_chr, abs(int(left_position)), inputpath)
- VAF=max(VAF_right,VAF_left)
- print(VAF)
- if VAF<1:
- VAFnew=VAF
- else:
- VAFnew=0
- fusion_infor1.loc[j, 'VAF'] = str(round(VAFnew*100,4)) + '%'
- if (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
- fusion_infor1.loc[j, 'Trans_left'] == '3_5'):
- # for new name
- fusion_new = right_gene + '_' + right_exonintron + str(
- right_exonintron_id) + '-' + left_gene + '_' + left_exonintron + str(left_exonintron_id)
- fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
- #change the left and right information
- fusion_infor1.loc[j, 'Gene_left'] = right_gene
- fusion_infor1.loc[j, 'chr_left'] = right_chr
- fusion_infor1.loc[j, 'position_left'] = right_position
- fusion_infor1.loc[j, 'exon_or_intron_left'] = right_exonintron
- fusion_infor1.loc[j, 'exon_or_intron_id_left'] = right_exonintron_id
- fusion_infor1.loc[j, 'Gene_right'] = left_gene
- fusion_infor1.loc[j, 'chr_right'] = left_chr
- fusion_infor1.loc[j, 'position_right'] = left_position
- fusion_infor1.loc[j, 'exon_or_intron_right'] = left_exonintron
- fusion_infor1.loc[j, 'exon_or_intron_id_right'] = left_exonintron_id
- elif (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
- fusion_infor1.loc[j, 'Trans_left'] == '5_3'):
- # for new name
- fusion_new = left_gene + '_' + left_exonintron + str(
- left_exonintron_id) + '-' + right_gene + '_' + right_exonintron + str(right_exonintron_id)
- fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
- elif fusion_infor1.loc[j, 'Trans_left'] != fusion_infor1.loc[j, 'Trans_right']:
- fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
- else:
- fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
- else:
- fusion_infor1=pd.DataFrame(columns=headlist)
- # output the data
- fusion_infor2 = fusion_infor1[headlist]
- fusion_infor2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True)
- return fusion_infor2
- def fusion_extract_v0test(inputpath,laneid,jsonfile,htmlfile,tumorname):
- '''
- ####extract the fusion information form the json and html
- ###and calculate the VAF
- :param jsonfile: the json file of genefuse
- :param htmlfile: the html file of genefuse
- :return: the information of fusion
- '''
- headlist = [ 'Fusion_new', 'VAF', 'Gene_left', 'chr_left', 'position_left', 'exon_or_intron_left',
- 'exon_or_intron_id_left', 'Gene_right', 'chr_right', 'position_right', 'exon_or_intron_right',
- 'exon_or_intron_id_right', 'Unique_reads', 'Total_reads']
- #extract the json file
- json_data = infro_json(jsonfile)
- #extract the html file
- html_data = pd.DataFrame(infro_html(htmlfile))
- if (json_data.shape[0]!=0) & (html_data.shape[0]!=0) :
- #获得结果均有转录本,且转录本方向一致的结果
- html_data01 = html_data[html_data['Trans_left'] == html_data['Trans_right']]
- #当其中一个的转录本方向不定,结果为空,
- html_data02 = html_data01[html_data01['Trans_left'] != 'no']
- html_data03 = html_data02[html_data01['Trans_right'] != 'no']
- html_data03.reset_index(drop=True, inplace=True)
- # merge the data
- fusion_infor1 = pd.merge(json_data, html_data03, on=["Fusion", "Gene_left", "Gene_right"], how='right')
- fusion_infor1['Gene_right'] = fusion_infor1['Gene_right'].str.split('_', expand=True)[0]
- fusion_infor1['Gene_left'] = fusion_infor1['Gene_left'].str.split('_', expand=True)[0]
- for j in range(len(fusion_infor1)):
- print(j)
- #print(fusion_infor1.loc[j, :])
- unique_reads = fusion_infor1.loc[j, 'Unique_reads']
- right_gene = fusion_infor1.loc[j, 'Gene_right']
- right_exonintron = fusion_infor1.loc[j, 'exon_or_intron_right']
- right_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_right']
- right_chr = fusion_infor1.loc[j, 'chr_right']
- right_position = fusion_infor1.loc[j, 'position_right']
- left_gene = fusion_infor1.loc[j, 'Gene_left']
- left_exonintron = fusion_infor1.loc[j, 'exon_or_intron_left']
- left_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_left']
- left_chr = fusion_infor1.loc[j, 'chr_left']
- left_position = fusion_infor1.loc[j, 'position_left']
- #cal the VAF
- print(tumorname)
- VAF_right = VAF_fusion(laneid,tumorname, right_chr, abs(int(right_position)), inputpath)
- VAF_left= VAF_fusion(laneid,tumorname, left_chr, abs(int(left_position)), inputpath)
- VAF=max(VAF_right,VAF_left)
- print(VAF)
- if VAF<1:
- VAFnew=VAF
- else:
- VAFnew=0
- fusion_infor1.loc[j, 'VAF'] = str(round(VAFnew*100,4)) + '%'
- if (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
- fusion_infor1.loc[j, 'Trans_left'] == '3_5'):
- # for new name
- fusion_new = right_gene + '_' + right_exonintron + str(
- right_exonintron_id) + '-' + left_gene + '_' + left_exonintron + str(left_exonintron_id)
- fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
- #change the left and right information
- fusion_infor1.loc[j, 'Gene_left'] = right_gene
- fusion_infor1.loc[j, 'chr_left'] = right_chr
- fusion_infor1.loc[j, 'position_left'] = right_position
- fusion_infor1.loc[j, 'exon_or_intron_left'] = right_exonintron
- fusion_infor1.loc[j, 'exon_or_intron_id_left'] = right_exonintron_id
- fusion_infor1.loc[j, 'Gene_right'] = left_gene
- fusion_infor1.loc[j, 'chr_right'] = left_chr
- fusion_infor1.loc[j, 'position_right'] = left_position
- fusion_infor1.loc[j, 'exon_or_intron_right'] = left_exonintron
- fusion_infor1.loc[j, 'exon_or_intron_id_right'] = left_exonintron_id
- elif (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
- fusion_infor1.loc[j, 'Trans_left'] == '5_3'):
- # for new name
- fusion_new = left_gene + '_' + left_exonintron + str(
- left_exonintron_id) + '-' + right_gene + '_' + right_exonintron + str(right_exonintron_id)
- fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
- elif fusion_infor1.loc[j, 'Trans_left'] != fusion_infor1.loc[j, 'Trans_right']:
- fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
- else:
- fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
- else:
- fusion_infor1=pd.DataFrame(columns=headlist)
- # output the data
- fusion_infor2 = fusion_infor1[headlist]
- fusion_infor2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True)
- return fusion_infor2
- ###for sample
- def fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid):
- #inputpath = '/cgdata/liuxiangqiong/work62pancancer/fusiontest/lane1'
- GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- #sampleid=tumorid[:-2]
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- tumor_jsonfile = GeneFusiondir + '/' + tumorname + '_report.json'
- tumor_htmlfile = GeneFusiondir + '/' + tumorname + '_report.html'
- if (os.path.exists(tumor_jsonfile)) & (os.path.exists(tumor_jsonfile)):
- fusionresult = fusion_extract(inputpath,laneid,tumor_jsonfile, tumor_htmlfile, tumorname)
- print(fusionresult)
- fusionresult.insert(0, 'sampleid', sampleid)
- # 选择uniquer reads>=5的fusion,且有突变丰度的
- fusionresult1 = fusionresult[fusionresult['Unique_reads'] >= 5]
- fusionresult2 = fusionresult1[(fusionresult1['VAF'] != '0%') & (fusionresult1['VAF'] != '0.0%')]
- fusionresult2.reset_index(drop=True, inplace=True)
- print(fusionresult2)
- if len(fusionresult2) == 0:
- fusionresult2.loc[0, 'sampleid'] = sampleid
- fusionresult2.loc[0, 'Fusion'] = 'no fusion'
- else:
- ###对fusion的表示进行修改CD74-ROS1 (intron6:intron33)
- #当有intron-1时,将其修改为基因间区
- fusionresult2['Fusion']=fusionresult2['Fusion'].str.replace('intron-1', 'intergenic')
- fusionsplit1 = fusionresult2['Fusion'].str.split('-', expand=True)
- fusionsplit1_left = fusionsplit1[0].str.split('_', expand=True)
- fusionsplit1_right = fusionsplit1[1].str.split('_', expand=True)
- fusion_new = fusionsplit1_left[0] + '-' + fusionsplit1_right[0] + '(' + fusionsplit1_left[1] + ':' + \
- fusionsplit1_right[1] + ')'
- fusionresult2.insert(1, 'Fusion_new', fusion_new)
- del fusionresult2['Fusion']
- fusionresult2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True)
- #将intron=-1的修改为基因间区
- fusionresult2.reset_index(drop=True, inplace=True)
- for i in range(len(fusionresult2)):
- if fusionresult2.loc[i, 'exon_or_intron_id_left'] == -1:
- fusionresult2.loc[i, 'exon_or_intron_id_left'] = 'intergenic'
- elif fusionresult2.loc[i, 'exon_or_intron_id_right'] == -1:
- fusionresult2.loc[i, 'exon_or_intron_id_right'] = 'intergenic'
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- fusionresult2.to_excel(writer, sheet_name='fusion', index=False)
- writer.save()
- writer.close()
- else:
- print(tumorid+' is no data')
- fusionresult2=pd.DataFrame()
- fusionresult2.loc[0,'sampleid']=sampleid
- fusionresult2.loc[0,'Fusion']='No fusion file'
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- bugfile_dir = os.path.join(temp_dir, 'bugfile')
- if not os.path.exists(bugfile_dir):
- os.mkdir(bugfile_dir)
- outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.fusion.nofile.log.txt')
- fusionresult2.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
- def cp_fusionhtml(inputpath,sampleid,tumorname,laneid):
- fusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- rawhtml = os.path.join(fusiondir, tumorname + '_report.html')
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- fusionhtml = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.html')
- copyfile='cp '+rawhtml+' '+fusionhtml
- os.system(copyfile)
- def fusion_runmain(inputpath,tumorname):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- sampledata = samplelist[samplelist['tumor'] ==tumorname]
- sampledata.reset_index(drop=True, inplace=True)
- sampleid = sampledata.loc[0, 'samplename']
- fusion_GeneFusion_sample(inputpath, laneid, tumorname, sampleid)
- cp_fusionhtml(inputpath, sampleid, tumorname, laneid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--tumorname', type=str, help='the tumor name of sample')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Tumorname = args.tumorname
- fusion_runmain(Inputpath,Tumorname)
- #for the result of GeneFusion
- def fusion_GeneFusion_sum(inputpath,laneid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- genefusionsummary_last = pd.DataFrame()
- for i in range(len(samplelist)):
- sampleid = samplelist.loc[i,'samplename']
- tumorname = samplelist.loc[i, 'tumor']
- print(sampleid)
- fusionresult2=fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid)
- genefusionsummary_last = genefusionsummary_last.append(fusionresult2)
- cp_fusionhtml(inputpath, sampleid, tumorname, laneid)
- outputname = datasummarydir + '/' + laneid+'_table8_fusion_datasummary.txt'
- genefusionsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|