datafile_fusion_v0_20230403_finish.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. #for the fusion summary by the Genefusion and Factera method
  2. #date:20220217
  3. import json
  4. import pandas as pd
  5. import sys,collections,math,os,os.path,re
  6. import xlrd
  7. import gzip
  8. from bs4 import BeautifulSoup
  9. import re
  10. import argparse
  11. #######acquire the transcrip strand form html
  12. #->mean the original strand;<-mean the recov
  13. pd.set_option('display.max_columns',6)
  14. def infro_html(htmlPath):
  15. '''
  16. extract the transcript strand form html
  17. → means original read, ← means reverse complement
  18. :param htmlPath: the genefuse html
  19. :return: the trans strand
  20. '''
  21. with open(htmlPath,"r",encoding="utf-8") as file:
  22. file1 = file.read()
  23. f1 = re.findall("<a name='fusion_id_.+'>.+, Fusion: (.+?) .+?<", file1)
  24. f2 = re.findall("<tr><td width='.+?%'>(.*?)</td>",file1)
  25. f3 = re.findall("</td><td width='.+?%'>(.*?)</td>", file1)
  26. f3_str = "<tr><td class='exon_left' width='.+?%'>.+" +\
  27. "|<tr><td class='intron_left' width='.+?%'>.+" +\
  28. "|<tr></tr></table>.+"
  29. f4 = re.findall(f3_str, file1)
  30. f5_str = "<tr><td class='exon_right' width='.+?%'>.+Supporting" +\
  31. "|<tr><td class='intron_right' width='.+?%'>.+Supporting" +\
  32. "|<tr></tr></table>.+Supporting"
  33. f5 = re.findall(f5_str, file1)
  34. print(len(f1),len(f2),len(f3),len(f4),len(f5))
  35. for i in f4:
  36. pass
  37. f4_1 = []
  38. f5_1 = []
  39. for i in range(len(f4)):
  40. if "←" in f4[i]:
  41. f4_1.append("3_5")
  42. elif "→" in f4[i]:
  43. f4_1.append("5_3")
  44. else:
  45. f4_1.append("no")
  46. if "←" in f5[i]:
  47. f5_1.append("3_5")
  48. elif "→" in f5[i]:
  49. f5_1.append("5_3")
  50. else:
  51. f5_1.append("no")
  52. data = pd.DataFrame([f1,f2,f4_1,f3,f5_1])
  53. data = data.T
  54. data.columns = ["Fusion","Gene_left","Trans_left","Gene_right","Trans_right"]
  55. return data
  56. ####
  57. def infro_json(filename):
  58. '''
  59. extract the information form json
  60. :param filename: the genefuse json
  61. :return: all the fusion information
  62. '''
  63. with open(filename) as f:
  64. fusionlist = json.load(f)
  65. result = pd.DataFrame()
  66. fusionlist_data = fusionlist['fusions']
  67. if fusionlist_data:
  68. for key, value in fusionlist_data.items():
  69. # for left
  70. df_left = pd.DataFrame.from_dict(value['left'], orient='index', columns=['values'])
  71. # change row index
  72. df_left_renamed = df_left.rename(
  73. index={'gene_name': 'Gene_left', 'gene_chr': 'chr_left', 'position': 'position_left',
  74. 'reference': 'reference_left', 'ref_ext': 'ref_ext_left', 'pos_str': 'pos_str_left',
  75. 'exon_or_intron': 'exon_or_intron_left', 'exon_or_intron_id': 'exon_or_intron_id_left',
  76. 'strand': 'strand_left'})
  77. # for right
  78. df_right = pd.DataFrame.from_dict(value['right'], orient='index', columns=['values'])
  79. # change the row index
  80. df_right_renamed = df_right.rename(
  81. index={'gene_name': 'Gene_right', 'gene_chr': 'chr_right', 'position': 'position_right',
  82. 'reference': 'reference_right', 'ref_ext': 'ref_ext_right', 'pos_str': 'pos_str_right',
  83. 'exon_or_intron': 'exon_or_intron_right', 'exon_or_intron_id': 'exon_or_intron_id_right',
  84. 'strand': 'strand_right'})
  85. # extract unique reads
  86. uniquereads = pd.DataFrame(columns=['values'])
  87. uniquereads.loc['Unique_reads'] = value['unique']
  88. # merge
  89. fusioninfor1 = df_left_renamed.append(df_right_renamed)
  90. fusioninfor2 = fusioninfor1.append(uniquereads)
  91. # trans the result
  92. fusioninfor3 = pd.DataFrame(fusioninfor2.values.T, index=fusioninfor2.columns,
  93. columns=fusioninfor2.index)
  94. fusioninfor3.insert(0, 'Fusion1', key)
  95. result = result.append(fusioninfor3)
  96. result = result.reset_index()
  97. del result['index']
  98. result['Total_reads'] = (
  99. (result['Fusion1'].str.split(' ', expand=True)[4]).str.split(',', expand=True)[0]).astype("int")
  100. result.insert(0, 'Fusion', result['Fusion1'].str.split(' ', expand=True)[1])
  101. del result['Fusion1']
  102. else:
  103. namelist=['Fusion', 'Gene_left', 'chr_left', 'position_left', 'reference_left','ref_ext_left', 'pos_str_left', 'exon_or_intron_left',
  104. 'exon_or_intron_id_left', 'strand_left', 'Gene_right', 'chr_right','position_right', 'reference_right', 'ref_ext_right',
  105. 'pos_str_right','exon_or_intron_right', 'exon_or_intron_id_right', 'strand_right', 'Unique_reads', 'Total_reads']
  106. result=pd.DataFrame(columns=namelist)
  107. return result
  108. ###t提取genefuse的序列
  109. def infro_txt(Path):
  110. '''
  111. extract the transcript strand form html
  112. → means original read, ← means reverse complement
  113. :param htmlPath: the genefuse html
  114. :return: the trans strand
  115. '''
  116. with open(Path,"r",encoding="utf-8") as file:
  117. file1 = file.read()
  118. t1 = re.findall("#Fusion: (.+?) .+?total: (.+?),.+", file1)
  119. num=0
  120. for i in t1:
  121. num = num+int(i[1])
  122. if num!=0:
  123. t2 = re.findall("name: (.+)", file1)
  124. if num==len(t2):
  125. t3 = re.findall("\n[ATCG].+", file1)
  126. t1_1 = []
  127. for i in t1:
  128. for i1 in range(int(i[1])):
  129. t1_1.append(i[0])
  130. data = pd.DataFrame([t1_1,t2,t3])
  131. data = data.T
  132. data.columns = ["Fusion","Name",""]
  133. return data
  134. else:
  135. data = pd.DataFrame(columns=["Fusion", "Name", "Seq"])
  136. print(os.path.basename(Path)+" error:The number of fusion-names is inconsistent")
  137. return data
  138. else:
  139. data = pd.DataFrame(columns=["Fusion", "Name", "Seq"])
  140. print(os.path.basename(Path)+":File no result")
  141. return data
  142. ######对VAF的计算
  143. #1.获得genefuse的readid
  144. #2.我们将断点区域上下各扩展50bp
  145. ##3.提取genefuse与bam中断点位置overlap的reads
  146. def VAF_fusion(laneid,sampleid,fusion_chr,fusion_position,inputpath):
  147. GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  148. # acqure the read in fusion region
  149. fusion_loc_ex50=str(fusion_chr) + ':' + str(fusion_position-50) + '-' + str(fusion_position+ 50)
  150. pandir = os.path.join('/cgdata/pancancer/analyse',laneid+'/bamfile')
  151. bamdir =os.path.join(pandir, sampleid + '_clean.bam')
  152. bed_file = GeneFusiondir + '/' + sampleid + '.' + fusion_loc_ex50 + '.bed'
  153. read_bedcmd = 'samtools view -h ' + bamdir + ' ' + fusion_loc_ex50 + ' | ' + 'samtools view -bS - | bedtools bamtobed -i - ' + '>' + bed_file
  154. os.system(read_bedcmd)
  155. # readin the result
  156. read_region = pd.read_table(bed_file, sep='\t', names=['chr', 'start', 'end', 'readid', 'len', 'strand'])
  157. ##3.提取genefuse与bam中断点位置overlap的reads
  158. txtfile = GeneFusiondir + '/' + sampleid + '_fusion_GeneFusion.txt'
  159. genefuse_result = infro_txt(txtfile)
  160. # rename
  161. readinfor = genefuse_result['Name'].str.split(' ', expand=True)
  162. gfusereadid = readinfor[0] + '/' + (readinfor[1].str.split(':', expand=True))[0]
  163. gfusereadid1 = pd.DataFrame(gfusereadid)
  164. gfusereadid1.rename(columns={0: 'readid'}, inplace=True)
  165. # merge
  166. read_overlap = pd.merge(gfusereadid1, read_region, on='readid', how='inner')
  167. ###acquire the fusion depths
  168. depthoutdir = GeneFusiondir + '/' + sampleid + '_' + fusion_loc_ex50 + '.txt'
  169. depthcomd = 'samtools depth -r ' + fusion_loc_ex50+ ' ' + bamdir + '>' + depthoutdir
  170. os.system(depthcomd)
  171. depthdata = pd.read_table(depthoutdir, sep='\t', names=['chr', 'positon', 'total_reads'])
  172. #filter the depth
  173. depthdata['chr'] = depthdata['chr'].astype(str)
  174. filter1 = depthdata[(depthdata['chr'] == fusion_chr) & (depthdata['positon'] == fusion_position)]
  175. filter1.reset_index(drop=True, inplace=True)
  176. if filter1.shape[0]!=0:
  177. Total_depth = filter1.loc[0, 'total_reads']
  178. fusion_readcounts = len(read_overlap)
  179. VAF = round(fusion_readcounts / Total_depth, 4)
  180. print('round4VAF '+str(VAF))
  181. else:
  182. VAF=0
  183. os.remove(bed_file)
  184. os.remove(depthoutdir)
  185. return VAF
  186. #fusion_chr=6
  187. #fusion_position=117647036
  188. #VAF_fusion(sampleid,fusion_chr,fusion_position,inputpath)
  189. #####修改,转录本方向改变的时候为3-5时,对应的left和right也需要同时进行修改
  190. def fusion_extract(inputpath,laneid,jsonfile,htmlfile,tumorname):
  191. '''
  192. ####extract the fusion information form the json and html
  193. ###and calculate the VAF
  194. :param jsonfile: the json file of genefuse
  195. :param htmlfile: the html file of genefuse
  196. :return: the information of fusion
  197. '''
  198. headlist = [ 'Fusion_new', 'VAF', 'Gene_left', 'chr_left', 'position_left', 'exon_or_intron_left',
  199. 'exon_or_intron_id_left', 'Gene_right', 'chr_right', 'position_right', 'exon_or_intron_right',
  200. 'exon_or_intron_id_right', 'Unique_reads', 'Total_reads']
  201. #extract the json file
  202. json_data = infro_json(jsonfile)
  203. #extract the html file
  204. html_data = pd.DataFrame(infro_html(htmlfile))
  205. if (json_data.shape[0]!=0) & (html_data.shape[0]!=0) :
  206. #获得结果均有转录本,且转录本方向一致的结果
  207. #html_data01 = html_data[html_data['Trans_left'] == html_data['Trans_right']]
  208. #当其中一个的转录本方向不定,结果为空,
  209. #html_data02 = html_data01[html_data01['Trans_left'] != 'no']
  210. #html_data03 = html_data02[html_data01['Trans_right'] != 'no']
  211. #html_data03.reset_index(drop=True, inplace=True)
  212. # merge the data
  213. fusion_infor1 = pd.merge(json_data, html_data, on=["Fusion", "Gene_left", "Gene_right"], how='right')
  214. #fusion_infor1 = pd.merge(json_data, html_data03, on=["Fusion", "Gene_left", "Gene_right"], how='right')
  215. fusion_infor1['Gene_right'] = fusion_infor1['Gene_right'].str.split('_', expand=True)[0]
  216. fusion_infor1['Gene_left'] = fusion_infor1['Gene_left'].str.split('_', expand=True)[0]
  217. for j in range(len(fusion_infor1)):
  218. print(j)
  219. #print(fusion_infor1.loc[j, :])
  220. unique_reads = fusion_infor1.loc[j, 'Unique_reads']
  221. right_gene = fusion_infor1.loc[j, 'Gene_right']
  222. right_exonintron = fusion_infor1.loc[j, 'exon_or_intron_right']
  223. right_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_right']
  224. right_chr = fusion_infor1.loc[j, 'chr_right']
  225. right_position = fusion_infor1.loc[j, 'position_right']
  226. left_gene = fusion_infor1.loc[j, 'Gene_left']
  227. left_exonintron = fusion_infor1.loc[j, 'exon_or_intron_left']
  228. left_exonintron_id = fusion_infor1.loc[j, 'exon_or_intron_id_left']
  229. left_chr = fusion_infor1.loc[j, 'chr_left']
  230. left_position = fusion_infor1.loc[j, 'position_left']
  231. #cal the VAF
  232. print(tumorname)
  233. VAF_right = VAF_fusion(laneid,tumorname, right_chr, abs(int(right_position)), inputpath)
  234. VAF_left= VAF_fusion(laneid,tumorname, left_chr, abs(int(left_position)), inputpath)
  235. VAF=max(VAF_right,VAF_left)
  236. print(VAF)
  237. if VAF<1:
  238. VAFnew=VAF
  239. else:
  240. VAFnew=0
  241. fusion_infor1.loc[j, 'VAF'] = str(round(VAFnew*100,4)) + '%'
  242. if (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
  243. fusion_infor1.loc[j, 'Trans_left'] == '3_5'):
  244. # for new name
  245. fusion_new = right_gene + '_' + right_exonintron + str(
  246. right_exonintron_id) + '-' + left_gene + '_' + left_exonintron + str(left_exonintron_id)
  247. fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
  248. #change the left and right information
  249. fusion_infor1.loc[j, 'Gene_left'] = right_gene
  250. fusion_infor1.loc[j, 'chr_left'] = right_chr
  251. fusion_infor1.loc[j, 'position_left'] = right_position
  252. fusion_infor1.loc[j, 'exon_or_intron_left'] = right_exonintron
  253. fusion_infor1.loc[j, 'exon_or_intron_id_left'] = right_exonintron_id
  254. fusion_infor1.loc[j, 'Gene_right'] = left_gene
  255. fusion_infor1.loc[j, 'chr_right'] = left_chr
  256. fusion_infor1.loc[j, 'position_right'] = left_position
  257. fusion_infor1.loc[j, 'exon_or_intron_right'] = left_exonintron
  258. fusion_infor1.loc[j, 'exon_or_intron_id_right'] = left_exonintron_id
  259. elif (fusion_infor1.loc[j, 'Trans_left'] == fusion_infor1.loc[j, 'Trans_right']) & (
  260. fusion_infor1.loc[j, 'Trans_left'] == '5_3'):
  261. # for new name
  262. fusion_new = left_gene + '_' + left_exonintron + str(
  263. left_exonintron_id) + '-' + right_gene + '_' + right_exonintron + str(right_exonintron_id)
  264. fusion_infor1.loc[j, 'Fusion_new'] = fusion_new
  265. elif fusion_infor1.loc[j, 'Trans_left'] != fusion_infor1.loc[j, 'Trans_right']:
  266. fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
  267. else:
  268. fusion_infor1.loc[j, 'Fusion_new'] = 'Neg'
  269. else:
  270. fusion_infor1=pd.DataFrame(columns=headlist)
  271. # output the data
  272. fusion_infor2 = fusion_infor1[headlist]
  273. fusion_infor2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True)
  274. return fusion_infor2
  275. ###过滤假阳性
  276. def filter_black_fusion(fusionresult2):
  277. fusion_black_dir='/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/fusion_blacklist_20230403.txt'
  278. fusion_black=pd.read_table(fusion_black_dir,sep='\t',header=0)
  279. del fusion_black['Fusion']
  280. del fusion_black['samplecounts']
  281. fusionresult2['loc']=fusionresult2['chr_left'].astype('str')+'_'+fusionresult2['position_left'].astype('str')+'_'+fusionresult2['chr_right'].astype('str')+'_'+fusionresult2['position_right'].astype('str')
  282. fusionlabel=pd.merge(fusionresult2,fusion_black,on=['loc'],how='left')
  283. fusion_filter=fusionlabel[fusionlabel['label']!='black']
  284. fusion_filter.reset_index(drop=True,inplace=True)
  285. del fusion_filter['loc']
  286. del fusion_filter['label']
  287. return fusion_filter
  288. ###for sample
  289. def fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid):
  290. #inputpath = '/cgdata/liuxiangqiong/work62pancancer/fusiontest/lane1'
  291. tempfile=os.path.join(inputpath,'tempfile')
  292. if not os.path.exists(tempfile):
  293. os.mkdir(tempfile)
  294. fusion_tem_dir=os.path.join(tempfile,'fusion')
  295. if not os.path.exists(fusion_tem_dir):
  296. os.mkdir(fusion_tem_dir)
  297. GeneFusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  298. #sampleid=tumorid[:-2]
  299. # make the result dir
  300. result_dir = os.path.join(inputpath, 'resultfile')
  301. if not os.path.exists(result_dir):
  302. os.mkdir(result_dir)
  303. sample_dir = os.path.join(result_dir, sampleid)
  304. if not os.path.exists(sample_dir):
  305. os.mkdir(sample_dir)
  306. tumor_jsonfile = GeneFusiondir + '/' + tumorname + '_report.json'
  307. tumor_htmlfile = GeneFusiondir + '/' + tumorname + '_report.html'
  308. if (os.path.exists(tumor_jsonfile)) & (os.path.exists(tumor_jsonfile)):
  309. fusionresult = fusion_extract(inputpath,laneid,tumor_jsonfile, tumor_htmlfile, tumorname)
  310. print(fusionresult)
  311. fusionresult.insert(0, 'sampleid', sampleid)
  312. #输出未过滤结果
  313. outputfile0=os.path.join(fusion_tem_dir,laneid + '-' + sampleid + '.fusion_unfilter.txt')
  314. fusionresult.to_csv(outputfile0,sep='\t',index=False,header=True)
  315. #设置最后结果
  316. Fusion_final=pd.DataFrame(columns=list(fusionresult.columns))
  317. # 选择uniquer reads>=5的fusion,且有突变丰度的
  318. fusionresult1 = fusionresult[fusionresult['Unique_reads'] >= 5]
  319. fusionresult2 = fusionresult1[(fusionresult1['VAF'] != '0%') & (fusionresult1['VAF'] != '0.0%')]
  320. fusionresult2.reset_index(drop=True, inplace=True)
  321. print(fusionresult2)
  322. if len(fusionresult2) == 0:
  323. Fusion_final.loc[0, 'sampleid'] = sampleid
  324. Fusion_final.loc[0, 'Fusion'] = 'no fusion'
  325. else:
  326. ###对fusion的表示进行修改CD74-ROS1 (intron6:intron33)
  327. #当有intron-1时,将其修改为基因间区
  328. fusionresult2['Fusion']=fusionresult2['Fusion'].str.replace('intron-1', 'intergenic')
  329. fusionsplit1 = fusionresult2['Fusion'].str.split('-', expand=True)
  330. fusionsplit1_left = fusionsplit1[0].str.split('_', expand=True)
  331. fusionsplit1_right = fusionsplit1[1].str.split('_', expand=True)
  332. fusion_new = fusionsplit1_left[0] + '-' + fusionsplit1_right[0] + '(' + fusionsplit1_left[1] + ':' + \
  333. fusionsplit1_right[1] + ')'
  334. fusionresult2.insert(1, 'Fusion_new', fusion_new)
  335. del fusionresult2['Fusion']
  336. fusionresult2.rename(columns={'Fusion_new': 'Fusion'}, inplace=True)
  337. #将intron=-1的修改为基因间区
  338. fusionresult2.reset_index(drop=True, inplace=True)
  339. for i in range(len(fusionresult2)):
  340. if fusionresult2.loc[i, 'exon_or_intron_id_left'] == -1:
  341. fusionresult2.loc[i, 'exon_or_intron_id_left'] = 'intergenic'
  342. elif fusionresult2.loc[i, 'exon_or_intron_id_right'] == -1:
  343. fusionresult2.loc[i, 'exon_or_intron_id_right'] = 'intergenic'
  344. #去除假阳性
  345. fusionresult2_1=filter_black_fusion(fusionresult2)
  346. if len(fusionresult2_1)!=0:
  347. Fusion_final=fusionresult2_1
  348. else:
  349. Fusion_final.loc[0, 'sampleid'] = sampleid
  350. Fusion_final.loc[0, 'Fusion'] = 'no fusion'
  351. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.xlsx')
  352. writer = pd.ExcelWriter(outputfile1)
  353. Fusion_final.to_excel(writer, sheet_name='fusion', index=False)
  354. writer.save()
  355. writer.close()
  356. else:
  357. print(tumorid+' is no data')
  358. Fusion_final=pd.DataFrame()
  359. Fusion_final.loc[0,'sampleid']=sampleid
  360. Fusion_final.loc[0,'Fusion']='No fusion file'
  361. # make the temp dir
  362. temp_dir = os.path.join(inputpath, 'tempfile')
  363. if not os.path.exists(temp_dir):
  364. os.mkdir(temp_dir)
  365. bugfile_dir = os.path.join(temp_dir, 'bugfile')
  366. if not os.path.exists(bugfile_dir):
  367. os.mkdir(bugfile_dir)
  368. outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.fusion.nofile.log.txt')
  369. Fusion_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
  370. def cp_fusionhtml(inputpath,sampleid,tumorname,laneid):
  371. fusiondir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  372. rawhtml = os.path.join(fusiondir, tumorname + '_report.html')
  373. result_dir = os.path.join(inputpath, 'resultfile')
  374. if not os.path.exists(result_dir):
  375. os.mkdir(result_dir)
  376. sample_dir = os.path.join(result_dir, sampleid)
  377. if not os.path.exists(sample_dir):
  378. os.mkdir(sample_dir)
  379. fusionhtml = os.path.join(sample_dir, laneid + '-' + sampleid + '.fusion.html')
  380. copyfile='cp '+rawhtml+' '+fusionhtml
  381. os.system(copyfile)
  382. def fusion_runmain(inputpath,tumorname):
  383. laneid = inputpath.split('/')[-1].split('-')[-1]
  384. datasummarydir = os.path.join(inputpath, 'datasummary')
  385. isExists = os.path.exists(datasummarydir)
  386. if not isExists:
  387. os.makedirs(datasummarydir)
  388. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  389. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  390. sampledata = samplelist[samplelist['tumor'] ==tumorname]
  391. sampledata.reset_index(drop=True, inplace=True)
  392. sampleid = sampledata.loc[0, 'samplename']
  393. fusion_GeneFusion_sample(inputpath, laneid, tumorname, sampleid)
  394. cp_fusionhtml(inputpath, sampleid, tumorname, laneid)
  395. if __name__=='__main__':
  396. parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain')
  397. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  398. parser.add_argument('-s', '--tumorname', type=str, help='the tumor name of sample')
  399. args = parser.parse_args()
  400. Inputpath = args.inputpath
  401. Tumorname = args.tumorname
  402. fusion_runmain(Inputpath,Tumorname)
  403. #for the result of GeneFusion
  404. def fusion_GeneFusion_sum(inputpath,laneid):
  405. datasummarydir = os.path.join(inputpath, 'datasummary')
  406. isExists = os.path.exists(datasummarydir)
  407. if not isExists:
  408. os.makedirs(datasummarydir)
  409. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  410. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  411. genefusionsummary_last = pd.DataFrame()
  412. for i in range(len(samplelist)):
  413. sampleid = samplelist.loc[i,'samplename']
  414. tumorname = samplelist.loc[i, 'tumor']
  415. print(sampleid)
  416. fusionresult2=fusion_GeneFusion_sample(inputpath,laneid,tumorname,sampleid)
  417. genefusionsummary_last = genefusionsummary_last.append(fusionresult2)
  418. cp_fusionhtml(inputpath, sampleid, tumorname, laneid)
  419. outputname = datasummarydir + '/' + laneid+'_table8_fusion_datasummary.txt'
  420. genefusionsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')