1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078 |
- import pandas as pd
- import os,os.path
- import xlrd
- import re
- import argparse
- ########germline针对的为白细胞对照
- #保留条件如:频率>=30%、Func.refGene中intronic和ncRNA_exonic去掉、无突变注释信息的去掉、人群频率>=1%的去掉;
- # 胚系注释中终止密码止为X表示,请跟体系保持一致,用*号。
- # 胚系表头请按附近中示例调整一下顺序,VAF改为百分比的数据形式。
- #去掉HLA相关基因突变
- #输出按照基因名称进行排序
- #for the end change
- def rep(a):
- # print(a.group(0))
- st = re.sub("x","*",a.group(0),flags=re.I)
- st = re.sub("#","",st,flags=re.I)
- return st
- #frameshift replace
- def rep1(a):
- st = re.sub("fs\*.*", "", a.group(0), flags=re.I)
- st1 = re.split("(\d+)", st)
- if "," in a.group(0):
- st2 = st1[0] + st1[1] + "fs,"
- else:
- st2 = st1[0] + st1[1] + "fs"
- return st2
- ###当由多个基因构成时进行拆分
- ###当Gene.refGene由多个基因注释构成,且这几个基因是在我们的genelist中,我们需要的是将基因拆分,然后对应的AAchange信息保留
- def gene_split(svdata):
- df_name = svdata['Gene.refGene'].str.split(';', expand=True)
- # 三、把行转列成列
- df_name = df_name.stack()
- # 四、重置索引,并删除多于的索引
- df_name = df_name.reset_index(level=1, drop=True)
- # 五、与原始数据合并
- df_name.name = 'df_name1'
- df_new = svdata.drop(['Gene.refGene'], axis=1).join(df_name)
- df_new.rename(columns={'df_name1': 'Gene.refGene'}, inplace=True)
- df_new.reset_index(drop=True, inplace=True)
- return df_new
- ###1.对主转录本数据的筛选。
- #对'AAChange.refGene'的数据提取
- #当'AAChange_select'为空时,
- #1)当AAChange.refGene只有一个转录本的突变信息时,我们将这个转录本信息直接赋给AAChange_select
- #2)当AAChange.refGene有多个转录本的突变信息时,选择这些转录本中最长的那个
- #输入的是基因时
- def transid_select_gene(gene):
- refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
- refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
- refseq['trans_length'] = refseq['end'] - refseq['start']
- genetand = refseq[refseq['gene'] == gene]
- maxlength = genetand['trans_length'].max()
- trans_unique_infor = genetand[genetand['trans_length'] == maxlength]
- trans_unique_infor.reset_index(drop=True, inplace=True)
- trans_unique_id = trans_unique_infor.loc[0, 'transid']
- return trans_unique_id
- #输入的时AAchange信息时.有一个或者多个转录本注释
- def transid_select_AAchange(AAchange,AAchange_gene):
- AAchangedata0=AAchange.split(',')
- AAchangedata0_1=pd.DataFrame()
- for i in range(len(AAchangedata0)):
- AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i]
- AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True)
- AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0]
- AAchangedata1.rename(columns={0: 'gene'}, inplace=True)
- AAchangedata2=AAchangedata1[AAchangedata1['gene']==AAchange_gene]
- refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
- refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
- refseq['trans_length'] = refseq['end'] - refseq['start']
- ##获得具有转录本编号的信息
- AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner')
- print(AAgene)
- #如果结果获得不为空则执行
- if len(AAgene) !=0:
- maxlength = AAgene['trans_length'].max()
- trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength]
- trans_unique_infor.reset_index(drop=True, inplace=True)
- trans_unique_id = trans_unique_infor.loc[0, 'transid']
- geneinfor = AAchange_gene + ':' + trans_unique_id
- AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s]
- AAchange_select=AAchange_select0[0]
- elif (len(AAgene) ==0) & (len(AAchangedata2)==1):
- AAchange_select =AAchange[0]
- elif (len(AAgene) ==0) & (len(AAchangedata2)>1):
- AAchange_select = AAchangedata0[0]
- return AAchange_select
- ###对于冷门基因的转录本信息
- def transid_select_AAchange(AAchange,AAchange_gene):
- AAchangedata0=AAchange.split(';')
- AAchangedata0_1=pd.DataFrame()
- for i in range(len(AAchangedata0)):
- AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i]
- AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True)
- AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0]
- AAchangedata1.rename(columns={0: 'gene'}, inplace=True)
- AAchangedata1_0 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene]
- # 当注释信息和基因匹配时才进行如下,如果结果不匹配,表明的时该基因和注释的基因是同意义,需要将注释的基因转为原来的
- if len(AAchangedata1_0) == 0:
- AAchangedata1['gene'] = AAchange_gene
- AAchangedata2 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene]
- refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
- refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
- refseq['trans_length'] = refseq['end'] - refseq['start']
- ##获得具有转录本编号的信息
- AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner')
- print(AAgene)
- #如果结果获得不为空则执行
- if len(AAgene) !=0:
- maxlength = AAgene['trans_length'].max()
- trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength]
- trans_unique_infor.reset_index(drop=True, inplace=True)
- trans_unique_id = trans_unique_infor.loc[0, 'transid']
- geneinfor = AAchange_gene + ':' + trans_unique_id
- AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s]
- AAchange_select=AAchange_select0[0]
- elif (len(AAgene) ==0) & (len(AAchangedata2)==1):
- AAchange_select =AAchange[0]
- elif (len(AAgene) ==0) & (len(AAchangedata2)>1):
- AAchange_select = AAchangedata0[0]
- return AAchange_select
- ####1.对AAchange_select的补充
- def AAchangeselect_plus(germdata):
- for j in range(len(germdata)):
- #print(j)
- gene = germdata.loc[j, 'Gene.refGene']
- if pd.isnull(germdata.loc[j, 'AAChange_select']):
- germdata.loc[j, 'AAChange_select']=''
- if ( ',' in germdata.loc[j, 'AAChange_select']) and (pd.notnull(germdata.loc[j, 'AAChange_select'])):
- AAChange_selectinfor = germdata.loc[j, 'AAChange_select'].split(',')
- AAChange_selectinfor1 = [s for s in AAChange_selectinfor if gene+':' in s]
- germdata.loc[j, 'AAChange_select']=AAChange_selectinfor1
- AAchangedata = germdata.loc[j, 'AAChange.refGene']
- #print(AAchangedata)
- if (AAchangedata.count(',') == 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''):
- germdata.loc[j, 'AAChange_select'] = germdata.loc[j, 'AAChange.refGene']
- elif (AAchangedata.count(',') != 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''):
- gene_transid_unque = transid_select_AAchange(AAchangedata, gene)
- AAchange_infor = AAchangedata.split(',')
- #geneinfor = gene + ':' + gene_transid_unque
- geneinfor = gene_transid_unque
- AAchange_select = [s for s in AAchange_infor if geneinfor in s]
- if len(AAchange_select) != 0:
- germdata.loc[j, 'AAChange_select'] = AAchange_select[0]
- else:
- germdata.loc[j, 'AAChange_select'] = ''
- return germdata
- #germdata1=AAchangeselect_plus(germdata)
- # 2.将c.注释的格式例如c.A8077T改为c.8077A>T
- #germdata2=cdsmut_trans(germdata1)
- #####2.将c.注释的格式例如c.A8077T改为c.8077A>T
- #3.终止密码止原来用X表示,转换后用*表示
- ###4.对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
- def cdsmut_trans(germdata):
- newchange = germdata['AAChange_select'].str.split(":", expand=True)
- newchange['c1'] = newchange[3].str.split('.', expand=True)[1]
- for i in range(len(newchange)):
- if pd.notnull(newchange.loc[i, 'c1']):
- if ('ins' not in newchange.loc[i, 'c1']) & ('del' not in newchange.loc[i, 'c1']):
- new = 'c.' + re.findall("\d+\.?\d*", newchange.loc[i, 'c1'])[0] + re.sub("[0-9]+", '>',
- newchange.loc[i, 'c1'])
- newchange.loc[i, 'new_AA'] = new
- else:
- new = 'c.' + newchange.loc[i, 'c1']
- newchange.loc[i, 'new_AA'] = new
- else:
- newchange.loc[i, 'new_AA'] = ''
- germdata['new_select'] = newchange[0] + ':' + newchange[1] + ':' + newchange[2] + ':' + newchange['new_AA'] + ':' + newchange[4]
- # 终止密码止原来用X表示,转换后用*表示
- germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*?[,#]', rep)
- germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "")
- germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*?[,#]', rep)
- germdata['new_select'] = (germdata['new_select']).str.replace('#$', "")
- # print(germline_filter4['AAChange_select'])
- ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
- germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
- germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "")
- germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
- germdata['new_select'] = (germdata['new_select']).str.replace('#$', "")
- return germdata
- ####对AAchange列进一步,提取经过vep和annovar注释后的Otherinfo11这列信息
- #并将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
- def AAchange_new_infor(svdata):
- data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True)
- data_INFO_1.fillna("", inplace=True)
- result_num = 0
- for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]:
- data_INFO_2 = pd.DataFrame()
- data_INFO_2["AAChange.refGene_1"] = data_INFO_1.iloc[:, i + 2] + ":" + \
- data_INFO_1.iloc[:, i + 5] + ":" + \
- "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \
- "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":"
- data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True)
- data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True)
- if data_INFO_3.shape[1] == 1:
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[0] + ":"
- else:
- data_INFO_3.fillna("", inplace=True)
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[1] + ":"
- if data_INFO_4.shape[1] == 1:
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[0]
- else:
- data_INFO_4.fillna("", inplace=True)
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[1]
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("exon:", "")
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("intron:", "")
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("::", ":")
- data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace(":$", "")
- if result_num == 0:
- svdata["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + ";"
- result_num = 1
- else:
- svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"] + data_INFO_2["AAChange.refGene_1"] + ";"
- svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(":;", "")
- svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(";$", "")
- # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
- AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt'
- AAdata = pd.read_table(AA_path, sep='\t')
- for j in range(len(AAdata)):
- AAdata_abbre = AAdata.loc[j, 'Abbre_name']
- AAdata_short = AAdata.loc[j, 'Short_name']
- svdata['AAChange.refGene_1'] = svdata['AAChange.refGene_1'].str.replace(AAdata_abbre, AAdata_short)
- return svdata
- ##将vep和annovar注释结果进行整合
- def vepannovar_merge(AAchange_raw,AAchange_new,AAchange_gene):
- '''
- vep and vardict anno merge
- 输入数据均为字符串
- :param AAchange_raw: the AAchange.refgene from annovar anno of the data,one str。
- :param AAchange_new: the AAchange.refgene_1 from the vep anno of the data,one str
- :return: the modified AAchange,str
- '''
- ###1.当AAchange_new中没有c.和p.信息时候,需要从AAchange_-raw中提取出来补全
- ###1.1对AAchange_raw按照符合分列。
- AAchangeraw1 = AAchange_raw.split(',')
- ###1.2对AAchane_new按照符号分列
- AAchangenew0 = AAchange_new.split(';')
- ##从AAchane_new选择Gene.refGene在内的
- AAchangenew1 = [s for s in AAchangenew0 if AAchange_gene in s]
- if len(AAchangenew1) != 0:
- # 1.2.1循环分列后的AAchangenew1
- # 首先从割裂后的结果中,找到与AAchange_gene一致的突变
- # 然后从raw中查找对应的c.和p.信息。
- # 如果c.信息都不能找到,那么这个转录本对应的突变信息就设置为空。
- AAchangenew2 = ''
- for j in range(len(AAchangenew1)):
- #print(j)
- AAchangenew1_0 = AAchangenew1[j]
- #print(AAchangenew1_0)
- #如果信息中跨外显子的,那只提取外显子的
- if ('exon' in AAchangenew1_0) & ('intron' in AAchangenew1_0):
- AAchangenew1_1=re.sub('intron\d.*?:', "",AAchangenew1_0)
- else:
- AAchangenew1_1 =AAchangenew1_0
- #从AAchangeraw中把AAchangenew基因转录本对应的信息抓取出来
- id0 = AAchangenew1_1.split(':')
- id1 = id0[0] + ':' + id0[1].split('.')[0]
- select0 = [s for s in AAchangeraw1 if id1 in s]
- #print(select0)
- #3.当提取的信息不为空时,进行格式转换。如果抓取信息为空时,保留原来注释信息
- #c.注释的格式的转换:如c.A8077T改为c.8077A>T。由于AAchangeraw是annovar注释的,而AAchangenew时VEP注释
- if len(select0) != 0:
- # 如果信息中跨外显子的,那只提取外显子的
- if ('exon' in select0[0]) & ('intron' in select0[0]):
- select = re.sub('intron\d.*?:', "", select0[0])
- else:
- select = select0[0]
- select1 = select.split(':')
- #从分割后的list中体取出c.对应的信息
- cdsinfor0=[s for s in select1 if 'c.' in s]
- #如果能够找到c.信息,那么进行替换。如果找不到c.信息。那么新赋值的注释为空。我们只提取包含c.的,对于注释为n.的舍弃
- if len(cdsinfor0)!=0:
- changeinfro_select1=cdsinfor0[0].split(".")[1]
- if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1)& ('dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1):
- cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',changeinfro_select1)
- else:
- cdsinfor1='c.'+changeinfro_select1
- # 从分隔后的list中提取楚p.对应的信息
- ppinfor0 = [s for s in select1 if 'p.' in s]
- if len(ppinfor0) != 0:
- ppinfor1 = ppinfor0[0]
- else:
- ppinfor1 = ''
- #获得新的注释
- annonew=id0[0] + ':' + id0[1]+':'+select1[2]+':'+cdsinfor1+':'+ppinfor1
- else:
- annonew =''
- # 4.对AAchangenew进行信息的填补
- # 如果AAchangenew没有c.信息时,那么新的赋值就用AAchangeraw里面
- if 'c.' not in AAchangenew1_1:
- AAchangenew2 = AAchangenew2 + annonew + ';'
- elif ('c.' in AAchangenew1_1) & ('p.' not in AAchangenew1_1):
- AAchangenew2 = AAchangenew2 + annonew + ';'
- else:
- AAchangenew2 = AAchangenew2 + AAchangenew1_1 + ';'
- else:
- AAchangenew2=AAchangenew2+AAchangenew1_1+';'
- AAchangenew3 = re.sub(r"(;)\1+", '', AAchangenew2)
- if AAchangenew3[-1] == ';':
- AAchangenew3_0 = AAchangenew3[:-1]
- if AAchangenew3_0[-1] ==':':
- AAchangenew4 = AAchangenew3_0[:-1]
- else:
- AAchangenew4 = AAchangenew3_0
- else:
- AAchangenew4 = AAchangenew3
- else:
- AAchangenew4 = AAchange_raw
- return AAchangenew4
- ###改,先对annovar注释的AAchange列进行格式转换
- ####对annovar注释的信息进行格式修改
- def AAchange_annovar_transdata(AAchange_annovar):
- '''
- :param AAchange_annovar: 输入为annovar注释的AAchange.refgene列的一行
- :return: 经过校正后的
- '''
- if (AAchange_annovar!='.') & (AAchange_annovar!='UNKNOWN'):
- annovarsplit = AAchange_annovar.split(',')
- annovar_new1 = ''
- for j in range(len(annovarsplit)):
- label_cpoint = annovarsplit[j].find('c.')
- label_ppoint = annovarsplit[j].find('p.')
- if (label_cpoint != -1) & (label_ppoint != -1):
- changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)]
- aainfor = annovarsplit[j][label_ppoint:]
- if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1) & (
- 'dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1):
- cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',
- changeinfro_select1)
- else:
- cdsinfor1 = 'c.' + changeinfro_select1
- # annovar_new00 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor
- # 对终止密码子的替换
- # annovar_new00 = annovar_new00 + "@"
- # annovar_new0 = re.sub("p\..+?[,@]", rep2, annovar_new00)
- if 'X' in aainfor:
- aainfor1 = aainfor.replace('X', '*')
- else:
- aainfor1 = aainfor
- annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor1
- elif (label_cpoint != -1) & (label_ppoint == -1):
- changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)]
- if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1):
- cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',
- changeinfro_select1)
- else:
- cdsinfor1 = 'c.' + changeinfro_select1
- annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1
- else:
- annovar_new0 = annovarsplit[j]
- annovar_new1 = annovar_new0 + ',' + annovar_new1
- annovar_new_last = annovar_new1[:-1]
- else:
- annovar_new_last=AAchange_annovar
- return annovar_new_last
- def AAchange_merge(svdata):
- svdata.reset_index(drop=True, inplace=True)
- for i in range(len(svdata)):
- #print(i)
- svdata.loc[i, 'AAChange.refGene'] = AAchange_annovar_transdata(svdata.loc[i, 'AAChange.refGene'])
- AAchange_raw = svdata.loc[i, 'AAChange.refGene']
- AAchange_new = svdata.loc[i, 'AAChange.refGene_1']
- AAchange_gene = svdata.loc[i, 'Gene.refGene']
- exonfunc = svdata.loc[i, 'ExonicFunc.refGene']
- svdata.loc[i, 'AAChange.refGene_2'] = vepannovar_merge(AAchange_raw, AAchange_new,AAchange_gene)
- # 对AAChange.refGene_2进一步修改
- new2 = svdata.loc[i, 'AAChange.refGene_2']
- new21 = new2.split(';')
- if exonfunc == 'stopgain':
- # for stopgain,extract the mut including *
- new2_extract = [s for s in new21 if '*' in s]
- newchange = ';'.join(new2_extract)
- elif exonfunc == 'startloss':
- # for the startloss,extract the mut including ?
- new2_extract = [s for s in new21 if '?' in s]
- newchange = ';'.join(new2_extract)
- else:
- newchange = new2
- # 将%3D替换为=
- newchange_re = newchange.replace('%3D', '=')
- svdata.loc[i, 'AAChange.refGene_2'] = newchange_re
- return svdata
- #######对转录本和突变表示形式的修改
- ####修改:去除化疗相关位点
- ###读入vep注释信息,方便对同义突变进一步筛选
- def vepanno(inputpath,normalid):
- germdir = os.path.join(inputpath, '4Germline_unpair')
- # 读入vep注释
- vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt')
- if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0:
- vepdata = pd.read_table(vepdir, sep='\t', header=0, low_memory=False)
- vepdata1 = AAchange_new_infor(vepdata)
- vepdata2 = AAchange_merge(vepdata1)
- #提取信息
- vepdata2['muttype'] = vepdata2['Otherinfo11'].str.split('|', expand=True)[1]
- vepdata3 = vepdata2[['Chr', 'Start', 'End', 'Ref', 'Alt', 'muttype', 'AAChange.refGene_1', 'AAChange.refGene_2']]
- vepdata3['Chr'] = vepdata3['Chr'].astype('str')
- vepdata3['Start'] = vepdata3['Start'].astype('str')
- vepdata3['End'] = vepdata3['End'].astype('str')
- return vepdata3
- ####改对unique的结果,根据vep的结果进一步校正。
- ####2.对于ExonicFunc.refGene为startloss选择结尾带?的突变信息;当为stopgain,选择下划线后面的信息。
- #对AAChange_select进一步核对
- def AAchange_select_new_test(germdata1):
- for i in range(len(germdata1)):
- aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
- aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
- if germdata1.loc[i,'ExonicFunc.refGene']=='startloss':
- result_aa = [s for s in aavep if '?' in s]
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif germdata1.loc[i,'ExonicFunc.refGene']=='stopgain':
- result_aa = [s for s in aavep if '*' in s]
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- else:
- seachgene = aaannovar[0] + ':' + aaannovar[1]
- result_aa = [s for s in aavep if seachgene in s]
- if len(result_aa) != 0:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- return germdata1
- ###对stopgain进一步修改
- def AAchange_select_new_raw(germdata1):
- for i in range(len(germdata1)):
- print(i)
- aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
- if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
- germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
- aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
- if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
- result_aa = [s for s in aavep if '?' in s]
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
- result_aa = [s for s in aavep if '*' in s]
- # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
- if 'ins*' in result_aa[0]:
- splitinfor = result_aa.split(':')
- newaa0 = splitinfor[4]
- if '_' in newaa0:
- newaa1 = newaa0.split('_')
- numbefore = int(re.findall("\d+", newaa1[0])[0])
- numafter = int(re.findall("\d+", newaa1[1])[0])
- if numafter - numbefore == 1:
- Pnew = "p." + newaa1[1]
- unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[
- 3] + ':' + Pnew
- germdata1.loc[i, 'AAChange_select'] = unique_new
- else:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- else:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- else:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- else:
- seachgene = aaannovar[0] + ':' + aaannovar[1]
- result_aa = [s for s in aavep if seachgene in s]
- if len(result_aa) != 0:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- return germdata1
- ####改,当有多个的时候用唯一转录本
- def AAchange_select_new_test1(germdata1):
- for i in range(len(germdata1)):
- print(i)
- if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
- germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
- aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
- if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
- result_aa = [s for s in aavep if '?' in s]
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
- result_aa = [s for s in aavep if '*' in s]
- # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
- if 'ins*' in result_aa[0]:
- splitinfor = result_aa.split(':')
- newaa0 = splitinfor[4]
- if '_' in newaa0:
- newaa1 = newaa0.split('_')
- numbefore = int(re.findall("\d+", newaa1[0])[0])
- numafter = int(re.findall("\d+", newaa1[1])[0])
- if numafter - numbefore == 1:
- Pnew = "p." + newaa1[1]
- unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[
- 3] + ':' + Pnew
- germdata1.loc[i, 'AAChange_select'] = unique_new
- else:
- germdata1.loc[i, 'AAChange_select'] =(Transid_unique_search(','.join(result_aa))).replace(';','')
- else:
- germdata1.loc[i, 'AAChange_select'] = (Transid_unique_search(','.join(result_aa))).replace(';','')
- else:
- # 当不是p.K173_Y174ins*,从中选择唯一转录本所在的
- unqiue_trans_select = (Transid_unique_search(','.join(result_aa))).replace(';','')
- if len(unqiue_trans_select)!=0:
- germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
- else:
- germdata1.loc[i, 'AAChange_select'] =result_aa[0]
- else:
- #首先选择唯一转录本
- unqiue_trans_select = (Transid_unique_search(','.join(aavep))).replace(';','')
- #如果没有获得唯一转录本的信息,采用AAchange_select原始的
- if len(unqiue_trans_select)!=0:
- germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
- elif (len(unqiue_trans_select)==0) & (pd.notna(germdata1.loc[i,'AAChange_select'])):
- #当原始注释有信息时
- aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
- seachgene = aaannovar[0] + ':' + aaannovar[1]
- result_aa = [s for s in aavep if seachgene in s]
- if len(result_aa) != 0:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif (len(unqiue_trans_select)==0) & (pd.isna(germdata1.loc[i,'AAChange_select'])):
- #当唯一转录本和原始注释都没信息时候,采用冷门转录本选择
- AAchange_gene=germdata1.loc[i,'Gene.refGene']
- AAchange=germdata1.loc[i,'AAChange.refGene_2']
- germdata1.loc[i, 'AAChange_select'] =transid_select_AAchange(AAchange,AAchange_gene)
- print(germdata1.loc[i, 'AAChange_select'] )
- #判断唯一转录本的区域是否和Func.refGene一致,如果不一致按照唯一转录本进行修改
- functype=germdata1.loc[i, 'Func.refGene']
- if 'intron' in germdata1.loc[i, 'AAChange_select']:
- func_unique='intronic'
- elif 'exon' in germdata1.loc[i, 'AAChange_select']:
- func_unique='exonic'
- else:
- func_unique='other'
- if (func_unique not in functype) & (func_unique!='other'):
- germdata1.loc[i, 'Func.refGene'] =func_unique
- return germdata1
- ##改当有多个stopgain这种p.K173_Y174ins* 突变的时候
- def AAchange_select_new(germdata1):
- for i in range(len(germdata1)):
- print(i)
- if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
- germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
- aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
- if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
- result_aa = [s for s in aavep if '?' in s]
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
- result_aa = [s for s in aavep if '*' in s]
- # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
- if 'ins*' in result_aa[0]:
- #首选主要转录本
- maintrans=(Transid_unique_search(','.join(result_aa))).replace(';','')
- splitinfor = maintrans.split(':')
- newaa0 = splitinfor[4]
- if '_' in newaa0:
- newaa1 = newaa0.split('_')
- numbefore = int(re.findall("\d+", newaa1[0])[0])
- numafter = int(re.findall("\d+", newaa1[1])[0])
- if numafter - numbefore == 1:
- Pnew = "p." + newaa1[1]
- unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[3] + ':' + Pnew
- germdata1.loc[i, 'AAChange_select'] = unique_new
- else:
- germdata1.loc[i, 'AAChange_select'] =maintrans
- else:
- germdata1.loc[i, 'AAChange_select'] = maintrans
- else:
- # 当不是p.K173_Y174ins*,从中选择唯一转录本所在的
- unqiue_trans_select = (Transid_unique_search(','.join(result_aa))).replace(';','')
- if len(unqiue_trans_select)!=0:
- germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
- else:
- germdata1.loc[i, 'AAChange_select'] =result_aa[0]
- else:
- #首先选择唯一转录本
- unqiue_trans_select = (Transid_unique_search(','.join(aavep))).replace(';','')
- #如果没有获得唯一转录本的信息,采用AAchange_select原始的
- if len(unqiue_trans_select)!=0:
- germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
- elif (len(unqiue_trans_select)==0) & (pd.notna(germdata1.loc[i,'AAChange_select'])):
- #当原始注释有信息时
- aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
- seachgene = aaannovar[0] + ':' + aaannovar[1]
- result_aa = [s for s in aavep if seachgene in s]
- if len(result_aa) != 0:
- germdata1.loc[i, 'AAChange_select'] = result_aa[0]
- elif (len(unqiue_trans_select)==0) & (pd.isna(germdata1.loc[i,'AAChange_select'])):
- #当唯一转录本和原始注释都没信息时候,采用冷门转录本选择
- AAchange_gene=germdata1.loc[i,'Gene.refGene']
- AAchange=germdata1.loc[i,'AAChange.refGene_2']
- germdata1.loc[i, 'AAChange_select'] =transid_select_AAchange(AAchange,AAchange_gene)
- print(germdata1.loc[i, 'AAChange_select'] )
- #判断唯一转录本的区域是否和Func.refGene一致,如果不一致按照唯一转录本进行修改
- functype=germdata1.loc[i, 'Func.refGene']
- if 'intron' in germdata1.loc[i, 'AAChange_select']:
- func_unique='intronic'
- elif 'exon' in germdata1.loc[i, 'AAChange_select']:
- func_unique='exonic'
- else:
- func_unique='other'
- if (func_unique not in functype) & (func_unique!='other'):
- germdata1.loc[i, 'Func.refGene'] =func_unique
- return germdata1
- ###对唯一转录本的再次选择。
- #首选在参考表中的转录本
- ###对于热门基因的转录本,采用主转录本
- #采用核对后的pan602的转录本信息
- def Transid_unique_search(AAchange):
- transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE_pan602_20230522.txt'
- transiddata = pd.read_table(transid_path, sep='\t')
- unique_trans_last = ''
- if AAchange != ' ':
- AAchange1 = AAchange.split(',')
- for j in range(len(AAchange1)):
- if AAchange1[j].count(":") == 4:
- AAchange1_0 = AAchange1[j].split(':')[1]
- AAchange1_exon = AAchange1[j].split(':')[2]
- AAchange1_cc = AAchange1[j].split(':')[3]
- AAchange1_pp = AAchange1[j].split(':')[4]
- #transinfor = transiddata[(transiddata['RefSeq'] == AAchange1_0) | (transiddata['HGNC'] == AAchange1_0) | (transiddata['MANE'] == AAchange1_0)].reset_index(drop=True)
- #采用遗传核对后的唯一转录本
- transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
- if not transinfor.empty:
- unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc + ':' + AAchange1_pp+';'
- else:
- unique_trans = ''
- unique_trans_last = unique_trans_last + unique_trans
- elif AAchange1[j].count(":") == 3:
- AAchange1_0 = AAchange1[j].split(':')[1]
- AAchange1_exon = AAchange1[j].split(':')[2]
- AAchange1_cc = AAchange1[j].split(':')[3]
- transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
- if not transinfor.empty:
- unique_trans = transinfor.loc[ 0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc+';'
- else:
- unique_trans = ''
- unique_trans_last = unique_trans_last + unique_trans
- elif AAchange1[j].count(":") == 2:
- AAchange1_0 = AAchange1[j].split(':')[1]
- AAchange1_exon = AAchange1[j].split(':')[2]
- transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
- if not transinfor.empty:
- unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon+';'
- else:
- unique_trans = ''
- unique_trans_last = unique_trans_last + unique_trans
- else:
- unique_trans = ''
- unique_trans_last = unique_trans_last + unique_trans
- else:
- unique_trans_last = ''
- return unique_trans_last
- ###进行gnomAD_exome_EAS和STR区域的筛选
- #改去掉多态性位点mut_ratio大于0.15
- def snvfilter(inputpath,germline_filter1,sampleid):
- germdir = os.path.join(inputpath, '4Germline_unpair')
- STRdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/UCSC_STR_LCR_simpleRepeat_genomicSuperDups.bed'
- germline_filter1.rename(columns={'AAChange_select': 'raw_AAChange_select'}, inplace=True)
- germline_filter1.rename(columns={'new_select': 'AAChange_select'}, inplace=True)
- germline_filter1.sort_values(by='Gene.refGene', inplace=True)
- # for the germline summary data
- germdata = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/germline_counts_stastic20230331.txt'
- germref1 = pd.read_table(germdata, sep='\t', header=0)
- #germref1 = germref[['mut_new', 'sample_counts']].drop_duplicates(keep='first')
- germref1.reset_index(drop=True, inplace=True)
- titlelist = germline_filter1.columns
- if len(germline_filter1) != 0:
- # select Func.refGene !='intronic'和'ncRNA_exonic'
- germline_filter2 = germline_filter1[
- (germline_filter1['Func.refGene'] != 'intronic') & (germline_filter1['Func.refGene'] != 'ncRNA_exonic')]
- if len(germline_filter2) != 0:
- # selct AAChange.refGene!='.'
- germline_filter3 = germline_filter2[germline_filter2['AAChange.refGene'] != '.']
- if len(germline_filter3) != 0:
- # select gnomAD_exome_EAS<=1%
- print('step6---select the vaf of genomAD')
- filterdata4_0 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] != '.')]
- # 6.1提取频率小于1%
- filterdata4_1 = filterdata4_0[(filterdata4_0['gnomAD_exome_EAS'].astype("float") <= 0.01)]
- # 6.2提取没有突变频率的
- filterdata4_2 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] == '.')]
- # 6.3将小于等于1%和没有突变频率的合并
- filterdata4_3 = filterdata4_2.append(filterdata4_1)
- germline_filter4 = filterdata4_3.reset_index(drop=True)
- germline_filter4.reset_index(drop=True, inplace=True)
- if len(germline_filter4) != 0:
- ##remove the mutation in STR region
- germbed = germline_filter4[['Chr', 'Start', 'End', 'Gene.refGene']]
- outputname1 = os.path.join(germdir, sampleid + '_germline.bed')
- germbed.to_csv(outputname1, index=False, header=None, encoding='gbk', sep='\t')
- outputname2 = os.path.join(germdir, sampleid + '_germline_unSTR.bed')
- STRcmd = 'bedtools subtract -a ' + outputname1 + ' -b ' + STRdir + '>' + outputname2
- os.system(STRcmd)
- germline_unSTR = pd.read_table(outputname2, sep='\t', names=['Chr', 'Start', 'End', 'Gene.refGene'])
- if len(germline_unSTR) != 0:
- germline_unSTR[['Chr', 'Start', 'End']] = germline_unSTR[['Chr', 'Start', 'End']].astype('str')
- germline_filter4[['Chr', 'Start', 'End']] = germline_filter4[['Chr', 'Start', 'End']].astype(
- 'str')
- germline_filter5 = pd.merge(germline_filter4, germline_unSTR,
- on=['Chr', 'Start', 'End', 'Gene.refGene'], how='inner')
- germline_filter5_1 = germline_filter5[titlelist]
- germline_filter5_1['mut_new'] = germline_filter5_1['Chr'] + '_' + germline_filter5_1[
- 'Start'] + '_' + germline_filter5_1['End'] + '_' + germline_filter5_1['Ref'] + '_' + \
- germline_filter5_1['Alt']
- germline_filter_result0 = pd.merge(germline_filter5_1, germref1, on=['mut_new'], how='left')
- germline_filter_result0['mut_ratio']=germline_filter_result0['mut_ratio'].fillna(0)
- #删除假阳性位点
- print('filter the false germline data')
- germline_filter_result =germline_filter_result0[germline_filter_result0['mut_ratio']<=0.15]
- del germline_filter_result['mut_ratio']
- del germline_filter_result['mut_new']
- germline_filter_result.reset_index(drop=True,inplace=True)
- print(len(germline_filter_result))
- if len(germline_filter_result)!=0:
- for i in range(len(germline_filter_result)):
- Func_refGene = germline_filter_result.loc[i, 'Func.refGene']
- ExonicFunc_refGene = germline_filter_result.loc[i, 'ExonicFunc.refGene']
- if (Func_refGene == 'splicing') & (ExonicFunc_refGene == '.'):
- germline_filter_result.loc[i, 'ExonicFunc.refGene'] = 'splicing'
- germline_filter_result.sort_values(by='Gene.refGene', inplace=True)
- else:
- print(sampleid + ' has no result when filter the false germline snv')
- germline_filter_result = pd.DataFrame(columns=titlelist)
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No result after filtering the false germline snv'
- else:
- print(sampleid + ' has no result when filter the STR')
- germline_filter_result = pd.DataFrame(columns=titlelist)
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No result after filtering the STR'
- os.remove(outputname1)
- os.remove(outputname2)
- # print(germline_filter_result['AAChange_select'])
- else:
- print(sampleid + ' has no result when filter the genomeAD_EAS')
- germline_filter_result = pd.DataFrame(columns=titlelist)
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No result after filtered by genomeAD_EAS'
- else:
- print(sampleid + ' has no result when filtering the intronic and ncRNA_exonic')
- germline_filter_result = pd.DataFrame(columns=titlelist)
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No result after filtering the intronic and ncRNA_exonic'
- else:
- print(sampleid + ' has no result aftere vaf filtering')
- germline_filter_result = pd.DataFrame(columns=titlelist)
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No result after filtering VAF>=0.3'
- return germline_filter_result
- ####获得vep注释的突变类型信息,包括突变类型以及其他突变信息
- def AAchange_new_infor_vep(svdata):
- data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True)
- data_INFO_1.fillna("", inplace=True)
- result_num = 0
- for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]:
- data_INFO_2 = pd.DataFrame()
- data_INFO_2["AAChange.refGene_vepraw"] =data_INFO_1.iloc[:, i + 0] + ":" + data_INFO_1.iloc[:, i + 2] + ":" + \
- data_INFO_1.iloc[:, i + 5] + ":" + \
- "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \
- "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":"
- data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True)
- data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True)
- if data_INFO_3.shape[1] == 1:
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[0] + ":"
- else:
- data_INFO_3.fillna("", inplace=True)
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[1] + ":"
- if data_INFO_4.shape[1] == 1:
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[0]
- else:
- data_INFO_4.fillna("", inplace=True)
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[1]
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("exon:", "")
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("intron:", "")
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("::", ":")
- data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace(":$", "")
- if result_num == 0:
- svdata["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + ";"
- result_num = 1
- else:
- svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"] + data_INFO_2["AAChange.refGene_vepraw"] + ";"
- svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(":;", "")
- svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(";$", "")
- # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
- AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt'
- AAdata = pd.read_table(AA_path, sep='\t')
- for j in range(len(AAdata)):
- AAdata_abbre = AAdata.loc[j, 'Abbre_name']
- AAdata_short = AAdata.loc[j, 'Short_name']
- svdata['AAChange.refGene_vepraw'] = svdata['AAChange.refGene_vepraw'].str.replace(AAdata_abbre, AAdata_short)
- return svdata
- #对获得的结果进一步校正
- def AAselect_modify(inputpath,normalid,germdata_result):
- germdir = os.path.join(inputpath, '4Germline_unpair')
- # 读入vep注释
- vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt')
- if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0:
- vepdata_raw = pd.read_table(vepdir, sep='\t', header=0, low_memory=False)
- vepdata1_raw = AAchange_new_infor_vep(vepdata_raw)
- vepcolumns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'AAChange.refGene_vepraw']
- vepdata2_raw = vepdata1_raw[vepcolumns]
- vepdata2_raw['Chr'] = vepdata2_raw['Chr'].astype(str)
- vepdata2_raw['Start'] = vepdata2_raw['Start'].astype(str)
- vepdata2_raw['End'] = vepdata2_raw['End'].astype(str)
- # 和过滤后结果合并
- germlinedata = pd.merge(germdata_result, vepdata2_raw, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left')
- #唯一转录本信息校正
- for i in range(len(germlinedata)):
- print(i)
- veprawanno = germlinedata.loc[i, 'AAChange.refGene_vepraw']
- gene = germlinedata.loc[i, 'Gene.refGene']
- # 首先对唯一转录本进行校正
- transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE_pan602_20230522.txt'
- transiddata = pd.read_table(transid_path, sep='\t')
- veprawanno1 = veprawanno.split(';')
- gene_trans_infor = transiddata[transiddata['Symbol'] == gene]
- gene_trans_infor.reset_index(drop=True, inplace=True)
- #以RefSeq的为主
- gene_trans_unique = gene_trans_infor.loc[0, 'RefSeq']
- # 获得唯一转录本对应的结果
- tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s]
- print(tranid_vepinfor)
- #如果没有获得唯一转录本的结果,从其他转录本中获得
- if len(tranid_vepinfor)==0:
- AAchange=germlinedata.loc[i, 'AAChange.refGene_1']
- newchange=transid_select_AAchange(AAchange,gene)
- gene_trans_unique=newchange.split(':')[1]
- tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s]
- #当注释的是intronic时候,需要把exonicFunc替换为vepanno里面的注释情况
- # 对Func.refGene的替换,如果vep注释信息有intron,那么写做intronic
- if len(tranid_vepinfor[0].split(':'))>3:
- Func_label = tranid_vepinfor[0].split(':')[3]
- if 'intron' in Func_label:
- germlinedata.loc[i, 'Func.refGene'] = 'intronic'
- Func_vep = tranid_vepinfor[0].split(':')[0]
- # 将annovar注释的功能替换为vep注释的功能
- germlinedata.loc[i, 'ExonicFunc.refGene'] = Func_vep
- else:
- germlinedata.loc[i, 'Func.refGene'] = 'intronic'
- Func_vep = tranid_vepinfor[0].split(':')[0]
- germlinedata.loc[i, 'ExonicFunc.refGene'] = Func_vep
- ##将AAChange_select进行替换
- vepselect = tranid_vepinfor[0].split(':')
- del (vepselect[0])
- germlinedata.loc[i, 'AAChange_select'] = ':'.join(vepselect)
- #对移码突变进行转换,如由p.P1353Qfs*89改成p.P1353fs
- # 基因也进行替换
- germlinedata.loc[i, 'Gene.refGene'] = vepselect[0]
- return germlinedata
- def germline_summary_control(inputpath,laneid,normalid,sampleid):
- germtitle_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/germline_title_20230522.txt'
- #panel genelist
- panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
- panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0, names=['Gene.refGene'])
- #druglist
- druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
- drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['Gene.refGene'])
- drug_genelist['drug_lable'] = 'Drug_gene'
- germtile = pd.read_table(germtitle_dir, sep='\t', header=0)
- titlelist_final = list(germtile.columns)
- titlelist_final.append('sample_counts')
- #对输出总表加上AAchange_1和AAchange_2
- #titlelist_summary=list(titlelist_final)
- #titlelist_summary.append('AAChange.refGene_1')
- #titlelist_summary.append('AAChange.refGene_2')
- germdir = os.path.join(inputpath, '4Germline_unpair')
- germdatafile=germdir+'/'+normalid+'.germ.xls'
- #读入vep注释
- vepdata1 = vepanno(inputpath,normalid)
- if os.path.exists(germdatafile) and os.path.getsize(germdatafile) !=0:
- #sampleid = normalid[:-2]
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- tempfile=os.path.join(inputpath,'tempfile')
- if not os.path.exists(tempfile):
- os.mkdir(tempfile)
- germ_tem_dir=os.path.join(tempfile,'germline')
- if not os.path.exists(germ_tem_dir):
- os.mkdir(germ_tem_dir)
- germdata_raw = pd.read_table(germdatafile, sep='\t')
- #先按照基因名排序
- germdata_raw.sort_values(by='Gene.refGene', inplace=True)
- germdata_raw.reset_index(drop=True, inplace=True)
- #修改列名
- germdata_raw.rename(columns={'CLNACC':'CLNALLELEID','CLNDBN':'CLNDN','CLNDSDB':'CLNDISDB','CLINVARREVSTATS':'CLNREVSTAT'},inplace=True)
- germdata_raw['Chr'] = germdata_raw['Chr'].astype('str')
- germdata_raw['Start'] = germdata_raw['Start'].astype('str')
- germdata_raw['End'] = germdata_raw['End'].astype('str')
- ##对基因进行分列
- germdata01 = gene_split(germdata_raw)
- # 提取在gene列表中的基因
- germdata02 = pd.merge(germdata01, panel_genelist, on=['Gene.refGene'], how='inner')
- germdata02.insert(0, 'sampleid', sampleid)
- #去除化疗基因
- germdata03 = pd.merge(germdata02, drug_genelist, on=['Gene.refGene'], how='left')
- germdata = germdata03[germdata03['drug_lable'] != 'Drug_gene']
- germdata.reset_index(drop=True, inplace=True)
- ###对CLINSIG进行分列
- if len(germdata[germdata['CLINSIG'].str.contains('\\[')]) != 0:
- germdata05 = germdata['CLINSIG'].str.split("[", expand=True)
- germdata05[1].fillna('.', inplace=True)
- germdata['CLNSIG'] = germdata05[0]
- germdata['CLNSIGCONF'] = germdata05[1].str.replace(']', '')
- else:
- germdata['CLNSIG'] = germdata['CLINSIG']
- germdata['CLNSIGCONF'] = '.'
- del germdata['CLINSIG']
- del germdata['drug_lable']
- print('clinsig')
- germdata.reset_index(drop=True, inplace=True)
- #1.add the AAchangedata
- germdata1=AAchangeselect_plus(germdata)
- # 2.将c.注释的格式例如c.A8077T改为c.8077A>T
- germdata2=cdsmut_trans(germdata1)
- # remove the HLA related gene
- germdata_result1 = germdata2[~germdata2['Gene.refGene'].str.contains('HLA')]
- germdata_result1.reset_index(drop=True, inplace=True)
- # select VAF>=0.3
- germline_filter1 = germdata_result1[germdata_result1['VAF'] >= 0.3]
- # VAF转为%
- germline_filter1['VAF'] = germline_filter1['VAF'].apply(lambda x: format(x, '.2%'))
- #进行gnomAD_exome_EAS和STR区域的筛选
- germline_filter_result0=snvfilter(inputpath,germline_filter1,sampleid)
- #对AAchange_select进行annovar和vep注释的合并进一步修改
- if len(vepdata1)!=0:
- germdata_vepanno0 = pd.merge(germline_filter_result0, vepdata1, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left')
- germdata_vepanno1 = germdata_vepanno0[~germdata_vepanno0['muttype'].str.contains('synonymous')]
- del germdata_vepanno1['muttype']
- germdata_vepanno1.reset_index(drop=True, inplace=True)
- #进行unique的筛选
- germdata_result=AAchange_select_new(germdata_vepanno1)
- #对移码突变的再次确认修改
- germdata_result['AAChange_select'] = (germdata_result['AAChange_select'] + "#").str.replace(
- 'p\..*fs\*.*?[,#]', rep1)
- germdata_result['AAChange_select'] = germdata_result['AAChange_select'].str.replace('#$', "")
- else:
- germdata_result=germline_filter_result0
- #对获得的结果进一步进行信息校正
- germdata_result_allinfor = AAselect_modify(inputpath, normalid, germdata_result)
- ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
- germdata_result_allinfor['AAChange_select'] = (germdata_result_allinfor['AAChange_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
- germdata_result_allinfor['AAChange_select'] = germdata_result_allinfor['AAChange_select'].str.replace('#$', "")
- germdata_result_allinfor.rename(columns={'AAChange.refGene': 'AAChange.refGene_annovar', 'AAChange.refGene_1': 'AAChange.refGene'},inplace=True)
- #输出两种注释结果
- outputfile0=os.path.join(germ_tem_dir,laneid + '-' + sampleid + '.germline_raw_infor.txt')
- germdata_result_allinfor.to_csv(outputfile0,sep='\t',index=False,header=True)
- #输出规定的列
- germline_filter_result=germdata_result_allinfor[titlelist_final]
- #再次去除包含包含“%3D”的突变
- germline_filter_result=germline_filter_result[germline_filter_result['AAChange_select'].str.contains("%3D")==False]
- germline_filter_result.reset_index(drop=True,inplace=True)
- #输出结果
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.germline.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- germline_filter_result.to_excel(writer, sheet_name='germline', index=False)
- writer.save()
- writer.close()
- else:
- print(normalid+' has no data or the result is null')
- germline_filter_result=pd.DataFrame()
- germline_filter_result.loc[0, 'sampleid'] = sampleid
- germline_filter_result.loc[0, 'label'] = 'No file'
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- bugfile_dir = os.path.join(temp_dir, 'bugfile')
- if not os.path.exists(bugfile_dir):
- os.mkdir(bugfile_dir)
- outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.germline.nofile.log.txt')
- germline_filter_result.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
- def germline_runmain(inputpath,normalid):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- sampledata = samplelist[samplelist['normal'] == normalid]
- sampledata.reset_index(drop=True, inplace=True)
- sampleid = sampledata.loc[0, 'samplename']
- germdir = os.path.join(inputpath, '4Germline_unpair')
- germdatafile = germdir + '/' + normalid + '.germ.xls'
- if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0:
- germline_summary_control(inputpath, laneid, normalid, sampleid)
- else:
- print(normalid + ' is null')
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the germline')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Normalid = args.normalid
- germline_runmain(Inputpath,Normalid)
- ###for all sample in lane
- def germlinesummary(inputpath,laneid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- germdatasummary = pd.DataFrame()
- for i in range(len(samplelist)):
- sampleid = samplelist.loc[i, 'samplename']
- normalid = samplelist.loc[i, 'normal']
- print(normalid)
- germdir = os.path.join(inputpath, '4Germline_unpair')
- germdatafile = germdir + '/' + normalid + '.germ.xls'
- if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0:
- germre=germline_summary_control(inputpath, laneid, normalid,sampleid)
- germdatasummary =germdatasummary.append(germre)
- else:
- print(normalid+' is null')
- continue
- outputname = datasummarydir + '/' + laneid + '_table6_germlinne_datasummary.txt'
- germdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|