import pandas as pd import os,os.path import xlrd import re import argparse ########germline针对的为白细胞对照 #保留条件如:频率>=30%、Func.refGene中intronic和ncRNA_exonic去掉、无突变注释信息的去掉、人群频率>=1%的去掉; # 胚系注释中终止密码止为X表示,请跟体系保持一致,用*号。 # 胚系表头请按附近中示例调整一下顺序,VAF改为百分比的数据形式。 #去掉HLA相关基因突变 #输出按照基因名称进行排序 #for the end change def rep(a): # print(a.group(0)) st = re.sub("x","*",a.group(0),flags=re.I) st = re.sub("#","",st,flags=re.I) return st #frameshift replace def rep1(a): st = re.sub("fs\*.*", "", a.group(0), flags=re.I) st1 = re.split("(\d+)", st) if "," in a.group(0): st2 = st1[0] + st1[1] + "fs," else: st2 = st1[0] + st1[1] + "fs" return st2 ###当由多个基因构成时进行拆分 ###当Gene.refGene由多个基因注释构成,且这几个基因是在我们的genelist中,我们需要的是将基因拆分,然后对应的AAchange信息保留 def gene_split(svdata): df_name = svdata['Gene.refGene'].str.split(';', expand=True) # 三、把行转列成列 df_name = df_name.stack() # 四、重置索引,并删除多于的索引 df_name = df_name.reset_index(level=1, drop=True) # 五、与原始数据合并 df_name.name = 'df_name1' df_new = svdata.drop(['Gene.refGene'], axis=1).join(df_name) df_new.rename(columns={'df_name1': 'Gene.refGene'}, inplace=True) df_new.reset_index(drop=True, inplace=True) return df_new ###1.对主转录本数据的筛选。 #对'AAChange.refGene'的数据提取 #当'AAChange_select'为空时, #1)当AAChange.refGene只有一个转录本的突变信息时,我们将这个转录本信息直接赋给AAChange_select #2)当AAChange.refGene有多个转录本的突变信息时,选择这些转录本中最长的那个 #输入的是基因时 def transid_select_gene(gene): refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed' refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene']) refseq['trans_length'] = refseq['end'] - refseq['start'] genetand = refseq[refseq['gene'] == gene] maxlength = genetand['trans_length'].max() trans_unique_infor = genetand[genetand['trans_length'] == maxlength] trans_unique_infor.reset_index(drop=True, inplace=True) trans_unique_id = trans_unique_infor.loc[0, 'transid'] return trans_unique_id #输入的时AAchange信息时.有一个或者多个转录本注释 def transid_select_AAchange(AAchange,AAchange_gene): AAchangedata0=AAchange.split(',') AAchangedata0_1=pd.DataFrame() for i in range(len(AAchangedata0)): AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i] AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True) AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0] AAchangedata1.rename(columns={0: 'gene'}, inplace=True) AAchangedata2=AAchangedata1[AAchangedata1['gene']==AAchange_gene] refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed' refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene']) refseq['trans_length'] = refseq['end'] - refseq['start'] ##获得具有转录本编号的信息 AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner') print(AAgene) #如果结果获得不为空则执行 if len(AAgene) !=0: maxlength = AAgene['trans_length'].max() trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength] trans_unique_infor.reset_index(drop=True, inplace=True) trans_unique_id = trans_unique_infor.loc[0, 'transid'] geneinfor = AAchange_gene + ':' + trans_unique_id AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s] AAchange_select=AAchange_select0[0] elif (len(AAgene) ==0) & (len(AAchangedata2)==1): AAchange_select =AAchange[0] elif (len(AAgene) ==0) & (len(AAchangedata2)>1): AAchange_select = AAchangedata0[0] return AAchange_select ###对于冷门基因的转录本信息 def transid_select_AAchange(AAchange,AAchange_gene): AAchangedata0=AAchange.split(';') AAchangedata0_1=pd.DataFrame() for i in range(len(AAchangedata0)): AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i] AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True) AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0] AAchangedata1.rename(columns={0: 'gene'}, inplace=True) AAchangedata1_0 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene] # 当注释信息和基因匹配时才进行如下,如果结果不匹配,表明的时该基因和注释的基因是同意义,需要将注释的基因转为原来的 if len(AAchangedata1_0) == 0: AAchangedata1['gene'] = AAchange_gene AAchangedata2 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene] refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed' refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene']) refseq['trans_length'] = refseq['end'] - refseq['start'] ##获得具有转录本编号的信息 AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner') print(AAgene) #如果结果获得不为空则执行 if len(AAgene) !=0: maxlength = AAgene['trans_length'].max() trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength] trans_unique_infor.reset_index(drop=True, inplace=True) trans_unique_id = trans_unique_infor.loc[0, 'transid'] geneinfor = AAchange_gene + ':' + trans_unique_id AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s] AAchange_select=AAchange_select0[0] elif (len(AAgene) ==0) & (len(AAchangedata2)==1): AAchange_select =AAchange[0] elif (len(AAgene) ==0) & (len(AAchangedata2)>1): AAchange_select = AAchangedata0[0] return AAchange_select ####1.对AAchange_select的补充 def AAchangeselect_plus(germdata): for j in range(len(germdata)): #print(j) gene = germdata.loc[j, 'Gene.refGene'] if pd.isnull(germdata.loc[j, 'AAChange_select']): germdata.loc[j, 'AAChange_select']='' if ( ',' in germdata.loc[j, 'AAChange_select']) and (pd.notnull(germdata.loc[j, 'AAChange_select'])): AAChange_selectinfor = germdata.loc[j, 'AAChange_select'].split(',') AAChange_selectinfor1 = [s for s in AAChange_selectinfor if gene+':' in s] germdata.loc[j, 'AAChange_select']=AAChange_selectinfor1 AAchangedata = germdata.loc[j, 'AAChange.refGene'] #print(AAchangedata) if (AAchangedata.count(',') == 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''): germdata.loc[j, 'AAChange_select'] = germdata.loc[j, 'AAChange.refGene'] elif (AAchangedata.count(',') != 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''): gene_transid_unque = transid_select_AAchange(AAchangedata, gene) AAchange_infor = AAchangedata.split(',') #geneinfor = gene + ':' + gene_transid_unque geneinfor = gene_transid_unque AAchange_select = [s for s in AAchange_infor if geneinfor in s] if len(AAchange_select) != 0: germdata.loc[j, 'AAChange_select'] = AAchange_select[0] else: germdata.loc[j, 'AAChange_select'] = '' return germdata #germdata1=AAchangeselect_plus(germdata) # 2.将c.注释的格式例如c.A8077T改为c.8077A>T #germdata2=cdsmut_trans(germdata1) #####2.将c.注释的格式例如c.A8077T改为c.8077A>T #3.终止密码止原来用X表示,转换后用*表示 ###4.对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs def cdsmut_trans(germdata): newchange = germdata['AAChange_select'].str.split(":", expand=True) newchange['c1'] = newchange[3].str.split('.', expand=True)[1] for i in range(len(newchange)): if pd.notnull(newchange.loc[i, 'c1']): if ('ins' not in newchange.loc[i, 'c1']) & ('del' not in newchange.loc[i, 'c1']): new = 'c.' + re.findall("\d+\.?\d*", newchange.loc[i, 'c1'])[0] + re.sub("[0-9]+", '>', newchange.loc[i, 'c1']) newchange.loc[i, 'new_AA'] = new else: new = 'c.' + newchange.loc[i, 'c1'] newchange.loc[i, 'new_AA'] = new else: newchange.loc[i, 'new_AA'] = '' germdata['new_select'] = newchange[0] + ':' + newchange[1] + ':' + newchange[2] + ':' + newchange['new_AA'] + ':' + newchange[4] # 终止密码止原来用X表示,转换后用*表示 germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*?[,#]', rep) germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "") germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*?[,#]', rep) germdata['new_select'] = (germdata['new_select']).str.replace('#$', "") # print(germline_filter4['AAChange_select']) ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1) germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "") germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1) germdata['new_select'] = (germdata['new_select']).str.replace('#$', "") return germdata ####对AAchange列进一步,提取经过vep和annovar注释后的Otherinfo11这列信息 #并将氨基酸改变缩写改为单字母的简写,终止突变Ter改为* def AAchange_new_infor(svdata): data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True) data_INFO_1.fillna("", inplace=True) result_num = 0 for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]: data_INFO_2 = pd.DataFrame() data_INFO_2["AAChange.refGene_1"] = data_INFO_1.iloc[:, i + 2] + ":" + \ data_INFO_1.iloc[:, i + 5] + ":" + \ "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \ "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":" data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True) data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True) if data_INFO_3.shape[1] == 1: data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[0] + ":" else: data_INFO_3.fillna("", inplace=True) data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[1] + ":" if data_INFO_4.shape[1] == 1: data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[0] else: data_INFO_4.fillna("", inplace=True) data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[1] data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("exon:", "") data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("intron:", "") data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("::", ":") data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace(":$", "") if result_num == 0: svdata["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + ";" result_num = 1 else: svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"] + data_INFO_2["AAChange.refGene_1"] + ";" svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(":;", "") svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(";$", "") # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为* AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt' AAdata = pd.read_table(AA_path, sep='\t') for j in range(len(AAdata)): AAdata_abbre = AAdata.loc[j, 'Abbre_name'] AAdata_short = AAdata.loc[j, 'Short_name'] svdata['AAChange.refGene_1'] = svdata['AAChange.refGene_1'].str.replace(AAdata_abbre, AAdata_short) return svdata ##将vep和annovar注释结果进行整合 def vepannovar_merge(AAchange_raw,AAchange_new,AAchange_gene): ''' vep and vardict anno merge 输入数据均为字符串 :param AAchange_raw: the AAchange.refgene from annovar anno of the data,one str。 :param AAchange_new: the AAchange.refgene_1 from the vep anno of the data,one str :return: the modified AAchange,str ''' ###1.当AAchange_new中没有c.和p.信息时候,需要从AAchange_-raw中提取出来补全 ###1.1对AAchange_raw按照符合分列。 AAchangeraw1 = AAchange_raw.split(',') ###1.2对AAchane_new按照符号分列 AAchangenew0 = AAchange_new.split(';') ##从AAchane_new选择Gene.refGene在内的 AAchangenew1 = [s for s in AAchangenew0 if AAchange_gene in s] if len(AAchangenew1) != 0: # 1.2.1循环分列后的AAchangenew1 # 首先从割裂后的结果中,找到与AAchange_gene一致的突变 # 然后从raw中查找对应的c.和p.信息。 # 如果c.信息都不能找到,那么这个转录本对应的突变信息就设置为空。 AAchangenew2 = '' for j in range(len(AAchangenew1)): #print(j) AAchangenew1_0 = AAchangenew1[j] #print(AAchangenew1_0) #如果信息中跨外显子的,那只提取外显子的 if ('exon' in AAchangenew1_0) & ('intron' in AAchangenew1_0): AAchangenew1_1=re.sub('intron\d.*?:', "",AAchangenew1_0) else: AAchangenew1_1 =AAchangenew1_0 #从AAchangeraw中把AAchangenew基因转录本对应的信息抓取出来 id0 = AAchangenew1_1.split(':') id1 = id0[0] + ':' + id0[1].split('.')[0] select0 = [s for s in AAchangeraw1 if id1 in s] #print(select0) #3.当提取的信息不为空时,进行格式转换。如果抓取信息为空时,保留原来注释信息 #c.注释的格式的转换:如c.A8077T改为c.8077A>T。由于AAchangeraw是annovar注释的,而AAchangenew时VEP注释 if len(select0) != 0: # 如果信息中跨外显子的,那只提取外显子的 if ('exon' in select0[0]) & ('intron' in select0[0]): select = re.sub('intron\d.*?:', "", select0[0]) else: select = select0[0] select1 = select.split(':') #从分割后的list中体取出c.对应的信息 cdsinfor0=[s for s in select1 if 'c.' in s] #如果能够找到c.信息,那么进行替换。如果找不到c.信息。那么新赋值的注释为空。我们只提取包含c.的,对于注释为n.的舍弃 if len(cdsinfor0)!=0: changeinfro_select1=cdsinfor0[0].split(".")[1] if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1)& ('dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1): cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',changeinfro_select1) else: cdsinfor1='c.'+changeinfro_select1 # 从分隔后的list中提取楚p.对应的信息 ppinfor0 = [s for s in select1 if 'p.' in s] if len(ppinfor0) != 0: ppinfor1 = ppinfor0[0] else: ppinfor1 = '' #获得新的注释 annonew=id0[0] + ':' + id0[1]+':'+select1[2]+':'+cdsinfor1+':'+ppinfor1 else: annonew ='' # 4.对AAchangenew进行信息的填补 # 如果AAchangenew没有c.信息时,那么新的赋值就用AAchangeraw里面 if 'c.' not in AAchangenew1_1: AAchangenew2 = AAchangenew2 + annonew + ';' elif ('c.' in AAchangenew1_1) & ('p.' not in AAchangenew1_1): AAchangenew2 = AAchangenew2 + annonew + ';' else: AAchangenew2 = AAchangenew2 + AAchangenew1_1 + ';' else: AAchangenew2=AAchangenew2+AAchangenew1_1+';' AAchangenew3 = re.sub(r"(;)\1+", '', AAchangenew2) if AAchangenew3[-1] == ';': AAchangenew3_0 = AAchangenew3[:-1] if AAchangenew3_0[-1] ==':': AAchangenew4 = AAchangenew3_0[:-1] else: AAchangenew4 = AAchangenew3_0 else: AAchangenew4 = AAchangenew3 else: AAchangenew4 = AAchange_raw return AAchangenew4 ###改,先对annovar注释的AAchange列进行格式转换 ####对annovar注释的信息进行格式修改 def AAchange_annovar_transdata(AAchange_annovar): ''' :param AAchange_annovar: 输入为annovar注释的AAchange.refgene列的一行 :return: 经过校正后的 ''' if (AAchange_annovar!='.') & (AAchange_annovar!='UNKNOWN'): annovarsplit = AAchange_annovar.split(',') annovar_new1 = '' for j in range(len(annovarsplit)): label_cpoint = annovarsplit[j].find('c.') label_ppoint = annovarsplit[j].find('p.') if (label_cpoint != -1) & (label_ppoint != -1): changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)] aainfor = annovarsplit[j][label_ppoint:] if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1) & ( 'dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1): cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>', changeinfro_select1) else: cdsinfor1 = 'c.' + changeinfro_select1 # annovar_new00 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor # 对终止密码子的替换 # annovar_new00 = annovar_new00 + "@" # annovar_new0 = re.sub("p\..+?[,@]", rep2, annovar_new00) if 'X' in aainfor: aainfor1 = aainfor.replace('X', '*') else: aainfor1 = aainfor annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor1 elif (label_cpoint != -1) & (label_ppoint == -1): changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)] if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1): cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>', changeinfro_select1) else: cdsinfor1 = 'c.' + changeinfro_select1 annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1 else: annovar_new0 = annovarsplit[j] annovar_new1 = annovar_new0 + ',' + annovar_new1 annovar_new_last = annovar_new1[:-1] else: annovar_new_last=AAchange_annovar return annovar_new_last def AAchange_merge(svdata): svdata.reset_index(drop=True, inplace=True) for i in range(len(svdata)): #print(i) svdata.loc[i, 'AAChange.refGene'] = AAchange_annovar_transdata(svdata.loc[i, 'AAChange.refGene']) AAchange_raw = svdata.loc[i, 'AAChange.refGene'] AAchange_new = svdata.loc[i, 'AAChange.refGene_1'] AAchange_gene = svdata.loc[i, 'Gene.refGene'] exonfunc = svdata.loc[i, 'ExonicFunc.refGene'] svdata.loc[i, 'AAChange.refGene_2'] = vepannovar_merge(AAchange_raw, AAchange_new,AAchange_gene) # 对AAChange.refGene_2进一步修改 new2 = svdata.loc[i, 'AAChange.refGene_2'] new21 = new2.split(';') if exonfunc == 'stopgain': # for stopgain,extract the mut including * new2_extract = [s for s in new21 if '*' in s] newchange = ';'.join(new2_extract) elif exonfunc == 'startloss': # for the startloss,extract the mut including ? new2_extract = [s for s in new21 if '?' in s] newchange = ';'.join(new2_extract) else: newchange = new2 # 将%3D替换为= newchange_re = newchange.replace('%3D', '=') svdata.loc[i, 'AAChange.refGene_2'] = newchange_re return svdata #######对转录本和突变表示形式的修改 ####修改:去除化疗相关位点 ###读入vep注释信息,方便对同义突变进一步筛选 def vepanno(inputpath,normalid): germdir = os.path.join(inputpath, '4Germline_unpair') # 读入vep注释 vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt') if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0: vepdata = pd.read_table(vepdir, sep='\t', header=0, low_memory=False) vepdata1 = AAchange_new_infor(vepdata) vepdata2 = AAchange_merge(vepdata1) #提取信息 vepdata2['muttype'] = vepdata2['Otherinfo11'].str.split('|', expand=True)[1] vepdata3 = vepdata2[['Chr', 'Start', 'End', 'Ref', 'Alt', 'muttype', 'AAChange.refGene_1', 'AAChange.refGene_2']] vepdata3['Chr'] = vepdata3['Chr'].astype('str') vepdata3['Start'] = vepdata3['Start'].astype('str') vepdata3['End'] = vepdata3['End'].astype('str') return vepdata3 ####改对unique的结果,根据vep的结果进一步校正。 ####2.对于ExonicFunc.refGene为startloss选择结尾带?的突变信息;当为stopgain,选择下划线后面的信息。 #对AAChange_select进一步核对 def AAchange_select_new_test(germdata1): for i in range(len(germdata1)): aaannovar = germdata1.loc[i, 'AAChange_select'].split(':') aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';') if germdata1.loc[i,'ExonicFunc.refGene']=='startloss': result_aa = [s for s in aavep if '?' in s] germdata1.loc[i, 'AAChange_select'] = result_aa[0] elif germdata1.loc[i,'ExonicFunc.refGene']=='stopgain': result_aa = [s for s in aavep if '*' in s] germdata1.loc[i, 'AAChange_select'] = result_aa[0] else: seachgene = aaannovar[0] + ':' + aaannovar[1] result_aa = [s for s in aavep if seachgene in s] if len(result_aa) != 0: germdata1.loc[i, 'AAChange_select'] = result_aa[0] return germdata1 ###对stopgain进一步修改 def AAchange_select_new_raw(germdata1): for i in range(len(germdata1)): print(i) aaannovar = germdata1.loc[i, 'AAChange_select'].split(':') if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0: germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';') aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';') if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss': result_aa = [s for s in aavep if '?' in s] germdata1.loc[i, 'AAChange_select'] = result_aa[0] elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain': result_aa = [s for s in aavep if '*' in s] # 对于特殊的修改:p.K173_Y174ins* -> Y174ins* if 'ins*' in result_aa[0]: splitinfor = result_aa.split(':') newaa0 = splitinfor[4] if '_' in newaa0: newaa1 = newaa0.split('_') numbefore = int(re.findall("\d+", newaa1[0])[0]) numafter = int(re.findall("\d+", newaa1[1])[0]) if numafter - numbefore == 1: Pnew = "p." + newaa1[1] unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[ 3] + ':' + Pnew germdata1.loc[i, 'AAChange_select'] = unique_new else: germdata1.loc[i, 'AAChange_select'] = result_aa[0] else: germdata1.loc[i, 'AAChange_select'] = result_aa[0] else: germdata1.loc[i, 'AAChange_select'] = result_aa[0] else: seachgene = aaannovar[0] + ':' + aaannovar[1] result_aa = [s for s in aavep if seachgene in s] if len(result_aa) != 0: germdata1.loc[i, 'AAChange_select'] = result_aa[0] return germdata1 ####改,当有多个的时候用唯一转录本 ###对stopgain进一步修改 def AAchange_select_new(germdata1): for i in range(len(germdata1)): print(i) aaannovar = germdata1.loc[i, 'AAChange_select'].split(':') if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0: germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';') aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';') if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss': result_aa = [s for s in aavep if '?' in s] germdata1.loc[i, 'AAChange_select'] = result_aa[0] elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain': result_aa = [s for s in aavep if '*' in s] # 对于特殊的修改:p.K173_Y174ins* -> Y174ins* if 'ins*' in result_aa[0]: splitinfor = result_aa.split(':') newaa0 = splitinfor[4] if '_' in newaa0: newaa1 = newaa0.split('_') numbefore = int(re.findall("\d+", newaa1[0])[0]) numafter = int(re.findall("\d+", newaa1[1])[0]) if numafter - numbefore == 1: Pnew = "p." + newaa1[1] unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[ 3] + ':' + Pnew germdata1.loc[i, 'AAChange_select'] = unique_new else: germdata1.loc[i, 'AAChange_select'] =(Transid_unique_search(','.join(result_aa))).replace(';','') else: germdata1.loc[i, 'AAChange_select'] = (Transid_unique_search(','.join(result_aa))).replace(';','') else: # 当不是p.K173_Y174ins*,从中选择唯一转录本所在的 unqiue_trans_select = (Transid_unique_search(','.join(result_aa))).replace(';','') if len(unqiue_trans_select)!=0: germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select else: germdata1.loc[i, 'AAChange_select'] =result_aa[0] else: #首先选择唯一转录本 unqiue_trans_select = (Transid_unique_search(','.join(aavep))).replace(';','') #如果没有获得唯一转录本的信息,采用AAchange_select原始的 if len(unqiue_trans_select)!=0: germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select else: seachgene = aaannovar[0] + ':' + aaannovar[1] result_aa = [s for s in aavep if seachgene in s] if len(result_aa) != 0: germdata1.loc[i, 'AAChange_select'] = result_aa[0] print(germdata1.loc[i, 'AAChange_select'] ) #判断唯一转录本的区域是否和Func.refGene一致,如果不一致按照唯一转录本进行修改 functype=germdata1.loc[i, 'Func.refGene'] if 'intron' in germdata1.loc[i, 'AAChange_select']: func_unique='intronic' elif 'exon' in germdata1.loc[i, 'AAChange_select']: func_unique='exonic' else: func_unique='other' if (func_unique not in functype) & (func_unique!='other'): germdata1.loc[i, 'Func.refGene'] =func_unique return germdata1 ###对唯一转录本的再次选择。 #首选在参考表中的转录本 ###对于热门基因的转录本,采用主转录本 def Transid_unique_search(AAchange): transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE.txt' transiddata = pd.read_table(transid_path, sep='\t') unique_trans_last = '' if AAchange != ' ': AAchange1 = AAchange.split(',') for j in range(len(AAchange1)): if AAchange1[j].count(":") == 4: AAchange1_0 = AAchange1[j].split(':')[1] AAchange1_exon = AAchange1[j].split(':')[2] AAchange1_cc = AAchange1[j].split(':')[3] AAchange1_pp = AAchange1[j].split(':')[4] transinfor = transiddata[(transiddata['RefSeq'] == AAchange1_0) | (transiddata['HGNC'] == AAchange1_0) | (transiddata['MANE'] == AAchange1_0)].reset_index(drop=True) if not transinfor.empty: unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc + ':' + AAchange1_pp+';' else: unique_trans = '' unique_trans_last = unique_trans_last + unique_trans elif AAchange1[j].count(":") == 3: AAchange1_0 = AAchange1[j].split(':')[1] AAchange1_exon = AAchange1[j].split(':')[2] AAchange1_cc = AAchange1[j].split(':')[3] transinfor = transiddata[ (transiddata['RefSeq'] == AAchange1_0) | (transiddata['HGNC'] == AAchange1_0) | ( transiddata['MANE'] == AAchange1_0)].reset_index(drop=True) if not transinfor.empty: unique_trans = transinfor.loc[ 0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc+';' else: unique_trans = '' unique_trans_last = unique_trans_last + unique_trans elif AAchange1[j].count(":") == 2: AAchange1_0 = AAchange1[j].split(':')[1] AAchange1_exon = AAchange1[j].split(':')[2] transinfor = transiddata[ (transiddata['RefSeq'] == AAchange1_0) | (transiddata['HGNC'] == AAchange1_0) | ( transiddata['MANE'] == AAchange1_0)].reset_index(drop=True) if not transinfor.empty: unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon+';' else: unique_trans = '' unique_trans_last = unique_trans_last + unique_trans else: unique_trans = '' unique_trans_last = unique_trans_last + unique_trans else: unique_trans_last = '' return unique_trans_last ###进行gnomAD_exome_EAS和STR区域的筛选 def snvfilter(inputpath,germline_filter1,sampleid): germdir = os.path.join(inputpath, '4Germline_unpair') STRdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/UCSC_STR_LCR_simpleRepeat_genomicSuperDups.bed' germline_filter1.rename(columns={'AAChange_select': 'raw_AAChange_select'}, inplace=True) germline_filter1.rename(columns={'new_select': 'AAChange_select'}, inplace=True) germline_filter1.sort_values(by='Gene.refGene', inplace=True) # for the germline summary data germdata = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/germline_counts_stastic20220815.txt' germref = pd.read_table(germdata, sep='\t', header=0) germref1 = germref[['mut_new', 'sample_counts']].drop_duplicates(keep='first') germref1.reset_index(drop=True, inplace=True) titlelist = germline_filter1.columns if len(germline_filter1) != 0: # select Func.refGene !='intronic'和'ncRNA_exonic' germline_filter2 = germline_filter1[ (germline_filter1['Func.refGene'] != 'intronic') & (germline_filter1['Func.refGene'] != 'ncRNA_exonic')] if len(germline_filter2) != 0: # selct AAChange.refGene!='.' germline_filter3 = germline_filter2[germline_filter2['AAChange.refGene'] != '.'] if len(germline_filter3) != 0: # select gnomAD_exome_EAS<=1% print('step6---select the vaf of genomAD') filterdata4_0 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] != '.')] # 6.1提取频率小于1% filterdata4_1 = filterdata4_0[(filterdata4_0['gnomAD_exome_EAS'].astype("float") <= 0.01)] # 6.2提取没有突变频率的 filterdata4_2 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] == '.')] # 6.3将小于等于1%和没有突变频率的合并 filterdata4_3 = filterdata4_2.append(filterdata4_1) germline_filter4 = filterdata4_3.reset_index(drop=True) germline_filter4.reset_index(drop=True, inplace=True) if len(germline_filter4) != 0: ##remove the mutation in STR region germbed = germline_filter4[['Chr', 'Start', 'End', 'Gene.refGene']] outputname1 = os.path.join(germdir, sampleid + '_germline.bed') germbed.to_csv(outputname1, index=False, header=None, encoding='gbk', sep='\t') outputname2 = os.path.join(germdir, sampleid + '_germline_unSTR.bed') STRcmd = 'bedtools subtract -a ' + outputname1 + ' -b ' + STRdir + '>' + outputname2 os.system(STRcmd) germline_unSTR = pd.read_table(outputname2, sep='\t', names=['Chr', 'Start', 'End', 'Gene.refGene']) if len(germline_unSTR) != 0: germline_unSTR[['Chr', 'Start', 'End']] = germline_unSTR[['Chr', 'Start', 'End']].astype('str') germline_filter4[['Chr', 'Start', 'End']] = germline_filter4[['Chr', 'Start', 'End']].astype( 'str') germline_filter5 = pd.merge(germline_filter4, germline_unSTR, on=['Chr', 'Start', 'End', 'Gene.refGene'], how='inner') germline_filter5_1 = germline_filter5[titlelist] germline_filter5_1['mut_new'] = germline_filter5_1['Chr'] + '_' + germline_filter5_1[ 'Start'] + '_' + germline_filter5_1['End'] + '_' + germline_filter5_1['Ref'] + '_' + \ germline_filter5_1['Alt'] germline_filter_result = pd.merge(germline_filter5_1, germref1, on=['mut_new'], how='left') del germline_filter_result['mut_new'] for i in range(len(germline_filter_result)): Func_refGene = germline_filter_result.loc[i, 'Func.refGene'] ExonicFunc_refGene = germline_filter_result.loc[i, 'ExonicFunc.refGene'] if (Func_refGene == 'splicing') & (ExonicFunc_refGene == '.'): germline_filter_result.loc[i, 'ExonicFunc.refGene'] = 'splicing' germline_filter_result.sort_values(by='Gene.refGene', inplace=True) else: print(sampleid + ' has no result when filter the STR') germline_filter_result = pd.DataFrame(columns=titlelist) germline_filter_result.loc[0, 'sampleid'] = sampleid germline_filter_result.loc[0, 'label'] = 'No result after filtering the STR' os.remove(outputname1) os.remove(outputname2) # print(germline_filter_result['AAChange_select']) else: print(sampleid + ' has no result when filter the genomeAD_EAS') germline_filter_result = pd.DataFrame(columns=titlelist) germline_filter_result.loc[0, 'sampleid'] = sampleid germline_filter_result.loc[0, 'label'] = 'No result after filtered by genomeAD_EAS' else: print(sampleid + ' has no result when filtering the intronic and ncRNA_exonic') germline_filter_result = pd.DataFrame(columns=titlelist) germline_filter_result.loc[0, 'sampleid'] = sampleid germline_filter_result.loc[0, 'label'] = 'No result after filtering the intronic and ncRNA_exonic' else: print(sampleid + ' has no result aftere vaf filtering') germline_filter_result = pd.DataFrame(columns=titlelist) germline_filter_result.loc[0, 'sampleid'] = sampleid germline_filter_result.loc[0, 'label'] = 'No result after filtering VAF>=0.3' return germline_filter_result ####获得vep注释的突变类型信息,包括突变类型以及其他突变信息 def AAchange_new_infor_vep(svdata): data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True) data_INFO_1.fillna("", inplace=True) result_num = 0 for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]: data_INFO_2 = pd.DataFrame() data_INFO_2["AAChange.refGene_vepraw"] =data_INFO_1.iloc[:, i + 0] + ":" + data_INFO_1.iloc[:, i + 2] + ":" + \ data_INFO_1.iloc[:, i + 5] + ":" + \ "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \ "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":" data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True) data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True) if data_INFO_3.shape[1] == 1: data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[0] + ":" else: data_INFO_3.fillna("", inplace=True) data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[1] + ":" if data_INFO_4.shape[1] == 1: data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[0] else: data_INFO_4.fillna("", inplace=True) data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[1] data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("exon:", "") data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("intron:", "") data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("::", ":") data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace(":$", "") if result_num == 0: svdata["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + ";" result_num = 1 else: svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"] + data_INFO_2["AAChange.refGene_vepraw"] + ";" svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(":;", "") svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(";$", "") # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为* AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt' AAdata = pd.read_table(AA_path, sep='\t') for j in range(len(AAdata)): AAdata_abbre = AAdata.loc[j, 'Abbre_name'] AAdata_short = AAdata.loc[j, 'Short_name'] svdata['AAChange.refGene_vepraw'] = svdata['AAChange.refGene_vepraw'].str.replace(AAdata_abbre, AAdata_short) return svdata #对获得的结果进一步校正 def AAselect_modify(inputpath,normalid,germdata_result): germdir = os.path.join(inputpath, '4Germline_unpair') # 读入vep注释 vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt') if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0: vepdata_raw = pd.read_table(vepdir, sep='\t', header=0, low_memory=False) vepdata1_raw = AAchange_new_infor_vep(vepdata_raw) vepcolumns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'AAChange.refGene_vepraw'] vepdata2_raw = vepdata1_raw[vepcolumns] vepdata2_raw['Chr'] = vepdata2_raw['Chr'].astype(str) vepdata2_raw['Start'] = vepdata2_raw['Start'].astype(str) vepdata2_raw['End'] = vepdata2_raw['End'].astype(str) # 和过滤后结果合并 germlinedata = pd.merge(germdata_result, vepdata2_raw, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left') #唯一转录本信息校正 for i in range(len(germlinedata)): veprawanno = germlinedata.loc[i, 'AAChange.refGene_vepraw'] gene = germlinedata.loc[i, 'Gene.refGene'] # 首先对唯一转录本进行校正 transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE.txt' transiddata = pd.read_table(transid_path, sep='\t') veprawanno1 = veprawanno.split(';') gene_trans_infor = transiddata[transiddata['Symbol'] == gene] gene_trans_infor.reset_index(drop=True, inplace=True) #以RefSeq的为主 gene_trans_unique = gene_trans_infor.loc[0, 'RefSeq'] # 获得唯一转录本对应的结果 tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s] #如果没有获得唯一转录本的结果,从其他转录本中获得 if len(tranid_vepinfor)==0: AAchange=germlinedata.loc[i, 'AAChange.refGene_1'] newchange=transid_select_AAchange(AAchange,gene) gene_trans_unique=newchange.split(':')[1] tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s] #当注释的是intronic时候,需要把exonicFunc替换为vepanno里面的注释情况 # 对Func.refGene的替换,如果vep注释信息有intron,那么写做intronic Func_label = tranid_vepinfor[0].split(':')[3] if 'intron' in Func_label: germlinedata.loc[i, 'Func.refGene'] = 'intronic' Func_vep = tranid_vepinfor[0].split(':')[0] # 将annovar注释的功能替换为vep注释的功能 germlinedata.loc[i, 'ExonicFunc.refGene'] = Func_vep ##将AAChange_select进行替换 vepselect = tranid_vepinfor[0].split(':') del (vepselect[0]) germlinedata.loc[i, 'AAChange_select'] = ':'.join(vepselect) #对移码突变进行转换,如由p.P1353Qfs*89改成p.P1353fs # 基因也进行替换 germlinedata.loc[i, 'Gene.refGene'] = vepselect[0] return germlinedata def germline_summary_control(inputpath,laneid,normalid,sampleid): germtitle_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/germline_title.txt' #panel genelist panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt' panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0, names=['Gene.refGene']) #druglist druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt' drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['Gene.refGene']) drug_genelist['drug_lable'] = 'Drug_gene' germtile = pd.read_table(germtitle_dir, sep='\t', header=0) titlelist_final = list(germtile.columns) titlelist_final.append('sample_counts') #对输出总表加上AAchange_1和AAchange_2 #titlelist_summary=list(titlelist_final) #titlelist_summary.append('AAChange.refGene_1') #titlelist_summary.append('AAChange.refGene_2') germdir = os.path.join(inputpath, '4Germline_unpair') germdatafile=germdir+'/'+normalid+'.germ.xls' #读入vep注释 vepdata1 = vepanno(inputpath,normalid) if os.path.exists(germdatafile) and os.path.getsize(germdatafile) !=0: #sampleid = normalid[:-2] # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) tempfile=os.path.join(inputpath,'tempfile') if not os.path.exists(tempfile): os.mkdir(tempfile) germ_tem_dir=os.path.join(tempfile,'germline') if not os.path.exists(germ_tem_dir): os.mkdir(germ_tem_dir) germdata_raw = pd.read_table(germdatafile, sep='\t') #先按照基因名排序 germdata_raw.sort_values(by='Gene.refGene', inplace=True) germdata_raw.reset_index(drop=True, inplace=True) #修改列名 germdata_raw.rename(columns={'CLNACC':'CLNALLELEID','CLNDBN':'CLNDN','CLNDSDB':'CLNDISDB','CLINVARREVSTATS':'CLNREVSTAT'},inplace=True) germdata_raw['Chr'] = germdata_raw['Chr'].astype('str') germdata_raw['Start'] = germdata_raw['Start'].astype('str') germdata_raw['End'] = germdata_raw['End'].astype('str') ##对基因进行分列 germdata01 = gene_split(germdata_raw) # 提取在gene列表中的基因 germdata02 = pd.merge(germdata01, panel_genelist, on=['Gene.refGene'], how='inner') germdata02.insert(0, 'sampleid', sampleid) #去除化疗基因 germdata03 = pd.merge(germdata02, drug_genelist, on=['Gene.refGene'], how='left') germdata = germdata03[germdata03['drug_lable'] != 'Drug_gene'] germdata.reset_index(drop=True, inplace=True) ###对CLINSIG进行分列 if len(germdata[germdata['CLINSIG'].str.contains('\\[')]) != 0: germdata05 = germdata['CLINSIG'].str.split("[", expand=True) germdata05[1].fillna('.', inplace=True) germdata['CLNSIG'] = germdata05[0] germdata['CLNSIGCONF'] = germdata05[1].str.replace(']', '') else: germdata['CLNSIG'] = germdata['CLINSIG'] germdata['CLNSIGCONF'] = '.' del germdata['CLINSIG'] del germdata['drug_lable'] print('clinsig') germdata.reset_index(drop=True, inplace=True) #1.add the AAchangedata germdata1=AAchangeselect_plus(germdata) # 2.将c.注释的格式例如c.A8077T改为c.8077A>T germdata2=cdsmut_trans(germdata1) # remove the HLA related gene germdata_result1 = germdata2[~germdata2['Gene.refGene'].str.contains('HLA')] germdata_result1.reset_index(drop=True, inplace=True) # select VAF>=0.3 germline_filter1 = germdata_result1[germdata_result1['VAF'] >= 0.3] # VAF转为% germline_filter1['VAF'] = germline_filter1['VAF'].apply(lambda x: format(x, '.2%')) #进行gnomAD_exome_EAS和STR区域的筛选 germline_filter_result0=snvfilter(inputpath,germline_filter1,sampleid) #对AAchange_select进行annovar和vep注释的合并进一步修改 if len(vepdata1)!=0: germdata_vepanno0 = pd.merge(germline_filter_result0, vepdata1, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left') germdata_vepanno1 = germdata_vepanno0[~germdata_vepanno0['muttype'].str.contains('synonymous')] del germdata_vepanno1['muttype'] germdata_vepanno1.reset_index(drop=True, inplace=True) #进行unique的筛选 germdata_result=AAchange_select_new(germdata_vepanno1) #对移码突变的再次确认修改 germdata_result['AAChange_select'] = (germdata_result['AAChange_select'] + "#").str.replace( 'p\..*fs\*.*?[,#]', rep1) germdata_result['AAChange_select'] = germdata_result['AAChange_select'].str.replace('#$', "") else: germdata_result=germline_filter_result0 #对获得的结果进一步进行信息校正 germdata_result_allinfor = AAselect_modify(inputpath, normalid, germdata_result) ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs germdata_result_allinfor['AAChange_select'] = (germdata_result_allinfor['AAChange_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1) germdata_result_allinfor['AAChange_select'] = germdata_result_allinfor['AAChange_select'].str.replace('#$', "") germdata_result_allinfor.rename(columns={'AAChange.refGene': 'AAChange.refGene_annovar', 'AAChange.refGene_1': 'AAChange.refGene'},inplace=True) #输出两种注释结果 outputfile0=os.path.join(germ_tem_dir,laneid + '-' + sampleid + '.germline_raw_infor.txt') germdata_result_allinfor.to_csv(outputfile0,sep='\t',index=False,header=True) #输出规定的列 germline_filter_result=germdata_result_allinfor[titlelist_final] #输出结果 outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.germline.xlsx') writer = pd.ExcelWriter(outputfile1) germline_filter_result.to_excel(writer, sheet_name='germline', index=False) writer.save() writer.close() else: print(normalid+' has no data or the result is null') germline_filter_result=pd.DataFrame() germline_filter_result.loc[0, 'sampleid'] = sampleid germline_filter_result.loc[0, 'label'] = 'No file' # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) bugfile_dir = os.path.join(temp_dir, 'bugfile') if not os.path.exists(bugfile_dir): os.mkdir(bugfile_dir) outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.germline.nofile.log.txt') germline_filter_result.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t') def germline_runmain(inputpath,normalid): laneid = inputpath.split('/')[-1].split('-')[-1] datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) sampledata = samplelist[samplelist['normal'] == normalid] sampledata.reset_index(drop=True, inplace=True) sampleid = sampledata.loc[0, 'samplename'] germdir = os.path.join(inputpath, '4Germline_unpair') germdatafile = germdir + '/' + normalid + '.germ.xls' if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0: germline_summary_control(inputpath, laneid, normalid, sampleid) else: print(normalid + ' is null') if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the germline') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample') args = parser.parse_args() Inputpath = args.inputpath Normalid = args.normalid germline_runmain(Inputpath,Normalid) ###for all sample in lane def germlinesummary(inputpath,laneid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) germdatasummary = pd.DataFrame() for i in range(len(samplelist)): sampleid = samplelist.loc[i, 'samplename'] normalid = samplelist.loc[i, 'normal'] print(normalid) germdir = os.path.join(inputpath, '4Germline_unpair') germdatafile = germdir + '/' + normalid + '.germ.xls' if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0: germre=germline_summary_control(inputpath, laneid, normalid,sampleid) germdatasummary =germdatasummary.append(germre) else: print(normalid+' is null') continue outputname = datasummarydir + '/' + laneid + '_table6_germlinne_datasummary.txt' germdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')