datafile_germline_v0_20230522_finish.py 51 KB


  1. import pandas as pd
  2. import os,os.path
  3. import xlrd
  4. import re
  5. import argparse
  6. ########germline针对的为白细胞对照
  7. #保留条件如:频率>=30%、Func.refGene中intronic和ncRNA_exonic去掉、无突变注释信息的去掉、人群频率>=1%的去掉;
  8. # 胚系注释中终止密码止为X表示,请跟体系保持一致,用*号。
  9. # 胚系表头请按附近中示例调整一下顺序,VAF改为百分比的数据形式。
  10. #去掉HLA相关基因突变
  11. #输出按照基因名称进行排序
  12. #for the end change
  13. def rep(a):
  14. # print(a.group(0))
  15. st = re.sub("x","*",a.group(0),flags=re.I)
  16. st = re.sub("#","",st,flags=re.I)
  17. return st
  18. #frameshift replace
  19. def rep1(a):
  20. st = re.sub("fs\*.*", "", a.group(0), flags=re.I)
  21. st1 = re.split("(\d+)", st)
  22. if "," in a.group(0):
  23. st2 = st1[0] + st1[1] + "fs,"
  24. else:
  25. st2 = st1[0] + st1[1] + "fs"
  26. return st2
  27. ###当由多个基因构成时进行拆分
  28. ###当Gene.refGene由多个基因注释构成,且这几个基因是在我们的genelist中,我们需要的是将基因拆分,然后对应的AAchange信息保留
  29. def gene_split(svdata):
  30. df_name = svdata['Gene.refGene'].str.split(';', expand=True)
  31. # 三、把行转列成列
  32. df_name = df_name.stack()
  33. # 四、重置索引,并删除多于的索引
  34. df_name = df_name.reset_index(level=1, drop=True)
  35. # 五、与原始数据合并
  36. df_name.name = 'df_name1'
  37. df_new = svdata.drop(['Gene.refGene'], axis=1).join(df_name)
  38. df_new.rename(columns={'df_name1': 'Gene.refGene'}, inplace=True)
  39. df_new.reset_index(drop=True, inplace=True)
  40. return df_new
  41. ###1.对主转录本数据的筛选。
  42. #对'AAChange.refGene'的数据提取
  43. #当'AAChange_select'为空时,
  44. #1)当AAChange.refGene只有一个转录本的突变信息时,我们将这个转录本信息直接赋给AAChange_select
  45. #2)当AAChange.refGene有多个转录本的突变信息时,选择这些转录本中最长的那个
  46. #输入的是基因时
  47. def transid_select_gene(gene):
  48. refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
  49. refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
  50. refseq['trans_length'] = refseq['end'] - refseq['start']
  51. genetand = refseq[refseq['gene'] == gene]
  52. maxlength = genetand['trans_length'].max()
  53. trans_unique_infor = genetand[genetand['trans_length'] == maxlength]
  54. trans_unique_infor.reset_index(drop=True, inplace=True)
  55. trans_unique_id = trans_unique_infor.loc[0, 'transid']
  56. return trans_unique_id
  57. #输入的时AAchange信息时.有一个或者多个转录本注释
  58. def transid_select_AAchange(AAchange,AAchange_gene):
  59. AAchangedata0=AAchange.split(',')
  60. AAchangedata0_1=pd.DataFrame()
  61. for i in range(len(AAchangedata0)):
  62. AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i]
  63. AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True)
  64. AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0]
  65. AAchangedata1.rename(columns={0: 'gene'}, inplace=True)
  66. AAchangedata2=AAchangedata1[AAchangedata1['gene']==AAchange_gene]
  67. refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
  68. refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
  69. refseq['trans_length'] = refseq['end'] - refseq['start']
  70. ##获得具有转录本编号的信息
  71. AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner')
  72. print(AAgene)
  73. #如果结果获得不为空则执行
  74. if len(AAgene) !=0:
  75. maxlength = AAgene['trans_length'].max()
  76. trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength]
  77. trans_unique_infor.reset_index(drop=True, inplace=True)
  78. trans_unique_id = trans_unique_infor.loc[0, 'transid']
  79. geneinfor = AAchange_gene + ':' + trans_unique_id
  80. AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s]
  81. AAchange_select=AAchange_select0[0]
  82. elif (len(AAgene) ==0) & (len(AAchangedata2)==1):
  83. AAchange_select =AAchange[0]
  84. elif (len(AAgene) ==0) & (len(AAchangedata2)>1):
  85. AAchange_select = AAchangedata0[0]
  86. return AAchange_select
  87. ###对于冷门基因的转录本信息
  88. def transid_select_AAchange(AAchange,AAchange_gene):
  89. AAchangedata0=AAchange.split(';')
  90. AAchangedata0_1=pd.DataFrame()
  91. for i in range(len(AAchangedata0)):
  92. AAchangedata0_1.loc[i,'AAchange']=AAchangedata0[i]
  93. AAchangedata1 =AAchangedata0_1['AAchange'].str.split(':', expand=True)
  94. AAchangedata1['transid'] = AAchangedata1[1].str.split('.', expand=True)[0]
  95. AAchangedata1.rename(columns={0: 'gene'}, inplace=True)
  96. AAchangedata1_0 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene]
  97. # 当注释信息和基因匹配时才进行如下,如果结果不匹配,表明的时该基因和注释的基因是同意义,需要将注释的基因转为原来的
  98. if len(AAchangedata1_0) == 0:
  99. AAchangedata1['gene'] = AAchange_gene
  100. AAchangedata2 = AAchangedata1[AAchangedata1['gene'] == AAchange_gene]
  101. refseq_transdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/hg19_refGene_transcript.bed'
  102. refseq = pd.read_table(refseq_transdir, sep='\t', usecols=[0, 1, 2, 6, 7], names=['chr', 'start', 'end', 'transid', 'gene'])
  103. refseq['trans_length'] = refseq['end'] - refseq['start']
  104. ##获得具有转录本编号的信息
  105. AAgene = pd.merge(AAchangedata2, refseq, on=['gene', 'transid'], how='inner')
  106. print(AAgene)
  107. #如果结果获得不为空则执行
  108. if len(AAgene) !=0:
  109. maxlength = AAgene['trans_length'].max()
  110. trans_unique_infor = AAgene[AAgene['trans_length'] == maxlength]
  111. trans_unique_infor.reset_index(drop=True, inplace=True)
  112. trans_unique_id = trans_unique_infor.loc[0, 'transid']
  113. geneinfor = AAchange_gene + ':' + trans_unique_id
  114. AAchange_select0 = [s for s in AAchangedata0 if geneinfor in s]
  115. AAchange_select=AAchange_select0[0]
  116. elif (len(AAgene) ==0) & (len(AAchangedata2)==1):
  117. AAchange_select =AAchange[0]
  118. elif (len(AAgene) ==0) & (len(AAchangedata2)>1):
  119. AAchange_select = AAchangedata0[0]
  120. return AAchange_select
  121. ####1.对AAchange_select的补充
  122. def AAchangeselect_plus(germdata):
  123. for j in range(len(germdata)):
  124. #print(j)
  125. gene = germdata.loc[j, 'Gene.refGene']
  126. if pd.isnull(germdata.loc[j, 'AAChange_select']):
  127. germdata.loc[j, 'AAChange_select']=''
  128. if ( ',' in germdata.loc[j, 'AAChange_select']) and (pd.notnull(germdata.loc[j, 'AAChange_select'])):
  129. AAChange_selectinfor = germdata.loc[j, 'AAChange_select'].split(',')
  130. AAChange_selectinfor1 = [s for s in AAChange_selectinfor if gene+':' in s]
  131. germdata.loc[j, 'AAChange_select']=AAChange_selectinfor1
  132. AAchangedata = germdata.loc[j, 'AAChange.refGene']
  133. #print(AAchangedata)
  134. if (AAchangedata.count(',') == 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''):
  135. germdata.loc[j, 'AAChange_select'] = germdata.loc[j, 'AAChange.refGene']
  136. elif (AAchangedata.count(',') != 0) & (AAchangedata != '.') & (germdata.loc[j, 'AAChange_select']==''):
  137. gene_transid_unque = transid_select_AAchange(AAchangedata, gene)
  138. AAchange_infor = AAchangedata.split(',')
  139. #geneinfor = gene + ':' + gene_transid_unque
  140. geneinfor = gene_transid_unque
  141. AAchange_select = [s for s in AAchange_infor if geneinfor in s]
  142. if len(AAchange_select) != 0:
  143. germdata.loc[j, 'AAChange_select'] = AAchange_select[0]
  144. else:
  145. germdata.loc[j, 'AAChange_select'] = ''
  146. return germdata
  147. #germdata1=AAchangeselect_plus(germdata)
  148. # 2.将c.注释的格式例如c.A8077T改为c.8077A>T
  149. #germdata2=cdsmut_trans(germdata1)
  150. #####2.将c.注释的格式例如c.A8077T改为c.8077A>T
  151. #3.终止密码止原来用X表示,转换后用*表示
  152. ###4.对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
  153. def cdsmut_trans(germdata):
  154. newchange = germdata['AAChange_select'].str.split(":", expand=True)
  155. newchange['c1'] = newchange[3].str.split('.', expand=True)[1]
  156. for i in range(len(newchange)):
  157. if pd.notnull(newchange.loc[i, 'c1']):
  158. if ('ins' not in newchange.loc[i, 'c1']) & ('del' not in newchange.loc[i, 'c1']):
  159. new = 'c.' + re.findall("\d+\.?\d*", newchange.loc[i, 'c1'])[0] + re.sub("[0-9]+", '>',
  160. newchange.loc[i, 'c1'])
  161. newchange.loc[i, 'new_AA'] = new
  162. else:
  163. new = 'c.' + newchange.loc[i, 'c1']
  164. newchange.loc[i, 'new_AA'] = new
  165. else:
  166. newchange.loc[i, 'new_AA'] = ''
  167. germdata['new_select'] = newchange[0] + ':' + newchange[1] + ':' + newchange[2] + ':' + newchange['new_AA'] + ':' + newchange[4]
  168. # 终止密码止原来用X表示,转换后用*表示
  169. germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*?[,#]', rep)
  170. germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "")
  171. germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*?[,#]', rep)
  172. germdata['new_select'] = (germdata['new_select']).str.replace('#$', "")
  173. # print(germline_filter4['AAChange_select'])
  174. ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
  175. germdata['AAChange.refGene'] = (germdata['AAChange.refGene'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
  176. germdata['AAChange.refGene'] = germdata['AAChange.refGene'].str.replace('#$', "")
  177. germdata['new_select'] = (germdata['new_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
  178. germdata['new_select'] = (germdata['new_select']).str.replace('#$', "")
  179. return germdata
  180. ####对AAchange列进一步,提取经过vep和annovar注释后的Otherinfo11这列信息
  181. #并将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
  182. def AAchange_new_infor(svdata):
  183. data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True)
  184. data_INFO_1.fillna("", inplace=True)
  185. result_num = 0
  186. for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]:
  187. data_INFO_2 = pd.DataFrame()
  188. data_INFO_2["AAChange.refGene_1"] = data_INFO_1.iloc[:, i + 2] + ":" + \
  189. data_INFO_1.iloc[:, i + 5] + ":" + \
  190. "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \
  191. "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":"
  192. data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True)
  193. data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True)
  194. if data_INFO_3.shape[1] == 1:
  195. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[0] + ":"
  196. else:
  197. data_INFO_3.fillna("", inplace=True)
  198. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_3[1] + ":"
  199. if data_INFO_4.shape[1] == 1:
  200. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[0]
  201. else:
  202. data_INFO_4.fillna("", inplace=True)
  203. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + data_INFO_4[1]
  204. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("exon:", "")
  205. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("intron:", "")
  206. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace("::", ":")
  207. data_INFO_2["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"].str.replace(":$", "")
  208. if result_num == 0:
  209. svdata["AAChange.refGene_1"] = data_INFO_2["AAChange.refGene_1"] + ";"
  210. result_num = 1
  211. else:
  212. svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"] + data_INFO_2["AAChange.refGene_1"] + ";"
  213. svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(":;", "")
  214. svdata["AAChange.refGene_1"] = svdata["AAChange.refGene_1"].str.replace(";$", "")
  215. # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
  216. AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt'
  217. AAdata = pd.read_table(AA_path, sep='\t')
  218. for j in range(len(AAdata)):
  219. AAdata_abbre = AAdata.loc[j, 'Abbre_name']
  220. AAdata_short = AAdata.loc[j, 'Short_name']
  221. svdata['AAChange.refGene_1'] = svdata['AAChange.refGene_1'].str.replace(AAdata_abbre, AAdata_short)
  222. return svdata
  223. ##将vep和annovar注释结果进行整合
  224. def vepannovar_merge(AAchange_raw,AAchange_new,AAchange_gene):
  225. '''
  226. vep and vardict anno merge
  227. 输入数据均为字符串
  228. :param AAchange_raw: the AAchange.refgene from annovar anno of the data,one str。
  229. :param AAchange_new: the AAchange.refgene_1 from the vep anno of the data,one str
  230. :return: the modified AAchange,str
  231. '''
  232. ###1.当AAchange_new中没有c.和p.信息时候,需要从AAchange_-raw中提取出来补全
  233. ###1.1对AAchange_raw按照符合分列。
  234. AAchangeraw1 = AAchange_raw.split(',')
  235. ###1.2对AAchane_new按照符号分列
  236. AAchangenew0 = AAchange_new.split(';')
  237. ##从AAchane_new选择Gene.refGene在内的
  238. AAchangenew1 = [s for s in AAchangenew0 if AAchange_gene in s]
  239. if len(AAchangenew1) != 0:
  240. # 1.2.1循环分列后的AAchangenew1
  241. # 首先从割裂后的结果中,找到与AAchange_gene一致的突变
  242. # 然后从raw中查找对应的c.和p.信息。
  243. # 如果c.信息都不能找到,那么这个转录本对应的突变信息就设置为空。
  244. AAchangenew2 = ''
  245. for j in range(len(AAchangenew1)):
  246. #print(j)
  247. AAchangenew1_0 = AAchangenew1[j]
  248. #print(AAchangenew1_0)
  249. #如果信息中跨外显子的,那只提取外显子的
  250. if ('exon' in AAchangenew1_0) & ('intron' in AAchangenew1_0):
  251. AAchangenew1_1=re.sub('intron\d.*?:', "",AAchangenew1_0)
  252. else:
  253. AAchangenew1_1 =AAchangenew1_0
  254. #从AAchangeraw中把AAchangenew基因转录本对应的信息抓取出来
  255. id0 = AAchangenew1_1.split(':')
  256. id1 = id0[0] + ':' + id0[1].split('.')[0]
  257. select0 = [s for s in AAchangeraw1 if id1 in s]
  258. #print(select0)
  259. #3.当提取的信息不为空时,进行格式转换。如果抓取信息为空时,保留原来注释信息
  260. #c.注释的格式的转换:如c.A8077T改为c.8077A>T。由于AAchangeraw是annovar注释的,而AAchangenew时VEP注释
  261. if len(select0) != 0:
  262. # 如果信息中跨外显子的,那只提取外显子的
  263. if ('exon' in select0[0]) & ('intron' in select0[0]):
  264. select = re.sub('intron\d.*?:', "", select0[0])
  265. else:
  266. select = select0[0]
  267. select1 = select.split(':')
  268. #从分割后的list中体取出c.对应的信息
  269. cdsinfor0=[s for s in select1 if 'c.' in s]
  270. #如果能够找到c.信息,那么进行替换。如果找不到c.信息。那么新赋值的注释为空。我们只提取包含c.的,对于注释为n.的舍弃
  271. if len(cdsinfor0)!=0:
  272. changeinfro_select1=cdsinfor0[0].split(".")[1]
  273. if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1)& ('dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1):
  274. cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',changeinfro_select1)
  275. else:
  276. cdsinfor1='c.'+changeinfro_select1
  277. # 从分隔后的list中提取楚p.对应的信息
  278. ppinfor0 = [s for s in select1 if 'p.' in s]
  279. if len(ppinfor0) != 0:
  280. ppinfor1 = ppinfor0[0]
  281. else:
  282. ppinfor1 = ''
  283. #获得新的注释
  284. annonew=id0[0] + ':' + id0[1]+':'+select1[2]+':'+cdsinfor1+':'+ppinfor1
  285. else:
  286. annonew =''
  287. # 4.对AAchangenew进行信息的填补
  288. # 如果AAchangenew没有c.信息时,那么新的赋值就用AAchangeraw里面
  289. if 'c.' not in AAchangenew1_1:
  290. AAchangenew2 = AAchangenew2 + annonew + ';'
  291. elif ('c.' in AAchangenew1_1) & ('p.' not in AAchangenew1_1):
  292. AAchangenew2 = AAchangenew2 + annonew + ';'
  293. else:
  294. AAchangenew2 = AAchangenew2 + AAchangenew1_1 + ';'
  295. else:
  296. AAchangenew2=AAchangenew2+AAchangenew1_1+';'
  297. AAchangenew3 = re.sub(r"(;)\1+", '', AAchangenew2)
  298. if AAchangenew3[-1] == ';':
  299. AAchangenew3_0 = AAchangenew3[:-1]
  300. if AAchangenew3_0[-1] ==':':
  301. AAchangenew4 = AAchangenew3_0[:-1]
  302. else:
  303. AAchangenew4 = AAchangenew3_0
  304. else:
  305. AAchangenew4 = AAchangenew3
  306. else:
  307. AAchangenew4 = AAchange_raw
  308. return AAchangenew4
  309. ###改,先对annovar注释的AAchange列进行格式转换
  310. ####对annovar注释的信息进行格式修改
  311. def AAchange_annovar_transdata(AAchange_annovar):
  312. '''
  313. :param AAchange_annovar: 输入为annovar注释的AAchange.refgene列的一行
  314. :return: 经过校正后的
  315. '''
  316. if (AAchange_annovar!='.') & (AAchange_annovar!='UNKNOWN'):
  317. annovarsplit = AAchange_annovar.split(',')
  318. annovar_new1 = ''
  319. for j in range(len(annovarsplit)):
  320. label_cpoint = annovarsplit[j].find('c.')
  321. label_ppoint = annovarsplit[j].find('p.')
  322. if (label_cpoint != -1) & (label_ppoint != -1):
  323. changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)]
  324. aainfor = annovarsplit[j][label_ppoint:]
  325. if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1) & (
  326. 'dup' not in changeinfro_select1) & ('inv' not in changeinfro_select1):
  327. cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',
  328. changeinfro_select1)
  329. else:
  330. cdsinfor1 = 'c.' + changeinfro_select1
  331. # annovar_new00 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor
  332. # 对终止密码子的替换
  333. # annovar_new00 = annovar_new00 + "@"
  334. # annovar_new0 = re.sub("p\..+?[,@]", rep2, annovar_new00)
  335. if 'X' in aainfor:
  336. aainfor1 = aainfor.replace('X', '*')
  337. else:
  338. aainfor1 = aainfor
  339. annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1 + ':' + aainfor1
  340. elif (label_cpoint != -1) & (label_ppoint == -1):
  341. changeinfro_select1 = annovarsplit[j][(label_cpoint + 2):(label_ppoint - 1)]
  342. if ('ins' not in changeinfro_select1) & ('del' not in changeinfro_select1):
  343. cdsinfor1 = 'c.' + re.findall("\d+\.?\d*", changeinfro_select1)[0] + re.sub("[0-9]+", '>',
  344. changeinfro_select1)
  345. else:
  346. cdsinfor1 = 'c.' + changeinfro_select1
  347. annovar_new0 = annovarsplit[j][:label_cpoint] + cdsinfor1
  348. else:
  349. annovar_new0 = annovarsplit[j]
  350. annovar_new1 = annovar_new0 + ',' + annovar_new1
  351. annovar_new_last = annovar_new1[:-1]
  352. else:
  353. annovar_new_last=AAchange_annovar
  354. return annovar_new_last
  355. def AAchange_merge(svdata):
  356. svdata.reset_index(drop=True, inplace=True)
  357. for i in range(len(svdata)):
  358. #print(i)
  359. svdata.loc[i, 'AAChange.refGene'] = AAchange_annovar_transdata(svdata.loc[i, 'AAChange.refGene'])
  360. AAchange_raw = svdata.loc[i, 'AAChange.refGene']
  361. AAchange_new = svdata.loc[i, 'AAChange.refGene_1']
  362. AAchange_gene = svdata.loc[i, 'Gene.refGene']
  363. exonfunc = svdata.loc[i, 'ExonicFunc.refGene']
  364. svdata.loc[i, 'AAChange.refGene_2'] = vepannovar_merge(AAchange_raw, AAchange_new,AAchange_gene)
  365. # 对AAChange.refGene_2进一步修改
  366. new2 = svdata.loc[i, 'AAChange.refGene_2']
  367. new21 = new2.split(';')
  368. if exonfunc == 'stopgain':
  369. # for stopgain,extract the mut including *
  370. new2_extract = [s for s in new21 if '*' in s]
  371. newchange = ';'.join(new2_extract)
  372. elif exonfunc == 'startloss':
  373. # for the startloss,extract the mut including ?
  374. new2_extract = [s for s in new21 if '?' in s]
  375. newchange = ';'.join(new2_extract)
  376. else:
  377. newchange = new2
  378. # 将%3D替换为=
  379. newchange_re = newchange.replace('%3D', '=')
  380. svdata.loc[i, 'AAChange.refGene_2'] = newchange_re
  381. return svdata
  382. #######对转录本和突变表示形式的修改
  383. ####修改:去除化疗相关位点
  384. ###读入vep注释信息,方便对同义突变进一步筛选
  385. def vepanno(inputpath,normalid):
  386. germdir = os.path.join(inputpath, '4Germline_unpair')
  387. # 读入vep注释
  388. vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt')
  389. if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0:
  390. vepdata = pd.read_table(vepdir, sep='\t', header=0, low_memory=False)
  391. vepdata1 = AAchange_new_infor(vepdata)
  392. vepdata2 = AAchange_merge(vepdata1)
  393. #提取信息
  394. vepdata2['muttype'] = vepdata2['Otherinfo11'].str.split('|', expand=True)[1]
  395. vepdata3 = vepdata2[['Chr', 'Start', 'End', 'Ref', 'Alt', 'muttype', 'AAChange.refGene_1', 'AAChange.refGene_2']]
  396. vepdata3['Chr'] = vepdata3['Chr'].astype('str')
  397. vepdata3['Start'] = vepdata3['Start'].astype('str')
  398. vepdata3['End'] = vepdata3['End'].astype('str')
  399. return vepdata3
  400. ####改对unique的结果,根据vep的结果进一步校正。
  401. ####2.对于ExonicFunc.refGene为startloss选择结尾带?的突变信息;当为stopgain,选择下划线后面的信息。
  402. #对AAChange_select进一步核对
  403. def AAchange_select_new_test(germdata1):
  404. for i in range(len(germdata1)):
  405. aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
  406. aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
  407. if germdata1.loc[i,'ExonicFunc.refGene']=='startloss':
  408. result_aa = [s for s in aavep if '?' in s]
  409. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  410. elif germdata1.loc[i,'ExonicFunc.refGene']=='stopgain':
  411. result_aa = [s for s in aavep if '*' in s]
  412. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  413. else:
  414. seachgene = aaannovar[0] + ':' + aaannovar[1]
  415. result_aa = [s for s in aavep if seachgene in s]
  416. if len(result_aa) != 0:
  417. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  418. return germdata1
  419. ###对stopgain进一步修改
  420. def AAchange_select_new_raw(germdata1):
  421. for i in range(len(germdata1)):
  422. print(i)
  423. aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
  424. if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
  425. germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
  426. aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
  427. if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
  428. result_aa = [s for s in aavep if '?' in s]
  429. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  430. elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
  431. result_aa = [s for s in aavep if '*' in s]
  432. # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
  433. if 'ins*' in result_aa[0]:
  434. splitinfor = result_aa.split(':')
  435. newaa0 = splitinfor[4]
  436. if '_' in newaa0:
  437. newaa1 = newaa0.split('_')
  438. numbefore = int(re.findall("\d+", newaa1[0])[0])
  439. numafter = int(re.findall("\d+", newaa1[1])[0])
  440. if numafter - numbefore == 1:
  441. Pnew = "p." + newaa1[1]
  442. unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[
  443. 3] + ':' + Pnew
  444. germdata1.loc[i, 'AAChange_select'] = unique_new
  445. else:
  446. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  447. else:
  448. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  449. else:
  450. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  451. else:
  452. seachgene = aaannovar[0] + ':' + aaannovar[1]
  453. result_aa = [s for s in aavep if seachgene in s]
  454. if len(result_aa) != 0:
  455. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  456. return germdata1
  457. ####改,当有多个的时候用唯一转录本
  458. def AAchange_select_new_test1(germdata1):
  459. for i in range(len(germdata1)):
  460. print(i)
  461. if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
  462. germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
  463. aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
  464. if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
  465. result_aa = [s for s in aavep if '?' in s]
  466. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  467. elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
  468. result_aa = [s for s in aavep if '*' in s]
  469. # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
  470. if 'ins*' in result_aa[0]:
  471. splitinfor = result_aa.split(':')
  472. newaa0 = splitinfor[4]
  473. if '_' in newaa0:
  474. newaa1 = newaa0.split('_')
  475. numbefore = int(re.findall("\d+", newaa1[0])[0])
  476. numafter = int(re.findall("\d+", newaa1[1])[0])
  477. if numafter - numbefore == 1:
  478. Pnew = "p." + newaa1[1]
  479. unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[
  480. 3] + ':' + Pnew
  481. germdata1.loc[i, 'AAChange_select'] = unique_new
  482. else:
  483. germdata1.loc[i, 'AAChange_select'] =(Transid_unique_search(','.join(result_aa))).replace(';','')
  484. else:
  485. germdata1.loc[i, 'AAChange_select'] = (Transid_unique_search(','.join(result_aa))).replace(';','')
  486. else:
  487. # 当不是p.K173_Y174ins*,从中选择唯一转录本所在的
  488. unqiue_trans_select = (Transid_unique_search(','.join(result_aa))).replace(';','')
  489. if len(unqiue_trans_select)!=0:
  490. germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
  491. else:
  492. germdata1.loc[i, 'AAChange_select'] =result_aa[0]
  493. else:
  494. #首先选择唯一转录本
  495. unqiue_trans_select = (Transid_unique_search(','.join(aavep))).replace(';','')
  496. #如果没有获得唯一转录本的信息,采用AAchange_select原始的
  497. if len(unqiue_trans_select)!=0:
  498. germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
  499. elif (len(unqiue_trans_select)==0) & (pd.notna(germdata1.loc[i,'AAChange_select'])):
  500. #当原始注释有信息时
  501. aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
  502. seachgene = aaannovar[0] + ':' + aaannovar[1]
  503. result_aa = [s for s in aavep if seachgene in s]
  504. if len(result_aa) != 0:
  505. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  506. elif (len(unqiue_trans_select)==0) & (pd.isna(germdata1.loc[i,'AAChange_select'])):
  507. #当唯一转录本和原始注释都没信息时候,采用冷门转录本选择
  508. AAchange_gene=germdata1.loc[i,'Gene.refGene']
  509. AAchange=germdata1.loc[i,'AAChange.refGene_2']
  510. germdata1.loc[i, 'AAChange_select'] =transid_select_AAchange(AAchange,AAchange_gene)
  511. print(germdata1.loc[i, 'AAChange_select'] )
  512. #判断唯一转录本的区域是否和Func.refGene一致,如果不一致按照唯一转录本进行修改
  513. functype=germdata1.loc[i, 'Func.refGene']
  514. if 'intron' in germdata1.loc[i, 'AAChange_select']:
  515. func_unique='intronic'
  516. elif 'exon' in germdata1.loc[i, 'AAChange_select']:
  517. func_unique='exonic'
  518. else:
  519. func_unique='other'
  520. if (func_unique not in functype) & (func_unique!='other'):
  521. germdata1.loc[i, 'Func.refGene'] =func_unique
  522. return germdata1
  523. ##改当有多个stopgain这种p.K173_Y174ins* 突变的时候
  524. def AAchange_select_new(germdata1):
  525. for i in range(len(germdata1)):
  526. print(i)
  527. if len(germdata1.loc[i, 'AAChange.refGene_2'])!=0:
  528. germdata1.loc[i, 'AAChange.refGene_2'] = germdata1.loc[i, 'AAChange.refGene_2'].replace(',', ';')
  529. aavep = germdata1.loc[i, 'AAChange.refGene_2'].split(';')
  530. if germdata1.loc[i, 'ExonicFunc.refGene'] == 'startloss':
  531. result_aa = [s for s in aavep if '?' in s]
  532. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  533. elif germdata1.loc[i, 'ExonicFunc.refGene'] == 'stopgain':
  534. result_aa = [s for s in aavep if '*' in s]
  535. # 对于特殊的修改:p.K173_Y174ins* -> Y174ins*
  536. if 'ins*' in result_aa[0]:
  537. #首选主要转录本
  538. maintrans=(Transid_unique_search(','.join(result_aa))).replace(';','')
  539. splitinfor = maintrans.split(':')
  540. newaa0 = splitinfor[4]
  541. if '_' in newaa0:
  542. newaa1 = newaa0.split('_')
  543. numbefore = int(re.findall("\d+", newaa1[0])[0])
  544. numafter = int(re.findall("\d+", newaa1[1])[0])
  545. if numafter - numbefore == 1:
  546. Pnew = "p." + newaa1[1]
  547. unique_new = splitinfor[0] + ':' + splitinfor[1] + ':' + splitinfor[2] + ':' + splitinfor[3] + ':' + Pnew
  548. germdata1.loc[i, 'AAChange_select'] = unique_new
  549. else:
  550. germdata1.loc[i, 'AAChange_select'] =maintrans
  551. else:
  552. germdata1.loc[i, 'AAChange_select'] = maintrans
  553. else:
  554. # 当不是p.K173_Y174ins*,从中选择唯一转录本所在的
  555. unqiue_trans_select = (Transid_unique_search(','.join(result_aa))).replace(';','')
  556. if len(unqiue_trans_select)!=0:
  557. germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
  558. else:
  559. germdata1.loc[i, 'AAChange_select'] =result_aa[0]
  560. else:
  561. #首先选择唯一转录本
  562. unqiue_trans_select = (Transid_unique_search(','.join(aavep))).replace(';','')
  563. #如果没有获得唯一转录本的信息,采用AAchange_select原始的
  564. if len(unqiue_trans_select)!=0:
  565. germdata1.loc[i, 'AAChange_select'] =unqiue_trans_select
  566. elif (len(unqiue_trans_select)==0) & (pd.notna(germdata1.loc[i,'AAChange_select'])):
  567. #当原始注释有信息时
  568. aaannovar = germdata1.loc[i, 'AAChange_select'].split(':')
  569. seachgene = aaannovar[0] + ':' + aaannovar[1]
  570. result_aa = [s for s in aavep if seachgene in s]
  571. if len(result_aa) != 0:
  572. germdata1.loc[i, 'AAChange_select'] = result_aa[0]
  573. elif (len(unqiue_trans_select)==0) & (pd.isna(germdata1.loc[i,'AAChange_select'])):
  574. #当唯一转录本和原始注释都没信息时候,采用冷门转录本选择
  575. AAchange_gene=germdata1.loc[i,'Gene.refGene']
  576. AAchange=germdata1.loc[i,'AAChange.refGene_2']
  577. germdata1.loc[i, 'AAChange_select'] =transid_select_AAchange(AAchange,AAchange_gene)
  578. print(germdata1.loc[i, 'AAChange_select'] )
  579. #判断唯一转录本的区域是否和Func.refGene一致,如果不一致按照唯一转录本进行修改
  580. functype=germdata1.loc[i, 'Func.refGene']
  581. if 'intron' in germdata1.loc[i, 'AAChange_select']:
  582. func_unique='intronic'
  583. elif 'exon' in germdata1.loc[i, 'AAChange_select']:
  584. func_unique='exonic'
  585. else:
  586. func_unique='other'
  587. if (func_unique not in functype) & (func_unique!='other'):
  588. germdata1.loc[i, 'Func.refGene'] =func_unique
  589. return germdata1
  590. ###对唯一转录本的再次选择。
  591. #首选在参考表中的转录本
  592. ###对于热门基因的转录本,采用主转录本
  593. #采用核对后的pan602的转录本信息
  594. def Transid_unique_search(AAchange):
  595. transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE_pan602_20230522.txt'
  596. transiddata = pd.read_table(transid_path, sep='\t')
  597. unique_trans_last = ''
  598. if AAchange != ' ':
  599. AAchange1 = AAchange.split(',')
  600. for j in range(len(AAchange1)):
  601. if AAchange1[j].count(":") == 4:
  602. AAchange1_0 = AAchange1[j].split(':')[1]
  603. AAchange1_exon = AAchange1[j].split(':')[2]
  604. AAchange1_cc = AAchange1[j].split(':')[3]
  605. AAchange1_pp = AAchange1[j].split(':')[4]
  606. #transinfor = transiddata[(transiddata['RefSeq'] == AAchange1_0) | (transiddata['HGNC'] == AAchange1_0) | (transiddata['MANE'] == AAchange1_0)].reset_index(drop=True)
  607. #采用遗传核对后的唯一转录本
  608. transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
  609. if not transinfor.empty:
  610. unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc + ':' + AAchange1_pp+';'
  611. else:
  612. unique_trans = ''
  613. unique_trans_last = unique_trans_last + unique_trans
  614. elif AAchange1[j].count(":") == 3:
  615. AAchange1_0 = AAchange1[j].split(':')[1]
  616. AAchange1_exon = AAchange1[j].split(':')[2]
  617. AAchange1_cc = AAchange1[j].split(':')[3]
  618. transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
  619. if not transinfor.empty:
  620. unique_trans = transinfor.loc[ 0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon + ':' + AAchange1_cc+';'
  621. else:
  622. unique_trans = ''
  623. unique_trans_last = unique_trans_last + unique_trans
  624. elif AAchange1[j].count(":") == 2:
  625. AAchange1_0 = AAchange1[j].split(':')[1]
  626. AAchange1_exon = AAchange1[j].split(':')[2]
  627. transinfor = transiddata[transiddata['Nuprobe_GC_vep'] == AAchange1_0].reset_index(drop=True)
  628. if not transinfor.empty:
  629. unique_trans = transinfor.loc[0, 'Symbol'] + ':' + AAchange1_0 + ':' + AAchange1_exon+';'
  630. else:
  631. unique_trans = ''
  632. unique_trans_last = unique_trans_last + unique_trans
  633. else:
  634. unique_trans = ''
  635. unique_trans_last = unique_trans_last + unique_trans
  636. else:
  637. unique_trans_last = ''
  638. return unique_trans_last
  639. ###进行gnomAD_exome_EAS和STR区域的筛选
  640. #改去掉多态性位点mut_ratio大于0.15
  641. def snvfilter(inputpath,germline_filter1,sampleid):
  642. germdir = os.path.join(inputpath, '4Germline_unpair')
  643. STRdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/UCSC_STR_LCR_simpleRepeat_genomicSuperDups.bed'
  644. germline_filter1.rename(columns={'AAChange_select': 'raw_AAChange_select'}, inplace=True)
  645. germline_filter1.rename(columns={'new_select': 'AAChange_select'}, inplace=True)
  646. germline_filter1.sort_values(by='Gene.refGene', inplace=True)
  647. # for the germline summary data
  648. germdata = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/germline_counts_stastic20230331.txt'
  649. germref1 = pd.read_table(germdata, sep='\t', header=0)
  650. #germref1 = germref[['mut_new', 'sample_counts']].drop_duplicates(keep='first')
  651. germref1.reset_index(drop=True, inplace=True)
  652. titlelist = germline_filter1.columns
  653. if len(germline_filter1) != 0:
  654. # select Func.refGene !='intronic'和'ncRNA_exonic'
  655. germline_filter2 = germline_filter1[
  656. (germline_filter1['Func.refGene'] != 'intronic') & (germline_filter1['Func.refGene'] != 'ncRNA_exonic')]
  657. if len(germline_filter2) != 0:
  658. # selct AAChange.refGene!='.'
  659. germline_filter3 = germline_filter2[germline_filter2['AAChange.refGene'] != '.']
  660. if len(germline_filter3) != 0:
  661. # select gnomAD_exome_EAS<=1%
  662. print('step6---select the vaf of genomAD')
  663. filterdata4_0 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] != '.')]
  664. # 6.1提取频率小于1%
  665. filterdata4_1 = filterdata4_0[(filterdata4_0['gnomAD_exome_EAS'].astype("float") <= 0.01)]
  666. # 6.2提取没有突变频率的
  667. filterdata4_2 = germline_filter3[(germline_filter3['gnomAD_exome_EAS'] == '.')]
  668. # 6.3将小于等于1%和没有突变频率的合并
  669. filterdata4_3 = filterdata4_2.append(filterdata4_1)
  670. germline_filter4 = filterdata4_3.reset_index(drop=True)
  671. germline_filter4.reset_index(drop=True, inplace=True)
  672. if len(germline_filter4) != 0:
  673. ##remove the mutation in STR region
  674. germbed = germline_filter4[['Chr', 'Start', 'End', 'Gene.refGene']]
  675. outputname1 = os.path.join(germdir, sampleid + '_germline.bed')
  676. germbed.to_csv(outputname1, index=False, header=None, encoding='gbk', sep='\t')
  677. outputname2 = os.path.join(germdir, sampleid + '_germline_unSTR.bed')
  678. STRcmd = 'bedtools subtract -a ' + outputname1 + ' -b ' + STRdir + '>' + outputname2
  679. os.system(STRcmd)
  680. germline_unSTR = pd.read_table(outputname2, sep='\t', names=['Chr', 'Start', 'End', 'Gene.refGene'])
  681. if len(germline_unSTR) != 0:
  682. germline_unSTR[['Chr', 'Start', 'End']] = germline_unSTR[['Chr', 'Start', 'End']].astype('str')
  683. germline_filter4[['Chr', 'Start', 'End']] = germline_filter4[['Chr', 'Start', 'End']].astype(
  684. 'str')
  685. germline_filter5 = pd.merge(germline_filter4, germline_unSTR,
  686. on=['Chr', 'Start', 'End', 'Gene.refGene'], how='inner')
  687. germline_filter5_1 = germline_filter5[titlelist]
  688. germline_filter5_1['mut_new'] = germline_filter5_1['Chr'] + '_' + germline_filter5_1[
  689. 'Start'] + '_' + germline_filter5_1['End'] + '_' + germline_filter5_1['Ref'] + '_' + \
  690. germline_filter5_1['Alt']
  691. germline_filter_result0 = pd.merge(germline_filter5_1, germref1, on=['mut_new'], how='left')
  692. germline_filter_result0['mut_ratio']=germline_filter_result0['mut_ratio'].fillna(0)
  693. #删除假阳性位点
  694. print('filter the false germline data')
  695. germline_filter_result =germline_filter_result0[germline_filter_result0['mut_ratio']<=0.15]
  696. del germline_filter_result['mut_ratio']
  697. del germline_filter_result['mut_new']
  698. germline_filter_result.reset_index(drop=True,inplace=True)
  699. print(len(germline_filter_result))
  700. if len(germline_filter_result)!=0:
  701. for i in range(len(germline_filter_result)):
  702. Func_refGene = germline_filter_result.loc[i, 'Func.refGene']
  703. ExonicFunc_refGene = germline_filter_result.loc[i, 'ExonicFunc.refGene']
  704. if (Func_refGene == 'splicing') & (ExonicFunc_refGene == '.'):
  705. germline_filter_result.loc[i, 'ExonicFunc.refGene'] = 'splicing'
  706. germline_filter_result.sort_values(by='Gene.refGene', inplace=True)
  707. else:
  708. print(sampleid + ' has no result when filter the false germline snv')
  709. germline_filter_result = pd.DataFrame(columns=titlelist)
  710. germline_filter_result.loc[0, 'sampleid'] = sampleid
  711. germline_filter_result.loc[0, 'label'] = 'No result after filtering the false germline snv'
  712. else:
  713. print(sampleid + ' has no result when filter the STR')
  714. germline_filter_result = pd.DataFrame(columns=titlelist)
  715. germline_filter_result.loc[0, 'sampleid'] = sampleid
  716. germline_filter_result.loc[0, 'label'] = 'No result after filtering the STR'
  717. os.remove(outputname1)
  718. os.remove(outputname2)
  719. # print(germline_filter_result['AAChange_select'])
  720. else:
  721. print(sampleid + ' has no result when filter the genomeAD_EAS')
  722. germline_filter_result = pd.DataFrame(columns=titlelist)
  723. germline_filter_result.loc[0, 'sampleid'] = sampleid
  724. germline_filter_result.loc[0, 'label'] = 'No result after filtered by genomeAD_EAS'
  725. else:
  726. print(sampleid + ' has no result when filtering the intronic and ncRNA_exonic')
  727. germline_filter_result = pd.DataFrame(columns=titlelist)
  728. germline_filter_result.loc[0, 'sampleid'] = sampleid
  729. germline_filter_result.loc[0, 'label'] = 'No result after filtering the intronic and ncRNA_exonic'
  730. else:
  731. print(sampleid + ' has no result aftere vaf filtering')
  732. germline_filter_result = pd.DataFrame(columns=titlelist)
  733. germline_filter_result.loc[0, 'sampleid'] = sampleid
  734. germline_filter_result.loc[0, 'label'] = 'No result after filtering VAF>=0.3'
  735. return germline_filter_result
  736. ####获得vep注释的突变类型信息,包括突变类型以及其他突变信息
  737. def AAchange_new_infor_vep(svdata):
  738. data_INFO_1 = svdata.loc[:, 'Otherinfo11'].str.split("|", expand=True)
  739. data_INFO_1.fillna("", inplace=True)
  740. result_num = 0
  741. for i in [i1 for i1 in range(1, data_INFO_1.shape[1], 41)]:
  742. data_INFO_2 = pd.DataFrame()
  743. data_INFO_2["AAChange.refGene_vepraw"] =data_INFO_1.iloc[:, i + 0] + ":" + data_INFO_1.iloc[:, i + 2] + ":" + \
  744. data_INFO_1.iloc[:, i + 5] + ":" + \
  745. "exon" + data_INFO_1.iloc[:, i + 7].str.split("/", expand=True)[0] + ":" + \
  746. "intron" + data_INFO_1.iloc[:, i + 8].str.split("/", expand=True)[0] + ":"
  747. data_INFO_3 = data_INFO_1.iloc[:, i + 9].str.split(":", expand=True)
  748. data_INFO_4 = data_INFO_1.iloc[:, i + 10].str.split(":", expand=True)
  749. if data_INFO_3.shape[1] == 1:
  750. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[0] + ":"
  751. else:
  752. data_INFO_3.fillna("", inplace=True)
  753. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_3[1] + ":"
  754. if data_INFO_4.shape[1] == 1:
  755. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[0]
  756. else:
  757. data_INFO_4.fillna("", inplace=True)
  758. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + data_INFO_4[1]
  759. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("exon:", "")
  760. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("intron:", "")
  761. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace("::", ":")
  762. data_INFO_2["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"].str.replace(":$", "")
  763. if result_num == 0:
  764. svdata["AAChange.refGene_vepraw"] = data_INFO_2["AAChange.refGene_vepraw"] + ";"
  765. result_num = 1
  766. else:
  767. svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"] + data_INFO_2["AAChange.refGene_vepraw"] + ";"
  768. svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(":;", "")
  769. svdata["AAChange.refGene_vepraw"] = svdata["AAChange.refGene_vepraw"].str.replace(";$", "")
  770. # 需要将氨基酸改变缩写改为单字母的简写,终止突变Ter改为*
  771. AA_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/AA_dict.txt'
  772. AAdata = pd.read_table(AA_path, sep='\t')
  773. for j in range(len(AAdata)):
  774. AAdata_abbre = AAdata.loc[j, 'Abbre_name']
  775. AAdata_short = AAdata.loc[j, 'Short_name']
  776. svdata['AAChange.refGene_vepraw'] = svdata['AAChange.refGene_vepraw'].str.replace(AAdata_abbre, AAdata_short)
  777. return svdata
  778. #对获得的结果进一步校正
  779. def AAselect_modify(inputpath,normalid,germdata_result):
  780. germdir = os.path.join(inputpath, '4Germline_unpair')
  781. # 读入vep注释
  782. vepdir = os.path.join(germdir, normalid + '.hg19_multianno.txt')
  783. if os.path.exists(vepdir) and os.path.getsize(vepdir) != 0:
  784. vepdata_raw = pd.read_table(vepdir, sep='\t', header=0, low_memory=False)
  785. vepdata1_raw = AAchange_new_infor_vep(vepdata_raw)
  786. vepcolumns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'AAChange.refGene_vepraw']
  787. vepdata2_raw = vepdata1_raw[vepcolumns]
  788. vepdata2_raw['Chr'] = vepdata2_raw['Chr'].astype(str)
  789. vepdata2_raw['Start'] = vepdata2_raw['Start'].astype(str)
  790. vepdata2_raw['End'] = vepdata2_raw['End'].astype(str)
  791. # 和过滤后结果合并
  792. germlinedata = pd.merge(germdata_result, vepdata2_raw, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left')
  793. #唯一转录本信息校正
  794. for i in range(len(germlinedata)):
  795. print(i)
  796. veprawanno = germlinedata.loc[i, 'AAChange.refGene_vepraw']
  797. gene = germlinedata.loc[i, 'Gene.refGene']
  798. # 首先对唯一转录本进行校正
  799. transid_path = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/Select_RefSeq_HGNC_MANE_pan602_20230522.txt'
  800. transiddata = pd.read_table(transid_path, sep='\t')
  801. veprawanno1 = veprawanno.split(';')
  802. gene_trans_infor = transiddata[transiddata['Symbol'] == gene]
  803. gene_trans_infor.reset_index(drop=True, inplace=True)
  804. #以RefSeq的为主
  805. gene_trans_unique = gene_trans_infor.loc[0, 'RefSeq']
  806. # 获得唯一转录本对应的结果
  807. tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s]
  808. print(tranid_vepinfor)
  809. #如果没有获得唯一转录本的结果,从其他转录本中获得
  810. if len(tranid_vepinfor)==0:
  811. AAchange=germlinedata.loc[i, 'AAChange.refGene_1']
  812. newchange=transid_select_AAchange(AAchange,gene)
  813. gene_trans_unique=newchange.split(':')[1]
  814. tranid_vepinfor = [s for s in veprawanno1 if gene_trans_unique in s]
  815. #当注释的是intronic时候,需要把exonicFunc替换为vepanno里面的注释情况
  816. # 对Func.refGene的替换,如果vep注释信息有intron,那么写做intronic
  817. if len(tranid_vepinfor[0].split(':'))>3:
  818. Func_label = tranid_vepinfor[0].split(':')[3]
  819. if 'intron' in Func_label:
  820. germlinedata.loc[i, 'Func.refGene'] = 'intronic'
  821. Func_vep = tranid_vepinfor[0].split(':')[0]
  822. # 将annovar注释的功能替换为vep注释的功能
  823. germlinedata.loc[i, 'ExonicFunc.refGene'] = Func_vep
  824. else:
  825. germlinedata.loc[i, 'Func.refGene'] = 'intronic'
  826. Func_vep = tranid_vepinfor[0].split(':')[0]
  827. germlinedata.loc[i, 'ExonicFunc.refGene'] = Func_vep
  828. ##将AAChange_select进行替换
  829. vepselect = tranid_vepinfor[0].split(':')
  830. del (vepselect[0])
  831. germlinedata.loc[i, 'AAChange_select'] = ':'.join(vepselect)
  832. #对移码突变进行转换,如由p.P1353Qfs*89改成p.P1353fs
  833. # 基因也进行替换
  834. germlinedata.loc[i, 'Gene.refGene'] = vepselect[0]
  835. return germlinedata
  836. def germline_summary_control(inputpath,laneid,normalid,sampleid):
  837. germtitle_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/germline_title_20230522.txt'
  838. #panel genelist
  839. panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
  840. panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0, names=['Gene.refGene'])
  841. #druglist
  842. druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
  843. drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['Gene.refGene'])
  844. drug_genelist['drug_lable'] = 'Drug_gene'
  845. germtile = pd.read_table(germtitle_dir, sep='\t', header=0)
  846. titlelist_final = list(germtile.columns)
  847. titlelist_final.append('sample_counts')
  848. #对输出总表加上AAchange_1和AAchange_2
  849. #titlelist_summary=list(titlelist_final)
  850. #titlelist_summary.append('AAChange.refGene_1')
  851. #titlelist_summary.append('AAChange.refGene_2')
  852. germdir = os.path.join(inputpath, '4Germline_unpair')
  853. germdatafile=germdir+'/'+normalid+'.germ.xls'
  854. #读入vep注释
  855. vepdata1 = vepanno(inputpath,normalid)
  856. if os.path.exists(germdatafile) and os.path.getsize(germdatafile) !=0:
  857. #sampleid = normalid[:-2]
  858. # make the result dir
  859. result_dir = os.path.join(inputpath, 'resultfile')
  860. if not os.path.exists(result_dir):
  861. os.mkdir(result_dir)
  862. sample_dir = os.path.join(result_dir, sampleid)
  863. if not os.path.exists(sample_dir):
  864. os.mkdir(sample_dir)
  865. tempfile=os.path.join(inputpath,'tempfile')
  866. if not os.path.exists(tempfile):
  867. os.mkdir(tempfile)
  868. germ_tem_dir=os.path.join(tempfile,'germline')
  869. if not os.path.exists(germ_tem_dir):
  870. os.mkdir(germ_tem_dir)
  871. germdata_raw = pd.read_table(germdatafile, sep='\t')
  872. #先按照基因名排序
  873. germdata_raw.sort_values(by='Gene.refGene', inplace=True)
  874. germdata_raw.reset_index(drop=True, inplace=True)
  875. #修改列名
  876. germdata_raw.rename(columns={'CLNACC':'CLNALLELEID','CLNDBN':'CLNDN','CLNDSDB':'CLNDISDB','CLINVARREVSTATS':'CLNREVSTAT'},inplace=True)
  877. germdata_raw['Chr'] = germdata_raw['Chr'].astype('str')
  878. germdata_raw['Start'] = germdata_raw['Start'].astype('str')
  879. germdata_raw['End'] = germdata_raw['End'].astype('str')
  880. ##对基因进行分列
  881. germdata01 = gene_split(germdata_raw)
  882. # 提取在gene列表中的基因
  883. germdata02 = pd.merge(germdata01, panel_genelist, on=['Gene.refGene'], how='inner')
  884. germdata02.insert(0, 'sampleid', sampleid)
  885. #去除化疗基因
  886. germdata03 = pd.merge(germdata02, drug_genelist, on=['Gene.refGene'], how='left')
  887. germdata = germdata03[germdata03['drug_lable'] != 'Drug_gene']
  888. germdata.reset_index(drop=True, inplace=True)
  889. ###对CLINSIG进行分列
  890. if len(germdata[germdata['CLINSIG'].str.contains('\\[')]) != 0:
  891. germdata05 = germdata['CLINSIG'].str.split("[", expand=True)
  892. germdata05[1].fillna('.', inplace=True)
  893. germdata['CLNSIG'] = germdata05[0]
  894. germdata['CLNSIGCONF'] = germdata05[1].str.replace(']', '')
  895. else:
  896. germdata['CLNSIG'] = germdata['CLINSIG']
  897. germdata['CLNSIGCONF'] = '.'
  898. del germdata['CLINSIG']
  899. del germdata['drug_lable']
  900. print('clinsig')
  901. germdata.reset_index(drop=True, inplace=True)
  902. #1.add the AAchangedata
  903. germdata1=AAchangeselect_plus(germdata)
  904. # 2.将c.注释的格式例如c.A8077T改为c.8077A>T
  905. germdata2=cdsmut_trans(germdata1)
  906. # remove the HLA related gene
  907. germdata_result1 = germdata2[~germdata2['Gene.refGene'].str.contains('HLA')]
  908. germdata_result1.reset_index(drop=True, inplace=True)
  909. # select VAF>=0.3
  910. germline_filter1 = germdata_result1[germdata_result1['VAF'] >= 0.3]
  911. # VAF转为%
  912. germline_filter1['VAF'] = germline_filter1['VAF'].apply(lambda x: format(x, '.2%'))
  913. #进行gnomAD_exome_EAS和STR区域的筛选
  914. germline_filter_result0=snvfilter(inputpath,germline_filter1,sampleid)
  915. #对AAchange_select进行annovar和vep注释的合并进一步修改
  916. if len(vepdata1)!=0:
  917. germdata_vepanno0 = pd.merge(germline_filter_result0, vepdata1, on=['Chr', 'Start', 'End', 'Ref', 'Alt'], how='left')
  918. germdata_vepanno1 = germdata_vepanno0[~germdata_vepanno0['muttype'].str.contains('synonymous')]
  919. del germdata_vepanno1['muttype']
  920. germdata_vepanno1.reset_index(drop=True, inplace=True)
  921. #进行unique的筛选
  922. germdata_result=AAchange_select_new(germdata_vepanno1)
  923. #对移码突变的再次确认修改
  924. germdata_result['AAChange_select'] = (germdata_result['AAChange_select'] + "#").str.replace(
  925. 'p\..*fs\*.*?[,#]', rep1)
  926. germdata_result['AAChange_select'] = germdata_result['AAChange_select'].str.replace('#$', "")
  927. else:
  928. germdata_result=germline_filter_result0
  929. #对获得的结果进一步进行信息校正
  930. germdata_result_allinfor = AAselect_modify(inputpath, normalid, germdata_result)
  931. ###对移码突变的转换,如由p.P1353Qfs*89改成p.P1353fs
  932. germdata_result_allinfor['AAChange_select'] = (germdata_result_allinfor['AAChange_select'] + "#").str.replace('p\..*fs\*.*?[,#]', rep1)
  933. germdata_result_allinfor['AAChange_select'] = germdata_result_allinfor['AAChange_select'].str.replace('#$', "")
  934. germdata_result_allinfor.rename(columns={'AAChange.refGene': 'AAChange.refGene_annovar', 'AAChange.refGene_1': 'AAChange.refGene'},inplace=True)
  935. #输出两种注释结果
  936. outputfile0=os.path.join(germ_tem_dir,laneid + '-' + sampleid + '.germline_raw_infor.txt')
  937. germdata_result_allinfor.to_csv(outputfile0,sep='\t',index=False,header=True)
  938. #输出规定的列
  939. germline_filter_result=germdata_result_allinfor[titlelist_final]
  940. #再次去除包含包含“%3D”的突变
  941. germline_filter_result=germline_filter_result[germline_filter_result['AAChange_select'].str.contains("%3D")==False]
  942. germline_filter_result.reset_index(drop=True,inplace=True)
  943. #输出结果
  944. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.germline.xlsx')
  945. writer = pd.ExcelWriter(outputfile1)
  946. germline_filter_result.to_excel(writer, sheet_name='germline', index=False)
  947. writer.save()
  948. writer.close()
  949. else:
  950. print(normalid+' has no data or the result is null')
  951. germline_filter_result=pd.DataFrame()
  952. germline_filter_result.loc[0, 'sampleid'] = sampleid
  953. germline_filter_result.loc[0, 'label'] = 'No file'
  954. # make the temp dir
  955. temp_dir = os.path.join(inputpath, 'tempfile')
  956. if not os.path.exists(temp_dir):
  957. os.mkdir(temp_dir)
  958. bugfile_dir = os.path.join(temp_dir, 'bugfile')
  959. if not os.path.exists(bugfile_dir):
  960. os.mkdir(bugfile_dir)
  961. outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.germline.nofile.log.txt')
  962. germline_filter_result.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
  963. def germline_runmain(inputpath,normalid):
  964. laneid = inputpath.split('/')[-1].split('-')[-1]
  965. datasummarydir = os.path.join(inputpath, 'datasummary')
  966. isExists = os.path.exists(datasummarydir)
  967. if not isExists:
  968. os.makedirs(datasummarydir)
  969. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  970. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  971. sampledata = samplelist[samplelist['normal'] == normalid]
  972. sampledata.reset_index(drop=True, inplace=True)
  973. sampleid = sampledata.loc[0, 'samplename']
  974. germdir = os.path.join(inputpath, '4Germline_unpair')
  975. germdatafile = germdir + '/' + normalid + '.germ.xls'
  976. if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0:
  977. germline_summary_control(inputpath, laneid, normalid, sampleid)
  978. else:
  979. print(normalid + ' is null')
  980. if __name__=='__main__':
  981. parser = argparse.ArgumentParser(description='filter the germline')
  982. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  983. parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
  984. args = parser.parse_args()
  985. Inputpath = args.inputpath
  986. Normalid = args.normalid
  987. germline_runmain(Inputpath,Normalid)
  988. ###for all sample in lane
  989. def germlinesummary(inputpath,laneid):
  990. datasummarydir = os.path.join(inputpath, 'datasummary')
  991. isExists = os.path.exists(datasummarydir)
  992. if not isExists:
  993. os.makedirs(datasummarydir)
  994. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  995. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  996. germdatasummary = pd.DataFrame()
  997. for i in range(len(samplelist)):
  998. sampleid = samplelist.loc[i, 'samplename']
  999. normalid = samplelist.loc[i, 'normal']
  1000. print(normalid)
  1001. germdir = os.path.join(inputpath, '4Germline_unpair')
  1002. germdatafile = germdir + '/' + normalid + '.germ.xls'
  1003. if os.path.exists(germdatafile) and os.path.getsize(germdatafile) != 0:
  1004. germre=germline_summary_control(inputpath, laneid, normalid,sampleid)
  1005. germdatasummary =germdatasummary.append(germre)
  1006. else:
  1007. print(normalid+' is null')
  1008. continue
  1009. outputname = datasummarydir + '/' + laneid + '_table6_germlinne_datasummary.txt'
  1010. germdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')