datafile_tmb_v0_20221107_for_IT.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. import pandas as pd
  2. import os
  3. import xlrd
  4. import numpy as np
  5. import math
  6. import re
  7. import argparse
  8. #计算四分位数
  9. # 其中data为数据组,n为第几个四分位数
  10. def quantile_exc(TMBdata, n):
  11. if n<1 or n>3:
  12. return false
  13. TMBdata.sort()
  14. TMBdata=TMBdata[::-1]
  15. position = (len(TMBdata) + 1)*n/4
  16. pos_integer = int(math.modf(position)[1])
  17. pos_decimal = position - pos_integer
  18. quartile = TMBdata[pos_integer - 1] + (TMBdata[pos_integer] - TMBdata[pos_integer - 1])*pos_decimal
  19. return quartile
  20. #####修改2,不过滤indel,首先过滤的是vaf大于等于5%,如果为0,捞回来1个
  21. #修改3,当结果为空或者为 'No_somatic'时,将tmb设置为0
  22. def TMB_filter(sampleid,tumortype,svdata,Size_panel=1.39):
  23. TCGA_TMB_file = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_wes_panel_TMB_overlap_20221107.txt'
  24. #Occur_cosmic = 100
  25. if (len(svdata)!=0) & (svdata.loc[0,'Transid_uniq'] != 'No_somatic') & (pd.notna(svdata.loc[0,'Transid_uniq'] )):
  26. print('step7---select the vaf>=5%')
  27. # 提取VAF>=5%
  28. filterdata8_4 = svdata[svdata['VAF_tumor'] >= 0.05]
  29. # 如果结果为空,就默认设置为1
  30. if len(filterdata8_4) == 0:
  31. TMB_counts = 1
  32. else:
  33. TMB_counts = len(filterdata8_4)
  34. TMB = round(TMB_counts / Size_panel,2)
  35. # compare with TCGA
  36. elif (svdata.loc[0,'Transid_uniq'] == 'No_somatic') | (pd.isna(svdata.loc[0,'Transid_uniq'])) :
  37. TMB_counts = 0
  38. TMB = 0
  39. else:
  40. TMB_counts = 1
  41. #修改tmb保留小数点后面两位,四舍五入
  42. TMB = round(TMB_counts / Size_panel,2)
  43. TCGA_TMBdata = pd.read_table(TCGA_TMB_file, sep='\t')
  44. if tumortype!='unknown':
  45. tumordata = TCGA_TMBdata[TCGA_TMBdata['Tumor_type'] == tumortype]
  46. #如果对应的肿瘤没有数据,那么以整个数据集为背景
  47. if len(tumordata)==0:
  48. tumordata = TCGA_TMBdata
  49. tumordata.sort_values(by=["TMB_panel"], ascending=False, inplace=True)
  50. tumordata.reset_index(drop=True, inplace=True)
  51. elif tumortype=='unknown':
  52. tumordata=TCGA_TMBdata
  53. TMB_1quar = quantile_exc(tumordata['TMB_panel'].values, 1)
  54. TMB_2quar = quantile_exc(tumordata['TMB_panel'].values, 2)
  55. TMB_3quar = quantile_exc(tumordata['TMB_panel'].values, 3)
  56. if TMB >= TMB_1quar:
  57. TMB_label = 'TMB-H'
  58. else:
  59. TMB_label = 'TMB-L'
  60. TMB_result = pd.DataFrame(
  61. columns=['sampleid', 'mut_count', 'TMB', 'TMB_label', '25%TCGA', '50%TCGA', '75%TCGA', 'tumor'])
  62. TMB_result.loc[0, 'sampleid'] = sampleid
  63. TMB_result.loc[0, 'mut_count'] = TMB_counts
  64. TMB_result.loc[0, 'TMB'] = round(TMB,2)
  65. TMB_result.loc[0, 'TMB_label'] = TMB_label
  66. TMB_result.loc[0, '25%TCGA'] = round(TMB_1quar,2)
  67. TMB_result.loc[0, '50%TCGA'] = round(TMB_2quar,2)
  68. TMB_result.loc[0, '75%TCGA'] = round(TMB_3quar,2)
  69. TMB_result.loc[0, 'tumor'] = tumortype
  70. return TMB_result
  71. ###TMB
  72. def tmbsample_run_test(inputpath,laneid,sampleid):
  73. '''
  74. caculate the tmb based on the snv data
  75. :param laneid: the lib id
  76. :param sampleid: the sample id
  77. :param tumortype: the tumor type
  78. :return:
  79. '''
  80. #tumortype = 'unknown'
  81. resultfile=os.path.join(inputpath,'resultfile')
  82. sampledir=os.path.join(resultfile,sampleid)
  83. tumortypedata = os.path.join(sampledir,laneid + '-' + sampleid + '.tumortype.xlsx')
  84. tumortype_result = pd.read_excel(tumortypedata)
  85. tumortype = tumortype_result.loc[0, 'tumortype']
  86. snvdata = os.path.join(sampledir,laneid + '-' + sampleid + '.snv.xlsx')
  87. somatic_result = pd.read_excel(snvdata,sheet_name='Somatic_Mut')
  88. #将vaf_tumor为空的填0
  89. somatic_result['VAF_tumor'] =somatic_result['VAF_tumor'].fillna(0).astype('str')+'%'
  90. somatic_result['VAF_tumor'] = somatic_result['VAF_tumor'].str.strip('%').astype('float') / 100
  91. tmb=TMB_filter(sampleid,tumortype,somatic_result,Size_panel=1.39)
  92. # output the TMB
  93. TMB_result = tmb[['sampleid', 'TMB', 'TMB_label']]
  94. print(TMB_result)
  95. #return TMB_result
  96. outputfile =os.path.join(sampledir,laneid + '-' + sampleid + '.tmb1.xlsx')
  97. writer = pd.ExcelWriter(outputfile)
  98. TMB_result.to_excel(writer, sheet_name='tmb', index=False)
  99. writer.save()
  100. writer.close()
  101. ##修,当人工改了vaf常规变为百分比的时候
  102. ###TMB
  103. def tmbsample_run(inputpath,laneid,sampleid):
  104. '''
  105. caculate the tmb based on the snv data
  106. :param laneid: the lib id
  107. :param sampleid: the sample id
  108. :param tumortype: the tumor type
  109. :return:
  110. '''
  111. #tumortype = 'unknown'
  112. resultfile=os.path.join(inputpath,'resultfile')
  113. sampledir=os.path.join(resultfile,sampleid)
  114. tumortypedata = os.path.join(sampledir,laneid + '-' + sampleid + '.tumortype.xlsx')
  115. tumortype_result = pd.read_excel(tumortypedata)
  116. tumortype = tumortype_result.loc[0, 'tumortype']
  117. snvdata = os.path.join(sampledir,laneid + '-' + sampleid + '.snv.xlsx')
  118. somatic_result = pd.read_excel(snvdata,sheet_name='Somatic_Mut')
  119. for i in range(len(somatic_result)):
  120. print(i)
  121. vaf = str(somatic_result.loc[i, 'VAF_tumor'])
  122. print(vaf)
  123. if pd.isna(somatic_result.loc[i, 'VAF_tumor']):
  124. somatic_result.loc[i, 'VAF_tumor'] = 0
  125. elif (pd.notna(somatic_result.loc[i, 'VAF_tumor'])) & ('%' in vaf):
  126. somatic_result.loc[i, 'VAF_tumor'] = float(somatic_result.loc[i, 'VAF_tumor'].strip('%')) / 100
  127. tmb=TMB_filter(sampleid,tumortype,somatic_result,Size_panel=1.39)
  128. # output the TMB
  129. TMB_result = tmb[['sampleid', 'TMB', 'TMB_label']]
  130. print(TMB_result)
  131. #return TMB_result
  132. outputfile =os.path.join(sampledir,laneid + '-' + sampleid + '.tmb1.xlsx')
  133. writer = pd.ExcelWriter(outputfile)
  134. TMB_result.to_excel(writer, sheet_name='tmb', index=False)
  135. writer.save()
  136. writer.close()
  137. #####读入每个样本
  138. def main(inputpath,laneid):
  139. samplelist = os.listdir(inputpath)
  140. for i in range(len(samplelist)):
  141. sampleid = samplelist[i]
  142. tmbsample_run(inputpath, laneid, sampleid)
  143. if __name__=='__main__':
  144. parser = argparse.ArgumentParser(description='TMB')
  145. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  146. parser.add_argument('-l', '--laneid', type=str, help='laneid')
  147. parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
  148. args = parser.parse_args()
  149. Inputpath = args.inputpath
  150. Laneid=args.laneid
  151. Sampleid = args.sampleid
  152. tmbsample_run(Inputpath,Laneid,Sampleid)