datafile_MSI_v0_20220929_finish.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #对msisensor的结果进行分析
  2. #MSI建议采用美国国家癌症研究院( NCI) 推荐的5 个微卫星( MS) 检测位点( BAT25、BAT26、D5S346、D2S123 和D17S250) 。
  3. # #判断标准为3 级: 所有5 个位点均稳定为微卫星稳定( MSS) ,1个位点不稳定为微卫星低度不稳定( MSI-L) ,
  4. # #2个及2 个以上位点不稳定为微卫星高度不稳定( MSI-H) 。
  5. #本次分析3个及其以上为MSI-H,2个和1个为MSI-L,0个为MSS
  6. #菁良panel的判断标准是
  7. #位点如下NR-21 (A)21;NR-24 (A)24;NR-27 (A)27;BAT-25 (A)25;BAT-26 (A)26和MONO-27 (A)2
  8. #MSS:所有位点均稳定为MSS均检测不出来MSI
  9. #MSI-H:六个位点中有两个及其以上的被检出
  10. #MSI-L:六个位点中只有1个被检测出
  11. #1.对.prefix分析,结果是统计了整个样本中MSI的数目
  12. #2..prefix_dis是具体的MSI信息
  13. #3.prefix_somatic对somatic的MSI进行分析
  14. #####################################################对paired文件进行分析
  15. #####对配对的样本进行MSI统计
  16. #针对prefix_somatic文件每个MS位点,根据该位点的discrimination_value_ML得分(第七列)来判断是否稳定,
  17. #大于0.5认为是不稳定。然后统计每个样本种有多少个不稳定MS位点,>=3个就认为是MSI-H。(3=15*0.2)
  18. #没有结果的标签设置为no result
  19. #样本有问题的不输出,再最后结果check标识需要进一步核对
  20. import pandas as pd
  21. import os,os.path
  22. import numpy as np
  23. import xlrd
  24. import argparse
  25. def MSI_pair_15R_run_v1(inputpath,sampleid):
  26. laneid = inputpath.split('/')[-1].split('-')[-1]
  27. R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
  28. MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  29. # make the result dir
  30. result_dir = os.path.join(inputpath, 'resultfile')
  31. if not os.path.exists(result_dir):
  32. os.mkdir(result_dir)
  33. sample_dir = os.path.join(result_dir, sampleid)
  34. if not os.path.exists(sample_dir):
  35. os.mkdir(sample_dir)
  36. MSIdata=MSIdir + '/' + sampleid + ".15STR"
  37. if os.path.exists(MSIdata):
  38. # read the MSI total count
  39. STRdir=os.path.join(MSIdir,sampleid + ".15STR")
  40. MSI_total = pd.read_csv(STRdir, sep='\t')
  41. MSI_total.insert(0, 'sampleid', sampleid)
  42. # for the somatic data
  43. somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
  44. somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
  45. somaticdata_input.insert(0, 'sampleid', sampleid)
  46. #discrimination_value_ML filter
  47. somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
  48. somaticdata.reset_index(drop=True, inplace=True)
  49. MSIcount_NCI = 0
  50. MSIcount_Pantaplex = 0
  51. if len(somaticdata) > 0:
  52. for i in range(len(somaticdata)):
  53. for j in range(len(R15data)):
  54. if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
  55. somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
  56. somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
  57. somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
  58. if '(NCI)' in R15data.loc[j, 'MSI_name']:
  59. MSIcount_NCI = MSIcount_NCI + 1
  60. elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
  61. MSIcount_Pantaplex = MSIcount_Pantaplex + 1
  62. elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
  63. MSIcount_NCI = MSIcount_NCI + 1
  64. MSIcount_Pantaplex = MSIcount_Pantaplex + 1
  65. else:
  66. somaticdata.loc[i, 'MSI_name'] = np.nan
  67. MSI_total.loc[0, 'MSIcount_15R'] = (~somaticdata['MSI_name'].isna()).sum()
  68. MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
  69. MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
  70. else:
  71. MSI_total.loc[0, 'MSIcount_15R'] = 0
  72. MSI_total.loc[0, 'MSIcount_NCI'] = 0
  73. MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
  74. # MSI type
  75. if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
  76. MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
  77. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
  78. MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
  79. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
  80. MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
  81. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
  82. MSI_total.loc[0, 'MSI_label'] = 'MSS'
  83. else:
  84. MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
  85. else:
  86. print(sampleid + ' has no MSI data,please check the data!')
  87. MSI_total=pd.DataFrame()
  88. MSI_total.loc[0, 'sampleid'] = sampleid
  89. MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
  90. MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
  91. MSI_total.loc[0, '%'] = 0
  92. MSI_total.loc[0, 'MSIcount_15R'] = 0
  93. MSI_total.loc[0, 'MSIcount_NCI'] = 0
  94. MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
  95. MSI_total.loc[0, 'MSI_label'] = 'No file'
  96. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
  97. writer = pd.ExcelWriter(outputfile1)
  98. MSI_total.to_excel(writer, sheet_name='msi', index=False)
  99. writer.save()
  100. writer.close()
  101. ###修改输出结果的Number_of_Somatic_Sites,这个里面的结果为在我们15个MSI范围内的,因为原始分析的MSI文件是在15个MSI的位点左右扩展了
  102. #因此somatic的结果为位点的个数,实际优的不在MSI位点范围内。
  103. ##NCI位点:BAT-25,BAT-26,D2S123,D5S346,D17S250
  104. ###pantaplex位点:NR-21,NR-27,NR-24,BAT-25,BAT-26
  105. #BAT-25和BAT-26同时属于pantaplex和NCI
  106. def MSI_pair_15R_run(inputpath,sampleid):
  107. laneid = inputpath.split('/')[-1].split('-')[-1]
  108. R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
  109. MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  110. # make the result dir
  111. result_dir = os.path.join(inputpath, 'resultfile')
  112. if not os.path.exists(result_dir):
  113. os.mkdir(result_dir)
  114. sample_dir = os.path.join(result_dir, sampleid)
  115. if not os.path.exists(sample_dir):
  116. os.mkdir(sample_dir)
  117. MSIdata=MSIdir + '/' + sampleid + ".15STR"
  118. if os.path.exists(MSIdata):
  119. # read the MSI total count
  120. STRdir=os.path.join(MSIdir,sampleid + ".15STR")
  121. MSI_total0 = pd.read_csv(STRdir, sep='\t')
  122. MSI_total0.insert(0, 'sampleid', sampleid)
  123. #只保留总的位点数据
  124. MSI_total=MSI_total0[['sampleid','Total_Number_of_Sites']]
  125. # for the somatic data
  126. somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
  127. somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
  128. somaticdata_input.insert(0, 'sampleid', sampleid)
  129. #discrimination_value_ML filter
  130. somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
  131. somaticdata.reset_index(drop=True, inplace=True)
  132. MSIcount_total=0
  133. MSIcount_NCI = 0
  134. MSIcount_Pantaplex = 0
  135. if len(somaticdata) > 0:
  136. for i in range(len(somaticdata)):
  137. for j in range(len(R15data)):
  138. if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
  139. somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
  140. somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
  141. somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
  142. if '(NCI)' in R15data.loc[j, 'MSI_name']:
  143. MSIcount_NCI = MSIcount_NCI + 1
  144. MSIcount_total=MSIcount_total+1
  145. elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
  146. MSIcount_Pantaplex = MSIcount_Pantaplex + 1
  147. MSIcount_total=MSIcount_total+1
  148. elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
  149. MSIcount_NCI = MSIcount_NCI + 1
  150. MSIcount_Pantaplex = MSIcount_Pantaplex + 1
  151. MSIcount_total=MSIcount_total+1
  152. else:
  153. MSIcount_total=MSIcount_total+1
  154. print(somaticdata)
  155. MSI_total.loc[0, 'Number_of_Somatic_Sites'] =MSIcount_total
  156. MSI_total.loc[0, '%'] =round(100*MSIcount_total/MSI_total.loc[0, 'Total_Number_of_Sites'],2)
  157. MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
  158. MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
  159. else:
  160. MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
  161. MSI_total.loc[0, '%'] =0
  162. MSI_total.loc[0, 'MSIcount_NCI'] = 0
  163. MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
  164. # MSI type
  165. if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
  166. MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
  167. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
  168. MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
  169. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
  170. MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
  171. elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
  172. MSI_total.loc[0, 'MSI_label'] = 'MSS'
  173. else:
  174. MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
  175. #return MSI_total
  176. else:
  177. print(sampleid + ' has no MSI data,please check the data!')
  178. MSI_total=pd.DataFrame()
  179. MSI_total.loc[0, 'sampleid'] = sampleid
  180. MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
  181. MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
  182. MSI_total.loc[0, '%'] = 0
  183. MSI_total.loc[0, 'MSIcount_NCI'] = 0
  184. MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
  185. MSI_total.loc[0, 'MSI_label'] = 'No file'
  186. print(MSI_total)
  187. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
  188. writer = pd.ExcelWriter(outputfile1)
  189. MSI_total.to_excel(writer, sheet_name='msi', index=False)
  190. writer.save()
  191. writer.close()
  192. #输出MIS中间文件
  193. # make the result dir
  194. temple_dir = os.path.join(inputpath, 'tempfile')
  195. if not os.path.exists(temple_dir):
  196. os.mkdir(temple_dir)
  197. templeMSI_dir = os.path.join(temple_dir, 'MSI')
  198. if not os.path.exists(templeMSI_dir):
  199. os.mkdir(templeMSI_dir)
  200. outputfile2=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter.txt')
  201. somaticdata.to_csv(outputfile2,sep='\t',index=False,header=True)
  202. outputfile3=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter_stastic.txt')
  203. MSI_total.to_csv(outputfile3,sep='\t',index=False,header=True)
  204. if __name__=='__main__':
  205. parser = argparse.ArgumentParser(description='filter the varscan somatic data')
  206. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  207. parser.add_argument('-s', '--sample', type=str, help='the sample name')
  208. args = parser.parse_args()
  209. Inputpath = args.inputpath
  210. Sampleid = args.sample
  211. MSI_pair_15R_run(Inputpath,Sampleid)
  212. ###for all
  213. def MSIsummary(inputpath,laneid):
  214. datasummarydir = os.path.join(inputpath, 'datasummary')
  215. isExists = os.path.exists(datasummarydir)
  216. if not isExists:
  217. os.makedirs(datasummarydir)
  218. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  219. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  220. MSIdatasummary = pd.DataFrame()
  221. for i in range(len(samplelist)):
  222. sampleid = samplelist.loc[i, 'samplename']
  223. print(sampleid)
  224. MSI = MSI_pair_15R_(inputpath, laneid, sampleid)
  225. MSIdatasummary = MSIdatasummary.append(MSI)
  226. outputname = datasummarydir + '/' + laneid + '_table5_MSI_datasummary.txt'
  227. MSIdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')