123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- #对msisensor的结果进行分析
- #MSI建议采用美国国家癌症研究院( NCI) 推荐的5 个微卫星( MS) 检测位点( BAT25、BAT26、D5S346、D2S123 和D17S250) 。
- # #判断标准为3 级: 所有5 个位点均稳定为微卫星稳定( MSS) ,1个位点不稳定为微卫星低度不稳定( MSI-L) ,
- # #2个及2 个以上位点不稳定为微卫星高度不稳定( MSI-H) 。
- #本次分析3个及其以上为MSI-H,2个和1个为MSI-L,0个为MSS
- #菁良panel的判断标准是
- #位点如下NR-21 (A)21;NR-24 (A)24;NR-27 (A)27;BAT-25 (A)25;BAT-26 (A)26和MONO-27 (A)2
- #MSS:所有位点均稳定为MSS均检测不出来MSI
- #MSI-H:六个位点中有两个及其以上的被检出
- #MSI-L:六个位点中只有1个被检测出
- #1.对.prefix分析,结果是统计了整个样本中MSI的数目
- #2..prefix_dis是具体的MSI信息
- #3.prefix_somatic对somatic的MSI进行分析
- #####################################################对paired文件进行分析
- #####对配对的样本进行MSI统计
- #针对prefix_somatic文件每个MS位点,根据该位点的discrimination_value_ML得分(第七列)来判断是否稳定,
- #大于0.5认为是不稳定。然后统计每个样本种有多少个不稳定MS位点,>=3个就认为是MSI-H。(3=15*0.2)
- #没有结果的标签设置为no result
- #样本有问题的不输出,再最后结果check标识需要进一步核对
- import pandas as pd
- import os,os.path
- import numpy as np
- import xlrd
- import argparse
- def MSI_pair_15R_run_v1(inputpath,sampleid):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
- MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- MSIdata=MSIdir + '/' + sampleid + ".15STR"
- if os.path.exists(MSIdata):
- # read the MSI total count
- STRdir=os.path.join(MSIdir,sampleid + ".15STR")
- MSI_total = pd.read_csv(STRdir, sep='\t')
- MSI_total.insert(0, 'sampleid', sampleid)
- # for the somatic data
- somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
- somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
- somaticdata_input.insert(0, 'sampleid', sampleid)
- #discrimination_value_ML filter
- somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
- somaticdata.reset_index(drop=True, inplace=True)
- MSIcount_NCI = 0
- MSIcount_Pantaplex = 0
- if len(somaticdata) > 0:
- for i in range(len(somaticdata)):
- for j in range(len(R15data)):
- if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
- somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
- somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
- somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
- if '(NCI)' in R15data.loc[j, 'MSI_name']:
- MSIcount_NCI = MSIcount_NCI + 1
- elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
- MSIcount_Pantaplex = MSIcount_Pantaplex + 1
- elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
- MSIcount_NCI = MSIcount_NCI + 1
- MSIcount_Pantaplex = MSIcount_Pantaplex + 1
- else:
- somaticdata.loc[i, 'MSI_name'] = np.nan
- MSI_total.loc[0, 'MSIcount_15R'] = (~somaticdata['MSI_name'].isna()).sum()
- MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
- else:
- MSI_total.loc[0, 'MSIcount_15R'] = 0
- MSI_total.loc[0, 'MSIcount_NCI'] = 0
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
- # MSI type
- if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
- MSI_total.loc[0, 'MSI_label'] = 'MSS'
- else:
- MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
- else:
- print(sampleid + ' has no MSI data,please check the data!')
- MSI_total=pd.DataFrame()
- MSI_total.loc[0, 'sampleid'] = sampleid
- MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
- MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
- MSI_total.loc[0, '%'] = 0
- MSI_total.loc[0, 'MSIcount_15R'] = 0
- MSI_total.loc[0, 'MSIcount_NCI'] = 0
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
- MSI_total.loc[0, 'MSI_label'] = 'No file'
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- MSI_total.to_excel(writer, sheet_name='msi', index=False)
- writer.save()
- writer.close()
- ###修改输出结果的Number_of_Somatic_Sites,这个里面的结果为在我们15个MSI范围内的,因为原始分析的MSI文件是在15个MSI的位点左右扩展了
- #因此somatic的结果为位点的个数,实际优的不在MSI位点范围内。
- ##NCI位点:BAT-25,BAT-26,D2S123,D5S346,D17S250
- ###pantaplex位点:NR-21,NR-27,NR-24,BAT-25,BAT-26
- #BAT-25和BAT-26同时属于pantaplex和NCI
- def MSI_pair_15R_run(inputpath,sampleid):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
- MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- MSIdata=MSIdir + '/' + sampleid + ".15STR"
- if os.path.exists(MSIdata):
- # read the MSI total count
- STRdir=os.path.join(MSIdir,sampleid + ".15STR")
- MSI_total0 = pd.read_csv(STRdir, sep='\t')
- MSI_total0.insert(0, 'sampleid', sampleid)
- #只保留总的位点数据
- MSI_total=MSI_total0[['sampleid','Total_Number_of_Sites']]
- # for the somatic data
- somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
- somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
- somaticdata_input.insert(0, 'sampleid', sampleid)
- #discrimination_value_ML filter
- somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
- somaticdata.reset_index(drop=True, inplace=True)
- MSIcount_total=0
- MSIcount_NCI = 0
- MSIcount_Pantaplex = 0
- if len(somaticdata) > 0:
- for i in range(len(somaticdata)):
- for j in range(len(R15data)):
- if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
- somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
- somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
- somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
- if '(NCI)' in R15data.loc[j, 'MSI_name']:
- MSIcount_NCI = MSIcount_NCI + 1
- MSIcount_total=MSIcount_total+1
- elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
- MSIcount_Pantaplex = MSIcount_Pantaplex + 1
- MSIcount_total=MSIcount_total+1
- elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
- MSIcount_NCI = MSIcount_NCI + 1
- MSIcount_Pantaplex = MSIcount_Pantaplex + 1
- MSIcount_total=MSIcount_total+1
- else:
- MSIcount_total=MSIcount_total+1
- print(somaticdata)
- MSI_total.loc[0, 'Number_of_Somatic_Sites'] =MSIcount_total
- MSI_total.loc[0, '%'] =round(100*MSIcount_total/MSI_total.loc[0, 'Total_Number_of_Sites'],2)
- MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
- else:
- MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
- MSI_total.loc[0, '%'] =0
- MSI_total.loc[0, 'MSIcount_NCI'] = 0
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
- # MSI type
- if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
- MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
- elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
- MSI_total.loc[0, 'MSI_label'] = 'MSS'
- else:
- MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
- #return MSI_total
- else:
- print(sampleid + ' has no MSI data,please check the data!')
- MSI_total=pd.DataFrame()
- MSI_total.loc[0, 'sampleid'] = sampleid
- MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
- MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
- MSI_total.loc[0, '%'] = 0
- MSI_total.loc[0, 'MSIcount_NCI'] = 0
- MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
- MSI_total.loc[0, 'MSI_label'] = 'No file'
- print(MSI_total)
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- MSI_total.to_excel(writer, sheet_name='msi', index=False)
- writer.save()
- writer.close()
- #输出MIS中间文件
- # make the result dir
- temple_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temple_dir):
- os.mkdir(temple_dir)
- templeMSI_dir = os.path.join(temple_dir, 'MSI')
- if not os.path.exists(templeMSI_dir):
- os.mkdir(templeMSI_dir)
- outputfile2=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter.txt')
- somaticdata.to_csv(outputfile2,sep='\t',index=False,header=True)
- outputfile3=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter_stastic.txt')
- MSI_total.to_csv(outputfile3,sep='\t',index=False,header=True)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the varscan somatic data')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--sample', type=str, help='the sample name')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Sampleid = args.sample
- MSI_pair_15R_run(Inputpath,Sampleid)
- ###for all
- def MSIsummary(inputpath,laneid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- MSIdatasummary = pd.DataFrame()
- for i in range(len(samplelist)):
- sampleid = samplelist.loc[i, 'samplename']
- print(sampleid)
- MSI = MSI_pair_15R_(inputpath, laneid, sampleid)
- MSIdatasummary = MSIdatasummary.append(MSI)
- outputname = datasummarydir + '/' + laneid + '_table5_MSI_datasummary.txt'
- MSIdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|