#对msisensor的结果进行分析 #MSI建议采用美国国家癌症研究院( NCI) 推荐的5 个微卫星( MS) 检测位点( BAT25、BAT26、D5S346、D2S123 和D17S250) 。 # #判断标准为3 级: 所有5 个位点均稳定为微卫星稳定( MSS) ,1个位点不稳定为微卫星低度不稳定( MSI-L) , # #2个及2 个以上位点不稳定为微卫星高度不稳定( MSI-H) 。 #本次分析3个及其以上为MSI-H,2个和1个为MSI-L,0个为MSS #菁良panel的判断标准是 #位点如下NR-21 (A)21;NR-24 (A)24;NR-27 (A)27;BAT-25 (A)25;BAT-26 (A)26和MONO-27 (A)2 #MSS:所有位点均稳定为MSS均检测不出来MSI #MSI-H:六个位点中有两个及其以上的被检出 #MSI-L:六个位点中只有1个被检测出 #1.对.prefix分析,结果是统计了整个样本中MSI的数目 #2..prefix_dis是具体的MSI信息 #3.prefix_somatic对somatic的MSI进行分析 #####################################################对paired文件进行分析 #####对配对的样本进行MSI统计 #针对prefix_somatic文件每个MS位点,根据该位点的discrimination_value_ML得分(第七列)来判断是否稳定, #大于0.5认为是不稳定。然后统计每个样本种有多少个不稳定MS位点,>=3个就认为是MSI-H。(3=15*0.2) #没有结果的标签设置为no result #样本有问题的不输出,再最后结果check标识需要进一步核对 import pandas as pd import os,os.path import numpy as np import xlrd import argparse def MSI_pair_15R_run_v1(inputpath,sampleid): laneid = inputpath.split('/')[-1].split('-')[-1] R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name']) MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair') # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) MSIdata=MSIdir + '/' + sampleid + ".15STR" if os.path.exists(MSIdata): # read the MSI total count STRdir=os.path.join(MSIdir,sampleid + ".15STR") MSI_total = pd.read_csv(STRdir, sep='\t') MSI_total.insert(0, 'sampleid', sampleid) # for the somatic data somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic') somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4']) somaticdata_input.insert(0, 'sampleid', sampleid) #discrimination_value_ML filter somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5] somaticdata.reset_index(drop=True, inplace=True) MSIcount_NCI = 0 MSIcount_Pantaplex = 0 if len(somaticdata) > 0: for i in range(len(somaticdata)): for j in range(len(R15data)): if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & ( somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & ( somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']): somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name'] if '(NCI)' in R15data.loc[j, 'MSI_name']: MSIcount_NCI = MSIcount_NCI + 1 elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']: MSIcount_Pantaplex = MSIcount_Pantaplex + 1 elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']: MSIcount_NCI = MSIcount_NCI + 1 MSIcount_Pantaplex = MSIcount_Pantaplex + 1 else: somaticdata.loc[i, 'MSI_name'] = np.nan MSI_total.loc[0, 'MSIcount_15R'] = (~somaticdata['MSI_name'].isna()).sum() MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex else: MSI_total.loc[0, 'MSIcount_15R'] = 0 MSI_total.loc[0, 'MSIcount_NCI'] = 0 MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0 # MSI type if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3): MSI_total.loc[0, 'MSI_label'] = 'MSI-H' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2): MSI_total.loc[0, 'MSI_label'] = 'MSI-L' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1): MSI_total.loc[0, 'MSI_label'] = 'MSI-L' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0): MSI_total.loc[0, 'MSI_label'] = 'MSS' else: MSI_total.loc[0, 'MSI_label'] = 'No MSI result' else: print(sampleid + ' has no MSI data,please check the data!') MSI_total=pd.DataFrame() MSI_total.loc[0, 'sampleid'] = sampleid MSI_total.loc[0, 'Total_Number_of_Sites'] = 0 MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0 MSI_total.loc[0, '%'] = 0 MSI_total.loc[0, 'MSIcount_15R'] = 0 MSI_total.loc[0, 'MSIcount_NCI'] = 0 MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0 MSI_total.loc[0, 'MSI_label'] = 'No file' outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx') writer = pd.ExcelWriter(outputfile1) MSI_total.to_excel(writer, sheet_name='msi', index=False) writer.save() writer.close() ###修改输出结果的Number_of_Somatic_Sites,这个里面的结果为在我们15个MSI范围内的,因为原始分析的MSI文件是在15个MSI的位点左右扩展了 #因此somatic的结果为位点的个数,实际优的不在MSI位点范围内。 ##NCI位点:BAT-25,BAT-26,D2S123,D5S346,D17S250 ###pantaplex位点:NR-21,NR-27,NR-24,BAT-25,BAT-26 #BAT-25和BAT-26同时属于pantaplex和NCI def MSI_pair_15R_run(inputpath,sampleid): laneid = inputpath.split('/')[-1].split('-')[-1] R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name']) MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair') # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) MSIdata=MSIdir + '/' + sampleid + ".15STR" if os.path.exists(MSIdata): # read the MSI total count STRdir=os.path.join(MSIdir,sampleid + ".15STR") MSI_total0 = pd.read_csv(STRdir, sep='\t') MSI_total0.insert(0, 'sampleid', sampleid) #只保留总的位点数据 MSI_total=MSI_total0[['sampleid','Total_Number_of_Sites']] # for the somatic data somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic') somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4']) somaticdata_input.insert(0, 'sampleid', sampleid) #discrimination_value_ML filter somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5] somaticdata.reset_index(drop=True, inplace=True) MSIcount_total=0 MSIcount_NCI = 0 MSIcount_Pantaplex = 0 if len(somaticdata) > 0: for i in range(len(somaticdata)): for j in range(len(R15data)): if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & ( somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & ( somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']): somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name'] if '(NCI)' in R15data.loc[j, 'MSI_name']: MSIcount_NCI = MSIcount_NCI + 1 MSIcount_total=MSIcount_total+1 elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']: MSIcount_Pantaplex = MSIcount_Pantaplex + 1 MSIcount_total=MSIcount_total+1 elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']: MSIcount_NCI = MSIcount_NCI + 1 MSIcount_Pantaplex = MSIcount_Pantaplex + 1 MSIcount_total=MSIcount_total+1 else: MSIcount_total=MSIcount_total+1 print(somaticdata) MSI_total.loc[0, 'Number_of_Somatic_Sites'] =MSIcount_total MSI_total.loc[0, '%'] =round(100*MSIcount_total/MSI_total.loc[0, 'Total_Number_of_Sites'],2) MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex else: MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0 MSI_total.loc[0, '%'] =0 MSI_total.loc[0, 'MSIcount_NCI'] = 0 MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0 # MSI type if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3): MSI_total.loc[0, 'MSI_label'] = 'MSI-H' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2): MSI_total.loc[0, 'MSI_label'] = 'MSI-L' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1): MSI_total.loc[0, 'MSI_label'] = 'MSI-L' elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0): MSI_total.loc[0, 'MSI_label'] = 'MSS' else: MSI_total.loc[0, 'MSI_label'] = 'No MSI result' #return MSI_total else: print(sampleid + ' has no MSI data,please check the data!') MSI_total=pd.DataFrame() MSI_total.loc[0, 'sampleid'] = sampleid MSI_total.loc[0, 'Total_Number_of_Sites'] = 0 MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0 MSI_total.loc[0, '%'] = 0 MSI_total.loc[0, 'MSIcount_NCI'] = 0 MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0 MSI_total.loc[0, 'MSI_label'] = 'No file' print(MSI_total) outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx') writer = pd.ExcelWriter(outputfile1) MSI_total.to_excel(writer, sheet_name='msi', index=False) writer.save() writer.close() #输出MIS中间文件 # make the result dir temple_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temple_dir): os.mkdir(temple_dir) templeMSI_dir = os.path.join(temple_dir, 'MSI') if not os.path.exists(templeMSI_dir): os.mkdir(templeMSI_dir) outputfile2=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter.txt') somaticdata.to_csv(outputfile2,sep='\t',index=False,header=True) outputfile3=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter_stastic.txt') MSI_total.to_csv(outputfile3,sep='\t',index=False,header=True) if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the varscan somatic data') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--sample', type=str, help='the sample name') args = parser.parse_args() Inputpath = args.inputpath Sampleid = args.sample MSI_pair_15R_run(Inputpath,Sampleid) ###for all def MSIsummary(inputpath,laneid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) MSIdatasummary = pd.DataFrame() for i in range(len(samplelist)): sampleid = samplelist.loc[i, 'samplename'] print(sampleid) MSI = MSI_pair_15R_(inputpath, laneid, sampleid) MSIdatasummary = MSIdatasummary.append(MSI) outputname = datasummarydir + '/' + laneid + '_table5_MSI_datasummary.txt' MSIdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')