chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
							#对msisensor的结果进行分析

#MSI建议采用美国国家癌症研究院( NCI) 推荐的5 个微卫星( MS) 检测位点( BAT25、BAT26、D5S346、D2S123 和D17S250) 。
# #判断标准为3 级: 所有5 个位点均稳定为微卫星稳定( MSS) ，1个位点不稳定为微卫星低度不稳定( MSI-L) ，
# #2个及2 个以上位点不稳定为微卫星高度不稳定( MSI-H) 。
#本次分析3个及其以上为MSI-H,2个和1个为MSI-L，0个为MSS

#菁良panel的判断标准是
#位点如下NR-21 (A)21；NR-24 (A)24；NR-27 (A)27；BAT-25 (A)25；BAT-26 (A)26和MONO-27 (A)2
#MSS:所有位点均稳定为MSS均检测不出来MSI
#MSI-H:六个位点中有两个及其以上的被检出
#MSI-L:六个位点中只有1个被检测出

#1.对.prefix分析，结果是统计了整个样本中MSI的数目
#2..prefix_dis是具体的MSI信息
#3.prefix_somatic对somatic的MSI进行分析
#####################################################对paired文件进行分析
#####对配对的样本进行MSI统计
#针对prefix_somatic文件每个MS位点，根据该位点的discrimination_value_ML得分（第七列）来判断是否稳定，
#大于0.5认为是不稳定。然后统计每个样本种有多少个不稳定MS位点，>=3个就认为是MSI-H。（3=15*0.2）
#没有结果的标签设置为no result
#样本有问题的不输出，再最后结果check标识需要进一步核对

import pandas as pd
import os,os.path
import numpy as np
import xlrd
import argparse


def MSI_pair_15R_run_v1(inputpath,sampleid):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
	MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
	# make the result dir
	result_dir = os.path.join(inputpath, 'resultfile')
	if not os.path.exists(result_dir):
		os.mkdir(result_dir)
	sample_dir = os.path.join(result_dir, sampleid)
	if not os.path.exists(sample_dir):
		os.mkdir(sample_dir)
	MSIdata=MSIdir + '/' + sampleid + ".15STR"
	if os.path.exists(MSIdata):
		# read the MSI total count
		STRdir=os.path.join(MSIdir,sampleid + ".15STR")
		MSI_total = pd.read_csv(STRdir, sep='\t')
		MSI_total.insert(0, 'sampleid', sampleid)
		# for the somatic data
		somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
		somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
		somaticdata_input.insert(0, 'sampleid', sampleid)
		#discrimination_value_ML filter
		somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
		somaticdata.reset_index(drop=True, inplace=True)
		MSIcount_NCI = 0
		MSIcount_Pantaplex = 0
		if len(somaticdata) > 0:
			for i in range(len(somaticdata)):
				for j in range(len(R15data)):
					if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
							somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
							somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
						somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
						if '(NCI)' in R15data.loc[j, 'MSI_name']:
							MSIcount_NCI = MSIcount_NCI + 1
						elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
							MSIcount_Pantaplex = MSIcount_Pantaplex + 1
						elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
							MSIcount_NCI = MSIcount_NCI + 1
							MSIcount_Pantaplex = MSIcount_Pantaplex + 1
					else:
						somaticdata.loc[i, 'MSI_name'] = np.nan
			MSI_total.loc[0, 'MSIcount_15R'] = (~somaticdata['MSI_name'].isna()).sum()
			MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
			MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
		else:
			MSI_total.loc[0, 'MSIcount_15R'] = 0
			MSI_total.loc[0, 'MSIcount_NCI'] = 0
			MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
		# MSI type
		if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
			MSI_total.loc[0, 'MSI_label'] = 'MSS'
		else:
			MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
	else:
		print(sampleid + ' has no MSI data,please check the data!')
		MSI_total=pd.DataFrame()
		MSI_total.loc[0, 'sampleid'] = sampleid
		MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
		MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
		MSI_total.loc[0, '%'] = 0
		MSI_total.loc[0, 'MSIcount_15R'] = 0
		MSI_total.loc[0, 'MSIcount_NCI'] = 0
		MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
		MSI_total.loc[0, 'MSI_label'] = 'No file'
	outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
	writer = pd.ExcelWriter(outputfile1)
	MSI_total.to_excel(writer, sheet_name='msi', index=False)
	writer.save()
	writer.close()


###修改输出结果的Number_of_Somatic_Sites，这个里面的结果为在我们15个MSI范围内的，因为原始分析的MSI文件是在15个MSI的位点左右扩展了
#因此somatic的结果为位点的个数，实际优的不在MSI位点范围内。
##NCI位点：BAT-25,BAT-26,D2S123,D5S346,D17S250
###pantaplex位点：NR-21，NR-27，NR-24，BAT-25，BAT-26
#BAT-25和BAT-26同时属于pantaplex和NCI
def MSI_pair_15R_run(inputpath,sampleid):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	R15data=pd.read_csv('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/MSI_15.slop50.bed',sep='\t',usecols=[0,1,2,3],names=['chr','start','end','MSI_name'])
	MSIdir = os.path.join(inputpath, '3MSI_msisensor2_pair')
	# make the result dir
	result_dir = os.path.join(inputpath, 'resultfile')
	if not os.path.exists(result_dir):
		os.mkdir(result_dir)
	sample_dir = os.path.join(result_dir, sampleid)
	if not os.path.exists(sample_dir):
		os.mkdir(sample_dir)
	MSIdata=MSIdir + '/' + sampleid + ".15STR"
	if os.path.exists(MSIdata):
		# read the MSI total count
		STRdir=os.path.join(MSIdir,sampleid + ".15STR")
		MSI_total0 = pd.read_csv(STRdir, sep='\t')
		MSI_total0.insert(0, 'sampleid', sampleid)
		#只保留总的位点数据
		MSI_total=MSI_total0[['sampleid','Total_Number_of_Sites']]
		# for the somatic data
		somaticdir = os.path.join(MSIdir, sampleid + '.15STR_somatic')
		somaticdata_input = pd.read_csv(somaticdir, sep='\t',names=['chr', 'pos', 'left', 'repeat_count', 'repeat_base', 'right','discrimination_value_ML', 'score2', 'score3', 'score4'])
		somaticdata_input.insert(0, 'sampleid', sampleid)
		#discrimination_value_ML filter
		somaticdata = somaticdata_input[somaticdata_input['discrimination_value_ML'] > 0.5]
		somaticdata.reset_index(drop=True, inplace=True)
		MSIcount_total=0
		MSIcount_NCI = 0
		MSIcount_Pantaplex = 0
		if len(somaticdata) > 0:
			for i in range(len(somaticdata)):
				for j in range(len(R15data)):
					if (somaticdata.loc[i, 'chr'] == R15data.loc[j, 'chr']) & (
							somaticdata.loc[i, 'pos'] >= R15data.loc[j, 'start']) & (
							somaticdata.loc[i, 'pos'] <= R15data.loc[j, 'end']):
						somaticdata.loc[i, 'MSI_name'] = R15data.loc[j, 'MSI_name']
						if '(NCI)' in R15data.loc[j, 'MSI_name']:
							MSIcount_NCI = MSIcount_NCI + 1
							MSIcount_total=MSIcount_total+1
						elif '(Pentaplex)' in R15data.loc[j, 'MSI_name']:
							MSIcount_Pantaplex = MSIcount_Pantaplex + 1
							MSIcount_total=MSIcount_total+1
						elif '(NCI,Pentaplex)' in R15data.loc[j, 'MSI_name']:
							MSIcount_NCI = MSIcount_NCI + 1
							MSIcount_Pantaplex = MSIcount_Pantaplex + 1
							MSIcount_total=MSIcount_total+1
						else:
							MSIcount_total=MSIcount_total+1
			print(somaticdata)
			MSI_total.loc[0, 'Number_of_Somatic_Sites'] =MSIcount_total
			MSI_total.loc[0, '%'] =round(100*MSIcount_total/MSI_total.loc[0, 'Total_Number_of_Sites'],2)
			MSI_total.loc[0, 'MSIcount_NCI'] = MSIcount_NCI
			MSI_total.loc[0, 'MSIcount_Pantaplex'] = MSIcount_Pantaplex
		else:
			MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
			MSI_total.loc[0, '%'] =0
			MSI_total.loc[0, 'MSIcount_NCI'] = 0
			MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
		# MSI type
		if (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] >= 3):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-H'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 2):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 1):
			MSI_total.loc[0, 'MSI_label'] = 'MSI-L'
		elif (MSI_total.loc[0, 'Total_Number_of_Sites'] > 0) and (MSI_total.loc[0, 'MSIcount_NCI'] == 0):
			MSI_total.loc[0, 'MSI_label'] = 'MSS'
		else:
			MSI_total.loc[0, 'MSI_label'] = 'No MSI result'
	#return MSI_total
	else:
		print(sampleid + ' has no MSI data,please check the data!')
		MSI_total=pd.DataFrame()
		MSI_total.loc[0, 'sampleid'] = sampleid
		MSI_total.loc[0, 'Total_Number_of_Sites'] = 0
		MSI_total.loc[0, 'Number_of_Somatic_Sites'] = 0
		MSI_total.loc[0, '%'] = 0
		MSI_total.loc[0, 'MSIcount_NCI'] = 0
		MSI_total.loc[0, 'MSIcount_Pantaplex'] = 0
		MSI_total.loc[0, 'MSI_label'] = 'No file'
	print(MSI_total)
	outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.msi.xlsx')
	writer = pd.ExcelWriter(outputfile1)
	MSI_total.to_excel(writer, sheet_name='msi', index=False)
	writer.save()
	writer.close()
	#输出MIS中间文件
	# make the result dir
	temple_dir = os.path.join(inputpath, 'tempfile')
	if not os.path.exists(temple_dir):
		os.mkdir(temple_dir)
	templeMSI_dir = os.path.join(temple_dir, 'MSI')
	if not os.path.exists(templeMSI_dir):
		os.mkdir(templeMSI_dir)
	outputfile2=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter.txt')
	somaticdata.to_csv(outputfile2,sep='\t',index=False,header=True)
	outputfile3=os.path.join(templeMSI_dir,laneid+'_'+sampleid+'_MSI_filter_stastic.txt')
	MSI_total.to_csv(outputfile3,sep='\t',index=False,header=True)


if __name__=='__main__':
	parser = argparse.ArgumentParser(description='filter the varscan somatic data')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-s', '--sample', type=str, help='the sample name')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Sampleid = args.sample
	MSI_pair_15R_run(Inputpath,Sampleid)


###for all
def MSIsummary(inputpath,laneid):
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t', header=0)
	MSIdatasummary = pd.DataFrame()
	for i in range(len(samplelist)):
		sampleid = samplelist.loc[i, 'samplename']
		print(sampleid)
		MSI = MSI_pair_15R_(inputpath, laneid, sampleid)
		MSIdatasummary = MSIdatasummary.append(MSI)
	outputname = datasummarydir + '/' + laneid + '_table5_MSI_datasummary.txt'
	MSIdatasummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')