chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
							
import pandas as pd
import os
import os.path
import xlrd
import argparse


####修改
#经过统计男性X染色体上Ratio的中值为0.5，女性为1.
#除性染色体外，对其他染色体上所有ratio小于0.6的cnv统计，发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
#但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
#1.筛选depth大于等于1000且ratio<=0.3的突变。
##2.对于扩增，组织,筛选ratio>=1.5；对于血液，筛选ratio>=4
#3.去除化疗相关基因


def cnv_sample_run_test(inputpath,sample):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
	panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
	druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
	drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
	drug_genelist['drug_lable'] = 'Drug_gene'
	namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
	namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2',  'Copynum', 'depth', 'CNV_label','Gene_label']
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
	sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t')
	sampledata = samplelist[samplelist['samplename'] == sample]
	sampledata.reset_index(drop=True, inplace=True)
	sample = sampledata.loc[0, 'samplename']
	sampletype = sampledata.loc[0, 'sampletype']
	# make the result dir
	result_dir = os.path.join(inputpath, 'resultfile')
	if not os.path.exists(result_dir):
		os.mkdir(result_dir)
	sample_dir = os.path.join(result_dir, sample)
	if not os.path.exists(sample_dir):
		os.mkdir(sample_dir)
	cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
	if os.path.exists(cnvdatadir):
		cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
		cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
		cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
		# extract the panel gene
		cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
		cnvfilter.reset_index(drop=True, inplace=True)
		del cnvfilter['druggene_label']
		if len(cnvfilter) == 0:
			cnvfilter_final = pd.DataFrame(columns=namelist)
			cnvfilter_final.loc[0, 'sampleid'] = sample
			cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
		else:
			##remove the drug gene
			CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
			CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
			del CNVfilter1['drug_lable']
			if len(CNVfilter1) != 0:
				# 筛选CNV，对于组织，Ratio>=1.5或者Ratio<=0.3；对于血液Ratio>=4或者Ratio<=0.3
				if sampletype == 'FFPE':
					CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
				elif sampletype == 'blood':
					CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
				if len(CNVfilter2) != 0:
					##对cnvloss进一步过滤,筛选depth>=1000
					cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
					cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
					cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
					cnvfilter_final.reset_index(drop=True, inplace=True)
					if len(cnvfilter_final) == 0:
						cnvfilter_final = pd.DataFrame(columns=namelist)
						cnvfilter_final.loc[0, 'sampleid'] = sample
						cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
				else:
					cnvfilter_final = pd.DataFrame(columns=namelist)
					cnvfilter_final.loc[0, 'sampleid'] = sample
					cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
			else:
				cnvfilter_final = pd.DataFrame(columns=namelist)
				cnvfilter_final.loc[0, 'sampleid'] = sample
				cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
		cnvfilter_output = cnvfilter_final[namelistout]
		outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
		writer = pd.ExcelWriter(outputfile1)
		cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
		writer.save()
		writer.close()
	else:
		cnvfilter_final = pd.DataFrame(columns=namelist)
		cnvfilter_final.loc[0, 'sampleid'] = sample
		cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
		# make the temp dir
		temp_dir = os.path.join(inputpath, 'tempfile')
		if not os.path.exists(temp_dir):
			os.mkdir(temp_dir)
		bugfile_dir = os.path.join(temp_dir, 'bugfile')
		if not os.path.exists(bugfile_dir):
			os.mkdir(bugfile_dir)
		outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
		cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')


####加入在样本中出现的次数


def CNVcounts(cnvfilter,inputpath,sample):
	# add the cnv counts in 117 samples
	cnvbgdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_117sample_stastic_risk_20220929.txt'
	cnvbgdata = pd.read_table(cnvbgdir, sep='\t', header=0)
	cnvfilter['CNV_gene'] = cnvfilter['gene'] + '_' + cnvfilter['chr'].astype('str') + '_' + cnvfilter['start'].astype('str') + '_' + cnvfilter['end'].astype('str')
	cnvfilter_counts = pd.merge(cnvfilter, cnvbgdata, on=['CNV_gene'], how='left')
	del cnvfilter_counts['CNV_gene']
	#add the tier infor
	cnvtierdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
	cnvtierdata= pd.read_table(cnvtierdir1, sep='\t', header=0)
	cnvfilter_counts_tier=pd.merge(cnvfilter_counts,cnvtierdata,on=['gene','CNV_label'],how='left')
	# make the temp dir
	temp_dir = os.path.join(inputpath, 'tempfile')
	if not os.path.exists(temp_dir):
		os.mkdir(temp_dir)
	tempcnv_dir = os.path.join(temp_dir, 'CNV')
	if not os.path.exists(tempcnv_dir):
		os.mkdir(tempcnv_dir)
	outputname = os.path.join(tempcnv_dir, sample + '_CNVunfilter_samplecounts.txt')
	cnvfilter_counts_tier.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
	return cnvfilter_counts_tier


####修改
#经过统计男性X染色体上Ratio的中值为0.5，女性为1.
#除性染色体外，对其他染色体上所有ratio小于0.6的cnv统计，发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
#但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
#1.筛选depth大于等于1000且ratio<=0.3的突变。
##2.对于扩增，组织,筛选ratio>=2；对于血液，筛选ratio>=4
#3.去除化疗相关基因

def cnv_sample_run(inputpath,sample):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
	panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
	druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
	drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
	drug_genelist['drug_lable'] = 'Drug_gene'
	namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
	namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2',  'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
	sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t')
	sampledata = samplelist[samplelist['samplename'] == sample]
	sampledata.reset_index(drop=True, inplace=True)
	sampletype = sampledata.loc[0, 'sampletype']
	# make the result dir
	result_dir = os.path.join(inputpath, 'resultfile')
	if not os.path.exists(result_dir):
		os.mkdir(result_dir)
	sample_dir = os.path.join(result_dir, sample)
	if not os.path.exists(sample_dir):
		os.mkdir(sample_dir)
	cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
	if os.path.exists(cnvdatadir):
		cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
		cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
		cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
		# extract the panel gene
		cnvfilter1_0 = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
		cnvfilter1_0 .reset_index(drop=True, inplace=True)
		del cnvfilter1_0['druggene_label']
		# add the cnv counts in 117 samples
		cnvfilter = CNVcounts(cnvfilter1_0, inputpath, sample)
		if len(cnvfilter) == 0:
			cnvfilter_final = pd.DataFrame(columns=namelist)
			cnvfilter_final.loc[0, 'sampleid'] = sample
			cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
		else:
			##remove the drug gene
			CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
			CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
			del CNVfilter1['drug_lable']
			if len(CNVfilter1) != 0:
				# 筛选CNV，对于组织，Ratio>=2或者Ratio<=0.3；对于血液Ratio>=4或者Ratio<=0.3
				if sampletype == 'FFPE':
					CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 2) | (CNVfilter1['Ratio'] <= 0.3)]
				elif sampletype == 'blood':
					CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
				if len(CNVfilter2) != 0:
					##对cnvloss进一步过滤,筛选depth>=1000
					cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
					cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
					cnvfilter_cnvloss = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
					cnvfilter_cnvloss.reset_index(drop=True, inplace=True)
					#筛选在117个样本中出现频率小于50%的CNV
					if len(cnvfilter_cnvloss) == 0:
						cnvfilter_final = pd.DataFrame(columns=namelist)
						cnvfilter_final.loc[0, 'sampleid'] = sample
						cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
					else:
						cnvfilter_final0 = cnvfilter_cnvloss[cnvfilter_cnvloss['Sample_ratio'] < 0.5]
						if len(cnvfilter_final0) != 0:
							cnvfilter_final = cnvfilter_final0.drop(labels=['Sample_ratio', 'Sample_counts'], axis=1)
							cnvfilter_final.reset_index(drop=True, inplace=True)
						else:
							cnvfilter_final = pd.DataFrame(columns=namelist)
							cnvfilter_final.loc[0, 'sampleid'] = sample
							cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
				else:
					cnvfilter_final = pd.DataFrame(columns=namelist)
					cnvfilter_final.loc[0, 'sampleid'] = sample
					cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
			else:
				cnvfilter_final = pd.DataFrame(columns=namelist)
				cnvfilter_final.loc[0, 'sampleid'] = sample
				cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
		cnvfilter_output = cnvfilter_final[namelistout]
		outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
		writer = pd.ExcelWriter(outputfile1)
		cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
		writer.save()
		writer.close()
	else:
		cnvfilter_final = pd.DataFrame(columns=namelist)
		cnvfilter_final.loc[0, 'sampleid'] = sample
		cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
		# make the temp dir
		temp_dir = os.path.join(inputpath, 'tempfile')
		if not os.path.exists(temp_dir):
			os.mkdir(temp_dir)
		bugfile_dir = os.path.join(temp_dir, 'bugfile')
		if not os.path.exists(bugfile_dir):
			os.mkdir(bugfile_dir)
		outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
		cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')


if __name__=='__main__':
	parser = argparse.ArgumentParser(description='filter the CNV')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-s', '--sample', type=str, help='the sample name')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Sample = args.sample
	cnv_sample_run(Inputpath,Sample)


####所有样本汇总不执行
def cnv_datasummary(inputpath,laneid):
	panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
	panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
	druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
	drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
	drug_genelist['drug_lable'] = 'Drug_gene'
	namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
	namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2',  'Copynum', 'depth', 'CNV_label','Gene_label']
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvsummary = pd.DataFrame()
	sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t')
	for i in range(len(samplelist)):
		sample = samplelist.loc[i, 'samplename']
		sampletype=samplelist.loc[i, 'sampletype']
		# make the result dir
		result_dir = os.path.join(inputpath, 'resultfile')
		if not os.path.exists(result_dir):
			os.mkdir(result_dir)
		sample_dir = os.path.join(result_dir, sample)
		if not os.path.exists(sample_dir):
			os.mkdir(sample_dir)
		cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
		if os.path.exists(cnvdatadir):
			cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
			cnvfilter0=cnvdata.dropna(subset=['CNV_label'])
			cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label']!='']
			# extract the panel gene
			cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
			cnvfilter.reset_index(drop=True,inplace=True)
			del cnvfilter['druggene_label']
			if len(cnvfilter)==0:
				cnvfilter_final = pd.DataFrame(columns=namelist)
				cnvfilter_final.loc[0, 'sampleid'] = sample
				cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
			else:
				##remove the drug gene
				CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
				CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
				del CNVfilter1['drug_lable']
				if len(CNVfilter1)!=0:
					# 筛选CNV，对于组织，Ratio>=1.5或者Ratio<=0.3；对于血液Ratio>=4或者Ratio<=0.3
					if sampletype=='FFPE':
						CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
					elif sampletype=='blood':
						CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
					if len(CNVfilter2)!=0:
						##对cnvloss进一步过滤,筛选depth>=1000
						cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
						cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
						cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
						cnvfilter_final.reset_index(drop=True, inplace=True)
						if len(cnvfilter_final) == 0:
							cnvfilter_final = pd.DataFrame(columns=namelist)
							cnvfilter_final.loc[0, 'sampleid'] = sample
							cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
					else:
						cnvfilter_final = pd.DataFrame(columns=namelist)
						cnvfilter_final.loc[0, 'sampleid'] = sample
						cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
				else:
					cnvfilter_final = pd.DataFrame(columns=namelist)
					cnvfilter_final.loc[0, 'sampleid'] = sample
					cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
			cnvfilter_output=cnvfilter_final[namelistout]
			outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
			writer = pd.ExcelWriter(outputfile1)
			cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
			writer.save()
			writer.close()
		else:
			cnvfilter_final=pd.DataFrame(columns=namelist)
			cnvfilter_final.loc[0,'sampleid']=sample
			cnvfilter_final.loc[0,'CNV_label']='No file'
		cnvsummary = cnvsummary.append(cnvfilter_final)
	outputname = datasummarydir + '/' + laneid + '_table4_cnv_datasummary.txt'
	cnvsummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')