chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
							import pandas as pd
import os,os.path
import numpy as np
import math
import xlrd
import argparse
###选择ratio大于1.5的为amplication,小于0.6为缺失
CNVamplicon=1.5
CNVloss=0.6
def cnv_result_sample_raw(inputpath,sampleid,tumor):
	druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
	drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
	drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
	druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
	druggene.reset_index(drop=True, inplace=True)
	# 获得基因标签数据
	genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
	genetypedata = pd.read_table(genetype, sep='\t', header=0)
	genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
	###读入数据
	cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
	# extrac the unseg cns
	#####提取unseg cns信息
	unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
	cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
	cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
	unseg_cns_all = pd.DataFrame()
	for i in range(len(unseg_genelist)):
		gene = unseg_genelist.loc[i, 'gene']
		seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
		seg_genecns.reset_index(drop=True, inplace=True)
		seg_genecns.loc[0, 'gene'] = gene
		unseg_cns_all = unseg_cns_all.append(seg_genecns)
	unseg_cns_all.reset_index(drop=True, inplace=True)
	unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
	####读取segment的基因信息
	seg_cns_all = pd.DataFrame()
	seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
	for i in range(len(seg_genelist)):
		genename = seg_genelist.loc[i, 'gene']
		segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
		try:
			seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
			seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
			seg_cns_all = seg_cns.append(seg_cns_all)
		except:
			print('there is no gene')
			continue
	seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
	seg_cns_all.reset_index(drop=True, inplace=True)
	seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
	seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
	###将segementh与unsegment进行合并
	all_cns = unseg_cns_all.append(seg_cns_all)
	all_cns.reset_index(drop=True, inplace=True)
	# 对拷贝数，筛选大于3的为扩增，小于1为缺失
	all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
	all_cns_filter.reset_index(drop=True, inplace=True)
	##添加标签
	titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
	if len(all_cns_filter) != 0:
		for m in range(len(all_cns_filter)):
			if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
				all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
			elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
				all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
		# select the CNV of drug related gene
		cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
		if len(cnv_druggene) != 0:
			# add the gene label oncogene or tumor suppersor
			drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
			drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
			drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
			cnv_result = drug_CNV_filter_genelabel[titlelist]
		else:
			cnv_result = pd.DataFrame(columns=titlelist)
			cnv_result.loc[0, 'sampleid'] = sampleid
			cnv_result.loc[0, 'CNV_label'] = 'No result'
	else:
		cnv_result = pd.DataFrame(columns=titlelist)
		cnv_result.loc[0, 'sampleid'] = sampleid
		cnv_result.loc[0, 'CNV_label'] = 'No result'
	outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
	cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')

###修改，当一个基因出现多个拷贝数结果式，需要重新进行分析，
#gene=EGFR
#cnvdir=${inputpath}/2CNV_cnvkit_pair/${sample}_cnvkit
#cnvkit.py segment ${cnvdir}/${tumor}_${gene}_gene.cnr -o ${cnvdir}/${tumor}_${gene}_gene.seg.cns -m none --smooth-cbs
def cnv_gene_cns(inputpath,sampleid,tumor,gene):
	cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvdir_sample = os.path.join(cnvdir, sampleid + '_cnvkit')
	sample_gene_cnrinput = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.cnr')
	# 将原始的基因cns文件拷贝
	rawgenecns = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
	rawgenecns_cpname = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.multiexon.cns')
	cpfile = 'cp -r ' + rawgenecns + ' ' + rawgenecns_cpname
	os.system(cpfile)
	#进行新的基因cns计算，代码会覆盖原来的结果
	newcns_outputdir = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
	cnvsegcmd = 'cnvkit.py segment ' + sample_gene_cnrinput + ' -o ' + newcns_outputdir  + '  -m none --smooth-cbs'
	os.system(cnvsegcmd)
	#为了方便核对以后的结果，我们也需要将新的进行拷贝
	newgenecns_cpname=os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.unique.cns')
	cpfilenew = 'cp -r ' + newcns_outputdir + ' ' + newgenecns_cpname
	os.system(cpfilenew)

def gene_cnv_sample(inputpath,sampleid,tumor):
	###读入数据
	cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
	####读取segment的基因信息
	seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
	for i in range(len(seg_genelist)):
		genename = seg_genelist.loc[i, 'gene']
		segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
		if os.path.exists(segcnrdir):
			count = len(open(segcnrdir, 'r').readlines())
			# 判断文件有几行，如果大于2行(2行包括了title)表示文件一个基因有多个exon扩增，那么需要重新执行cnv
			if count > 2:
				print(genename)
				cnv_gene_cns(inputpath, sampleid, tumor, genename)


def cnv_result_sample(inputpath,sampleid,tumor):
	druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
	drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
	drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
	druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
	druggene.reset_index(drop=True, inplace=True)
	# 获得基因标签数据
	genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
	genetypedata = pd.read_table(genetype, sep='\t', header=0)
	genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
	###读入数据
	cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
	# extrac the unseg cns
	#####提取unseg cns信息
	unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
	cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
	cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
	unseg_cns_all = pd.DataFrame()
	for i in range(len(unseg_genelist)):
		gene = unseg_genelist.loc[i, 'gene']
		seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
		seg_genecns.reset_index(drop=True, inplace=True)
		seg_genecns.loc[0, 'gene'] = gene
		unseg_cns_all = unseg_cns_all.append(seg_genecns)
	unseg_cns_all.reset_index(drop=True, inplace=True)
	unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
	####读取segment的基因信息
	seg_cns_all = pd.DataFrame()
	seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
	for i in range(len(seg_genelist)):
		genename = seg_genelist.loc[i, 'gene']
		segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
		if os.path.exists(segcnrdir):
			seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
			seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
			seg_cns_all = seg_cns.append(seg_cns_all)
		else:
			print('there is no gene')
			continue
	seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
	seg_cns_all.reset_index(drop=True, inplace=True)
	seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
	seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
	###将segementh与unsegment进行合并
	all_cns = unseg_cns_all.append(seg_cns_all)
	all_cns.reset_index(drop=True, inplace=True)
	# 对拷贝数，筛选大于3的为扩增，小于1为缺失
	all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
	all_cns_filter.reset_index(drop=True, inplace=True)
	##添加标签
	titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
	if len(all_cns_filter) != 0:
		for m in range(len(all_cns_filter)):
			if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
				all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
			elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
				all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
		# select the CNV of drug related gene
		cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
		if len(cnv_druggene) != 0:
			# add the gene label oncogene or tumor suppersor
			drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
			drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
			drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
			cnv_result = drug_CNV_filter_genelabel[titlelist]
		else:
			cnv_result = pd.DataFrame(columns=titlelist)
			cnv_result.loc[0, 'sampleid'] = sampleid
			cnv_result.loc[0, 'CNV_label'] = 'No result'
	else:
		cnv_result = pd.DataFrame(columns=titlelist)
		cnv_result.loc[0, 'sampleid'] = sampleid
		cnv_result.loc[0, 'CNV_label'] = 'No result'
	outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
	cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')


###输出unfilte的结果

def cnv_result_sample_unfilter(inputpath,sampleid,tumor):
	druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
	drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
	drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
	druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
	druggene['druggene_label']='Drug_gene'
	druggene.reset_index(drop=True, inplace=True)
	# 获得基因标签数据
	genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
	genetypedata = pd.read_table(genetype, sep='\t', header=0)
	genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
	###读入数据
	cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
	cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
	# extrac the unseg cns
	#####提取unseg cns信息
	unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
	cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
	cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
	unseg_cns_all = pd.DataFrame()
	for i in range(len(unseg_genelist)):
		gene = unseg_genelist.loc[i, 'gene']
		seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
		seg_genecns.reset_index(drop=True, inplace=True)
		seg_genecns.loc[0, 'gene'] = gene
		unseg_cns_all = unseg_cns_all.append(seg_genecns)
	unseg_cns_all.reset_index(drop=True, inplace=True)
	unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
	unseg_cns_all['Copynum'] = round(unseg_cns_all['Ratio'] * 2, 2)
	####读取segment的基因信息
	seg_cns_all = pd.DataFrame()
	seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
	for i in range(len(seg_genelist)):
		genename = seg_genelist.loc[i, 'gene']
		segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
		try:
			seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
			seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
			seg_cns_all = seg_cns.append(seg_cns_all)
		except:
			print('there is no gene')
			continue
	seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
	seg_cns_all.reset_index(drop=True, inplace=True)
	seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
	seg_cns_all['Copynum'] = round(seg_cns_all['Ratio'] * 2, 2)
	###将segementh与unsegment进行合并
	all_cns = unseg_cns_all.append(seg_cns_all)
	all_cns.reset_index(drop=True, inplace=True)
	##添加标签
	titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label','druggene_label', 'Gene_label']
	if len(all_cns) != 0:
		for m in range(len(all_cns)):
			if all_cns.loc[m, 'Ratio'] > CNVamplicon:
				all_cns.loc[m, 'CNV_label'] = 'Amplification'
			elif all_cns.loc[m, 'Ratio'] < CNVloss:
				all_cns.loc[m, 'CNV_label'] = 'CopyNumberLoss'
			else:
				all_cns.loc[m, 'CNV_label'] =''
		# select the CNV of drug related gene
		cnv_druggene = pd.merge(all_cns, druggene, on=['gene'], how='left')
		if len(cnv_druggene) != 0:
			# add the gene label oncogene or tumor suppersor
			drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
			drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
			drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
			cnv_result = drug_CNV_filter_genelabel[titlelist]
		else:
			cnv_result = pd.DataFrame(columns=titlelist)
			cnv_result.loc[0, 'sampleid'] = sampleid
			cnv_result.loc[0, 'CNV_label'] = 'No result'
	else:
		cnv_result = pd.DataFrame(columns=titlelist)
		cnv_result.loc[0, 'sampleid'] = sampleid
		cnv_result.loc[0, 'CNV_label'] = 'No result'
	outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.unfilter.txt')
	cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')


if __name__=='__main__':
	parser = argparse.ArgumentParser(description='cnv data for cnvdata')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
	parser.add_argument('-t', '--tumor', type=str, help='tumorid')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Sampleid = args.sampleid
	Tumor = args.tumor
	gene_cnv_sample(Inputpath,Sampleid,Tumor)
	cnv_result_sample(Inputpath,Sampleid,Tumor)
	cnv_result_sample_unfilter(Inputpath,Sampleid,Tumor)