123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- import pandas as pd
- import os,os.path
- import numpy as np
- import math
- import xlrd
- import argparse
- ###选择ratio大于1.5的为amplication,小于0.6为缺失
- CNVamplicon=1.5
- CNVloss=0.6
- def cnv_result_sample_raw(inputpath,sampleid,tumor):
- druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
- drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
- drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
- druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
- druggene.reset_index(drop=True, inplace=True)
- # 获得基因标签数据
- genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
- genetypedata = pd.read_table(genetype, sep='\t', header=0)
- genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
- ###读入数据
- cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
- # extrac the unseg cns
- #####提取unseg cns信息
- unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
- cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
- cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
- unseg_cns_all = pd.DataFrame()
- for i in range(len(unseg_genelist)):
- gene = unseg_genelist.loc[i, 'gene']
- seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
- seg_genecns.reset_index(drop=True, inplace=True)
- seg_genecns.loc[0, 'gene'] = gene
- unseg_cns_all = unseg_cns_all.append(seg_genecns)
- unseg_cns_all.reset_index(drop=True, inplace=True)
- unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
- ####读取segment的基因信息
- seg_cns_all = pd.DataFrame()
- seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
- for i in range(len(seg_genelist)):
- genename = seg_genelist.loc[i, 'gene']
- segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
- try:
- seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
- seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
- seg_cns_all = seg_cns.append(seg_cns_all)
- except:
- print('there is no gene')
- continue
- seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
- seg_cns_all.reset_index(drop=True, inplace=True)
- seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
- seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
- ###将segementh与unsegment进行合并
- all_cns = unseg_cns_all.append(seg_cns_all)
- all_cns.reset_index(drop=True, inplace=True)
- # 对拷贝数,筛选大于3的为扩增,小于1为缺失
- all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
- all_cns_filter.reset_index(drop=True, inplace=True)
- ##添加标签
- titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
- if len(all_cns_filter) != 0:
- for m in range(len(all_cns_filter)):
- if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
- all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
- elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
- all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
- # select the CNV of drug related gene
- cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
- if len(cnv_druggene) != 0:
- # add the gene label oncogene or tumor suppersor
- drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
- drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
- drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
- cnv_result = drug_CNV_filter_genelabel[titlelist]
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
- cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
- ###修改,当一个基因出现多个拷贝数结果式,需要重新进行分析,
- #gene=EGFR
- #cnvdir=${inputpath}/2CNV_cnvkit_pair/${sample}_cnvkit
- #cnvkit.py segment ${cnvdir}/${tumor}_${gene}_gene.cnr -o ${cnvdir}/${tumor}_${gene}_gene.seg.cns -m none --smooth-cbs
- def cnv_gene_cns(inputpath,sampleid,tumor,gene):
- cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvdir_sample = os.path.join(cnvdir, sampleid + '_cnvkit')
- sample_gene_cnrinput = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.cnr')
- # 将原始的基因cns文件拷贝
- rawgenecns = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
- rawgenecns_cpname = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.multiexon.cns')
- cpfile = 'cp -r ' + rawgenecns + ' ' + rawgenecns_cpname
- os.system(cpfile)
- #进行新的基因cns计算,代码会覆盖原来的结果
- newcns_outputdir = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
- cnvsegcmd = 'cnvkit.py segment ' + sample_gene_cnrinput + ' -o ' + newcns_outputdir + ' -m none --smooth-cbs'
- os.system(cnvsegcmd)
- #为了方便核对以后的结果,我们也需要将新的进行拷贝
- newgenecns_cpname=os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.unique.cns')
- cpfilenew = 'cp -r ' + newcns_outputdir + ' ' + newgenecns_cpname
- os.system(cpfilenew)
- def gene_cnv_sample(inputpath,sampleid,tumor):
- ###读入数据
- cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
- ####读取segment的基因信息
- seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
- for i in range(len(seg_genelist)):
- genename = seg_genelist.loc[i, 'gene']
- segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
- if os.path.exists(segcnrdir):
- count = len(open(segcnrdir, 'r').readlines())
- # 判断文件有几行,如果大于2行(2行包括了title)表示文件一个基因有多个exon扩增,那么需要重新执行cnv
- if count > 2:
- print(genename)
- cnv_gene_cns(inputpath, sampleid, tumor, genename)
- def cnv_result_sample(inputpath,sampleid,tumor):
- druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
- drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
- drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
- druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
- druggene.reset_index(drop=True, inplace=True)
- # 获得基因标签数据
- genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
- genetypedata = pd.read_table(genetype, sep='\t', header=0)
- genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
- ###读入数据
- cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
- # extrac the unseg cns
- #####提取unseg cns信息
- unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
- cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
- cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
- unseg_cns_all = pd.DataFrame()
- for i in range(len(unseg_genelist)):
- gene = unseg_genelist.loc[i, 'gene']
- seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
- seg_genecns.reset_index(drop=True, inplace=True)
- seg_genecns.loc[0, 'gene'] = gene
- unseg_cns_all = unseg_cns_all.append(seg_genecns)
- unseg_cns_all.reset_index(drop=True, inplace=True)
- unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
- ####读取segment的基因信息
- seg_cns_all = pd.DataFrame()
- seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
- for i in range(len(seg_genelist)):
- genename = seg_genelist.loc[i, 'gene']
- segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
- if os.path.exists(segcnrdir):
- seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
- seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
- seg_cns_all = seg_cns.append(seg_cns_all)
- else:
- print('there is no gene')
- continue
- seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
- seg_cns_all.reset_index(drop=True, inplace=True)
- seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
- seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
- ###将segementh与unsegment进行合并
- all_cns = unseg_cns_all.append(seg_cns_all)
- all_cns.reset_index(drop=True, inplace=True)
- # 对拷贝数,筛选大于3的为扩增,小于1为缺失
- all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
- all_cns_filter.reset_index(drop=True, inplace=True)
- ##添加标签
- titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
- if len(all_cns_filter) != 0:
- for m in range(len(all_cns_filter)):
- if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
- all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
- elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
- all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
- # select the CNV of drug related gene
- cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
- if len(cnv_druggene) != 0:
- # add the gene label oncogene or tumor suppersor
- drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
- drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
- drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
- cnv_result = drug_CNV_filter_genelabel[titlelist]
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
- cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
- ###输出unfilte的结果
- def cnv_result_sample_unfilter(inputpath,sampleid,tumor):
- druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
- drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
- drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
- druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
- druggene['druggene_label']='Drug_gene'
- druggene.reset_index(drop=True, inplace=True)
- # 获得基因标签数据
- genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
- genetypedata = pd.read_table(genetype, sep='\t', header=0)
- genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
- ###读入数据
- cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
- # extrac the unseg cns
- #####提取unseg cns信息
- unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
- cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
- cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
- unseg_cns_all = pd.DataFrame()
- for i in range(len(unseg_genelist)):
- gene = unseg_genelist.loc[i, 'gene']
- seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
- seg_genecns.reset_index(drop=True, inplace=True)
- seg_genecns.loc[0, 'gene'] = gene
- unseg_cns_all = unseg_cns_all.append(seg_genecns)
- unseg_cns_all.reset_index(drop=True, inplace=True)
- unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
- unseg_cns_all['Copynum'] = round(unseg_cns_all['Ratio'] * 2, 2)
- ####读取segment的基因信息
- seg_cns_all = pd.DataFrame()
- seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
- for i in range(len(seg_genelist)):
- genename = seg_genelist.loc[i, 'gene']
- segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
- try:
- seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
- seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
- seg_cns_all = seg_cns.append(seg_cns_all)
- except:
- print('there is no gene')
- continue
- seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
- seg_cns_all.reset_index(drop=True, inplace=True)
- seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
- seg_cns_all['Copynum'] = round(seg_cns_all['Ratio'] * 2, 2)
- ###将segementh与unsegment进行合并
- all_cns = unseg_cns_all.append(seg_cns_all)
- all_cns.reset_index(drop=True, inplace=True)
- ##添加标签
- titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label','druggene_label', 'Gene_label']
- if len(all_cns) != 0:
- for m in range(len(all_cns)):
- if all_cns.loc[m, 'Ratio'] > CNVamplicon:
- all_cns.loc[m, 'CNV_label'] = 'Amplification'
- elif all_cns.loc[m, 'Ratio'] < CNVloss:
- all_cns.loc[m, 'CNV_label'] = 'CopyNumberLoss'
- else:
- all_cns.loc[m, 'CNV_label'] =''
- # select the CNV of drug related gene
- cnv_druggene = pd.merge(all_cns, druggene, on=['gene'], how='left')
- if len(cnv_druggene) != 0:
- # add the gene label oncogene or tumor suppersor
- drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
- drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
- drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
- cnv_result = drug_CNV_filter_genelabel[titlelist]
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- else:
- cnv_result = pd.DataFrame(columns=titlelist)
- cnv_result.loc[0, 'sampleid'] = sampleid
- cnv_result.loc[0, 'CNV_label'] = 'No result'
- outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.unfilter.txt')
- cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='cnv data for cnvdata')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
- parser.add_argument('-t', '--tumor', type=str, help='tumorid')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Sampleid = args.sampleid
- Tumor = args.tumor
- gene_cnv_sample(Inputpath,Sampleid,Tumor)
- cnv_result_sample(Inputpath,Sampleid,Tumor)
- cnv_result_sample_unfilter(Inputpath,Sampleid,Tumor)
|