import pandas as pd import os,os.path import numpy as np import math import xlrd import argparse ###选择ratio大于1.5的为amplication,小于0.6为缺失 CNVamplicon=1.5 CNVloss=0.6 def cnv_result_sample_raw(inputpath,sampleid,tumor): druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt' drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0) drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True) druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first')) druggene.reset_index(drop=True, inplace=True) # 获得基因标签数据 genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt' genetypedata = pd.read_table(genetype, sep='\t', header=0) genetypedata.rename(columns={'label': 'gene_label'}, inplace=True) ###读入数据 cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair') cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit') # extrac the unseg cns #####提取unseg cns信息 unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene']) cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0) cns_all.rename(columns={'chromosome': 'chr'}, inplace=True) unseg_cns_all = pd.DataFrame() for i in range(len(unseg_genelist)): gene = unseg_genelist.loc[i, 'gene'] seg_genecns = cns_all[cns_all['gene'].str.contains(gene)] seg_genecns.reset_index(drop=True, inplace=True) seg_genecns.loc[0, 'gene'] = gene unseg_cns_all = unseg_cns_all.append(seg_genecns) unseg_cns_all.reset_index(drop=True, inplace=True) unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2) ####读取segment的基因信息 seg_cns_all = pd.DataFrame() seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene']) for i in range(len(seg_genelist)): genename = seg_genelist.loc[i, 'gene'] segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns' try: seg_cns = pd.read_table(segcnrdir, sep='\t', header=0) seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True) seg_cns_all = seg_cns.append(seg_cns_all) except: print('there is no gene') continue seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-'] seg_cns_all.reset_index(drop=True, inplace=True) seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2) seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2) ###将segementh与unsegment进行合并 all_cns = unseg_cns_all.append(seg_cns_all) all_cns.reset_index(drop=True, inplace=True) # 对拷贝数,筛选大于3的为扩增,小于1为缺失 all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)] all_cns_filter.reset_index(drop=True, inplace=True) ##添加标签 titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label'] if len(all_cns_filter) != 0: for m in range(len(all_cns_filter)): if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon: all_cns_filter.loc[m, 'CNV_label'] = 'Amplification' elif all_cns_filter.loc[m, 'Ratio'] < CNVloss: all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss' # select the CNV of drug related gene cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner') if len(cnv_druggene) != 0: # add the gene label oncogene or tumor suppersor drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left') drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid) drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True) cnv_result = drug_CNV_filter_genelabel[titlelist] else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt') cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t') ###修改,当一个基因出现多个拷贝数结果式,需要重新进行分析, #gene=EGFR #cnvdir=${inputpath}/2CNV_cnvkit_pair/${sample}_cnvkit #cnvkit.py segment ${cnvdir}/${tumor}_${gene}_gene.cnr -o ${cnvdir}/${tumor}_${gene}_gene.seg.cns -m none --smooth-cbs def cnv_gene_cns(inputpath,sampleid,tumor,gene): cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair') cnvdir_sample = os.path.join(cnvdir, sampleid + '_cnvkit') sample_gene_cnrinput = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.cnr') # 将原始的基因cns文件拷贝 rawgenecns = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns') rawgenecns_cpname = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.multiexon.cns') cpfile = 'cp -r ' + rawgenecns + ' ' + rawgenecns_cpname os.system(cpfile) #进行新的基因cns计算,代码会覆盖原来的结果 newcns_outputdir = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns') cnvsegcmd = 'cnvkit.py segment ' + sample_gene_cnrinput + ' -o ' + newcns_outputdir + ' -m none --smooth-cbs' os.system(cnvsegcmd) #为了方便核对以后的结果,我们也需要将新的进行拷贝 newgenecns_cpname=os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.unique.cns') cpfilenew = 'cp -r ' + newcns_outputdir + ' ' + newgenecns_cpname os.system(cpfilenew) def gene_cnv_sample(inputpath,sampleid,tumor): ###读入数据 cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair') cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit') ####读取segment的基因信息 seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene']) for i in range(len(seg_genelist)): genename = seg_genelist.loc[i, 'gene'] segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns' if os.path.exists(segcnrdir): count = len(open(segcnrdir, 'r').readlines()) # 判断文件有几行,如果大于2行(2行包括了title)表示文件一个基因有多个exon扩增,那么需要重新执行cnv if count > 2: print(genename) cnv_gene_cns(inputpath, sampleid, tumor, genename) def cnv_result_sample(inputpath,sampleid,tumor): druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt' drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0) drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True) druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first')) druggene.reset_index(drop=True, inplace=True) # 获得基因标签数据 genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt' genetypedata = pd.read_table(genetype, sep='\t', header=0) genetypedata.rename(columns={'label': 'gene_label'}, inplace=True) ###读入数据 cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair') cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit') # extrac the unseg cns #####提取unseg cns信息 unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene']) cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0) cns_all.rename(columns={'chromosome': 'chr'}, inplace=True) unseg_cns_all = pd.DataFrame() for i in range(len(unseg_genelist)): gene = unseg_genelist.loc[i, 'gene'] seg_genecns = cns_all[cns_all['gene'].str.contains(gene)] seg_genecns.reset_index(drop=True, inplace=True) seg_genecns.loc[0, 'gene'] = gene unseg_cns_all = unseg_cns_all.append(seg_genecns) unseg_cns_all.reset_index(drop=True, inplace=True) unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2) ####读取segment的基因信息 seg_cns_all = pd.DataFrame() seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene']) for i in range(len(seg_genelist)): genename = seg_genelist.loc[i, 'gene'] segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns' if os.path.exists(segcnrdir): seg_cns = pd.read_table(segcnrdir, sep='\t', header=0) seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True) seg_cns_all = seg_cns.append(seg_cns_all) else: print('there is no gene') continue seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-'] seg_cns_all.reset_index(drop=True, inplace=True) seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2) seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2) ###将segementh与unsegment进行合并 all_cns = unseg_cns_all.append(seg_cns_all) all_cns.reset_index(drop=True, inplace=True) # 对拷贝数,筛选大于3的为扩增,小于1为缺失 all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)] all_cns_filter.reset_index(drop=True, inplace=True) ##添加标签 titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label'] if len(all_cns_filter) != 0: for m in range(len(all_cns_filter)): if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon: all_cns_filter.loc[m, 'CNV_label'] = 'Amplification' elif all_cns_filter.loc[m, 'Ratio'] < CNVloss: all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss' # select the CNV of drug related gene cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner') if len(cnv_druggene) != 0: # add the gene label oncogene or tumor suppersor drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left') drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid) drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True) cnv_result = drug_CNV_filter_genelabel[titlelist] else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt') cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t') ###输出unfilte的结果 def cnv_result_sample_unfilter(inputpath,sampleid,tumor): druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt' drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0) drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True) druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first')) druggene['druggene_label']='Drug_gene' druggene.reset_index(drop=True, inplace=True) # 获得基因标签数据 genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt' genetypedata = pd.read_table(genetype, sep='\t', header=0) genetypedata.rename(columns={'label': 'gene_label'}, inplace=True) ###读入数据 cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair') cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit') # extrac the unseg cns #####提取unseg cns信息 unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene']) cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0) cns_all.rename(columns={'chromosome': 'chr'}, inplace=True) unseg_cns_all = pd.DataFrame() for i in range(len(unseg_genelist)): gene = unseg_genelist.loc[i, 'gene'] seg_genecns = cns_all[cns_all['gene'].str.contains(gene)] seg_genecns.reset_index(drop=True, inplace=True) seg_genecns.loc[0, 'gene'] = gene unseg_cns_all = unseg_cns_all.append(seg_genecns) unseg_cns_all.reset_index(drop=True, inplace=True) unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2) unseg_cns_all['Copynum'] = round(unseg_cns_all['Ratio'] * 2, 2) ####读取segment的基因信息 seg_cns_all = pd.DataFrame() seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene']) for i in range(len(seg_genelist)): genename = seg_genelist.loc[i, 'gene'] segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns' try: seg_cns = pd.read_table(segcnrdir, sep='\t', header=0) seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True) seg_cns_all = seg_cns.append(seg_cns_all) except: print('there is no gene') continue seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-'] seg_cns_all.reset_index(drop=True, inplace=True) seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2) seg_cns_all['Copynum'] = round(seg_cns_all['Ratio'] * 2, 2) ###将segementh与unsegment进行合并 all_cns = unseg_cns_all.append(seg_cns_all) all_cns.reset_index(drop=True, inplace=True) ##添加标签 titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label','druggene_label', 'Gene_label'] if len(all_cns) != 0: for m in range(len(all_cns)): if all_cns.loc[m, 'Ratio'] > CNVamplicon: all_cns.loc[m, 'CNV_label'] = 'Amplification' elif all_cns.loc[m, 'Ratio'] < CNVloss: all_cns.loc[m, 'CNV_label'] = 'CopyNumberLoss' else: all_cns.loc[m, 'CNV_label'] ='' # select the CNV of drug related gene cnv_druggene = pd.merge(all_cns, druggene, on=['gene'], how='left') if len(cnv_druggene) != 0: # add the gene label oncogene or tumor suppersor drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left') drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid) drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True) cnv_result = drug_CNV_filter_genelabel[titlelist] else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' else: cnv_result = pd.DataFrame(columns=titlelist) cnv_result.loc[0, 'sampleid'] = sampleid cnv_result.loc[0, 'CNV_label'] = 'No result' outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.unfilter.txt') cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t') if __name__=='__main__': parser = argparse.ArgumentParser(description='cnv data for cnvdata') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--sampleid', type=str, help='sampleid') parser.add_argument('-t', '--tumor', type=str, help='tumorid') args = parser.parse_args() Inputpath = args.inputpath Sampleid = args.sampleid Tumor = args.tumor gene_cnv_sample(Inputpath,Sampleid,Tumor) cnv_result_sample(Inputpath,Sampleid,Tumor) cnv_result_sample_unfilter(Inputpath,Sampleid,Tumor)