123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- import pandas as pd
- import os
- import os.path
- import xlrd
- import argparse
- ####修改
- #经过统计男性X染色体上Ratio的中值为0.5,女性为1.
- #除性染色体外,对其他染色体上所有ratio小于0.6的cnv统计,发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
- #但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
- #1.筛选depth大于等于1000且ratio<=0.3的突变。
- ##2.对于扩增,组织,筛选ratio>=1.5;对于血液,筛选ratio>=4
- #3.去除化疗相关基因
- def cnv_sample_run_test(inputpath,sample):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
- panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
- druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
- drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
- drug_genelist['drug_lable'] = 'Drug_gene'
- namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
- namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label']
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t')
- sampledata = samplelist[samplelist['samplename'] == sample]
- sampledata.reset_index(drop=True, inplace=True)
- sample = sampledata.loc[0, 'samplename']
- sampletype = sampledata.loc[0, 'sampletype']
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sample)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
- if os.path.exists(cnvdatadir):
- cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
- cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
- cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
- # extract the panel gene
- cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
- cnvfilter.reset_index(drop=True, inplace=True)
- del cnvfilter['druggene_label']
- if len(cnvfilter) == 0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- ##remove the drug gene
- CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
- CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
- del CNVfilter1['drug_lable']
- if len(CNVfilter1) != 0:
- # 筛选CNV,对于组织,Ratio>=1.5或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
- if sampletype == 'FFPE':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
- elif sampletype == 'blood':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
- if len(CNVfilter2) != 0:
- ##对cnvloss进一步过滤,筛选depth>=1000
- cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
- cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
- cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
- cnvfilter_final.reset_index(drop=True, inplace=True)
- if len(cnvfilter_final) == 0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- cnvfilter_output = cnvfilter_final[namelistout]
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
- writer.save()
- writer.close()
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- bugfile_dir = os.path.join(temp_dir, 'bugfile')
- if not os.path.exists(bugfile_dir):
- os.mkdir(bugfile_dir)
- outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
- cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
- ####加入在样本中出现的次数
- def CNVcounts(cnvfilter,inputpath,sample):
- # add the cnv counts in 117 samples
- cnvbgdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_117sample_stastic_risk_20220929.txt'
- cnvbgdata = pd.read_table(cnvbgdir, sep='\t', header=0)
- cnvfilter['CNV_gene'] = cnvfilter['gene'] + '_' + cnvfilter['chr'].astype('str') + '_' + cnvfilter['start'].astype('str') + '_' + cnvfilter['end'].astype('str')
- cnvfilter_counts = pd.merge(cnvfilter, cnvbgdata, on=['CNV_gene'], how='left')
- del cnvfilter_counts['CNV_gene']
- #add the tier infor
- cnvtierdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
- cnvtierdata= pd.read_table(cnvtierdir1, sep='\t', header=0)
- cnvfilter_counts_tier=pd.merge(cnvfilter_counts,cnvtierdata,on=['gene','CNV_label'],how='left')
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- tempcnv_dir = os.path.join(temp_dir, 'CNV')
- if not os.path.exists(tempcnv_dir):
- os.mkdir(tempcnv_dir)
- outputname = os.path.join(tempcnv_dir, sample + '_CNVunfilter_samplecounts.txt')
- cnvfilter_counts_tier.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
- return cnvfilter_counts_tier
- ####修改
- #经过统计男性X染色体上Ratio的中值为0.5,女性为1.
- #除性染色体外,对其他染色体上所有ratio小于0.6的cnv统计,发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
- #但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
- #1.筛选depth大于等于1000且ratio<=0.3的突变。
- ##2.对于扩增,组织,筛选ratio>=2;对于血液,筛选ratio>=4
- #3.去除化疗相关基因
- def cnv_sample_run(inputpath,sample):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
- panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
- druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
- drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
- drug_genelist['drug_lable'] = 'Drug_gene'
- namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
- namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t')
- sampledata = samplelist[samplelist['samplename'] == sample]
- sampledata.reset_index(drop=True, inplace=True)
- sampletype = sampledata.loc[0, 'sampletype']
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sample)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
- if os.path.exists(cnvdatadir):
- cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
- cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
- cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
- # extract the panel gene
- cnvfilter1_0 = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
- cnvfilter1_0 .reset_index(drop=True, inplace=True)
- del cnvfilter1_0['druggene_label']
- # add the cnv counts in 117 samples
- cnvfilter = CNVcounts(cnvfilter1_0, inputpath, sample)
- if len(cnvfilter) == 0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- ##remove the drug gene
- CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
- CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
- del CNVfilter1['drug_lable']
- if len(CNVfilter1) != 0:
- # 筛选CNV,对于组织,Ratio>=2或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
- if sampletype == 'FFPE':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 2) | (CNVfilter1['Ratio'] <= 0.3)]
- elif sampletype == 'blood':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
- if len(CNVfilter2) != 0:
- ##对cnvloss进一步过滤,筛选depth>=1000
- cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
- cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
- cnvfilter_cnvloss = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
- cnvfilter_cnvloss.reset_index(drop=True, inplace=True)
- #筛选在117个样本中出现频率小于50%的CNV
- if len(cnvfilter_cnvloss) == 0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final0 = cnvfilter_cnvloss[cnvfilter_cnvloss['Sample_ratio'] < 0.5]
- if len(cnvfilter_final0) != 0:
- cnvfilter_final = cnvfilter_final0.drop(labels=['Sample_ratio', 'Sample_counts'], axis=1)
- cnvfilter_final.reset_index(drop=True, inplace=True)
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- cnvfilter_output = cnvfilter_final[namelistout]
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
- writer.save()
- writer.close()
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- bugfile_dir = os.path.join(temp_dir, 'bugfile')
- if not os.path.exists(bugfile_dir):
- os.mkdir(bugfile_dir)
- outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
- cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the CNV')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--sample', type=str, help='the sample name')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Sample = args.sample
- cnv_sample_run(Inputpath,Sample)
- ####所有样本汇总不执行
- def cnv_datasummary(inputpath,laneid):
- panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
- panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
- druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
- drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
- drug_genelist['drug_lable'] = 'Drug_gene'
- namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
- namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label']
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- cnvsummary = pd.DataFrame()
- sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t')
- for i in range(len(samplelist)):
- sample = samplelist.loc[i, 'samplename']
- sampletype=samplelist.loc[i, 'sampletype']
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sample)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
- if os.path.exists(cnvdatadir):
- cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
- cnvfilter0=cnvdata.dropna(subset=['CNV_label'])
- cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label']!='']
- # extract the panel gene
- cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
- cnvfilter.reset_index(drop=True,inplace=True)
- del cnvfilter['druggene_label']
- if len(cnvfilter)==0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- ##remove the drug gene
- CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
- CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
- del CNVfilter1['drug_lable']
- if len(CNVfilter1)!=0:
- # 筛选CNV,对于组织,Ratio>=1.5或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
- if sampletype=='FFPE':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
- elif sampletype=='blood':
- CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
- if len(CNVfilter2)!=0:
- ##对cnvloss进一步过滤,筛选depth>=1000
- cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
- cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
- cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
- cnvfilter_final.reset_index(drop=True, inplace=True)
- if len(cnvfilter_final) == 0:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- else:
- cnvfilter_final = pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0, 'sampleid'] = sample
- cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
- cnvfilter_output=cnvfilter_final[namelistout]
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
- writer.save()
- writer.close()
- else:
- cnvfilter_final=pd.DataFrame(columns=namelist)
- cnvfilter_final.loc[0,'sampleid']=sample
- cnvfilter_final.loc[0,'CNV_label']='No file'
- cnvsummary = cnvsummary.append(cnvfilter_final)
- outputname = datasummarydir + '/' + laneid + '_table4_cnv_datasummary.txt'
- cnvsummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|