import pandas as pd import os,os.path import math import xlrd import numpy as np import argparse ###############改 #####1.将有overlape的基因拆分自各自的gene中,结果以gene为单位,获得对应的cnr文件 #####2.将每个基因的cnr文件进行cnvkt segment分析获得cns文件。 #####3.对所有基因的cns文件合并,并进行cnv过滤 # ###4.当多个基因的具有相同的数据时,panel中的基因优先 def genesegment_cnvkit(inputpath,sampleid,tumor): cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair') samplelistdir=os.path.join(cnvinputpath, sampleid+'_cnvkit') cnrdir = samplelistdir + '/' +tumor + '_clean.cnr' cnrdata = pd.read_table(cnrdir, sep='\t', header=0) cnrdata_2 = cnrdata['gene'].str.split(',', expand=True) cnrdata_2 = cnrdata_2.stack() cnrdata_2 = cnrdata_2.reset_index(level=1, drop=True) cnrdata_2 = pd.DataFrame(cnrdata_2) cnrdata_2.columns = ["gene"] cnrdata_new = cnrdata.drop("gene", axis=1).join(cnrdata_2) cnrdata_new1 = cnrdata_new[['chromosome', 'start', 'end', 'gene', 'depth', 'log2', 'weight']] ###gene list gene = cnrdata_new1['gene'].drop_duplicates(keep='first') gene = pd.DataFrame(gene) gene1 = gene[gene['gene'] != '-'] gene1.reset_index(drop=True, inplace=True) #gene1.to_csv(cnvdir + '/' + sampleid + '_gene.list', index=False, header=None, encoding='gbk', sep='\t') ##output the cnr of gene seg_genelist = pd.DataFrame() unseg_genelist = pd.DataFrame() k = 0 m = 0 for j in range(len(gene1)): genename = gene1.loc[j, 'gene'] genecnr = cnrdata_new1[cnrdata_new1['gene'] == genename] genecnr.reset_index(drop=True, inplace=True) # for multi region segment if len(genecnr) > 1: outname = samplelistdir + '/' + tumor + '_' + genename + '_gene.cnr' genecnr.to_csv(outname, index=False, header=True, encoding='gbk', sep='\t') seg_genelist.loc[k, 'gene'] = genename k = k + 1 else: unseg_genelist.loc[m, 'gene'] = genename m = m + 1 ### seg_genelist.to_csv(samplelistdir + '/' + tumor + '_seggene.list', index=False, header=None, encoding='gbk', sep='\t') unseg_genelist.to_csv(samplelistdir + '/' + tumor + '_unseggene.list', index=False, header=None, encoding='gbk',sep='\t') if __name__=='__main__': parser = argparse.ArgumentParser(description='segment cnr for cnvdata') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--sampleid', type=str, help='sampleid') parser.add_argument('-t', '--tumor', type=str, help='tumorid') args = parser.parse_args() Inputpath = args.inputpath Sampleid = args.sampleid Tumor = args.tumor genesegment_cnvkit(Inputpath,Sampleid,Tumor)