123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- import pandas as pd
- import os,os.path
- import math
- import xlrd
- import numpy as np
- import argparse
- ###############改
- #####1.将有overlape的基因拆分自各自的gene中,结果以gene为单位,获得对应的cnr文件
- #####2.将每个基因的cnr文件进行cnvkt segment分析获得cns文件。
- #####3.对所有基因的cns文件合并,并进行cnv过滤
- # ###4.当多个基因的具有相同的数据时,panel中的基因优先
- def genesegment_cnvkit(inputpath,sampleid,tumor):
- cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
- samplelistdir=os.path.join(cnvinputpath, sampleid+'_cnvkit')
- cnrdir = samplelistdir + '/' +tumor + '_clean.cnr'
- cnrdata = pd.read_table(cnrdir, sep='\t', header=0)
- cnrdata_2 = cnrdata['gene'].str.split(',', expand=True)
- cnrdata_2 = cnrdata_2.stack()
- cnrdata_2 = cnrdata_2.reset_index(level=1, drop=True)
- cnrdata_2 = pd.DataFrame(cnrdata_2)
- cnrdata_2.columns = ["gene"]
- cnrdata_new = cnrdata.drop("gene", axis=1).join(cnrdata_2)
- cnrdata_new1 = cnrdata_new[['chromosome', 'start', 'end', 'gene', 'depth', 'log2', 'weight']]
- ###gene list
- gene = cnrdata_new1['gene'].drop_duplicates(keep='first')
- gene = pd.DataFrame(gene)
- gene1 = gene[gene['gene'] != '-']
- gene1.reset_index(drop=True, inplace=True)
- #gene1.to_csv(cnvdir + '/' + sampleid + '_gene.list', index=False, header=None, encoding='gbk', sep='\t')
- ##output the cnr of gene
- seg_genelist = pd.DataFrame()
- unseg_genelist = pd.DataFrame()
- k = 0
- m = 0
- for j in range(len(gene1)):
- genename = gene1.loc[j, 'gene']
- genecnr = cnrdata_new1[cnrdata_new1['gene'] == genename]
- genecnr.reset_index(drop=True, inplace=True)
- # for multi region segment
- if len(genecnr) > 1:
- outname = samplelistdir + '/' + tumor + '_' + genename + '_gene.cnr'
- genecnr.to_csv(outname, index=False, header=True, encoding='gbk', sep='\t')
- seg_genelist.loc[k, 'gene'] = genename
- k = k + 1
- else:
- unseg_genelist.loc[m, 'gene'] = genename
- m = m + 1
- ###
- seg_genelist.to_csv(samplelistdir + '/' + tumor + '_seggene.list', index=False, header=None, encoding='gbk', sep='\t')
- unseg_genelist.to_csv(samplelistdir + '/' + tumor + '_unseggene.list', index=False, header=None, encoding='gbk',sep='\t')
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='segment cnr for cnvdata')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
- parser.add_argument('-t', '--tumor', type=str, help='tumorid')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Sampleid = args.sampleid
- Tumor = args.tumor
- genesegment_cnvkit(Inputpath,Sampleid,Tumor)
|