cnvkit_cnr_gene20220525.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import pandas as pd
  2. import os,os.path
  3. import math
  4. import xlrd
  5. import numpy as np
  6. import argparse
  7. ###############改
  8. #####1.将有overlape的基因拆分自各自的gene中,结果以gene为单位,获得对应的cnr文件
  9. #####2.将每个基因的cnr文件进行cnvkt segment分析获得cns文件。
  10. #####3.对所有基因的cns文件合并,并进行cnv过滤
  11. # ###4.当多个基因的具有相同的数据时,panel中的基因优先
  12. def genesegment_cnvkit(inputpath,sampleid,tumor):
  13. cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
  14. samplelistdir=os.path.join(cnvinputpath, sampleid+'_cnvkit')
  15. cnrdir = samplelistdir + '/' +tumor + '_clean.cnr'
  16. cnrdata = pd.read_table(cnrdir, sep='\t', header=0)
  17. cnrdata_2 = cnrdata['gene'].str.split(',', expand=True)
  18. cnrdata_2 = cnrdata_2.stack()
  19. cnrdata_2 = cnrdata_2.reset_index(level=1, drop=True)
  20. cnrdata_2 = pd.DataFrame(cnrdata_2)
  21. cnrdata_2.columns = ["gene"]
  22. cnrdata_new = cnrdata.drop("gene", axis=1).join(cnrdata_2)
  23. cnrdata_new1 = cnrdata_new[['chromosome', 'start', 'end', 'gene', 'depth', 'log2', 'weight']]
  24. ###gene list
  25. gene = cnrdata_new1['gene'].drop_duplicates(keep='first')
  26. gene = pd.DataFrame(gene)
  27. gene1 = gene[gene['gene'] != '-']
  28. gene1.reset_index(drop=True, inplace=True)
  29. #gene1.to_csv(cnvdir + '/' + sampleid + '_gene.list', index=False, header=None, encoding='gbk', sep='\t')
  30. ##output the cnr of gene
  31. seg_genelist = pd.DataFrame()
  32. unseg_genelist = pd.DataFrame()
  33. k = 0
  34. m = 0
  35. for j in range(len(gene1)):
  36. genename = gene1.loc[j, 'gene']
  37. genecnr = cnrdata_new1[cnrdata_new1['gene'] == genename]
  38. genecnr.reset_index(drop=True, inplace=True)
  39. # for multi region segment
  40. if len(genecnr) > 1:
  41. outname = samplelistdir + '/' + tumor + '_' + genename + '_gene.cnr'
  42. genecnr.to_csv(outname, index=False, header=True, encoding='gbk', sep='\t')
  43. seg_genelist.loc[k, 'gene'] = genename
  44. k = k + 1
  45. else:
  46. unseg_genelist.loc[m, 'gene'] = genename
  47. m = m + 1
  48. ###
  49. seg_genelist.to_csv(samplelistdir + '/' + tumor + '_seggene.list', index=False, header=None, encoding='gbk', sep='\t')
  50. unseg_genelist.to_csv(samplelistdir + '/' + tumor + '_unseggene.list', index=False, header=None, encoding='gbk',sep='\t')
  51. if __name__=='__main__':
  52. parser = argparse.ArgumentParser(description='segment cnr for cnvdata')
  53. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  54. parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
  55. parser.add_argument('-t', '--tumor', type=str, help='tumorid')
  56. args = parser.parse_args()
  57. Inputpath = args.inputpath
  58. Sampleid = args.sampleid
  59. Tumor = args.tumor
  60. genesegment_cnvkit(Inputpath,Sampleid,Tumor)