cnvkit_cns_merge_20220823.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. import pandas as pd
  2. import os,os.path
  3. import numpy as np
  4. import math
  5. import xlrd
  6. import argparse
  7. ###选择ratio大于1.5的为amplication,小于0.6为缺失
  8. CNVamplicon=1.5
  9. CNVloss=0.6
  10. def cnv_result_sample_raw(inputpath,sampleid,tumor):
  11. druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
  12. drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
  13. drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
  14. druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
  15. druggene.reset_index(drop=True, inplace=True)
  16. # 获得基因标签数据
  17. genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
  18. genetypedata = pd.read_table(genetype, sep='\t', header=0)
  19. genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
  20. ###读入数据
  21. cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
  22. cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
  23. # extrac the unseg cns
  24. #####提取unseg cns信息
  25. unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
  26. cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
  27. cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
  28. unseg_cns_all = pd.DataFrame()
  29. for i in range(len(unseg_genelist)):
  30. gene = unseg_genelist.loc[i, 'gene']
  31. seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
  32. seg_genecns.reset_index(drop=True, inplace=True)
  33. seg_genecns.loc[0, 'gene'] = gene
  34. unseg_cns_all = unseg_cns_all.append(seg_genecns)
  35. unseg_cns_all.reset_index(drop=True, inplace=True)
  36. unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
  37. ####读取segment的基因信息
  38. seg_cns_all = pd.DataFrame()
  39. seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
  40. for i in range(len(seg_genelist)):
  41. genename = seg_genelist.loc[i, 'gene']
  42. segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
  43. try:
  44. seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
  45. seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
  46. seg_cns_all = seg_cns.append(seg_cns_all)
  47. except:
  48. print('there is no gene')
  49. continue
  50. seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
  51. seg_cns_all.reset_index(drop=True, inplace=True)
  52. seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
  53. seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
  54. ###将segementh与unsegment进行合并
  55. all_cns = unseg_cns_all.append(seg_cns_all)
  56. all_cns.reset_index(drop=True, inplace=True)
  57. # 对拷贝数,筛选大于3的为扩增,小于1为缺失
  58. all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
  59. all_cns_filter.reset_index(drop=True, inplace=True)
  60. ##添加标签
  61. titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
  62. if len(all_cns_filter) != 0:
  63. for m in range(len(all_cns_filter)):
  64. if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
  65. all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
  66. elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
  67. all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
  68. # select the CNV of drug related gene
  69. cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
  70. if len(cnv_druggene) != 0:
  71. # add the gene label oncogene or tumor suppersor
  72. drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
  73. drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
  74. drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
  75. cnv_result = drug_CNV_filter_genelabel[titlelist]
  76. else:
  77. cnv_result = pd.DataFrame(columns=titlelist)
  78. cnv_result.loc[0, 'sampleid'] = sampleid
  79. cnv_result.loc[0, 'CNV_label'] = 'No result'
  80. else:
  81. cnv_result = pd.DataFrame(columns=titlelist)
  82. cnv_result.loc[0, 'sampleid'] = sampleid
  83. cnv_result.loc[0, 'CNV_label'] = 'No result'
  84. outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
  85. cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
  86. ###修改,当一个基因出现多个拷贝数结果式,需要重新进行分析,
  87. #gene=EGFR
  88. #cnvdir=${inputpath}/2CNV_cnvkit_pair/${sample}_cnvkit
  89. #cnvkit.py segment ${cnvdir}/${tumor}_${gene}_gene.cnr -o ${cnvdir}/${tumor}_${gene}_gene.seg.cns -m none --smooth-cbs
  90. def cnv_gene_cns(inputpath,sampleid,tumor,gene):
  91. cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  92. cnvdir_sample = os.path.join(cnvdir, sampleid + '_cnvkit')
  93. sample_gene_cnrinput = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.cnr')
  94. # 将原始的基因cns文件拷贝
  95. rawgenecns = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
  96. rawgenecns_cpname = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.multiexon.cns')
  97. cpfile = 'cp -r ' + rawgenecns + ' ' + rawgenecns_cpname
  98. os.system(cpfile)
  99. #进行新的基因cns计算,代码会覆盖原来的结果
  100. newcns_outputdir = os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.cns')
  101. cnvsegcmd = 'cnvkit.py segment ' + sample_gene_cnrinput + ' -o ' + newcns_outputdir + ' -m none --smooth-cbs'
  102. os.system(cnvsegcmd)
  103. #为了方便核对以后的结果,我们也需要将新的进行拷贝
  104. newgenecns_cpname=os.path.join(cnvdir_sample, tumor + '_' + gene + '_gene.seg.unique.cns')
  105. cpfilenew = 'cp -r ' + newcns_outputdir + ' ' + newgenecns_cpname
  106. os.system(cpfilenew)
  107. def gene_cnv_sample(inputpath,sampleid,tumor):
  108. ###读入数据
  109. cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
  110. cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
  111. ####读取segment的基因信息
  112. seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
  113. for i in range(len(seg_genelist)):
  114. genename = seg_genelist.loc[i, 'gene']
  115. segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
  116. if os.path.exists(segcnrdir):
  117. count = len(open(segcnrdir, 'r').readlines())
  118. # 判断文件有几行,如果大于2行(2行包括了title)表示文件一个基因有多个exon扩增,那么需要重新执行cnv
  119. if count > 2:
  120. print(genename)
  121. cnv_gene_cns(inputpath, sampleid, tumor, genename)
  122. def cnv_result_sample(inputpath,sampleid,tumor):
  123. druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
  124. drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
  125. drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
  126. druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
  127. druggene.reset_index(drop=True, inplace=True)
  128. # 获得基因标签数据
  129. genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
  130. genetypedata = pd.read_table(genetype, sep='\t', header=0)
  131. genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
  132. ###读入数据
  133. cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
  134. cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
  135. # extrac the unseg cns
  136. #####提取unseg cns信息
  137. unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
  138. cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
  139. cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
  140. unseg_cns_all = pd.DataFrame()
  141. for i in range(len(unseg_genelist)):
  142. gene = unseg_genelist.loc[i, 'gene']
  143. seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
  144. seg_genecns.reset_index(drop=True, inplace=True)
  145. seg_genecns.loc[0, 'gene'] = gene
  146. unseg_cns_all = unseg_cns_all.append(seg_genecns)
  147. unseg_cns_all.reset_index(drop=True, inplace=True)
  148. unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
  149. ####读取segment的基因信息
  150. seg_cns_all = pd.DataFrame()
  151. seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
  152. for i in range(len(seg_genelist)):
  153. genename = seg_genelist.loc[i, 'gene']
  154. segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
  155. if os.path.exists(segcnrdir):
  156. seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
  157. seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
  158. seg_cns_all = seg_cns.append(seg_cns_all)
  159. else:
  160. print('there is no gene')
  161. continue
  162. seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
  163. seg_cns_all.reset_index(drop=True, inplace=True)
  164. seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
  165. seg_cns_all['Copynum']=round(seg_cns_all['Ratio']*2,2)
  166. ###将segementh与unsegment进行合并
  167. all_cns = unseg_cns_all.append(seg_cns_all)
  168. all_cns.reset_index(drop=True, inplace=True)
  169. # 对拷贝数,筛选大于3的为扩增,小于1为缺失
  170. all_cns_filter = all_cns[(all_cns['Ratio'] > CNVamplicon) | (all_cns['Ratio'] < CNVloss)]
  171. all_cns_filter.reset_index(drop=True, inplace=True)
  172. ##添加标签
  173. titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label', 'Gene_label']
  174. if len(all_cns_filter) != 0:
  175. for m in range(len(all_cns_filter)):
  176. if all_cns_filter.loc[m, 'Ratio'] > CNVamplicon:
  177. all_cns_filter.loc[m, 'CNV_label'] = 'Amplification'
  178. elif all_cns_filter.loc[m, 'Ratio'] < CNVloss:
  179. all_cns_filter.loc[m, 'CNV_label'] = 'CopyNumberLoss'
  180. # select the CNV of drug related gene
  181. cnv_druggene = pd.merge(all_cns_filter, druggene, on=['gene'], how='inner')
  182. if len(cnv_druggene) != 0:
  183. # add the gene label oncogene or tumor suppersor
  184. drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
  185. drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
  186. drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
  187. cnv_result = drug_CNV_filter_genelabel[titlelist]
  188. else:
  189. cnv_result = pd.DataFrame(columns=titlelist)
  190. cnv_result.loc[0, 'sampleid'] = sampleid
  191. cnv_result.loc[0, 'CNV_label'] = 'No result'
  192. else:
  193. cnv_result = pd.DataFrame(columns=titlelist)
  194. cnv_result.loc[0, 'sampleid'] = sampleid
  195. cnv_result.loc[0, 'CNV_label'] = 'No result'
  196. outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.txt')
  197. cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
  198. ###输出unfilte的结果
  199. def cnv_result_sample_unfilter(inputpath,sampleid,tumor):
  200. druggene_dir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_gene_infor20220525.txt'
  201. drugene_infor = pd.read_table(druggene_dir, sep='\t', header=0)
  202. drugene_infor.rename(columns={'chr': 'chromosome', 'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
  203. druggene = pd.DataFrame(drugene_infor['gene'].drop_duplicates(keep='first'))
  204. druggene['druggene_label']='Drug_gene'
  205. druggene.reset_index(drop=True, inplace=True)
  206. # 获得基因标签数据
  207. genetype = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/oncokb_cancerGeneList_20220517_final.txt'
  208. genetypedata = pd.read_table(genetype, sep='\t', header=0)
  209. genetypedata.rename(columns={'label': 'gene_label'}, inplace=True)
  210. ###读入数据
  211. cnvinputpath = os.path.join(inputpath, '2CNV_cnvkit_pair')
  212. cnvdir = os.path.join(cnvinputpath, sampleid + '_cnvkit')
  213. # extrac the unseg cns
  214. #####提取unseg cns信息
  215. unseg_genelist = pd.read_table(cnvdir + '/' + tumor + '_unseggene.list', sep='\t', names=['gene'])
  216. cns_all = pd.read_table(cnvdir + '/' + tumor + '_clean.cns', sep='\t', header=0)
  217. cns_all.rename(columns={'chromosome': 'chr'}, inplace=True)
  218. unseg_cns_all = pd.DataFrame()
  219. for i in range(len(unseg_genelist)):
  220. gene = unseg_genelist.loc[i, 'gene']
  221. seg_genecns = cns_all[cns_all['gene'].str.contains(gene)]
  222. seg_genecns.reset_index(drop=True, inplace=True)
  223. seg_genecns.loc[0, 'gene'] = gene
  224. unseg_cns_all = unseg_cns_all.append(seg_genecns)
  225. unseg_cns_all.reset_index(drop=True, inplace=True)
  226. unseg_cns_all['Ratio'] = round(unseg_cns_all['log2'].apply(np.exp2), 2)
  227. unseg_cns_all['Copynum'] = round(unseg_cns_all['Ratio'] * 2, 2)
  228. ####读取segment的基因信息
  229. seg_cns_all = pd.DataFrame()
  230. seg_genelist = pd.read_table(cnvdir + '/' + tumor + '_seggene.list', names=['gene'])
  231. for i in range(len(seg_genelist)):
  232. genename = seg_genelist.loc[i, 'gene']
  233. segcnrdir = cnvdir + '/' + tumor + '_' + genename + '_gene.seg.cns'
  234. try:
  235. seg_cns = pd.read_table(segcnrdir, sep='\t', header=0)
  236. seg_cns.rename(columns={'chromosome': 'chr'}, inplace=True)
  237. seg_cns_all = seg_cns.append(seg_cns_all)
  238. except:
  239. print('there is no gene')
  240. continue
  241. seg_cns_all = seg_cns_all[seg_cns_all['gene'] != '-']
  242. seg_cns_all.reset_index(drop=True, inplace=True)
  243. seg_cns_all['Ratio'] = round(seg_cns_all['log2'].apply(np.exp2), 2)
  244. seg_cns_all['Copynum'] = round(seg_cns_all['Ratio'] * 2, 2)
  245. ###将segementh与unsegment进行合并
  246. all_cns = unseg_cns_all.append(seg_cns_all)
  247. all_cns.reset_index(drop=True, inplace=True)
  248. ##添加标签
  249. titlelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Ratio','Copynum', 'depth', 'CNV_label','druggene_label', 'Gene_label']
  250. if len(all_cns) != 0:
  251. for m in range(len(all_cns)):
  252. if all_cns.loc[m, 'Ratio'] > CNVamplicon:
  253. all_cns.loc[m, 'CNV_label'] = 'Amplification'
  254. elif all_cns.loc[m, 'Ratio'] < CNVloss:
  255. all_cns.loc[m, 'CNV_label'] = 'CopyNumberLoss'
  256. else:
  257. all_cns.loc[m, 'CNV_label'] =''
  258. # select the CNV of drug related gene
  259. cnv_druggene = pd.merge(all_cns, druggene, on=['gene'], how='left')
  260. if len(cnv_druggene) != 0:
  261. # add the gene label oncogene or tumor suppersor
  262. drug_CNV_filter_genelabel = pd.merge(cnv_druggene, genetypedata, on=['gene'], how='left')
  263. drug_CNV_filter_genelabel.insert(0, 'sampleid', sampleid)
  264. drug_CNV_filter_genelabel.rename(columns={'gene_label':'Gene_label'},inplace=True)
  265. cnv_result = drug_CNV_filter_genelabel[titlelist]
  266. else:
  267. cnv_result = pd.DataFrame(columns=titlelist)
  268. cnv_result.loc[0, 'sampleid'] = sampleid
  269. cnv_result.loc[0, 'CNV_label'] = 'No result'
  270. else:
  271. cnv_result = pd.DataFrame(columns=titlelist)
  272. cnv_result.loc[0, 'sampleid'] = sampleid
  273. cnv_result.loc[0, 'CNV_label'] = 'No result'
  274. outputfile1 = os.path.join(cnvinputpath, sampleid + '.cnv.unfilter.txt')
  275. cnv_result.to_csv(outputfile1, index=False, header=True, encoding='gbk', sep='\t')
  276. if __name__=='__main__':
  277. parser = argparse.ArgumentParser(description='cnv data for cnvdata')
  278. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  279. parser.add_argument('-s', '--sampleid', type=str, help='sampleid')
  280. parser.add_argument('-t', '--tumor', type=str, help='tumorid')
  281. args = parser.parse_args()
  282. Inputpath = args.inputpath
  283. Sampleid = args.sampleid
  284. Tumor = args.tumor
  285. gene_cnv_sample(Inputpath,Sampleid,Tumor)
  286. cnv_result_sample(Inputpath,Sampleid,Tumor)
  287. cnv_result_sample_unfilter(Inputpath,Sampleid,Tumor)