datafile_cnv_v0_20220919_finish.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. import pandas as pd
  2. import os
  3. import os.path
  4. import xlrd
  5. import argparse
  6. ####修改
  7. #经过统计男性X染色体上Ratio的中值为0.5,女性为1.
  8. #除性染色体外,对其他染色体上所有ratio小于0.6的cnv统计,发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
  9. #但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
  10. #1.筛选depth大于等于1000且ratio<=0.3的突变。
  11. ##2.对于扩增,组织,筛选ratio>=1.5;对于血液,筛选ratio>=4
  12. #3.去除化疗相关基因
  13. def cnv_sample_run_test(inputpath,sample):
  14. laneid = inputpath.split('/')[-1].split('-')[-1]
  15. panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
  16. panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
  17. druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
  18. drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
  19. drug_genelist['drug_lable'] = 'Drug_gene'
  20. namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
  21. namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label']
  22. datasummarydir = os.path.join(inputpath, 'datasummary')
  23. isExists = os.path.exists(datasummarydir)
  24. if not isExists:
  25. os.makedirs(datasummarydir)
  26. cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  27. sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
  28. samplelist = pd.read_table(sampledir, sep='\t')
  29. sampledata = samplelist[samplelist['samplename'] == sample]
  30. sampledata.reset_index(drop=True, inplace=True)
  31. sample = sampledata.loc[0, 'samplename']
  32. sampletype = sampledata.loc[0, 'sampletype']
  33. # make the result dir
  34. result_dir = os.path.join(inputpath, 'resultfile')
  35. if not os.path.exists(result_dir):
  36. os.mkdir(result_dir)
  37. sample_dir = os.path.join(result_dir, sample)
  38. if not os.path.exists(sample_dir):
  39. os.mkdir(sample_dir)
  40. cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
  41. if os.path.exists(cnvdatadir):
  42. cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
  43. cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
  44. cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
  45. # extract the panel gene
  46. cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
  47. cnvfilter.reset_index(drop=True, inplace=True)
  48. del cnvfilter['druggene_label']
  49. if len(cnvfilter) == 0:
  50. cnvfilter_final = pd.DataFrame(columns=namelist)
  51. cnvfilter_final.loc[0, 'sampleid'] = sample
  52. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  53. else:
  54. ##remove the drug gene
  55. CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
  56. CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
  57. del CNVfilter1['drug_lable']
  58. if len(CNVfilter1) != 0:
  59. # 筛选CNV,对于组织,Ratio>=1.5或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
  60. if sampletype == 'FFPE':
  61. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
  62. elif sampletype == 'blood':
  63. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
  64. if len(CNVfilter2) != 0:
  65. ##对cnvloss进一步过滤,筛选depth>=1000
  66. cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
  67. cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
  68. cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
  69. cnvfilter_final.reset_index(drop=True, inplace=True)
  70. if len(cnvfilter_final) == 0:
  71. cnvfilter_final = pd.DataFrame(columns=namelist)
  72. cnvfilter_final.loc[0, 'sampleid'] = sample
  73. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  74. else:
  75. cnvfilter_final = pd.DataFrame(columns=namelist)
  76. cnvfilter_final.loc[0, 'sampleid'] = sample
  77. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  78. else:
  79. cnvfilter_final = pd.DataFrame(columns=namelist)
  80. cnvfilter_final.loc[0, 'sampleid'] = sample
  81. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  82. cnvfilter_output = cnvfilter_final[namelistout]
  83. outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
  84. writer = pd.ExcelWriter(outputfile1)
  85. cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
  86. writer.save()
  87. writer.close()
  88. else:
  89. cnvfilter_final = pd.DataFrame(columns=namelist)
  90. cnvfilter_final.loc[0, 'sampleid'] = sample
  91. cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
  92. # make the temp dir
  93. temp_dir = os.path.join(inputpath, 'tempfile')
  94. if not os.path.exists(temp_dir):
  95. os.mkdir(temp_dir)
  96. bugfile_dir = os.path.join(temp_dir, 'bugfile')
  97. if not os.path.exists(bugfile_dir):
  98. os.mkdir(bugfile_dir)
  99. outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
  100. cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
  101. ####加入在样本中出现的次数
  102. def CNVcounts(cnvfilter,inputpath,sample):
  103. # add the cnv counts in 117 samples
  104. cnvbgdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_117sample_stastic_risk_20220929.txt'
  105. cnvbgdata = pd.read_table(cnvbgdir, sep='\t', header=0)
  106. cnvfilter['CNV_gene'] = cnvfilter['gene'] + '_' + cnvfilter['chr'].astype('str') + '_' + cnvfilter['start'].astype('str') + '_' + cnvfilter['end'].astype('str')
  107. cnvfilter_counts = pd.merge(cnvfilter, cnvbgdata, on=['CNV_gene'], how='left')
  108. del cnvfilter_counts['CNV_gene']
  109. #add the tier infor
  110. cnvtierdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
  111. cnvtierdata= pd.read_table(cnvtierdir1, sep='\t', header=0)
  112. cnvfilter_counts_tier=pd.merge(cnvfilter_counts,cnvtierdata,on=['gene','CNV_label'],how='left')
  113. # make the temp dir
  114. temp_dir = os.path.join(inputpath, 'tempfile')
  115. if not os.path.exists(temp_dir):
  116. os.mkdir(temp_dir)
  117. tempcnv_dir = os.path.join(temp_dir, 'CNV')
  118. if not os.path.exists(tempcnv_dir):
  119. os.mkdir(tempcnv_dir)
  120. outputname = os.path.join(tempcnv_dir, sample + '_CNVunfilter_samplecounts.txt')
  121. cnvfilter_counts_tier.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
  122. return cnvfilter_counts_tier
  123. ####修改
  124. #经过统计男性X染色体上Ratio的中值为0.5,女性为1.
  125. #除性染色体外,对其他染色体上所有ratio小于0.6的cnv统计,发下有很多的cnvloss主要集中在Ratio[0.5,0.6]之间
  126. #但仍有很多集中在0.3-0.5之间。为了获得更为严苛的cnvloss.修订以下规则
  127. #1.筛选depth大于等于1000且ratio<=0.3的突变。
  128. ##2.对于扩增,组织,筛选ratio>=2;对于血液,筛选ratio>=4
  129. #3.去除化疗相关基因
  130. def cnv_sample_run(inputpath,sample):
  131. laneid = inputpath.split('/')[-1].split('-')[-1]
  132. panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
  133. panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
  134. druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
  135. drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
  136. drug_genelist['drug_lable'] = 'Drug_gene'
  137. namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
  138. namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label','Tier_label']
  139. datasummarydir = os.path.join(inputpath, 'datasummary')
  140. isExists = os.path.exists(datasummarydir)
  141. if not isExists:
  142. os.makedirs(datasummarydir)
  143. cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  144. sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
  145. samplelist = pd.read_table(sampledir, sep='\t')
  146. sampledata = samplelist[samplelist['samplename'] == sample]
  147. sampledata.reset_index(drop=True, inplace=True)
  148. sampletype = sampledata.loc[0, 'sampletype']
  149. # make the result dir
  150. result_dir = os.path.join(inputpath, 'resultfile')
  151. if not os.path.exists(result_dir):
  152. os.mkdir(result_dir)
  153. sample_dir = os.path.join(result_dir, sample)
  154. if not os.path.exists(sample_dir):
  155. os.mkdir(sample_dir)
  156. cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
  157. if os.path.exists(cnvdatadir):
  158. cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
  159. cnvfilter0 = cnvdata.dropna(subset=['CNV_label'])
  160. cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label'] != '']
  161. # extract the panel gene
  162. cnvfilter1_0 = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
  163. cnvfilter1_0 .reset_index(drop=True, inplace=True)
  164. del cnvfilter1_0['druggene_label']
  165. # add the cnv counts in 117 samples
  166. cnvfilter = CNVcounts(cnvfilter1_0, inputpath, sample)
  167. if len(cnvfilter) == 0:
  168. cnvfilter_final = pd.DataFrame(columns=namelist)
  169. cnvfilter_final.loc[0, 'sampleid'] = sample
  170. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  171. else:
  172. ##remove the drug gene
  173. CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
  174. CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
  175. del CNVfilter1['drug_lable']
  176. if len(CNVfilter1) != 0:
  177. # 筛选CNV,对于组织,Ratio>=2或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
  178. if sampletype == 'FFPE':
  179. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 2) | (CNVfilter1['Ratio'] <= 0.3)]
  180. elif sampletype == 'blood':
  181. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
  182. if len(CNVfilter2) != 0:
  183. ##对cnvloss进一步过滤,筛选depth>=1000
  184. cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
  185. cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
  186. cnvfilter_cnvloss = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
  187. cnvfilter_cnvloss.reset_index(drop=True, inplace=True)
  188. #筛选在117个样本中出现频率小于50%的CNV
  189. if len(cnvfilter_cnvloss) == 0:
  190. cnvfilter_final = pd.DataFrame(columns=namelist)
  191. cnvfilter_final.loc[0, 'sampleid'] = sample
  192. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  193. else:
  194. cnvfilter_final0 = cnvfilter_cnvloss[cnvfilter_cnvloss['Sample_ratio'] < 0.5]
  195. if len(cnvfilter_final0) != 0:
  196. cnvfilter_final = cnvfilter_final0.drop(labels=['Sample_ratio', 'Sample_counts'], axis=1)
  197. cnvfilter_final.reset_index(drop=True, inplace=True)
  198. else:
  199. cnvfilter_final = pd.DataFrame(columns=namelist)
  200. cnvfilter_final.loc[0, 'sampleid'] = sample
  201. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  202. else:
  203. cnvfilter_final = pd.DataFrame(columns=namelist)
  204. cnvfilter_final.loc[0, 'sampleid'] = sample
  205. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  206. else:
  207. cnvfilter_final = pd.DataFrame(columns=namelist)
  208. cnvfilter_final.loc[0, 'sampleid'] = sample
  209. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  210. cnvfilter_output = cnvfilter_final[namelistout]
  211. outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
  212. writer = pd.ExcelWriter(outputfile1)
  213. cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
  214. writer.save()
  215. writer.close()
  216. else:
  217. cnvfilter_final = pd.DataFrame(columns=namelist)
  218. cnvfilter_final.loc[0, 'sampleid'] = sample
  219. cnvfilter_final.loc[0, 'CNV_label'] = 'No file'
  220. # make the temp dir
  221. temp_dir = os.path.join(inputpath, 'tempfile')
  222. if not os.path.exists(temp_dir):
  223. os.mkdir(temp_dir)
  224. bugfile_dir = os.path.join(temp_dir, 'bugfile')
  225. if not os.path.exists(bugfile_dir):
  226. os.mkdir(bugfile_dir)
  227. outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sample + '.cnv.nofile.log.txt')
  228. cnvfilter_final.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
  229. if __name__=='__main__':
  230. parser = argparse.ArgumentParser(description='filter the CNV')
  231. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  232. parser.add_argument('-s', '--sample', type=str, help='the sample name')
  233. args = parser.parse_args()
  234. Inputpath = args.inputpath
  235. Sample = args.sample
  236. cnv_sample_run(Inputpath,Sample)
  237. ####所有样本汇总不执行
  238. def cnv_datasummary(inputpath,laneid):
  239. panel_genelistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/602gene_select.txt'
  240. panel_genelist = pd.read_table(panel_genelistdir, sep='\t', header=0)
  241. druglistdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_genelist_20220829.txt'
  242. drug_genelist = pd.read_table(druglistdir, sep='\t', header=None, names=['gene'])
  243. drug_genelist['drug_lable'] = 'Drug_gene'
  244. namelist = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2','Ratio', 'Copynum', 'depth', 'CNV_label','Gene_label']
  245. namelistout = ['sampleid', 'gene', 'chr', 'start', 'end', 'log2', 'Copynum', 'depth', 'CNV_label','Gene_label']
  246. datasummarydir = os.path.join(inputpath, 'datasummary')
  247. isExists = os.path.exists(datasummarydir)
  248. if not isExists:
  249. os.makedirs(datasummarydir)
  250. cnvdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  251. cnvsummary = pd.DataFrame()
  252. sampledir=os.path.join(inputpath,laneid+'_sample_infor_label.txt')
  253. samplelist = pd.read_table(sampledir, sep='\t')
  254. for i in range(len(samplelist)):
  255. sample = samplelist.loc[i, 'samplename']
  256. sampletype=samplelist.loc[i, 'sampletype']
  257. # make the result dir
  258. result_dir = os.path.join(inputpath, 'resultfile')
  259. if not os.path.exists(result_dir):
  260. os.mkdir(result_dir)
  261. sample_dir = os.path.join(result_dir, sample)
  262. if not os.path.exists(sample_dir):
  263. os.mkdir(sample_dir)
  264. cnvdatadir = cnvdir + '/' + sample + '.cnv.unfilter.txt'
  265. if os.path.exists(cnvdatadir):
  266. cnvdata = pd.read_table(cnvdatadir, sep='\t', header=0)
  267. cnvfilter0=cnvdata.dropna(subset=['CNV_label'])
  268. cnvfilter1 = cnvfilter0[cnvfilter0['CNV_label']!='']
  269. # extract the panel gene
  270. cnvfilter = pd.merge(cnvfilter1, panel_genelist, on=['gene'], how='inner')
  271. cnvfilter.reset_index(drop=True,inplace=True)
  272. del cnvfilter['druggene_label']
  273. if len(cnvfilter)==0:
  274. cnvfilter_final = pd.DataFrame(columns=namelist)
  275. cnvfilter_final.loc[0, 'sampleid'] = sample
  276. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  277. else:
  278. ##remove the drug gene
  279. CNVfilter_druglabel = pd.merge(cnvfilter, drug_genelist, on=['gene'], how='left')
  280. CNVfilter1 = CNVfilter_druglabel[CNVfilter_druglabel['drug_lable'] != 'Drug_gene']
  281. del CNVfilter1['drug_lable']
  282. if len(CNVfilter1)!=0:
  283. # 筛选CNV,对于组织,Ratio>=1.5或者Ratio<=0.3;对于血液Ratio>=4或者Ratio<=0.3
  284. if sampletype=='FFPE':
  285. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 1.5) | (CNVfilter1['Ratio'] <= 0.3)]
  286. elif sampletype=='blood':
  287. CNVfilter2 = CNVfilter1[(CNVfilter1['Ratio'] >= 4) | (CNVfilter1['Ratio'] <= 0.3)]
  288. if len(CNVfilter2)!=0:
  289. ##对cnvloss进一步过滤,筛选depth>=1000
  290. cnvlossdata = CNVfilter2[CNVfilter2['CNV_label'] == 'CopyNumberLoss']
  291. cnvloss_filter = cnvlossdata[cnvlossdata['depth'] >= 1000]
  292. cnvfilter_final = CNVfilter2[CNVfilter2['CNV_label'] != 'CopyNumberLoss'].append(cnvloss_filter)
  293. cnvfilter_final.reset_index(drop=True, inplace=True)
  294. if len(cnvfilter_final) == 0:
  295. cnvfilter_final = pd.DataFrame(columns=namelist)
  296. cnvfilter_final.loc[0, 'sampleid'] = sample
  297. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  298. else:
  299. cnvfilter_final = pd.DataFrame(columns=namelist)
  300. cnvfilter_final.loc[0, 'sampleid'] = sample
  301. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  302. else:
  303. cnvfilter_final = pd.DataFrame(columns=namelist)
  304. cnvfilter_final.loc[0, 'sampleid'] = sample
  305. cnvfilter_final.loc[0, 'CNV_label'] = 'No_cnv'
  306. cnvfilter_output=cnvfilter_final[namelistout]
  307. outputfile1 = os.path.join(sample_dir, laneid + '-' + sample + '.cnv.xlsx')
  308. writer = pd.ExcelWriter(outputfile1)
  309. cnvfilter_output.to_excel(writer, sheet_name='CNV', index=False)
  310. writer.save()
  311. writer.close()
  312. else:
  313. cnvfilter_final=pd.DataFrame(columns=namelist)
  314. cnvfilter_final.loc[0,'sampleid']=sample
  315. cnvfilter_final.loc[0,'CNV_label']='No file'
  316. cnvsummary = cnvsummary.append(cnvfilter_final)
  317. outputname = datasummarydir + '/' + laneid + '_table4_cnv_datasummary.txt'
  318. cnvsummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')