s01_samplesheet_v0_20230420_finish.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # -*- encoding: utf-8 -*-
  2. '''
  3. @File : s01_samplesheet_v0_20230420_finish.py
  4. @Time : 2023/04/20 11:35:55
  5. @Author : liuxiangqiong
  6. @Version : 0.1
  7. '''
  8. # here put the import lib
  9. import pandas as pd
  10. import os
  11. import argparse
  12. import re
  13. ##项目表有上海和启动两个sheet
  14. ##修改,提取检测项目的编号,204或者602.
  15. def pancancer_project(inputpath,laneid):
  16. #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
  17. #laneid = 'CGB0158'
  18. ###0.拷贝CNV分级表
  19. rawcnvdir = '/cgdata/pancancer_report/refdata/CNV.xlsx'
  20. cnvtable = pd.read_excel(rawcnvdir)
  21. cnvtable['gene'] = cnvtable['gene'].str.strip()
  22. cnvtable['CNV_label'] = cnvtable['CNV_label'].str.strip()
  23. refcnvdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
  24. biocnvdir2 = '/cgdata/bioproject/pancancer602gene/Project_table/CNV_tier_infor.txt'
  25. cnvtable.to_csv(refcnvdir1, index=False, header=True, sep='\t')
  26. cnvtable.to_csv(biocnvdir2, index=False, header=True, sep='\t')
  27. ###1.从项目表提取出当前大panel的样本表
  28. project = pd.read_excel('/cgdata/bioproject/pancancer602gene/Project_table/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl',sheet_name=None)
  29. #project = pd.read_excel('/cgdata/liuxiangqiong/work62pancancer/probe204/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl',sheet_name=None)
  30. #提取出两个sheet的结果并追加合并
  31. project_SH=project['项目信息表-上海']
  32. project_QD=project['项目信息表-启东']
  33. project_all=project_QD.append(project_SH)
  34. project_all.reset_index(drop=True,inplace=True)
  35. ###2.1获得602的项目
  36. projectsample1 = project_all[project_all['检测项目1'] == '阅全-泛癌种602基因检测']
  37. projectsample2 = project_all[project_all['检测项目1'].str.contains('CPS', na=False)]
  38. projectsample_602=projectsample1.append(projectsample2)
  39. ##2.2获得204的项目
  40. projectsample_204 = project_all[project_all['检测项目1'].str.contains('CMS', na=False)]
  41. ##2.3将204和602项目合并
  42. projectsample=projectsample_602.append(projectsample_204)
  43. projectsample.reset_index(drop=True,inplace=True)
  44. outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_allsamples.xlsx')
  45. #outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_602gene_samples.xlsx')
  46. writer = pd.ExcelWriter(outputfile1)
  47. projectsample.to_excel(writer, sheet_name='sample', index=False)
  48. writer.save()
  49. writer.close()
  50. ####3.提取出当前我们panel分析样本的信息
  51. selectlist = ['样本编号', '样本类型', '临床诊断','检测项目1']
  52. selectsample = projectsample[selectlist]
  53. selectsample.reset_index(drop=True, inplace=True)
  54. selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib")
  55. ###3.1 我们从底层表中获取疾病的名称
  56. tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt'
  57. tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk')
  58. # 添加TCGA的名称
  59. selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype','检测项目1':'project'}, inplace=True)
  60. selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left')
  61. # 将不在TCGA的癌症名称标为unknown
  62. selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown')
  63. # 3.2 获得当前的样本对应的信息
  64. ####提取出当前lane的样本对应的信息
  65. lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid'])
  66. lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left')
  67. # 输出原始信息表
  68. outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
  69. writer = pd.ExcelWriter(outputfile2)
  70. lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False)
  71. writer.save()
  72. writer.close()
  73. return lane_sampleinfor
  74. #inputpath='/cgdata/liuxiangqiong/work62pancancer/probe204'
  75. #laneid='CGB0434_2'
  76. #rawsheet=pancancer_project(inputpath,laneid)
  77. ##修改,加上检测项目编号
  78. #且给出下机数据的样本编号。如果对应的需要修改,那么人工再改
  79. def project_pair(rawsheet):
  80. #inputfile = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
  81. #rawsheet = pd.read_excel(inputfile)
  82. ##首先获得所有的肿瘤样本
  83. rawsheet_tumorid = rawsheet[rawsheet['sampleid'].str.contains('T')]
  84. rawsheet_tumorid.reset_index(drop=True, inplace=True)
  85. ###获得肿瘤样本名,构建配对表,因为不管正常如何命名,最后的normal也是根据肿瘤样本名,所以只需要看肿瘤样本名就可
  86. sample_pair = pd.DataFrame()
  87. for i in range(len(rawsheet_tumorid)):
  88. sample = rawsheet_tumorid.loc[i, 'sampleid']
  89. sampletype1 = rawsheet_tumorid.loc[i, 'sampletype']
  90. tumortype = rawsheet_tumorid.loc[i, 'Abbr']
  91. projectid=rawsheet_tumorid.loc[i, 'project']
  92. sample_pair.loc[i, 'samplename'] = sample
  93. #判断样本项目
  94. if 'CPS' in projectid:
  95. sample_pair.loc[i, 'projectid'] = 'Pan602'
  96. elif 'CMS' in projectid:
  97. sample_pair.loc[i,'projectid']='Pan204'
  98. else:
  99. sample_pair.loc[i,'projectid']='other'
  100. #获得normal新命名
  101. sample_pair.loc[i, 'normal'] = sample + 'CN'
  102. #判断样本类型和tumor新命名
  103. if sampletype1 == '血浆':
  104. sample_pair.loc[i, 'tumor'] = sample + 'CT'
  105. sample_pair.loc[i, 'sampletype'] = 'blood'
  106. sample_pair.loc[i, 'tumortype'] = tumortype
  107. sample_pair.loc[i, 'fastq_tumor'] = sample
  108. sample_pair.loc[i, 'fastq_normal'] = sample[:-2]+'N'
  109. elif sampletype1=='外周血':
  110. sample_pair.loc[i, 'tumor'] = sample + 'CT'
  111. sample_pair.loc[i, 'sampletype'] = 'blood'
  112. sample_pair.loc[i, 'tumortype'] = tumortype
  113. sample_pair.loc[i, 'fastq_tumor'] = sample
  114. sample_pair.loc[i, 'fastq_normal'] = sample[:-2]+'N'
  115. else:
  116. sample_pair.loc[i, 'tumor'] = sample + 'TT'
  117. sample_pair.loc[i, 'sampletype'] = 'FFPE'
  118. sample_pair.loc[i, 'tumortype'] = tumortype
  119. sample_pair.loc[i, 'fastq_tumor'] = sample
  120. sample_pair.loc[i, 'fastq_normal'] = sample[:-1]+'N'
  121. return sample_pair
  122. def sheetrunmain(inputpath,laneid):
  123. lane_sampleinfor=pancancer_project(inputpath,laneid)
  124. project_sample=project_pair(lane_sampleinfor)
  125. print('project_sample')
  126. print(project_sample)
  127. outputfile = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  128. project_sample.to_csv(outputfile,sep='\t',header=True, index=False)
  129. print('finish')
  130. #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
  131. #laneid='CGB0158'
  132. #sheetrun(inputpath,laneid)
  133. if __name__=='__main__':
  134. parser = argparse.ArgumentParser(description='sample pair')
  135. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  136. parser.add_argument('-l', '--laneid', type=str, help='laneid')
  137. args = parser.parse_args()
  138. Inputpath = args.inputpath
  139. Laneid = args.laneid
  140. sheetrunmain(Inputpath,Laneid)