s0_samplesheet_v0_20220915_finish.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #####
  2. import pandas as pd
  3. import xlrd
  4. import os,sys
  5. import argparse
  6. ####creat the analysis
  7. def analis_dir(inputpath,laneid):
  8. #新建出报告的目录
  9. pancancer_dir = os.path.join('/cgdata/pancancer_report', laneid)
  10. if not os.path.exists(pancancer_dir):
  11. os.mkdir(pancancer_dir)
  12. svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
  13. if not os.path.exists(svdir1):
  14. os.mkdir(svdir1)
  15. svdir2= os.path.join(inputpath, '1SV_vardict_pair')
  16. if not os.path.exists(svdir2):
  17. os.mkdir(svdir2)
  18. # for CNV
  19. CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  20. if not os.path.exists(CNVdir):
  21. os.mkdir(CNVdir)
  22. # for MSI
  23. MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  24. if not os.path.exists(MSI_dir):
  25. os.mkdir(MSI_dir)
  26. # for germline result
  27. gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
  28. if not os.path.exists(gemerlinedir):
  29. os.mkdir(gemerlinedir)
  30. # for HL result
  31. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  32. if not os.path.exists(HLdir):
  33. os.mkdir(HLdir)
  34. # for fusion
  35. fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
  36. if not os.path.exists(fusion_method1_dir):
  37. os.mkdir(fusion_method1_dir)
  38. fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  39. if not os.path.exists(fusion_method2_dir):
  40. os.mkdir(fusion_method2_dir)
  41. #for HLA
  42. HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
  43. if not os.path.exists(HLA_dir):
  44. os.mkdir(HLA_dir)
  45. # for qc
  46. qc_dir = os.path.join(inputpath, '8Fastqc')
  47. if not os.path.exists(qc_dir):
  48. os.mkdir(qc_dir)
  49. # for ontarget
  50. ontarget_dir = os.path.join(inputpath, '9Ontarget')
  51. if not os.path.exists(ontarget_dir):
  52. os.mkdir(ontarget_dir)
  53. # for coverage
  54. coverage_dir = os.path.join(inputpath, '10Coverage')
  55. if not os.path.exists(coverage_dir):
  56. os.mkdir(coverage_dir)
  57. # datasummary
  58. datasummary_dir = os.path.join(inputpath, 'datasummary')
  59. if not os.path.exists(datasummary_dir):
  60. os.mkdir(datasummary_dir)
  61. def pancancer_project(inputpath,laneid):
  62. #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
  63. #laneid = 'CGB0158'
  64. ###1.从项目表提取出当前大panel的样本表
  65. project = pd.read_excel('/cgdata/liuxiangqiong/work62pancancer/Project_table/阅尔基因项目信息总表-0606.xlsx')
  66. ###2.获得当前的项目,更新当前的样本表
  67. projectsample = project[project['检测项目1'] == '阅全-泛癌种602基因检测']
  68. outputfile1 = os.path.join('/cgdata/pancancer/project/pancancer_602gene_samples.xlsx')
  69. writer = pd.ExcelWriter(outputfile1)
  70. projectsample.to_excel(writer, sheet_name='QC', index=False)
  71. writer.save()
  72. writer.close()
  73. ####3.提取出当前我们panel分析样本的信息
  74. selectlist = ['样本编号', '样本类型', '临床诊断']
  75. selectsample = projectsample[selectlist]
  76. selectsample.reset_index(drop=True, inplace=True)
  77. selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib")
  78. ###3.1 我们从底层表中获取疾病的名称
  79. tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt'
  80. tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk')
  81. # 添加TCGA的名称
  82. selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True)
  83. selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left')
  84. # 将不在TCGA的癌症名称标为unknown
  85. selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown')
  86. # 3.2 获得当前的样本对应的信息
  87. ####提取出当前lane的样本对应的信息
  88. lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid'])
  89. lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left')
  90. # 输出原始信息表
  91. outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
  92. writer = pd.ExcelWriter(outputfile2)
  93. lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False)
  94. writer.save()
  95. writer.close()
  96. return lane_sampleinfor
  97. #####################制作配对数据样本表
  98. def samplepair(inputpath,laneid,lane_sampleinfor):
  99. ###4.1获得样本名
  100. samplenamelist = pd.DataFrame()
  101. for i in range(len(lane_sampleinfor)):
  102. sample = lane_sampleinfor.loc[i, 'sampleid'][:-1]
  103. samplenamelist.loc[i, 'samplename'] = sample
  104. samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True)
  105. ###4.2制作配对样本表
  106. sample_pair = pd.DataFrame()
  107. for j in range(len(samplenamelist_uq)):
  108. sample = samplenamelist_uq.loc[j, 'samplename']
  109. # sample_pair.loc[j, 'samplename'] = sample
  110. for k in range(len(lane_sampleinfor)):
  111. sampleid = lane_sampleinfor.loc[k, 'sampleid']
  112. sampletype1 = lane_sampleinfor.loc[k, 'sampletype']
  113. tumortype = lane_sampleinfor.loc[k, 'Abbr']
  114. print(sampleid)
  115. label = sampleid.find(sample)
  116. # print(label)
  117. if label != -1:
  118. # 修改样本名
  119. sample_pair.loc[j, 'samplename'] = sample + 'T'
  120. if (sampletype1 == '外周血') and (sampleid[-1] == 'N'):
  121. sample_pair.loc[j, 'normal'] = sample + 'T' + 'CN'
  122. elif sampletype1 == '血浆':
  123. sample_pair.loc[j, 'tumor'] = sample + 'T' + 'CT'
  124. sample_pair.loc[j, 'sampletype'] = 'blood'
  125. sample_pair.loc[j, 'tumortype'] = tumortype
  126. else:
  127. sample_pair.loc[j, 'tumor'] = sample + 'T' + 'TT'
  128. sample_pair.loc[j, 'sampletype'] = 'FFPE'
  129. sample_pair.loc[j, 'tumortype'] = tumortype
  130. # output
  131. outputfile3 = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  132. sample_pair.to_csv(outputfile3, index=False, header=True, encoding='gbk', sep='\t')
  133. def sheetrunmain(inputpath,laneid):
  134. analis_dir(inputpath,laneid)
  135. lane_sampleinfor=pancancer_project(inputpath,laneid)
  136. samplepair(inputpath, laneid, lane_sampleinfor)
  137. #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
  138. #laneid='CGB0158'
  139. #sheetrun(inputpath,laneid)
  140. if __name__=='__main__':
  141. parser = argparse.ArgumentParser(description='sample infor')
  142. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  143. parser.add_argument('-l', '--laneid', type=str, help='laneid')
  144. args = parser.parse_args()
  145. Inputpath = args.inputpath
  146. Laneid = args.laneid
  147. sheetrunmain(Inputpath,Laneid)