##### import pandas as pd import xlrd import os,sys import argparse ####creat the analysis def analis_dir(inputpath,laneid): #新建出报告的目录 pancancer_dir = os.path.join('/cgdata/pancancer_report', laneid) if not os.path.exists(pancancer_dir): os.mkdir(pancancer_dir) svdir1 = os.path.join(inputpath, '1SV_varscan_pair') if not os.path.exists(svdir1): os.mkdir(svdir1) svdir2= os.path.join(inputpath, '1SV_vardict_pair') if not os.path.exists(svdir2): os.mkdir(svdir2) # for CNV CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair') if not os.path.exists(CNVdir): os.mkdir(CNVdir) # for MSI MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair') if not os.path.exists(MSI_dir): os.mkdir(MSI_dir) # for germline result gemerlinedir = os.path.join(inputpath, '4Germline_unpair') if not os.path.exists(gemerlinedir): os.mkdir(gemerlinedir) # for HL result HLdir = os.path.join(inputpath, '5HL_gatk_unpair') if not os.path.exists(HLdir): os.mkdir(HLdir) # for fusion fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair') if not os.path.exists(fusion_method1_dir): os.mkdir(fusion_method1_dir) fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair') if not os.path.exists(fusion_method2_dir): os.mkdir(fusion_method2_dir) #for HLA HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair') if not os.path.exists(HLA_dir): os.mkdir(HLA_dir) # for qc qc_dir = os.path.join(inputpath, '8Fastqc') if not os.path.exists(qc_dir): os.mkdir(qc_dir) # for ontarget ontarget_dir = os.path.join(inputpath, '9Ontarget') if not os.path.exists(ontarget_dir): os.mkdir(ontarget_dir) # for coverage coverage_dir = os.path.join(inputpath, '10Coverage') if not os.path.exists(coverage_dir): os.mkdir(coverage_dir) # datasummary datasummary_dir = os.path.join(inputpath, 'datasummary') if not os.path.exists(datasummary_dir): os.mkdir(datasummary_dir) def pancancer_project(inputpath,laneid): #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158' #laneid = 'CGB0158' ###1.从项目表提取出当前大panel的样本表 project = pd.read_excel('/cgdata/liuxiangqiong/work62pancancer/Project_table/阅尔基因项目信息总表-0606.xlsx') ###2.获得当前的项目,更新当前的样本表 projectsample = project[project['检测项目1'] == '阅全-泛癌种602基因检测'] outputfile1 = os.path.join('/cgdata/pancancer/project/pancancer_602gene_samples.xlsx') writer = pd.ExcelWriter(outputfile1) projectsample.to_excel(writer, sheet_name='QC', index=False) writer.save() writer.close() ####3.提取出当前我们panel分析样本的信息 selectlist = ['样本编号', '样本类型', '临床诊断'] selectsample = projectsample[selectlist] selectsample.reset_index(drop=True, inplace=True) selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib") ###3.1 我们从底层表中获取疾病的名称 tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt' tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk') # 添加TCGA的名称 selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True) selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left') # 将不在TCGA的癌症名称标为unknown selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown') # 3.2 获得当前的样本对应的信息 ####提取出当前lane的样本对应的信息 lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid']) lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left') # 输出原始信息表 outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx') writer = pd.ExcelWriter(outputfile2) lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False) writer.save() writer.close() return lane_sampleinfor #####################制作配对数据样本表 def samplepair(inputpath,laneid,lane_sampleinfor): ###4.1获得样本名 samplenamelist = pd.DataFrame() for i in range(len(lane_sampleinfor)): sample = lane_sampleinfor.loc[i, 'sampleid'][:-1] samplenamelist.loc[i, 'samplename'] = sample samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True) ###4.2制作配对样本表 sample_pair = pd.DataFrame() for j in range(len(samplenamelist_uq)): sample = samplenamelist_uq.loc[j, 'samplename'] # sample_pair.loc[j, 'samplename'] = sample for k in range(len(lane_sampleinfor)): sampleid = lane_sampleinfor.loc[k, 'sampleid'] sampletype1 = lane_sampleinfor.loc[k, 'sampletype'] tumortype = lane_sampleinfor.loc[k, 'Abbr'] print(sampleid) label = sampleid.find(sample) # print(label) if label != -1: # 修改样本名 sample_pair.loc[j, 'samplename'] = sample + 'T' if (sampletype1 == '外周血') and (sampleid[-1] == 'N'): sample_pair.loc[j, 'normal'] = sample + 'T' + 'CN' elif sampletype1 == '血浆': sample_pair.loc[j, 'tumor'] = sample + 'T' + 'CT' sample_pair.loc[j, 'sampletype'] = 'blood' sample_pair.loc[j, 'tumortype'] = tumortype else: sample_pair.loc[j, 'tumor'] = sample + 'T' + 'TT' sample_pair.loc[j, 'sampletype'] = 'FFPE' sample_pair.loc[j, 'tumortype'] = tumortype # output outputfile3 = os.path.join(inputpath, laneid + '_sample_infor_label.txt') sample_pair.to_csv(outputfile3, index=False, header=True, encoding='gbk', sep='\t') def sheetrunmain(inputpath,laneid): analis_dir(inputpath,laneid) lane_sampleinfor=pancancer_project(inputpath,laneid) samplepair(inputpath, laneid, lane_sampleinfor) #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158' #laneid='CGB0158' #sheetrun(inputpath,laneid) if __name__=='__main__': parser = argparse.ArgumentParser(description='sample infor') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-l', '--laneid', type=str, help='laneid') args = parser.parse_args() Inputpath = args.inputpath Laneid = args.laneid sheetrunmain(Inputpath,Laneid)