##### import pandas as pd import xlrd import os,sys import argparse import re #原项目表只有一个sheet的代码 def pancancer_project_v0(inputpath,laneid): #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158' #laneid = 'CGB0158' ###0.拷贝CNV分级表 rawcnvdir = '/cgdata/pancancer_report/refdata/CNV.xlsx' cnvtable = pd.read_excel(rawcnvdir) cnvtable['gene'] = cnvtable['gene'].str.strip() cnvtable['CNV_label'] = cnvtable['CNV_label'].str.strip() refcnvdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt' biocnvdir2 = '/cgdata/bioproject/pancancer602gene/Project_table/CNV_tier_infor.txt' cnvtable.to_csv(refcnvdir1, index=False, header=True, sep='\t') cnvtable.to_csv(biocnvdir2, index=False, header=True, sep='\t') ###1.从项目表提取出当前大panel的样本表 project = pd.read_excel('/cgdata/bioproject/pancancer602gene/Project_table/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl') ###2.获得当前的项目,更新当前的样本表 projectsample1 = project[project['检测项目1'] == '阅全-泛癌种602基因检测'] projectsample2 = project[project['检测项目1'].str.contains('CPS', na=False)] projectsample=projectsample1.append(projectsample2) projectsample.reset_index(drop=True,inplace=True) outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_602gene_samples.xlsx') writer = pd.ExcelWriter(outputfile1) projectsample.to_excel(writer, sheet_name='QC', index=False) writer.save() writer.close() ####3.提取出当前我们panel分析样本的信息 selectlist = ['样本编号', '样本类型', '临床诊断'] selectsample = projectsample[selectlist] selectsample.reset_index(drop=True, inplace=True) selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib") ###3.1 我们从底层表中获取疾病的名称 tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt' tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk') # 添加TCGA的名称 selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True) selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left') # 将不在TCGA的癌症名称标为unknown selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown') # 3.2 获得当前的样本对应的信息 ####提取出当前lane的样本对应的信息 lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid']) lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left') # 输出原始信息表 outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx') writer = pd.ExcelWriter(outputfile2) lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False) writer.save() writer.close() return lane_sampleinfor ##项目表有上海和启动两个sheet def pancancer_project(inputpath,laneid): #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158' #laneid = 'CGB0158' ###0.拷贝CNV分级表 rawcnvdir = '/cgdata/pancancer_report/refdata/CNV.xlsx' cnvtable = pd.read_excel(rawcnvdir) cnvtable['gene'] = cnvtable['gene'].str.strip() cnvtable['CNV_label'] = cnvtable['CNV_label'].str.strip() refcnvdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt' biocnvdir2 = '/cgdata/bioproject/pancancer602gene/Project_table/CNV_tier_infor.txt' cnvtable.to_csv(refcnvdir1, index=False, header=True, sep='\t') cnvtable.to_csv(biocnvdir2, index=False, header=True, sep='\t') ###1.从项目表提取出当前大panel的样本表 project = pd.read_excel('/cgdata/bioproject/pancancer602gene/Project_table/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl',sheet_name=None) #提取出两个sheet的结果并追加合并 project_SH=project['项目信息表-上海'] project_QD=project['项目信息表-启东'] project_all=project_QD.append(project_SH) project_all.reset_index(drop=True,inplace=True) ###2.获得当前的项目,更新当前的样本表 projectsample1 = project_all[project_all['检测项目1'] == '阅全-泛癌种602基因检测'] projectsample2 = project_all[project_all['检测项目1'].str.contains('CPS', na=False)] projectsample=projectsample1.append(projectsample2) projectsample.reset_index(drop=True,inplace=True) outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_602gene_samples.xlsx') writer = pd.ExcelWriter(outputfile1) projectsample.to_excel(writer, sheet_name='QC', index=False) writer.save() writer.close() ####3.提取出当前我们panel分析样本的信息 selectlist = ['样本编号', '样本类型', '临床诊断'] selectsample = projectsample[selectlist] selectsample.reset_index(drop=True, inplace=True) selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib") ###3.1 我们从底层表中获取疾病的名称 tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt' tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk') # 添加TCGA的名称 selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True) selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left') # 将不在TCGA的癌症名称标为unknown selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown') # 3.2 获得当前的样本对应的信息 ####提取出当前lane的样本对应的信息 lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid']) lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left') # 输出原始信息表 outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx') writer = pd.ExcelWriter(outputfile2) lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False) writer.save() writer.close() return lane_sampleinfor #####################获得项目表中样本的信息 def project_pair(rawsheet): #inputfile = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx') #rawsheet = pd.read_excel(inputfile) ##首先获得所有的肿瘤样本 rawsheet_tumorid = rawsheet[rawsheet['sampleid'].str.contains('T')] rawsheet_tumorid.reset_index(drop=True, inplace=True) ###获得肿瘤样本名,构建配对表,因为不管正常如何命名,最后的normal也是根据肿瘤样本名,所以只需要看肿瘤样本名就可 sample_pair = pd.DataFrame() for i in range(len(rawsheet_tumorid)): sample = rawsheet_tumorid.loc[i, 'sampleid'] sampletype1 = rawsheet_tumorid.loc[i, 'sampletype'] tumortype = rawsheet_tumorid.loc[i, 'Abbr'] sample_pair.loc[i, 'samplename'] = sample sample_pair.loc[i, 'normal'] = sample + 'CN' if sampletype1 == '血浆': sample_pair.loc[i, 'tumor'] = sample + 'CT' sample_pair.loc[i, 'sampletype'] = 'blood' sample_pair.loc[i, 'tumortype'] = tumortype elif sampletype1=='外周血': sample_pair.loc[i, 'tumor'] = sample + 'CT' sample_pair.loc[i, 'sampletype'] = 'blood' sample_pair.loc[i, 'tumortype'] = tumortype else: sample_pair.loc[i, 'tumor'] = sample + 'TT' sample_pair.loc[i, 'sampletype'] = 'FFPE' sample_pair.loc[i, 'tumortype'] = tumortype return sample_pair ###########获得下机数据的样本表信息 def subdir_list(dirname): """获取目录下所有子目录名 @param dirname: str 目录的完整路径 @return: list(str) 所有子目录完整路径组成的列表 """ return list(filter(os.path.isdir, map(lambda filename: os.path.join(dirname, filename), os.listdir(dirname)))) def file_list(dirname, ext='.gz'): """获取目录下所有特定后缀的文件 @param dirname: str 目录的完整路径 @param ext: str 后缀名, 以点号开头 @return: list(str) 所有子文件名(不包含路径)组成的列表 """ return list(filter( lambda filename: os.path.splitext(filename)[1] == ext, os.listdir(dirname))) def fastq_pair(bclpath): file0 = subdir_list(bclpath) fastq_sample = pd.DataFrame() for i in range(len(file0)): split0 = file0[i].split('/')[-1] sampleid0 = split0.split('-')[-1] corsampleid = re.findall(r"[Lib]+[0-9]+", sampleid0) fastq_sample.loc[i, 'corname'] = corsampleid[0] fastq_sample.loc[i, 'fastq_name'] = sampleid0 # 制作下机数据的配对表 core_samplelist = pd.DataFrame(fastq_sample.loc[:, 'corname'].drop_duplicates(keep='first')) core_samplelist.reset_index(drop=True, inplace=True) pair_fastq = pd.DataFrame() k = 0 for i in range(len(core_samplelist)): corname = core_samplelist.loc[i, 'corname'] # 从下机数据里面获得对应的样本编号 fastqsample = fastq_sample[fastq_sample['corname'] == corname] fastqsample.reset_index(drop=True, inplace=True) fastqsample.rename(columns={'fastq_name': 'sampleid'}, inplace=True) ###提取出肿瘤的样本 fastqsample_tumor = fastqsample[fastqsample['sampleid'].str.contains('T')] fastqsample_normal = fastqsample[fastqsample['sampleid'].str.contains('N')] fastqsample_tumor.reset_index(drop=True, inplace=True) fastqsample_normal.reset_index(drop=True, inplace=True) for j in range(len(fastqsample_tumor)): pair_fastq.loc[k, 'samplename'] = fastqsample_tumor.loc[j, 'sampleid'] pair_fastq.loc[k, 'fastq_tumor'] = fastqsample_tumor.loc[j, 'sampleid'] for m in range(len(fastqsample_normal)): pair_fastq.loc[k, 'fastq_normal'] = fastqsample_normal.loc[m, 'sampleid'] k = k + 1 return pair_fastq def sheetrunmain(inputpath,laneid,bclpath): lane_sampleinfor=pancancer_project(inputpath,laneid) project_sample=project_pair(lane_sampleinfor) fastq_sample=fastq_pair(bclpath) #merge the project and fastq print('project_sample') print(project_sample) print('fastq_sample') print(fastq_sample) sample_infor_pair = pd.merge(project_sample, fastq_sample, on=['samplename'], how='outer') print('mergedata') print(sample_infor_pair) outputfile = os.path.join(inputpath, laneid + '_sample_infor_label.txt') sample_infor_pair.to_csv(outputfile,sep='\t',header=True, index=False) print('finish') #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158' #laneid='CGB0158' #sheetrun(inputpath,laneid) if __name__=='__main__': parser = argparse.ArgumentParser(description='sample pair') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-l', '--laneid', type=str, help='laneid') parser.add_argument('-f', '--bclpath', type=str, help='the fastq file') args = parser.parse_args() Inputpath = args.inputpath Laneid = args.laneid Bclpath=args.bclpath sheetrunmain(Inputpath,Laneid,Bclpath)