123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- #####
- import pandas as pd
- import xlrd
- import os,sys
- import argparse
- import re
- #原项目表只有一个sheet的代码
- def pancancer_project_v0(inputpath,laneid):
- #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
- #laneid = 'CGB0158'
- ###0.拷贝CNV分级表
- rawcnvdir = '/cgdata/pancancer_report/refdata/CNV.xlsx'
- cnvtable = pd.read_excel(rawcnvdir)
- cnvtable['gene'] = cnvtable['gene'].str.strip()
- cnvtable['CNV_label'] = cnvtable['CNV_label'].str.strip()
- refcnvdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
- biocnvdir2 = '/cgdata/bioproject/pancancer602gene/Project_table/CNV_tier_infor.txt'
- cnvtable.to_csv(refcnvdir1, index=False, header=True, sep='\t')
- cnvtable.to_csv(biocnvdir2, index=False, header=True, sep='\t')
- ###1.从项目表提取出当前大panel的样本表
- project = pd.read_excel('/cgdata/bioproject/pancancer602gene/Project_table/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl')
- ###2.获得当前的项目,更新当前的样本表
- projectsample1 = project[project['检测项目1'] == '阅全-泛癌种602基因检测']
- projectsample2 = project[project['检测项目1'].str.contains('CPS', na=False)]
- projectsample=projectsample1.append(projectsample2)
- projectsample.reset_index(drop=True,inplace=True)
- outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_602gene_samples.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- projectsample.to_excel(writer, sheet_name='QC', index=False)
- writer.save()
- writer.close()
- ####3.提取出当前我们panel分析样本的信息
- selectlist = ['样本编号', '样本类型', '临床诊断']
- selectsample = projectsample[selectlist]
- selectsample.reset_index(drop=True, inplace=True)
- selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib")
- ###3.1 我们从底层表中获取疾病的名称
- tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt'
- tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk')
- # 添加TCGA的名称
- selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True)
- selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left')
- # 将不在TCGA的癌症名称标为unknown
- selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown')
- # 3.2 获得当前的样本对应的信息
- ####提取出当前lane的样本对应的信息
- lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid'])
- lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left')
- # 输出原始信息表
- outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
- writer = pd.ExcelWriter(outputfile2)
- lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False)
- writer.save()
- writer.close()
- return lane_sampleinfor
- ##项目表有上海和启动两个sheet
- def pancancer_project(inputpath,laneid):
- #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
- #laneid = 'CGB0158'
- ###0.拷贝CNV分级表
- rawcnvdir = '/cgdata/pancancer_report/refdata/CNV.xlsx'
- cnvtable = pd.read_excel(rawcnvdir)
- cnvtable['gene'] = cnvtable['gene'].str.strip()
- cnvtable['CNV_label'] = cnvtable['CNV_label'].str.strip()
- refcnvdir1 = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/CNV_tier_infor.txt'
- biocnvdir2 = '/cgdata/bioproject/pancancer602gene/Project_table/CNV_tier_infor.txt'
- cnvtable.to_csv(refcnvdir1, index=False, header=True, sep='\t')
- cnvtable.to_csv(biocnvdir2, index=False, header=True, sep='\t')
- ###1.从项目表提取出当前大panel的样本表
- project = pd.read_excel('/cgdata/bioproject/pancancer602gene/Project_table/阅尔基因项目信息总表-0606.xlsx',engine='openpyxl',sheet_name=None)
- #提取出两个sheet的结果并追加合并
- project_SH=project['项目信息表-上海']
- project_QD=project['项目信息表-启东']
- project_all=project_QD.append(project_SH)
- project_all.reset_index(drop=True,inplace=True)
- ###2.获得当前的项目,更新当前的样本表
- projectsample1 = project_all[project_all['检测项目1'] == '阅全-泛癌种602基因检测']
- projectsample2 = project_all[project_all['检测项目1'].str.contains('CPS', na=False)]
- projectsample=projectsample1.append(projectsample2)
- projectsample.reset_index(drop=True,inplace=True)
- outputfile1 = os.path.join('/cgdata/bioproject/pancancer602gene/Project_table/pancancer_602gene_samples.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- projectsample.to_excel(writer, sheet_name='QC', index=False)
- writer.save()
- writer.close()
- ####3.提取出当前我们panel分析样本的信息
- selectlist = ['样本编号', '样本类型', '临床诊断']
- selectsample = projectsample[selectlist]
- selectsample.reset_index(drop=True, inplace=True)
- selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib")
- ###3.1 我们从底层表中获取疾病的名称
- tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt'
- tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk')
- # 添加TCGA的名称
- selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True)
- selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left')
- # 将不在TCGA的癌症名称标为unknown
- selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown')
- # 3.2 获得当前的样本对应的信息
- ####提取出当前lane的样本对应的信息
- lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid'])
- lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left')
- # 输出原始信息表
- outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
- writer = pd.ExcelWriter(outputfile2)
- lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False)
- writer.save()
- writer.close()
- return lane_sampleinfor
- #####################获得项目表中样本的信息
- def project_pair(rawsheet):
- #inputfile = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
- #rawsheet = pd.read_excel(inputfile)
- ##首先获得所有的肿瘤样本
- rawsheet_tumorid = rawsheet[rawsheet['sampleid'].str.contains('T')]
- rawsheet_tumorid.reset_index(drop=True, inplace=True)
- ###获得肿瘤样本名,构建配对表,因为不管正常如何命名,最后的normal也是根据肿瘤样本名,所以只需要看肿瘤样本名就可
- sample_pair = pd.DataFrame()
- for i in range(len(rawsheet_tumorid)):
- sample = rawsheet_tumorid.loc[i, 'sampleid']
- sampletype1 = rawsheet_tumorid.loc[i, 'sampletype']
- tumortype = rawsheet_tumorid.loc[i, 'Abbr']
- sample_pair.loc[i, 'samplename'] = sample
- sample_pair.loc[i, 'normal'] = sample + 'CN'
- if sampletype1 == '血浆':
- sample_pair.loc[i, 'tumor'] = sample + 'CT'
- sample_pair.loc[i, 'sampletype'] = 'blood'
- sample_pair.loc[i, 'tumortype'] = tumortype
- elif sampletype1=='外周血':
- sample_pair.loc[i, 'tumor'] = sample + 'CT'
- sample_pair.loc[i, 'sampletype'] = 'blood'
- sample_pair.loc[i, 'tumortype'] = tumortype
- else:
- sample_pair.loc[i, 'tumor'] = sample + 'TT'
- sample_pair.loc[i, 'sampletype'] = 'FFPE'
- sample_pair.loc[i, 'tumortype'] = tumortype
- return sample_pair
- ###########获得下机数据的样本表信息
- def subdir_list(dirname):
- """获取目录下所有子目录名
- @param dirname: str 目录的完整路径
- @return: list(str) 所有子目录完整路径组成的列表
- """
- return list(filter(os.path.isdir,
- map(lambda filename: os.path.join(dirname, filename),
- os.listdir(dirname))))
- def file_list(dirname, ext='.gz'):
- """获取目录下所有特定后缀的文件
- @param dirname: str 目录的完整路径
- @param ext: str 后缀名, 以点号开头
- @return: list(str) 所有子文件名(不包含路径)组成的列表
- """
- return list(filter(
- lambda filename: os.path.splitext(filename)[1] == ext,
- os.listdir(dirname)))
- def fastq_pair(bclpath):
- file0 = subdir_list(bclpath)
- fastq_sample = pd.DataFrame()
- for i in range(len(file0)):
- split0 = file0[i].split('/')[-1]
- sampleid0 = split0.split('-')[-1]
- corsampleid = re.findall(r"[Lib]+[0-9]+", sampleid0)
- fastq_sample.loc[i, 'corname'] = corsampleid[0]
- fastq_sample.loc[i, 'fastq_name'] = sampleid0
- # 制作下机数据的配对表
- core_samplelist = pd.DataFrame(fastq_sample.loc[:, 'corname'].drop_duplicates(keep='first'))
- core_samplelist.reset_index(drop=True, inplace=True)
- pair_fastq = pd.DataFrame()
- k = 0
- for i in range(len(core_samplelist)):
- corname = core_samplelist.loc[i, 'corname']
- # 从下机数据里面获得对应的样本编号
- fastqsample = fastq_sample[fastq_sample['corname'] == corname]
- fastqsample.reset_index(drop=True, inplace=True)
- fastqsample.rename(columns={'fastq_name': 'sampleid'}, inplace=True)
- ###提取出肿瘤的样本
- fastqsample_tumor = fastqsample[fastqsample['sampleid'].str.contains('T')]
- fastqsample_normal = fastqsample[fastqsample['sampleid'].str.contains('N')]
- fastqsample_tumor.reset_index(drop=True, inplace=True)
- fastqsample_normal.reset_index(drop=True, inplace=True)
- for j in range(len(fastqsample_tumor)):
- pair_fastq.loc[k, 'samplename'] = fastqsample_tumor.loc[j, 'sampleid']
- pair_fastq.loc[k, 'fastq_tumor'] = fastqsample_tumor.loc[j, 'sampleid']
- for m in range(len(fastqsample_normal)):
- pair_fastq.loc[k, 'fastq_normal'] = fastqsample_normal.loc[m, 'sampleid']
- k = k + 1
- return pair_fastq
- def sheetrunmain(inputpath,laneid,bclpath):
- lane_sampleinfor=pancancer_project(inputpath,laneid)
- project_sample=project_pair(lane_sampleinfor)
- fastq_sample=fastq_pair(bclpath)
- #merge the project and fastq
- print('project_sample')
- print(project_sample)
- print('fastq_sample')
- print(fastq_sample)
- sample_infor_pair = pd.merge(project_sample, fastq_sample, on=['samplename'], how='outer')
- print('mergedata')
- print(sample_infor_pair)
- outputfile = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- sample_infor_pair.to_csv(outputfile,sep='\t',header=True, index=False)
- print('finish')
- #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
- #laneid='CGB0158'
- #sheetrun(inputpath,laneid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='sample pair')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-l', '--laneid', type=str, help='laneid')
- parser.add_argument('-f', '--bclpath', type=str, help='the fastq file')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Laneid = args.laneid
- Bclpath=args.bclpath
- sheetrunmain(Inputpath,Laneid,Bclpath)
|