123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- #####
- import pandas as pd
- import xlrd
- import os,sys
- import argparse
- ####creat the analysis
- def analis_dir(inputpath,laneid):
- #新建出报告的目录
- pancancer_dir = os.path.join('/cgdata/pancancer_report', laneid)
- if not os.path.exists(pancancer_dir):
- os.mkdir(pancancer_dir)
- svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
- if not os.path.exists(svdir1):
- os.mkdir(svdir1)
- svdir2= os.path.join(inputpath, '1SV_vardict_pair')
- if not os.path.exists(svdir2):
- os.mkdir(svdir2)
- # for CNV
- CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- if not os.path.exists(CNVdir):
- os.mkdir(CNVdir)
- # for MSI
- MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
- if not os.path.exists(MSI_dir):
- os.mkdir(MSI_dir)
- # for germline result
- gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
- if not os.path.exists(gemerlinedir):
- os.mkdir(gemerlinedir)
- # for HL result
- HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
- if not os.path.exists(HLdir):
- os.mkdir(HLdir)
- # for fusion
- fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
- if not os.path.exists(fusion_method1_dir):
- os.mkdir(fusion_method1_dir)
- fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- if not os.path.exists(fusion_method2_dir):
- os.mkdir(fusion_method2_dir)
- #for HLA
- HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
- if not os.path.exists(HLA_dir):
- os.mkdir(HLA_dir)
- # for qc
- qc_dir = os.path.join(inputpath, '8Fastqc')
- if not os.path.exists(qc_dir):
- os.mkdir(qc_dir)
- # for ontarget
- ontarget_dir = os.path.join(inputpath, '9Ontarget')
- if not os.path.exists(ontarget_dir):
- os.mkdir(ontarget_dir)
- # for coverage
- coverage_dir = os.path.join(inputpath, '10Coverage')
- if not os.path.exists(coverage_dir):
- os.mkdir(coverage_dir)
- # datasummary
- datasummary_dir = os.path.join(inputpath, 'datasummary')
- if not os.path.exists(datasummary_dir):
- os.mkdir(datasummary_dir)
- def pancancer_project(inputpath,laneid):
- #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
- #laneid = 'CGB0158'
- ###1.从项目表提取出当前大panel的样本表
- project = pd.read_excel('/cgdata/liuxiangqiong/work62pancancer/Project_table/阅尔基因项目信息总表-0606.xlsx')
- ###2.获得当前的项目,更新当前的样本表
- projectsample = project[project['检测项目1'] == '阅全-泛癌种602基因检测']
- outputfile1 = os.path.join('/cgdata/pancancer/project/pancancer_602gene_samples.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- projectsample.to_excel(writer, sheet_name='QC', index=False)
- writer.save()
- writer.close()
- ####3.提取出当前我们panel分析样本的信息
- selectlist = ['样本编号', '样本类型', '临床诊断']
- selectsample = projectsample[selectlist]
- selectsample.reset_index(drop=True, inplace=True)
- selectsample['样本编号'] = selectsample['样本编号'].str.replace("^A", "Lib")
- ###3.1 我们从底层表中获取疾病的名称
- tumordbdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/TCGA_tumor_list_20220915.txt'
- tumordblist = pd.read_table(tumordbdir, sep='\t', header=0, encoding='gbk')
- # 添加TCGA的名称
- selectsample.rename(columns={'临床诊断': 'tumor_name', '样本编号': 'sampleid', '样本类型': 'sampletype'}, inplace=True)
- selectsample_TCGA = pd.merge(selectsample, tumordblist, on=['tumor_name'], how='left')
- # 将不在TCGA的癌症名称标为unknown
- selectsample_TCGA['tumor_name'] = selectsample_TCGA['tumor_name'].fillna('unknown')
- # 3.2 获得当前的样本对应的信息
- ####提取出当前lane的样本对应的信息
- lane_sampelelist = pd.read_table(os.path.join(inputpath, laneid + '_sample.txt'), sep='\t', header=None, names=['sampleid'])
- lane_sampleinfor = pd.merge(lane_sampelelist, selectsample_TCGA, on=['sampleid'], how='left')
- # 输出原始信息表
- outputfile2 = os.path.join(inputpath, laneid + '_sampleinfor_raw.xlsx')
- writer = pd.ExcelWriter(outputfile2)
- lane_sampleinfor.to_excel(writer, sheet_name='sampleinfor', index=False)
- writer.save()
- writer.close()
- return lane_sampleinfor
- #####################制作配对数据样本表
- def samplepair(inputpath,laneid,lane_sampleinfor):
- ###4.1获得样本名
- samplenamelist = pd.DataFrame()
- for i in range(len(lane_sampleinfor)):
- sample = lane_sampleinfor.loc[i, 'sampleid'][:-1]
- samplenamelist.loc[i, 'samplename'] = sample
- samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True)
- ###4.2制作配对样本表
- sample_pair = pd.DataFrame()
- for j in range(len(samplenamelist_uq)):
- sample = samplenamelist_uq.loc[j, 'samplename']
- # sample_pair.loc[j, 'samplename'] = sample
- for k in range(len(lane_sampleinfor)):
- sampleid = lane_sampleinfor.loc[k, 'sampleid']
- sampletype1 = lane_sampleinfor.loc[k, 'sampletype']
- tumortype = lane_sampleinfor.loc[k, 'Abbr']
- print(sampleid)
- label = sampleid.find(sample)
- # print(label)
- if label != -1:
- # 修改样本名
- sample_pair.loc[j, 'samplename'] = sample + 'T'
- if (sampletype1 == '外周血') and (sampleid[-1] == 'N'):
- sample_pair.loc[j, 'normal'] = sample + 'T' + 'CN'
- elif sampletype1 == '血浆':
- sample_pair.loc[j, 'tumor'] = sample + 'T' + 'CT'
- sample_pair.loc[j, 'sampletype'] = 'blood'
- sample_pair.loc[j, 'tumortype'] = tumortype
- else:
- sample_pair.loc[j, 'tumor'] = sample + 'T' + 'TT'
- sample_pair.loc[j, 'sampletype'] = 'FFPE'
- sample_pair.loc[j, 'tumortype'] = tumortype
- # output
- outputfile3 = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- sample_pair.to_csv(outputfile3, index=False, header=True, encoding='gbk', sep='\t')
- def sheetrunmain(inputpath,laneid):
- analis_dir(inputpath,laneid)
- lane_sampleinfor=pancancer_project(inputpath,laneid)
- samplepair(inputpath, laneid, lane_sampleinfor)
- #inputpath='/cgdata/liuxiangqiong/work62pancancer/pipelinetest-CGB0158'
- #laneid='CGB0158'
- #sheetrun(inputpath,laneid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='sample infor')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-l', '--laneid', type=str, help='laneid')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Laneid = args.laneid
- sheetrunmain(Inputpath,Laneid)
|