#edit the file #1.clean data #2.align #3.sv callin import sys,collections,math,os,os.path,re import pandas as pd from pandas.core.frame import DataFrame import argparse import numpy as np import time import re sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/') import s11_noUMI_QCsummary as qcsummary import s12_noUMI_somatic_UMI_20220128 as somatic ##0.creat the analysis path def analis_dir(inputpath): svdir1 = os.path.join(inputpath, '1SV_varscan_pair') if not os.path.exists(svdir1): os.mkdir(svdir1) svdir2= os.path.join(inputpath, '1SV_vardict_pair') if not os.path.exists(svdir2): os.mkdir(svdir2) # for CNV CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair') if not os.path.exists(CNVdir): os.mkdir(CNVdir) # for MSI MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair') if not os.path.exists(MSI_dir): os.mkdir(MSI_dir) # for germline result gemerlinedir = os.path.join(inputpath, '4Germline_unpair') if not os.path.exists(gemerlinedir): os.mkdir(gemerlinedir) # for HL result HLdir = os.path.join(inputpath, '5HL_gatk_unpair') if not os.path.exists(HLdir): os.mkdir(HLdir) # for fusion fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair') if not os.path.exists(fusion_method1_dir): os.mkdir(fusion_method1_dir) fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair') if not os.path.exists(fusion_method2_dir): os.mkdir(fusion_method2_dir) #for HLA HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair') if not os.path.exists(HLA_dir): os.mkdir(HLA_dir) # for qc qc_dir = os.path.join(inputpath, '8Fastqc') if not os.path.exists(qc_dir): os.mkdir(qc_dir) # for ontarget ontarget_dir = os.path.join(inputpath, '9Ontarget') if not os.path.exists(ontarget_dir): os.mkdir(ontarget_dir) # for coverage coverage_dir = os.path.join(inputpath, '10Coverage') if not os.path.exists(coverage_dir): os.mkdir(coverage_dir) # datasummary datasummary_dir = os.path.join(inputpath, 'datasummary') if not os.path.exists(datasummary_dir): os.mkdir(datasummary_dir) ####1.执行cleandata,align ###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile def clean_align(inputpath,laneid): lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid) if not os.path.exists(lane_dir): os.mkdir(lane_dir) bam_dir = os.path.join(lane_dir, 'bamfile') if not os.path.exists(bam_dir): os.mkdir(bam_dir) sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t') align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20220705.pbs' for i in range(len(samplelist)): tumor = samplelist.loc[i, 'tumor'] normal = samplelist.loc[i, 'normal'] if pd.notnull(tumor): print(tumor) align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align os.system(align_tumor_cmd) if pd.notnull(normal): print(normal) align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align os.system(align_normal_cmd) ###3.执行pair的操作(sv caling,cnv,MSI) ##需要判断肿瘤样本是来自组织还是血液,如果是血液,那么min_af=0.005 for blood,0.01 for tissue def pair(inputpath,laneid,sample,tumor,normal): pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220929.pbs' lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid) bam_dir = os.path.join(lane_dir, 'bamfile') tumorlabel = tumor[-2] if tumorlabel == 'T': min_af = 0.01 elif tumorlabel == 'C': min_af = 0.005 svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir os.system(svcall_cmd) ####4.unpair(germline,HL,HLA) for control sample def unpair_control(inputpath,laneid,normal): lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid) bam_dir = os.path.join(lane_dir, 'bamfile') unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220929.pbs' unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair os.system(unpair_cmd) ####5.unpair(Genefuse) for tumor sample def unpair_case(inputpath,tumor): unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220929.pbs' unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair os.system(unpair_cmd) ####6.unpair_qc(QC,ontarget,coverage) for normal and tumor def unpair_QC(inputpath,laneid,sample): lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid) bam_dir = os.path.join(lane_dir, 'bamfile') unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20220929.pbs' unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair os.system(unpair_tumor_cmd) #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest' #laneid='laneruntest' ##### #1.首先执行mapping #clean_align(inputpath,laneid) #2.判断log文件是否已经生成完毕。 #2.1)如果单个文件的log生成了,那么执行单个样本对应的命令 #2.1.1) 对于tumor,执行unpair_case(inputpath,tumor),unpair_QC(inputpath,laneid,sample) #2.1.2)对于normal,执行unpair_control(inputpath,laneid,normal),unpair_QC(inputpath,laneid,sample) #2.2)如果样本的tumor和normal对应的bam均生成了,以上两个程序均要执行,如果执行了,就不重复执行。完毕后执行配对样本的分析命令 #对于配对的,执行pair(inputpath,laneid,sample,tumor,normal) def classify_1(data1,data_classify): if pd.isna(data1["normal"]): if re.search("CT$",data1["tumor"]): data_classify[data1["samplename"]] = "tumor_CT" else: data_classify[data1["samplename"]] = "tumor_TT" elif pd.isna(data1["tumor"]): data_classify[data1["samplename"]] = "normal" else: if re.search("CT$", data1["tumor"]): data_classify[data1["samplename"] + " "] = "tumor_CT" data_classify[data1["samplename"]] = "normal_tumor_CT" else: data_classify[data1["samplename"] + " "] = "tumor_TT" data_classify[data1["samplename"]] = "normal_tumor_TT" data_classify[data1["samplename"]+" "] = "normal" def get_logfile(path,file_list): dir_list = os.listdir(path) for x in dir_list: new_x = os.path.join(path,x) if os.path.isdir(new_x): get_logfile(new_x,file_list) else: file_tuple = os.path.splitext(new_x) if file_tuple[1] == '.log': file_list.append(x.split("_")[0]) return file_list def sample_1(data_classify, file_list, inputpath, laneid): sample_accomplish = [] for sam in data_classify: if data_classify[sam] == "normal": sample_name_1 = sam.strip() + "CN" if sample_name_1 in file_list: print('deal with the normal data including QC,germline,HLA,chemthrethy') sample = sam.strip() unpair_control(inputpath, laneid, sample_name_1) unpair_QC(inputpath, laneid, sample_name_1) sample_accomplish.append(sam) if data_classify[sam] == "tumor_TT": sample_name_2 = sam.strip() + "TT" if sample_name_2 in file_list: print('deal with the tumor data for the tissue including QC,fusion') unpair_case(inputpath,sample_name_2) unpair_QC(inputpath,laneid,sample_name_2) sample_accomplish.append(sam) if data_classify[sam] == "tumor_CT": sample_name_2 = sam.strip() + "CT" if sample_name_2 in file_list: #print(33) print('deal with the tumor data for the blood including QC,fusion') unpair_case(inputpath, sample_name_2) unpair_QC(inputpath, laneid, sample_name_2) sample_accomplish.append(sam) if data_classify[sam] == "normal_tumor_TT": sample_name_1 = sam + "CN" sample_name_2 = sam + "TT" if (sample_name_1 in file_list) and (sample_name_2 in file_list): #print(44) print('deal with the pair data for the tissue including varscan and vardict mutation') sample=sam.strip() pair(inputpath, laneid, sample, sample_name_2, sample_name_1 ) sample_accomplish.append(sam) if data_classify[sam] == "normal_tumor_CT": sample_name_1 = sam + "CN" sample_name_2 = sam + "CT" if (sample_name_1 in file_list) and (sample_name_2 in file_list): #print(55) print('deal with the pair data for the blood including varscan and vardict mutation') sample = sam.strip() pair(inputpath, laneid, sample, sample_name_2, sample_name_1) sample_accomplish.append(sam) for sam1 in sample_accomplish: data_classify.pop(sam1) return data_classify def run_1(inputpath,time_sleep, laneid): Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt') data_classify = {} data = pd.read_csv(Sample_list, header=0, sep="\t") data.apply(classify_1,axis=1,args=(data_classify,)) print(data_classify) analis_dir(inputpath) clean_align(inputpath, laneid) while 1: print("未处理样本:") print(data_classify) file_list = [] path = inputpath get_logfile(path, file_list) if len(data_classify) == 0: print("结束") break data_classify = sample_1(data_classify, file_list, inputpath, laneid) print("{}秒后继续".format(time_sleep)) time.sleep(time_sleep) #Sample_list='laneruntest_sample_infor_label.txt' #time_sleep=15 #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest' #laneid='laneruntest' #Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt') #run_1(inputpath,time_sleep, laneid) if __name__=='__main__': parser = argparse.ArgumentParser(description='the main script for pancancer') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-l', '--laneid', type=str, help='the laneid') args = parser.parse_args() Inputpath = args.inputpath Laneid=args.laneid time_sleep = 1200 run_1(Inputpath, time_sleep, Laneid)