123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- #edit the file
- #1.clean data
- #2.align
- #3.sv callin
- import sys,collections,math,os,os.path,re
- import pandas as pd
- from pandas.core.frame import DataFrame
- import argparse
- import numpy as np
- import time
- import re
- sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/')
- import s11_noUMI_QCsummary as qcsummary
- import s12_noUMI_somatic_UMI_20220128 as somatic
- ##0.creat the analysis path
- def analis_dir(inputpath):
- svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
- if not os.path.exists(svdir1):
- os.mkdir(svdir1)
- svdir2= os.path.join(inputpath, '1SV_vardict_pair')
- if not os.path.exists(svdir2):
- os.mkdir(svdir2)
- # for CNV
- CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- if not os.path.exists(CNVdir):
- os.mkdir(CNVdir)
- # for MSI
- MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
- if not os.path.exists(MSI_dir):
- os.mkdir(MSI_dir)
- # for germline result
- gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
- if not os.path.exists(gemerlinedir):
- os.mkdir(gemerlinedir)
- # for HL result
- HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
- if not os.path.exists(HLdir):
- os.mkdir(HLdir)
- # for fusion
- fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
- if not os.path.exists(fusion_method1_dir):
- os.mkdir(fusion_method1_dir)
- fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- if not os.path.exists(fusion_method2_dir):
- os.mkdir(fusion_method2_dir)
- #for HLA
- HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
- if not os.path.exists(HLA_dir):
- os.mkdir(HLA_dir)
- # for qc
- qc_dir = os.path.join(inputpath, '8Fastqc')
- if not os.path.exists(qc_dir):
- os.mkdir(qc_dir)
- # for ontarget
- ontarget_dir = os.path.join(inputpath, '9Ontarget')
- if not os.path.exists(ontarget_dir):
- os.mkdir(ontarget_dir)
- # for coverage
- coverage_dir = os.path.join(inputpath, '10Coverage')
- if not os.path.exists(coverage_dir):
- os.mkdir(coverage_dir)
- # datasummary
- datasummary_dir = os.path.join(inputpath, 'datasummary')
- if not os.path.exists(datasummary_dir):
- os.mkdir(datasummary_dir)
- ####1.执行cleandata,align
- ###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile
- def clean_align(inputpath,laneid):
- lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
- if not os.path.exists(lane_dir):
- os.mkdir(lane_dir)
- bam_dir = os.path.join(lane_dir, 'bamfile')
- if not os.path.exists(bam_dir):
- os.mkdir(bam_dir)
- sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t')
- align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20220705.pbs'
- for i in range(len(samplelist)):
- tumor = samplelist.loc[i, 'tumor']
- normal = samplelist.loc[i, 'normal']
- if pd.notnull(tumor):
- print(tumor)
- align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
- os.system(align_tumor_cmd)
- if pd.notnull(normal):
- print(normal)
- align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
- os.system(align_normal_cmd)
- ###3.执行pair的操作(sv caling,cnv,MSI)
- ##需要判断肿瘤样本是来自组织还是血液,如果是血液,那么min_af=0.005 for blood,0.01 for tissue
- def pair(inputpath,laneid,sample,tumor,normal):
- pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220705.pbs'
- lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
- bam_dir = os.path.join(lane_dir, 'bamfile')
- tumorlabel = tumor[-2]
- if tumorlabel == 'T':
- min_af = 0.01
- elif tumorlabel == 'C':
- min_af = 0.005
- svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
- os.system(svcall_cmd)
- ####4.unpair(germline,HL,HLA) for control sample
- def unpair_control(inputpath,laneid,normal):
- lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
- bam_dir = os.path.join(lane_dir, 'bamfile')
- unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220705.pbs'
- unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair
- os.system(unpair_cmd)
- ####5.unpair(Genefuse) for tumor sample
- def unpair_case(inputpath,tumor):
- unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220705.pbs'
- unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair
- os.system(unpair_cmd)
- ####6.unpair_qc(QC,ontarget,coverage) for normal and tumor
- def unpair_QC(inputpath,laneid,sample):
- lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
- bam_dir = os.path.join(lane_dir, 'bamfile')
- unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20220705.pbs'
- unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair
- os.system(unpair_tumor_cmd)
- #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
- #laneid='laneruntest'
- #####
- #1.首先执行mapping
- #clean_align(inputpath,laneid)
- #2.判断log文件是否已经生成完毕。
- #2.1)如果单个文件的log生成了,那么执行单个样本对应的命令
- #2.1.1) 对于tumor,执行unpair_case(inputpath,tumor),unpair_QC(inputpath,laneid,sample)
- #2.1.2)对于normal,执行unpair_control(inputpath,laneid,normal),unpair_QC(inputpath,laneid,sample)
- #2.2)如果样本的tumor和normal对应的bam均生成了,以上两个程序均要执行,如果执行了,就不重复执行。完毕后执行配对样本的分析命令
- #对于配对的,执行pair(inputpath,laneid,sample,tumor,normal)
- def classify_1(data1,data_classify):
- if pd.isna(data1["normal"]):
- if re.search("CT$",data1["tumor"]):
- data_classify[data1["samplename"]] = "tumor_CT"
- else:
- data_classify[data1["samplename"]] = "tumor_TT"
- elif pd.isna(data1["tumor"]):
- data_classify[data1["samplename"]] = "normal"
- else:
- if re.search("CT$", data1["tumor"]):
- data_classify[data1["samplename"] + " "] = "tumor_CT"
- data_classify[data1["samplename"]] = "normal_tumor_CT"
- else:
- data_classify[data1["samplename"] + " "] = "tumor_TT"
- data_classify[data1["samplename"]] = "normal_tumor_TT"
- data_classify[data1["samplename"]+" "] = "normal"
- def get_logfile(path,file_list):
- dir_list = os.listdir(path)
- for x in dir_list:
- new_x = os.path.join(path,x)
- if os.path.isdir(new_x):
- get_logfile(new_x,file_list)
- else:
- file_tuple = os.path.splitext(new_x)
- if file_tuple[1] == '.log':
- file_list.append(x.split("_")[0])
- return file_list
- def sample_1(data_classify, file_list, inputpath, laneid):
- sample_accomplish = []
- for sam in data_classify:
- if data_classify[sam] == "normal":
- sample_name_1 = sam.strip() + "CN"
- if sample_name_1 in file_list:
- print('deal with the normal data including QC,germline,HLA,chemthrethy')
- sample = sam.strip()
- unpair_control(inputpath, laneid, sample_name_1)
- unpair_QC(inputpath, laneid, sample_name_1)
- sample_accomplish.append(sam)
- if data_classify[sam] == "tumor_TT":
- sample_name_2 = sam.strip() + "TT"
- if sample_name_2 in file_list:
- print('deal with the tumor data for the tissue including QC,fusion')
- unpair_case(inputpath,sample_name_2)
- unpair_QC(inputpath,laneid,sample_name_2)
- sample_accomplish.append(sam)
- if data_classify[sam] == "tumor_CT":
- sample_name_2 = sam.strip() + "CT"
- if sample_name_2 in file_list:
- #print(33)
- print('deal with the tumor data for the blood including QC,fusion')
- unpair_case(inputpath, sample_name_2)
- unpair_QC(inputpath, laneid, sample_name_2)
- sample_accomplish.append(sam)
- if data_classify[sam] == "normal_tumor_TT":
- sample_name_1 = sam + "CN"
- sample_name_2 = sam + "TT"
- if (sample_name_1 in file_list) and (sample_name_2 in file_list):
- #print(44)
- print('deal with the pair data for the tissue including varscan and vardict mutation')
- sample=sam.strip()
- pair(inputpath, laneid, sample, sample_name_2, sample_name_1 )
- sample_accomplish.append(sam)
- if data_classify[sam] == "normal_tumor_CT":
- sample_name_1 = sam + "CN"
- sample_name_2 = sam + "CT"
- if (sample_name_1 in file_list) and (sample_name_2 in file_list):
- #print(55)
- print('deal with the pair data for the blood including varscan and vardict mutation')
- sample = sam.strip()
- pair(inputpath, laneid, sample, sample_name_2, sample_name_1)
- sample_accomplish.append(sam)
- for sam1 in sample_accomplish:
- data_classify.pop(sam1)
- return data_classify
- def run_1(inputpath,time_sleep, laneid):
- Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt')
- data_classify = {}
- data = pd.read_csv(Sample_list, header=0, sep="\t")
- data.apply(classify_1,axis=1,args=(data_classify,))
- print(data_classify)
- analis_dir(inputpath)
- clean_align(inputpath, laneid)
- while 1:
- print("未处理样本:")
- print(data_classify)
- file_list = []
- path = inputpath
- get_logfile(path, file_list)
- if len(data_classify) == 0:
- print("结束")
- break
- data_classify = sample_1(data_classify, file_list, inputpath, laneid)
- print("{}秒后继续".format(time_sleep))
- time.sleep(time_sleep)
- #Sample_list='laneruntest_sample_infor_label.txt'
- #time_sleep=15
- #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
- #laneid='laneruntest'
- #Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt')
- #run_1(inputpath,time_sleep, laneid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='the main script for pancancer')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-l', '--laneid', type=str, help='the laneid')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Laneid=args.laneid
- time_sleep = 1200
- run_1(Inputpath, time_sleep, Laneid)
|