chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
							
#edit the file
#1.clean data
#2.align
#3.sv callin


import sys,collections,math,os,os.path,re
import pandas as pd
from pandas.core.frame import DataFrame
import argparse
import numpy as np
import time
import re

sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/')
import s11_noUMI_QCsummary as qcsummary
import s12_noUMI_somatic_UMI_20220128 as somatic

##0.creat the analysis path

def analis_dir(inputpath):
	svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
	if not os.path.exists(svdir1):
		os.mkdir(svdir1)
	svdir2= os.path.join(inputpath, '1SV_vardict_pair')
	if not os.path.exists(svdir2):
		os.mkdir(svdir2)
	# for CNV
	CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
	if not os.path.exists(CNVdir):
		os.mkdir(CNVdir)
	# for MSI
	MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
	if not os.path.exists(MSI_dir):
		os.mkdir(MSI_dir)
	# for germline result
	gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
	if not os.path.exists(gemerlinedir):
		os.mkdir(gemerlinedir)
	# for HL result
	HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
	if not os.path.exists(HLdir):
		os.mkdir(HLdir)
	# for fusion
	fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
	if not os.path.exists(fusion_method1_dir):
		os.mkdir(fusion_method1_dir)
	fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
	if not os.path.exists(fusion_method2_dir):
		os.mkdir(fusion_method2_dir)
	#for HLA
	HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
	if not os.path.exists(HLA_dir):
		os.mkdir(HLA_dir)
	# for qc
	qc_dir = os.path.join(inputpath, '8Fastqc')
	if not os.path.exists(qc_dir):
		os.mkdir(qc_dir)
	# for ontarget
	ontarget_dir = os.path.join(inputpath, '9Ontarget')
	if not os.path.exists(ontarget_dir):
		os.mkdir(ontarget_dir)
	# for coverage
	coverage_dir = os.path.join(inputpath, '10Coverage')
	if not os.path.exists(coverage_dir):
		os.mkdir(coverage_dir)
	# datasummary
	datasummary_dir = os.path.join(inputpath, 'datasummary')
	if not os.path.exists(datasummary_dir):
		os.mkdir(datasummary_dir)


####1.执行cleandata,align
###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile
def clean_align_v0(inputpath,laneid):
	lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
	if not os.path.exists(lane_dir):
		os.mkdir(lane_dir)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	if not os.path.exists(bam_dir):
		os.mkdir(bam_dir)
	sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t')
	align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20221027.pbs'
	for i in range(len(samplelist)):
		tumor = samplelist.loc[i, 'tumor']
		normal = samplelist.loc[i, 'normal']
		if pd.notnull(tumor):
			print(tumor)
			align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
			os.system(align_tumor_cmd)
		if pd.notnull(normal):
			print(normal)
			align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
			os.system(align_normal_cmd)

###修改，对于血液样本切除UMI，对于组织和白细胞对照样本不切除UMI

def clean_align(inputpath,laneid):
	lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
	if not os.path.exists(lane_dir):
		os.mkdir(lane_dir)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	if not os.path.exists(bam_dir):
		os.mkdir(bam_dir)
	sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t')
	#align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20221027.pbs'
	alignBlood = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_blood_20230529.pbs'
	alignFFPE = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_FFPE_20230529.pbs'
	for i in range(len(samplelist)):
		tumor = samplelist.loc[i, 'tumor']
		normal = samplelist.loc[i, 'normal']
		sampletype=samplelist.loc[i, 'sampletype']
		if pd.notnull(tumor):
			print(tumor)
			if sampletype=='blood':
				align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignBlood
				os.system(align_tumor_cmd)
			elif sampletype=='FFPE':
				align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignFFPE
				os.system(align_tumor_cmd)
		if pd.notnull(normal):
			print(normal)
			align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignFFPE
			os.system(align_normal_cmd)


###3.执行pair的操作(sv caling,cnv,MSI)
##需要判断肿瘤样本是来自组织还是血液，如果是血液，那么min_af=0.005 for blood,0.01 for tissue
def pair(inputpath,laneid,sample,tumor,normal):
	pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220929.pbs'
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	tumorlabel = tumor[-2]
	if tumorlabel == 'T':
		min_af = 0.01
	elif tumorlabel == 'C':
		min_af = 0.005
	svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
	os.system(svcall_cmd)
##3.1执行SNV配对
def SNVpair(inputpath,laneid,sample,tumor,normal):
	pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_SNVpair_20221027.pbs'
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	tumorlabel = tumor[-2]
	if tumorlabel == 'T':
		min_af = 0.01
	elif tumorlabel == 'C':
		min_af = 0.005
	svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
	os.system(svcall_cmd)
###3.2执行CNV
def CNVpair(inputpath,laneid,sample,tumor,normal):
	pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s3_noUMI_CNV_cnvkit_pair_20220929.sh'
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	cnv_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal  + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
	os.system(cnv_cmd)
###执行3.3MSI
def MSIpair(inputpath,laneid,sample,tumor,normal):
	pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s4_noUMI_MSI_15STR_msisensor2_pair_20220929.sh'
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	MSI_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal  + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
	os.system(MSI_cmd)


####4.unpair(germline,HL,HLA) for control sample
def unpair_control(inputpath,laneid,normal):
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220929.pbs'
	unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair
	os.system(unpair_cmd)


####5.unpair(Genefuse) for tumor sample
def unpair_case(inputpath,tumor):
	unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220929.pbs'
	unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair
	os.system(unpair_cmd)


####6.unpair_qc(QC,ontarget,coverage) for normal and tumor
def unpair_QC(inputpath,laneid,sample):
	lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
	bam_dir = os.path.join(lane_dir, 'bamfile')
	unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20221027.pbs'
	unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair
	os.system(unpair_tumor_cmd)


#inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
#laneid='laneruntest'
#####
#1.首先执行mapping
#clean_align(inputpath,laneid)
#2.判断log文件是否已经生成完毕。
#2.1)如果单个文件的log生成了，那么执行单个样本对应的命令
#2.1.1) 对于tumor,执行unpair_case(inputpath,tumor)，unpair_QC(inputpath,laneid,sample)
#2.1.2）对于normal,执行unpair_control(inputpath,laneid,normal)，unpair_QC(inputpath,laneid,sample)
#2.2）如果样本的tumor和normal对应的bam均生成了，以上两个程序均要执行，如果执行了，就不重复执行。完毕后执行配对样本的分析命令
#对于配对的，执行pair(inputpath,laneid,sample,tumor,normal)

def classify_1(data1,data_classify):
	if pd.isna(data1["normal"]):
		if re.search("CT$",data1["tumor"]):
			data_classify[data1["samplename"]] = "tumor_CT"
		else:
			data_classify[data1["samplename"]] = "tumor_TT"
	elif pd.isna(data1["tumor"]):
		data_classify[data1["samplename"]] = "normal"
	else:
		if re.search("CT$", data1["tumor"]):
			data_classify[data1["samplename"] + " "] = "tumor_CT"
			data_classify[data1["samplename"]] = "normal_tumor_CT"
		else:
			data_classify[data1["samplename"] + " "] = "tumor_TT"
			data_classify[data1["samplename"]] = "normal_tumor_TT"
		data_classify[data1["samplename"]+"  "] = "normal"

def get_logfile(path,file_list):
	dir_list = os.listdir(path)
	for x in dir_list:
		new_x = os.path.join(path,x)
		if os.path.isdir(new_x):
			get_logfile(new_x,file_list)
		else:
			file_tuple = os.path.splitext(new_x)
			if file_tuple[1] == '.log':
				file_list.append(x.split("_")[0])
	return file_list


def sample_1(data_classify, file_list, inputpath, laneid):
	sample_accomplish = []
	for sam in data_classify:
		if data_classify[sam] == "normal":
			sample_name_1 = sam.strip() + "CN"
			if sample_name_1 in file_list:
				print('deal with the normal data including QC,germline,HLA,chemthrethy')
				sample = sam.strip()
				unpair_control(inputpath, laneid, sample_name_1)
				unpair_QC(inputpath, laneid, sample_name_1)
				sample_accomplish.append(sam)
		if data_classify[sam] == "tumor_TT":
			sample_name_2 = sam.strip() + "TT"
			if sample_name_2 in file_list:
				print('deal with the tumor data for the tissue including QC,fusion')
				unpair_case(inputpath,sample_name_2)
				unpair_QC(inputpath,laneid,sample_name_2)
				sample_accomplish.append(sam)
		if data_classify[sam] == "tumor_CT":
			sample_name_2 = sam.strip() + "CT"
			if sample_name_2 in file_list:
				#print(33)
				print('deal with the tumor data for the blood including QC,fusion')
				unpair_case(inputpath, sample_name_2)
				unpair_QC(inputpath, laneid, sample_name_2)
				sample_accomplish.append(sam)
		if data_classify[sam] == "normal_tumor_TT":
			sample_name_1 = sam + "CN"
			sample_name_2 = sam + "TT"
			if (sample_name_1 in file_list) and (sample_name_2 in file_list):
				#print(44)
				print('deal with the pair data for the tissue including varscan and vardict mutation')
				sample=sam.strip()
				SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1 )
				CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
				MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
				sample_accomplish.append(sam)
		if data_classify[sam] == "normal_tumor_CT":
			sample_name_1 = sam + "CN"
			sample_name_2 = sam + "CT"
			if (sample_name_1 in file_list) and (sample_name_2 in file_list):
				#print(55)
				print('deal with the pair data for the blood including varscan and vardict mutation')
				sample = sam.strip()
				SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
				CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
				MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
				sample_accomplish.append(sam)
	for sam1 in sample_accomplish:
		data_classify.pop(sam1)
	return data_classify


def run_1(inputpath,time_sleep, laneid):
	Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt')
	data_classify = {}
	data = pd.read_csv(Sample_list, header=0, sep="\t")
	data.apply(classify_1,axis=1,args=(data_classify,))
	print(data_classify)
	analis_dir(inputpath)
	clean_align(inputpath, laneid)
	while 1:
		print("未处理样本:")
		print(data_classify)
		file_list = []
		path = inputpath
		get_logfile(path, file_list)
		if len(data_classify) == 0:
			print("结束")
			break
		data_classify = sample_1(data_classify, file_list, inputpath, laneid)
		print("{}秒后继续".format(time_sleep))
		time.sleep(time_sleep)


#Sample_list='laneruntest_sample_infor_label.txt'
#time_sleep=15
#inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
#laneid='laneruntest'
#Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt')
#run_1(inputpath,time_sleep, laneid)


if __name__=='__main__':
	parser = argparse.ArgumentParser(description='the main script for pancancer')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-l', '--laneid', type=str, help='the laneid')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Laneid=args.laneid
	time_sleep = 1200
	run_1(Inputpath, time_sleep, Laneid)