main_20221027_v0.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #edit the file
  2. #1.clean data
  3. #2.align
  4. #3.sv callin
  5. import sys,collections,math,os,os.path,re
  6. import pandas as pd
  7. from pandas.core.frame import DataFrame
  8. import argparse
  9. import numpy as np
  10. import time
  11. import re
  12. sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/')
  13. import s11_noUMI_QCsummary as qcsummary
  14. import s12_noUMI_somatic_UMI_20220128 as somatic
  15. ##0.creat the analysis path
  16. def analis_dir(inputpath):
  17. svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
  18. if not os.path.exists(svdir1):
  19. os.mkdir(svdir1)
  20. svdir2= os.path.join(inputpath, '1SV_vardict_pair')
  21. if not os.path.exists(svdir2):
  22. os.mkdir(svdir2)
  23. # for CNV
  24. CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  25. if not os.path.exists(CNVdir):
  26. os.mkdir(CNVdir)
  27. # for MSI
  28. MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  29. if not os.path.exists(MSI_dir):
  30. os.mkdir(MSI_dir)
  31. # for germline result
  32. gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
  33. if not os.path.exists(gemerlinedir):
  34. os.mkdir(gemerlinedir)
  35. # for HL result
  36. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  37. if not os.path.exists(HLdir):
  38. os.mkdir(HLdir)
  39. # for fusion
  40. fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
  41. if not os.path.exists(fusion_method1_dir):
  42. os.mkdir(fusion_method1_dir)
  43. fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  44. if not os.path.exists(fusion_method2_dir):
  45. os.mkdir(fusion_method2_dir)
  46. #for HLA
  47. HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
  48. if not os.path.exists(HLA_dir):
  49. os.mkdir(HLA_dir)
  50. # for qc
  51. qc_dir = os.path.join(inputpath, '8Fastqc')
  52. if not os.path.exists(qc_dir):
  53. os.mkdir(qc_dir)
  54. # for ontarget
  55. ontarget_dir = os.path.join(inputpath, '9Ontarget')
  56. if not os.path.exists(ontarget_dir):
  57. os.mkdir(ontarget_dir)
  58. # for coverage
  59. coverage_dir = os.path.join(inputpath, '10Coverage')
  60. if not os.path.exists(coverage_dir):
  61. os.mkdir(coverage_dir)
  62. # datasummary
  63. datasummary_dir = os.path.join(inputpath, 'datasummary')
  64. if not os.path.exists(datasummary_dir):
  65. os.mkdir(datasummary_dir)
  66. ####1.执行cleandata,align
  67. ###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile
  68. def clean_align_v0(inputpath,laneid):
  69. lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
  70. if not os.path.exists(lane_dir):
  71. os.mkdir(lane_dir)
  72. bam_dir = os.path.join(lane_dir, 'bamfile')
  73. if not os.path.exists(bam_dir):
  74. os.mkdir(bam_dir)
  75. sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
  76. samplelist = pd.read_table(sampledir, sep='\t')
  77. align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20221027.pbs'
  78. for i in range(len(samplelist)):
  79. tumor = samplelist.loc[i, 'tumor']
  80. normal = samplelist.loc[i, 'normal']
  81. if pd.notnull(tumor):
  82. print(tumor)
  83. align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  84. os.system(align_tumor_cmd)
  85. if pd.notnull(normal):
  86. print(normal)
  87. align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  88. os.system(align_normal_cmd)
  89. ###修改,对于血液样本切除UMI,对于组织和白细胞对照样本不切除UMI
  90. def clean_align(inputpath,laneid):
  91. lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
  92. if not os.path.exists(lane_dir):
  93. os.mkdir(lane_dir)
  94. bam_dir = os.path.join(lane_dir, 'bamfile')
  95. if not os.path.exists(bam_dir):
  96. os.mkdir(bam_dir)
  97. sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
  98. samplelist = pd.read_table(sampledir, sep='\t')
  99. #align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20221027.pbs'
  100. alignBlood = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_blood_20230529.pbs'
  101. alignFFPE = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_FFPE_20230529.pbs'
  102. for i in range(len(samplelist)):
  103. tumor = samplelist.loc[i, 'tumor']
  104. normal = samplelist.loc[i, 'normal']
  105. sampletype=samplelist.loc[i, 'sampletype']
  106. if pd.notnull(tumor):
  107. print(tumor)
  108. if sampletype=='blood':
  109. align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignBlood
  110. os.system(align_tumor_cmd)
  111. elif sampletype=='FFPE':
  112. align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignFFPE
  113. os.system(align_tumor_cmd)
  114. if pd.notnull(normal):
  115. print(normal)
  116. align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + alignFFPE
  117. os.system(align_normal_cmd)
  118. ###3.执行pair的操作(sv caling,cnv,MSI)
  119. ##需要判断肿瘤样本是来自组织还是血液,如果是血液,那么min_af=0.005 for blood,0.01 for tissue
  120. def pair(inputpath,laneid,sample,tumor,normal):
  121. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220929.pbs'
  122. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  123. bam_dir = os.path.join(lane_dir, 'bamfile')
  124. tumorlabel = tumor[-2]
  125. if tumorlabel == 'T':
  126. min_af = 0.01
  127. elif tumorlabel == 'C':
  128. min_af = 0.005
  129. svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  130. os.system(svcall_cmd)
  131. ##3.1执行SNV配对
  132. def SNVpair(inputpath,laneid,sample,tumor,normal):
  133. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_SNVpair_20221027.pbs'
  134. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  135. bam_dir = os.path.join(lane_dir, 'bamfile')
  136. tumorlabel = tumor[-2]
  137. if tumorlabel == 'T':
  138. min_af = 0.01
  139. elif tumorlabel == 'C':
  140. min_af = 0.005
  141. svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  142. os.system(svcall_cmd)
  143. ###3.2执行CNV
  144. def CNVpair(inputpath,laneid,sample,tumor,normal):
  145. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s3_noUMI_CNV_cnvkit_pair_20220929.sh'
  146. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  147. bam_dir = os.path.join(lane_dir, 'bamfile')
  148. cnv_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  149. os.system(cnv_cmd)
  150. ###执行3.3MSI
  151. def MSIpair(inputpath,laneid,sample,tumor,normal):
  152. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s4_noUMI_MSI_15STR_msisensor2_pair_20220929.sh'
  153. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  154. bam_dir = os.path.join(lane_dir, 'bamfile')
  155. MSI_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  156. os.system(MSI_cmd)
  157. ####4.unpair(germline,HL,HLA) for control sample
  158. def unpair_control(inputpath,laneid,normal):
  159. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  160. bam_dir = os.path.join(lane_dir, 'bamfile')
  161. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220929.pbs'
  162. unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair
  163. os.system(unpair_cmd)
  164. ####5.unpair(Genefuse) for tumor sample
  165. def unpair_case(inputpath,tumor):
  166. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220929.pbs'
  167. unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair
  168. os.system(unpair_cmd)
  169. ####6.unpair_qc(QC,ontarget,coverage) for normal and tumor
  170. def unpair_QC(inputpath,laneid,sample):
  171. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  172. bam_dir = os.path.join(lane_dir, 'bamfile')
  173. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20221027.pbs'
  174. unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair
  175. os.system(unpair_tumor_cmd)
  176. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  177. #laneid='laneruntest'
  178. #####
  179. #1.首先执行mapping
  180. #clean_align(inputpath,laneid)
  181. #2.判断log文件是否已经生成完毕。
  182. #2.1)如果单个文件的log生成了,那么执行单个样本对应的命令
  183. #2.1.1) 对于tumor,执行unpair_case(inputpath,tumor),unpair_QC(inputpath,laneid,sample)
  184. #2.1.2)对于normal,执行unpair_control(inputpath,laneid,normal),unpair_QC(inputpath,laneid,sample)
  185. #2.2)如果样本的tumor和normal对应的bam均生成了,以上两个程序均要执行,如果执行了,就不重复执行。完毕后执行配对样本的分析命令
  186. #对于配对的,执行pair(inputpath,laneid,sample,tumor,normal)
  187. def classify_1(data1,data_classify):
  188. if pd.isna(data1["normal"]):
  189. if re.search("CT$",data1["tumor"]):
  190. data_classify[data1["samplename"]] = "tumor_CT"
  191. else:
  192. data_classify[data1["samplename"]] = "tumor_TT"
  193. elif pd.isna(data1["tumor"]):
  194. data_classify[data1["samplename"]] = "normal"
  195. else:
  196. if re.search("CT$", data1["tumor"]):
  197. data_classify[data1["samplename"] + " "] = "tumor_CT"
  198. data_classify[data1["samplename"]] = "normal_tumor_CT"
  199. else:
  200. data_classify[data1["samplename"] + " "] = "tumor_TT"
  201. data_classify[data1["samplename"]] = "normal_tumor_TT"
  202. data_classify[data1["samplename"]+" "] = "normal"
  203. def get_logfile(path,file_list):
  204. dir_list = os.listdir(path)
  205. for x in dir_list:
  206. new_x = os.path.join(path,x)
  207. if os.path.isdir(new_x):
  208. get_logfile(new_x,file_list)
  209. else:
  210. file_tuple = os.path.splitext(new_x)
  211. if file_tuple[1] == '.log':
  212. file_list.append(x.split("_")[0])
  213. return file_list
  214. def sample_1(data_classify, file_list, inputpath, laneid):
  215. sample_accomplish = []
  216. for sam in data_classify:
  217. if data_classify[sam] == "normal":
  218. sample_name_1 = sam.strip() + "CN"
  219. if sample_name_1 in file_list:
  220. print('deal with the normal data including QC,germline,HLA,chemthrethy')
  221. sample = sam.strip()
  222. unpair_control(inputpath, laneid, sample_name_1)
  223. unpair_QC(inputpath, laneid, sample_name_1)
  224. sample_accomplish.append(sam)
  225. if data_classify[sam] == "tumor_TT":
  226. sample_name_2 = sam.strip() + "TT"
  227. if sample_name_2 in file_list:
  228. print('deal with the tumor data for the tissue including QC,fusion')
  229. unpair_case(inputpath,sample_name_2)
  230. unpair_QC(inputpath,laneid,sample_name_2)
  231. sample_accomplish.append(sam)
  232. if data_classify[sam] == "tumor_CT":
  233. sample_name_2 = sam.strip() + "CT"
  234. if sample_name_2 in file_list:
  235. #print(33)
  236. print('deal with the tumor data for the blood including QC,fusion')
  237. unpair_case(inputpath, sample_name_2)
  238. unpair_QC(inputpath, laneid, sample_name_2)
  239. sample_accomplish.append(sam)
  240. if data_classify[sam] == "normal_tumor_TT":
  241. sample_name_1 = sam + "CN"
  242. sample_name_2 = sam + "TT"
  243. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  244. #print(44)
  245. print('deal with the pair data for the tissue including varscan and vardict mutation')
  246. sample=sam.strip()
  247. SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1 )
  248. CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  249. MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  250. sample_accomplish.append(sam)
  251. if data_classify[sam] == "normal_tumor_CT":
  252. sample_name_1 = sam + "CN"
  253. sample_name_2 = sam + "CT"
  254. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  255. #print(55)
  256. print('deal with the pair data for the blood including varscan and vardict mutation')
  257. sample = sam.strip()
  258. SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  259. CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  260. MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  261. sample_accomplish.append(sam)
  262. for sam1 in sample_accomplish:
  263. data_classify.pop(sam1)
  264. return data_classify
  265. def run_1(inputpath,time_sleep, laneid):
  266. Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt')
  267. data_classify = {}
  268. data = pd.read_csv(Sample_list, header=0, sep="\t")
  269. data.apply(classify_1,axis=1,args=(data_classify,))
  270. print(data_classify)
  271. analis_dir(inputpath)
  272. clean_align(inputpath, laneid)
  273. while 1:
  274. print("未处理样本:")
  275. print(data_classify)
  276. file_list = []
  277. path = inputpath
  278. get_logfile(path, file_list)
  279. if len(data_classify) == 0:
  280. print("结束")
  281. break
  282. data_classify = sample_1(data_classify, file_list, inputpath, laneid)
  283. print("{}秒后继续".format(time_sleep))
  284. time.sleep(time_sleep)
  285. #Sample_list='laneruntest_sample_infor_label.txt'
  286. #time_sleep=15
  287. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  288. #laneid='laneruntest'
  289. #Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt')
  290. #run_1(inputpath,time_sleep, laneid)
  291. if __name__=='__main__':
  292. parser = argparse.ArgumentParser(description='the main script for pancancer')
  293. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  294. parser.add_argument('-l', '--laneid', type=str, help='the laneid')
  295. args = parser.parse_args()
  296. Inputpath = args.inputpath
  297. Laneid=args.laneid
  298. time_sleep = 1200
  299. run_1(Inputpath, time_sleep, Laneid)