main_20221026_v0.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #edit the file
  2. #1.clean data
  3. #2.align
  4. #3.sv callin
  5. import sys,collections,math,os,os.path,re
  6. import pandas as pd
  7. from pandas.core.frame import DataFrame
  8. import argparse
  9. import numpy as np
  10. import time
  11. import re
  12. sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/')
  13. import s11_noUMI_QCsummary as qcsummary
  14. import s12_noUMI_somatic_UMI_20220128 as somatic
  15. ##0.creat the analysis path
  16. def analis_dir(inputpath):
  17. svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
  18. if not os.path.exists(svdir1):
  19. os.mkdir(svdir1)
  20. svdir2= os.path.join(inputpath, '1SV_vardict_pair')
  21. if not os.path.exists(svdir2):
  22. os.mkdir(svdir2)
  23. # for CNV
  24. CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  25. if not os.path.exists(CNVdir):
  26. os.mkdir(CNVdir)
  27. # for MSI
  28. MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  29. if not os.path.exists(MSI_dir):
  30. os.mkdir(MSI_dir)
  31. # for germline result
  32. gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
  33. if not os.path.exists(gemerlinedir):
  34. os.mkdir(gemerlinedir)
  35. # for HL result
  36. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  37. if not os.path.exists(HLdir):
  38. os.mkdir(HLdir)
  39. # for fusion
  40. fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
  41. if not os.path.exists(fusion_method1_dir):
  42. os.mkdir(fusion_method1_dir)
  43. fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  44. if not os.path.exists(fusion_method2_dir):
  45. os.mkdir(fusion_method2_dir)
  46. #for HLA
  47. HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
  48. if not os.path.exists(HLA_dir):
  49. os.mkdir(HLA_dir)
  50. # for qc
  51. qc_dir = os.path.join(inputpath, '8Fastqc')
  52. if not os.path.exists(qc_dir):
  53. os.mkdir(qc_dir)
  54. # for ontarget
  55. ontarget_dir = os.path.join(inputpath, '9Ontarget')
  56. if not os.path.exists(ontarget_dir):
  57. os.mkdir(ontarget_dir)
  58. # for coverage
  59. coverage_dir = os.path.join(inputpath, '10Coverage')
  60. if not os.path.exists(coverage_dir):
  61. os.mkdir(coverage_dir)
  62. # datasummary
  63. datasummary_dir = os.path.join(inputpath, 'datasummary')
  64. if not os.path.exists(datasummary_dir):
  65. os.mkdir(datasummary_dir)
  66. ####1.执行cleandata,align
  67. ###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile
  68. def clean_align(inputpath,laneid):
  69. lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
  70. if not os.path.exists(lane_dir):
  71. os.mkdir(lane_dir)
  72. bam_dir = os.path.join(lane_dir, 'bamfile')
  73. if not os.path.exists(bam_dir):
  74. os.mkdir(bam_dir)
  75. sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
  76. samplelist = pd.read_table(sampledir, sep='\t')
  77. align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20220705.pbs'
  78. for i in range(len(samplelist)):
  79. tumor = samplelist.loc[i, 'tumor']
  80. normal = samplelist.loc[i, 'normal']
  81. if pd.notnull(tumor):
  82. print(tumor)
  83. align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  84. os.system(align_tumor_cmd)
  85. if pd.notnull(normal):
  86. print(normal)
  87. align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  88. os.system(align_normal_cmd)
  89. ###3.执行pair的操作(sv caling,cnv,MSI)
  90. ##需要判断肿瘤样本是来自组织还是血液,如果是血液,那么min_af=0.005 for blood,0.01 for tissue
  91. def pair(inputpath,laneid,sample,tumor,normal):
  92. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220929.pbs'
  93. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  94. bam_dir = os.path.join(lane_dir, 'bamfile')
  95. tumorlabel = tumor[-2]
  96. if tumorlabel == 'T':
  97. min_af = 0.01
  98. elif tumorlabel == 'C':
  99. min_af = 0.005
  100. svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  101. os.system(svcall_cmd)
  102. ##3.1执行SNV配对
  103. def SNVpair(inputpath,laneid,sample,tumor,normal):
  104. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_SNVpair_20221026.pbs'
  105. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  106. bam_dir = os.path.join(lane_dir, 'bamfile')
  107. tumorlabel = tumor[-2]
  108. if tumorlabel == 'T':
  109. min_af = 0.01
  110. elif tumorlabel == 'C':
  111. min_af = 0.005
  112. svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  113. os.system(svcall_cmd)
  114. ###3.2执行CNV
  115. def CNVpair(inputpath,laneid,sample,tumor,normal):
  116. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s3_noUMI_CNV_cnvkit_pair_20220929.sh'
  117. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  118. bam_dir = os.path.join(lane_dir, 'bamfile')
  119. cnv_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  120. os.system(cnv_cmd)
  121. ###执行3.3MSI
  122. def MSIpair(inputpath,laneid,sample,tumor,normal):
  123. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s4_noUMI_MSI_15STR_msisensor2_pair_20220929.sh'
  124. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  125. bam_dir = os.path.join(lane_dir, 'bamfile')
  126. MSI_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  127. os.system(MSI_cmd)
  128. ####4.unpair(germline,HL,HLA) for control sample
  129. def unpair_control(inputpath,laneid,normal):
  130. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  131. bam_dir = os.path.join(lane_dir, 'bamfile')
  132. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220929.pbs'
  133. unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair
  134. os.system(unpair_cmd)
  135. ####5.unpair(Genefuse) for tumor sample
  136. def unpair_case(inputpath,tumor):
  137. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220929.pbs'
  138. unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair
  139. os.system(unpair_cmd)
  140. ####6.unpair_qc(QC,ontarget,coverage) for normal and tumor
  141. def unpair_QC(inputpath,laneid,sample):
  142. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  143. bam_dir = os.path.join(lane_dir, 'bamfile')
  144. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20220929.pbs'
  145. unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair
  146. os.system(unpair_tumor_cmd)
  147. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  148. #laneid='laneruntest'
  149. #####
  150. #1.首先执行mapping
  151. #clean_align(inputpath,laneid)
  152. #2.判断log文件是否已经生成完毕。
  153. #2.1)如果单个文件的log生成了,那么执行单个样本对应的命令
  154. #2.1.1) 对于tumor,执行unpair_case(inputpath,tumor),unpair_QC(inputpath,laneid,sample)
  155. #2.1.2)对于normal,执行unpair_control(inputpath,laneid,normal),unpair_QC(inputpath,laneid,sample)
  156. #2.2)如果样本的tumor和normal对应的bam均生成了,以上两个程序均要执行,如果执行了,就不重复执行。完毕后执行配对样本的分析命令
  157. #对于配对的,执行pair(inputpath,laneid,sample,tumor,normal)
  158. def classify_1(data1,data_classify):
  159. if pd.isna(data1["normal"]):
  160. if re.search("CT$",data1["tumor"]):
  161. data_classify[data1["samplename"]] = "tumor_CT"
  162. else:
  163. data_classify[data1["samplename"]] = "tumor_TT"
  164. elif pd.isna(data1["tumor"]):
  165. data_classify[data1["samplename"]] = "normal"
  166. else:
  167. if re.search("CT$", data1["tumor"]):
  168. data_classify[data1["samplename"] + " "] = "tumor_CT"
  169. data_classify[data1["samplename"]] = "normal_tumor_CT"
  170. else:
  171. data_classify[data1["samplename"] + " "] = "tumor_TT"
  172. data_classify[data1["samplename"]] = "normal_tumor_TT"
  173. data_classify[data1["samplename"]+" "] = "normal"
  174. def get_logfile(path,file_list):
  175. dir_list = os.listdir(path)
  176. for x in dir_list:
  177. new_x = os.path.join(path,x)
  178. if os.path.isdir(new_x):
  179. get_logfile(new_x,file_list)
  180. else:
  181. file_tuple = os.path.splitext(new_x)
  182. if file_tuple[1] == '.log':
  183. file_list.append(x.split("_")[0])
  184. return file_list
  185. def sample_1(data_classify, file_list, inputpath, laneid):
  186. sample_accomplish = []
  187. for sam in data_classify:
  188. if data_classify[sam] == "normal":
  189. sample_name_1 = sam.strip() + "CN"
  190. if sample_name_1 in file_list:
  191. print('deal with the normal data including QC,germline,HLA,chemthrethy')
  192. sample = sam.strip()
  193. unpair_control(inputpath, laneid, sample_name_1)
  194. unpair_QC(inputpath, laneid, sample_name_1)
  195. sample_accomplish.append(sam)
  196. if data_classify[sam] == "tumor_TT":
  197. sample_name_2 = sam.strip() + "TT"
  198. if sample_name_2 in file_list:
  199. print('deal with the tumor data for the tissue including QC,fusion')
  200. unpair_case(inputpath,sample_name_2)
  201. unpair_QC(inputpath,laneid,sample_name_2)
  202. sample_accomplish.append(sam)
  203. if data_classify[sam] == "tumor_CT":
  204. sample_name_2 = sam.strip() + "CT"
  205. if sample_name_2 in file_list:
  206. #print(33)
  207. print('deal with the tumor data for the blood including QC,fusion')
  208. unpair_case(inputpath, sample_name_2)
  209. unpair_QC(inputpath, laneid, sample_name_2)
  210. sample_accomplish.append(sam)
  211. if data_classify[sam] == "normal_tumor_TT":
  212. sample_name_1 = sam + "CN"
  213. sample_name_2 = sam + "TT"
  214. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  215. #print(44)
  216. print('deal with the pair data for the tissue including varscan and vardict mutation')
  217. sample=sam.strip()
  218. SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1 )
  219. CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  220. MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  221. sample_accomplish.append(sam)
  222. if data_classify[sam] == "normal_tumor_CT":
  223. sample_name_1 = sam + "CN"
  224. sample_name_2 = sam + "CT"
  225. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  226. #print(55)
  227. print('deal with the pair data for the blood including varscan and vardict mutation')
  228. sample = sam.strip()
  229. SNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  230. CNVpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  231. MSIpair(inputpath, laneid, sample, sample_name_2, sample_name_1)
  232. sample_accomplish.append(sam)
  233. for sam1 in sample_accomplish:
  234. data_classify.pop(sam1)
  235. return data_classify
  236. def run_1(inputpath,time_sleep, laneid):
  237. Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt')
  238. data_classify = {}
  239. data = pd.read_csv(Sample_list, header=0, sep="\t")
  240. data.apply(classify_1,axis=1,args=(data_classify,))
  241. print(data_classify)
  242. analis_dir(inputpath)
  243. clean_align(inputpath, laneid)
  244. while 1:
  245. print("未处理样本:")
  246. print(data_classify)
  247. file_list = []
  248. path = inputpath
  249. get_logfile(path, file_list)
  250. if len(data_classify) == 0:
  251. print("结束")
  252. break
  253. data_classify = sample_1(data_classify, file_list, inputpath, laneid)
  254. print("{}秒后继续".format(time_sleep))
  255. time.sleep(time_sleep)
  256. #Sample_list='laneruntest_sample_infor_label.txt'
  257. #time_sleep=15
  258. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  259. #laneid='laneruntest'
  260. #Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt')
  261. #run_1(inputpath,time_sleep, laneid)
  262. if __name__=='__main__':
  263. parser = argparse.ArgumentParser(description='the main script for pancancer')
  264. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  265. parser.add_argument('-l', '--laneid', type=str, help='the laneid')
  266. args = parser.parse_args()
  267. Inputpath = args.inputpath
  268. Laneid=args.laneid
  269. time_sleep = 1200
  270. run_1(Inputpath, time_sleep, Laneid)