main_20220802_v0test.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #edit the file
  2. #1.clean data
  3. #2.align
  4. #3.sv callin
  5. import sys,collections,math,os,os.path,re
  6. import pandas as pd
  7. from pandas.core.frame import DataFrame
  8. import argparse
  9. import numpy as np
  10. import time
  11. import re
  12. sys.path.append('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/noUMI_v0/')
  13. import s11_noUMI_QCsummary as qcsummary
  14. import s12_noUMI_somatic_UMI_20220128 as somatic
  15. ##0.creat the analysis path
  16. def analis_dir(inputpath):
  17. svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
  18. if not os.path.exists(svdir1):
  19. os.mkdir(svdir1)
  20. svdir2= os.path.join(inputpath, '1SV_vardict_pair')
  21. if not os.path.exists(svdir2):
  22. os.mkdir(svdir2)
  23. # for CNV
  24. CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  25. if not os.path.exists(CNVdir):
  26. os.mkdir(CNVdir)
  27. # for MSI
  28. MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  29. if not os.path.exists(MSI_dir):
  30. os.mkdir(MSI_dir)
  31. # for germline result
  32. gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
  33. if not os.path.exists(gemerlinedir):
  34. os.mkdir(gemerlinedir)
  35. # for HL result
  36. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  37. if not os.path.exists(HLdir):
  38. os.mkdir(HLdir)
  39. # for fusion
  40. fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
  41. if not os.path.exists(fusion_method1_dir):
  42. os.mkdir(fusion_method1_dir)
  43. fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  44. if not os.path.exists(fusion_method2_dir):
  45. os.mkdir(fusion_method2_dir)
  46. #for HLA
  47. HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
  48. if not os.path.exists(HLA_dir):
  49. os.mkdir(HLA_dir)
  50. # for qc
  51. qc_dir = os.path.join(inputpath, '8Fastqc')
  52. if not os.path.exists(qc_dir):
  53. os.mkdir(qc_dir)
  54. # for ontarget
  55. ontarget_dir = os.path.join(inputpath, '9Ontarget')
  56. if not os.path.exists(ontarget_dir):
  57. os.mkdir(ontarget_dir)
  58. # for coverage
  59. coverage_dir = os.path.join(inputpath, '10Coverage')
  60. if not os.path.exists(coverage_dir):
  61. os.mkdir(coverage_dir)
  62. # datasummary
  63. datasummary_dir = os.path.join(inputpath, 'datasummary')
  64. if not os.path.exists(datasummary_dir):
  65. os.mkdir(datasummary_dir)
  66. ####1.执行cleandata,align
  67. ###执行输出的bam文件在/cgdata/pancancer/analyse/$laneid/bamfile
  68. def clean_align(inputpath,laneid):
  69. lane_dir=os.path.join('/cgdata/pancancer/analyse', laneid)
  70. if not os.path.exists(lane_dir):
  71. os.mkdir(lane_dir)
  72. bam_dir = os.path.join(lane_dir, 'bamfile')
  73. if not os.path.exists(bam_dir):
  74. os.mkdir(bam_dir)
  75. sampledir=os.path.join(inputpath,laneid+ '_sample_infor_label.txt')
  76. samplelist = pd.read_table(sampledir, sep='\t')
  77. align = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/s1_noUMI_clean_align_20220705.pbs'
  78. for i in range(len(samplelist)):
  79. tumor = samplelist.loc[i, 'tumor']
  80. normal = samplelist.loc[i, 'normal']
  81. if pd.notnull(tumor):
  82. print(tumor)
  83. align_tumor_cmd = 'qsub -v sample=' + tumor + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  84. os.system(align_tumor_cmd)
  85. if pd.notnull(normal):
  86. print(normal)
  87. align_normal_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + align
  88. os.system(align_normal_cmd)
  89. ###3.执行pair的操作(sv caling,cnv,MSI)
  90. ##需要判断肿瘤样本是来自组织还是血液,如果是血液,那么min_af=0.005 for blood,0.01 for tissue
  91. def pair(inputpath,laneid,sample,tumor,normal):
  92. pairdir = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_pair_20220705.pbs'
  93. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  94. bam_dir = os.path.join(lane_dir, 'bamfile')
  95. tumorlabel = tumor[-2]
  96. if tumorlabel == 'T':
  97. min_af = 0.01
  98. elif tumorlabel == 'C':
  99. min_af = 0.005
  100. svcall_cmd = 'qsub -v sample=' + sample + ',tumor=' + tumor + ',normal=' + normal + ',min_af=' + str(min_af) + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + pairdir
  101. os.system(svcall_cmd)
  102. ####4.unpair(germline,HL,HLA) for control sample
  103. def unpair_control(inputpath,laneid,normal):
  104. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  105. bam_dir = os.path.join(lane_dir, 'bamfile')
  106. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_control_20220705.pbs'
  107. unpair_cmd = 'qsub -v sample=' + normal + ',inputpath=' + inputpath + ',bam_dir=' +bam_dir+ ' ' + unpair
  108. os.system(unpair_cmd)
  109. ####5.unpair(Genefuse) for tumor sample
  110. def unpair_case(inputpath,tumor):
  111. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_case_20220705.pbs'
  112. unpair_cmd = 'qsub -v tumor=' + tumor + ',inputpath=' + inputpath + ' ' + unpair
  113. os.system(unpair_cmd)
  114. ####6.unpair_qc(QC,ontarget,coverage) for normal and tumor
  115. def unpair_QC(inputpath,laneid,sample):
  116. lane_dir = os.path.join('/cgdata/pancancer/analyse', laneid)
  117. bam_dir = os.path.join(lane_dir, 'bamfile')
  118. unpair = '/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/run_unpair_qc_20220705.pbs'
  119. unpair_tumor_cmd = 'qsub -v sample=' + sample + ',inputpath=' + inputpath + ',bam_dir=' + bam_dir + ' ' + unpair
  120. os.system(unpair_tumor_cmd)
  121. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  122. #laneid='laneruntest'
  123. #####
  124. #1.首先执行mapping
  125. #clean_align(inputpath,laneid)
  126. #2.判断log文件是否已经生成完毕。
  127. #2.1)如果单个文件的log生成了,那么执行单个样本对应的命令
  128. #2.1.1) 对于tumor,执行unpair_case(inputpath,tumor),unpair_QC(inputpath,laneid,sample)
  129. #2.1.2)对于normal,执行unpair_control(inputpath,laneid,normal),unpair_QC(inputpath,laneid,sample)
  130. #2.2)如果样本的tumor和normal对应的bam均生成了,以上两个程序均要执行,如果执行了,就不重复执行。完毕后执行配对样本的分析命令
  131. #对于配对的,执行pair(inputpath,laneid,sample,tumor,normal)
  132. def classify_1(data1,data_classify):
  133. if pd.isna(data1["normal"]):
  134. if re.search("CT$",data1["tumor"]):
  135. data_classify[data1["samplename"]] = "tumor_CT"
  136. else:
  137. data_classify[data1["samplename"]] = "tumor_TT"
  138. elif pd.isna(data1["tumor"]):
  139. data_classify[data1["samplename"]] = "normal"
  140. else:
  141. if re.search("CT$", data1["tumor"]):
  142. data_classify[data1["samplename"] + " "] = "tumor_CT"
  143. data_classify[data1["samplename"]] = "normal_tumor_CT"
  144. else:
  145. data_classify[data1["samplename"] + " "] = "tumor_TT"
  146. data_classify[data1["samplename"]] = "normal_tumor_TT"
  147. data_classify[data1["samplename"]+" "] = "normal"
  148. def get_logfile(path,file_list):
  149. dir_list = os.listdir(path)
  150. for x in dir_list:
  151. new_x = os.path.join(path,x)
  152. if os.path.isdir(new_x):
  153. get_logfile(new_x,file_list)
  154. else:
  155. file_tuple = os.path.splitext(new_x)
  156. if file_tuple[1] == '.log':
  157. file_list.append(x.split("_")[0])
  158. return file_list
  159. def sample_1(data_classify, file_list, inputpath, laneid):
  160. sample_accomplish = []
  161. for sam in data_classify:
  162. if data_classify[sam] == "normal":
  163. sample_name_1 = sam.strip() + "CN"
  164. if sample_name_1 in file_list:
  165. print('deal with the normal data including QC,germline,HLA,chemthrethy')
  166. sample = sam.strip()
  167. unpair_control(inputpath, laneid, sample_name_1)
  168. unpair_QC(inputpath, laneid, sample_name_1)
  169. sample_accomplish.append(sam)
  170. if data_classify[sam] == "tumor_TT":
  171. sample_name_2 = sam.strip() + "TT"
  172. if sample_name_2 in file_list:
  173. print('deal with the tumor data for the tissue including QC,fusion')
  174. unpair_case(inputpath,sample_name_2)
  175. unpair_QC(inputpath,laneid,sample_name_2)
  176. sample_accomplish.append(sam)
  177. if data_classify[sam] == "tumor_CT":
  178. sample_name_2 = sam.strip() + "CT"
  179. if sample_name_2 in file_list:
  180. #print(33)
  181. print('deal with the tumor data for the blood including QC,fusion')
  182. unpair_case(inputpath, sample_name_2)
  183. unpair_QC(inputpath, laneid, sample_name_2)
  184. sample_accomplish.append(sam)
  185. if data_classify[sam] == "normal_tumor_TT":
  186. sample_name_1 = sam + "CN"
  187. sample_name_2 = sam + "TT"
  188. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  189. #print(44)
  190. print('deal with the pair data for the tissue including varscan and vardict mutation')
  191. sample=sam.strip()
  192. pair(inputpath, laneid, sample, sample_name_1, sample_name_2 )
  193. sample_accomplish.append(sam)
  194. if data_classify[sam] == "normal_tumor_CT":
  195. sample_name_1 = sam + "CN"
  196. sample_name_2 = sam + "CT"
  197. if (sample_name_1 in file_list) and (sample_name_2 in file_list):
  198. #print(55)
  199. print('deal with the pair data for the blood including varscan and vardict mutation')
  200. sample = sam.strip()
  201. pair(inputpath, laneid, sample, sample_name_1, sample_name_2)
  202. sample_accomplish.append(sam)
  203. for sam1 in sample_accomplish:
  204. data_classify.pop(sam1)
  205. return data_classify
  206. def run_1(inputpath,time_sleep, laneid):
  207. Sample_list = os.path.join(inputpath, laneid + '_' + 'sample_infor_label.txt')
  208. data_classify = {}
  209. data = pd.read_csv(Sample_list, header=0, sep="\t")
  210. data.apply(classify_1,axis=1,args=(data_classify,))
  211. print(data_classify)
  212. analis_dir(inputpath)
  213. clean_align(inputpath, laneid)
  214. while 1:
  215. print("未处理样本:")
  216. print(data_classify)
  217. file_list = []
  218. path = inputpath
  219. get_logfile(path, file_list)
  220. if len(data_classify) == 0:
  221. print("结束")
  222. break
  223. data_classify = sample_1(data_classify, file_list, inputpath, laneid)
  224. print("{}秒后继续".format(time_sleep))
  225. time.sleep(time_sleep)
  226. #Sample_list='laneruntest_sample_infor_label.txt'
  227. #time_sleep=15
  228. #inputpath='/cgdata/liuxiangqiong/work62pancancer/runtest/laneruntest'
  229. #laneid='laneruntest'
  230. #Sample_list=os.path.join(inputpath,laneid+'_'+ 'sample_infor_label.txt')
  231. #run_1(inputpath,time_sleep, laneid)
  232. if __name__=='__main__':
  233. parser = argparse.ArgumentParser(description='the main script for pancancer')
  234. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  235. parser.add_argument('-l', '--laneid', type=str, help='the laneid')
  236. args = parser.parse_args()
  237. Inputpath = args.inputpath
  238. Laneid=args.laneid
  239. time_sleep = 1200
  240. run_1(Inputpath, time_sleep, Laneid)