datafile_QC_v0_20230406_finish.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. import sys,collections,math,os,os.path,re
  2. from collections import Counter
  3. import pandas as pd
  4. from pandas.core.frame import DataFrame
  5. import argparse
  6. import xlrd
  7. import subprocess
  8. from time import time
  9. from tqdm import tqdm
  10. import threading
  11. from multiprocessing.pool import ThreadPool #线程
  12. from openpyxl import *
  13. from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font
  14. ####excel颜色标签
  15. # file:文件路径或workbook对象,第一次运行函数后会返回一个workbook对象,后续可直接调用这个返回的对象进行其他操作
  16. # column_1:要操作的excel列,A-Z,比如第1列在excel中为A,第5列是E
  17. # type_1:对比类型,1是>,2是<,3是==
  18. # conditions_1:筛选条件,如type_1=1,conditions_1=10,则对大于等于10的值进行操作
  19. # color_1:设置符合条件的值得颜色,可填red,blue,green,black四种
  20. # reverse=False:设置不符合条件的值得颜色,默认是黑色,可填red,blue,green,black四种
  21. # head=True:表中是否包含列名,默认为True
  22. # percentage=False:是否是百分比数值,默认为False,当数值是百分比时设置为True
  23. def excel_font_color(file, column_1, type_1, conditions_1,color_1, reverse=False, head=True, percentage=False):
  24. if type(file) == workbook.workbook.Workbook:
  25. wb2 = file
  26. ws2 = wb2.active
  27. else:
  28. wb2 = load_workbook(file)
  29. ws2 = wb2.active
  30. color = {"red": "ff000c", "green": "04ff00", "blue": "0006ff", "black":"000000"}
  31. bold_itatic_24_font_2 = Font(color=color[color_1])
  32. bold_itatic_24_font_3 = False
  33. if reverse:
  34. bold_itatic_24_font_3 = Font(color=color[reverse])
  35. num = 0
  36. if head:
  37. num = 1
  38. for i in ws2[column_1]:
  39. if num == 1:
  40. num = 0
  41. continue
  42. if num == 0:
  43. if "%" in str(i.value):
  44. print("{}列含有%,注意设置percentage=True,如不需设置则忽略此警告".format(column_1))
  45. num = 2
  46. # 带有百分比的值读取时值类型可能存在不同,需要分开设置
  47. if percentage:
  48. if i.number_format == "General":
  49. vAlue = float(i.value[:-1])/100
  50. else:
  51. vAlue = i.value
  52. else:
  53. vAlue = i.value
  54. # 数据加颜色
  55. if type_1 == 1:
  56. if vAlue > conditions_1:
  57. i.font = bold_itatic_24_font_2
  58. else:
  59. if bold_itatic_24_font_3:
  60. i.font = bold_itatic_24_font_3
  61. if type_1 == 2:
  62. if vAlue < conditions_1:
  63. i.font = bold_itatic_24_font_2
  64. else:
  65. if bold_itatic_24_font_3:
  66. i.font = bold_itatic_24_font_3
  67. if type_1 == 3:
  68. if vAlue == conditions_1:
  69. i.font = bold_itatic_24_font_2
  70. else:
  71. if bold_itatic_24_font_3:
  72. i.font = bold_itatic_24_font_3
  73. return wb2
  74. ############data summary######
  75. ###rawdata qc summary
  76. #1.qcsummary(成功)
  77. def rawdata_Q30(inputpath,samplename):
  78. # make the temp dir
  79. temp_dir = os.path.join(inputpath, 'tempfile')
  80. if not os.path.exists(temp_dir):
  81. os.mkdir(temp_dir)
  82. tempqc_dir = os.path.join(temp_dir, 'QC')
  83. if not os.path.exists(tempqc_dir):
  84. os.mkdir(tempqc_dir)
  85. qcdir = os.path.join(inputpath, '8Fastqc')
  86. samplelistdir = os.path.join(qcdir,samplename+'.stat')
  87. try:
  88. qcdata = pd.read_table(samplelistdir, nrows=11, sep=' ', names=['reads', 'infor1', 'infor2', 'infor3'])
  89. qcsummary_re = pd.DataFrame()
  90. qcsummary_re.loc[0, 'sampleid'] = samplename
  91. qcsummary_re.loc[0, 'ReadNum'] = qcdata.loc[1, "infor1"].split('\t')[0]
  92. qcsummary_re.loc[0, 'BaseNum'] = qcdata.loc[1, "infor2"].split('\t')[0]
  93. qcsummary_re.loc[0, 'Q10'] = qcdata.loc[8, "infor3"]
  94. qcsummary_re.loc[0, 'Q20'] = qcdata.loc[9, "infor3"]
  95. qcsummary_re.loc[0, 'Q30'] = qcdata.loc[10, "infor3"]
  96. qcsummary_re.loc[0, 'GC'] = qcdata.loc[2, "infor1"].split('\t')[0]
  97. outputname = os.path.join(tempqc_dir,samplename+'_rawQ30.txt')
  98. qcsummary_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
  99. except:
  100. print(sampleid + ' is null!')
  101. #2.获得mapping rate summary针对flagstat的结果(成功)
  102. def mapping_rate(inputpath,samplename):
  103. # make the temp dir
  104. temp_dir = os.path.join(inputpath, 'tempfile')
  105. if not os.path.exists(temp_dir):
  106. os.mkdir(temp_dir)
  107. tempqc_dir = os.path.join(temp_dir, 'QC')
  108. if not os.path.exists(tempqc_dir):
  109. os.mkdir(tempqc_dir)
  110. ontargetdir = os.path.join(inputpath, '9Ontarget')
  111. samplelistdir=os.path.join(ontargetdir,samplename+'_clean.abra2.flagstat')
  112. try:
  113. samplemapping = pd.read_table(samplelistdir, sep='+', names=['reads', 'infor1', 'infor2'])
  114. ontarget_re = pd.DataFrame()
  115. ontarget_re.loc[0, 'sampleid'] = samplename
  116. ontarget_re.loc[0, 'Read Num'] = samplemapping.loc[0, 'reads']
  117. ontarget_re.loc[0, 'mapped'] = samplemapping.loc[4, 'reads']
  118. df = samplemapping["infor1"].str.split('(', expand=True)
  119. mappingrate = df.loc[4, 1].split(' ')[0]
  120. ontarget_re.loc[0, 'Mapping Ratio (%)'] = mappingrate
  121. outputname = os.path.join(tempqc_dir, samplename + '_mappingrate.txt')
  122. ontarget_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
  123. except:
  124. print(samplename + ' is null!')
  125. #3.对ontarget rate summary,
  126. #perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl -d ${inputpath} bamqc.txt
  127. #ontargetcmd='perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl '+' -d '+inputpath+' '+summarydir+'/'+'table3_ontarget.txt'
  128. #os.system(ontargetcmd)
  129. #inputpath=/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3
  130. #perl qualimap_extract.pl -d ${inputpath} -t p table3_ontarget_rate_1.txt
  131. def ontarget_insertsize(inputpath,samplename):
  132. laneid = inputpath.split('/')[-1].split('-')[-1]
  133. # make the temp dir
  134. temp_dir = os.path.join(inputpath, 'tempfile')
  135. if not os.path.exists(temp_dir):
  136. os.mkdir(temp_dir)
  137. tempqc_dir = os.path.join(temp_dir, 'QC')
  138. if not os.path.exists(tempqc_dir):
  139. os.mkdir(tempqc_dir)
  140. bamdir=os.path.join('/cgdata/pancancer/analyse/',laneid+'/bamfile')
  141. outputname=os.path.join(tempqc_dir,samplename+'_ontarget1.txt')
  142. ontargetcmd='perl /cgdata/liuxiangqiong/work62pancancer/pipeline/v0/prim_umi_v0/qualimap_extract.pl '+' -d '+bamdir+' '+'-t p '+outputname
  143. os.system(ontargetcmd)
  144. ontarget = pd.read_table(outputname, sep='\t', header=0)
  145. ontarget['sampleid'] = ontarget['Sample'].str.split("_", expand=True)[0]
  146. del ontarget['Sample']
  147. ontarget_sample=ontarget[ontarget['sampleid']==samplename]
  148. outputname1 = os.path.join(tempqc_dir, samplename + '_ontarget.txt')
  149. ontarget_sample.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t')
  150. os.remove(outputname)
  151. ###4.获得每个探针的测序深度
  152. def coverage_uniform_samtools(inputpath,samplename):
  153. # make the temp dir
  154. temp_dir = os.path.join(inputpath, 'tempfile')
  155. if not os.path.exists(temp_dir):
  156. os.mkdir(temp_dir)
  157. tempqc_dir = os.path.join(temp_dir, 'QC')
  158. if not os.path.exists(tempqc_dir):
  159. os.mkdir(tempqc_dir)
  160. #读入靶区间
  161. targetdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/NanOnco_Plus_Panel_v2.0_Covered_b37_cg.parY2X.sort.bed'
  162. targetdata = pd.read_table(targetdir, sep='\t', header=None, names=['chr', 'start', 'end', 'gene', 'infor1', 'strand'])
  163. #读入每个靶标区间的测序深度
  164. coveragedir = os.path.join(inputpath, '10Coverage')
  165. covdir = os.path.join(coveragedir, samplename + '.cov.samtools_coverage.txt')
  166. if os.path.getsize(covdir) != 1:
  167. coverdata_tumor = pd.read_table(covdir, header=0, sep='\t', low_memory=False)
  168. coverdata_tumor['start'] = coverdata_tumor['start'].astype('int')
  169. coverdata_tumor['end'] = coverdata_tumor['end'].astype('int')
  170. tumor_covsum = pd.merge(targetdata, coverdata_tumor, on=['chr', 'start', 'end', 'gene', 'strand'],how='left')
  171. outputname1 = os.path.join(tempqc_dir,samplename+'_target_coverage.txt')
  172. tumor_covsum.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t')
  173. # 计算每个样本的平均有效测序深度
  174. # 读入每个碱基位点测序深度和均一性
  175. coveragedir = os.path.join(inputpath, '10Coverage')
  176. unfilterreadsdir = os.path.join(coveragedir, samplename + '.cov.samtools.txt')
  177. if (os.path.exists(unfilterreadsdir)) and (os.path.getsize(unfilterreadsdir) != 0):
  178. inputdata1 = pd.read_table(unfilterreadsdir, header=None, sep='\t', low_memory=False)
  179. inputdata1.columns = ['chrom', 'start', 'readcounts']
  180. ##获得样本的平均测序深度
  181. # T_mean = ontarget[ontarget['sampleid'] == sampleid].reset_index(drop=True).loc[0, 'T_Mean']
  182. T_mean = int(inputdata1['readcounts'].sum() / len(inputdata1))
  183. # 计算覆盖均一性(大于平均深度20%的碱基位点占目标区域碱基位点总数的比例)
  184. Coverge_uniform = len(inputdata1[inputdata1['readcounts'] > 0.2 * T_mean]) / len(inputdata1)
  185. averagedata = pd.DataFrame()
  186. averagedata.loc[0, 'sampleid'] = samplename
  187. averagedata.loc[0, 'Unique_average_depth'] = round(T_mean,4)
  188. averagedata.loc[0, 'coverge_uniform']=round(Coverge_uniform,4)
  189. outputname2 = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt')
  190. averagedata.to_csv(outputname2, index=False, header=True, encoding='gbk', sep='\t')
  191. ####输出原始的测序深度
  192. #本panel大小2.6M,编码区大小1.39M
  193. #总体质控合格的判断
  194. #1)总的平均原始测序深度血液>=5000X;组织>=1000X;白细胞对照>=200X
  195. #2)有效测序深度,血液>=1000X;组织>=500X
  196. #2)血液insert size (median)<=200;组织insert size(median)>=100
  197. #3)覆盖均一性>=80%
  198. #4)Q30>=80%
  199. ####补充计算覆盖均一性
  200. #大于平均深度20%的碱基位点占目标区域碱基位点总数的比例
  201. ####修改采用samtools计算平均测序深度时候
  202. def QCreprot2_samtools(inputpath,samplename):
  203. laneid = inputpath.split('/')[-1].split('-')[-1]
  204. temp_dir = os.path.join(inputpath, 'tempfile')
  205. tempqc_dir = os.path.join(temp_dir, 'QC')
  206. #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3'
  207. # 本panel大小2.6M
  208. panelsize = 2600000
  209. # 读入基础质控
  210. rawdatadir=os.path.join(tempqc_dir,samplename+'_rawQ30.txt')
  211. QCdata = pd.read_table(rawdatadir, sep='\t', header=0)
  212. ###读入insert size
  213. insertsizedir = os.path.join(tempqc_dir, samplename + '_ontarget.txt')
  214. ontarget = pd.read_table(insertsizedir, sep='\t', header=0)
  215. # 读入平均有效测序深度和覆盖均一性
  216. coveragedir = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt')
  217. coveragedata = pd.read_table(coveragedir, header=0, sep='\t')
  218. ##获得样本的有效平均测序深度
  219. T_mean = coveragedata.loc[0, 'Unique_average_depth']
  220. # 获得均一性
  221. Coverge_uniform = coveragedata.loc[0, 'coverge_uniform']
  222. # 获得insert size(median)
  223. Insert_size_median = int(ontarget.loc[0, 'Insert_Size'].split(' / ')[1])
  224. ##获得Q30
  225. Q30 = QCdata.loc[0, 'Q30']
  226. ##获得原始数据量
  227. rawdata_base = 2 * 150 * (QCdata.loc[0, 'ReadNum'])
  228. # 获得原始测序深度
  229. raw_depth = round(rawdata_base / panelsize, 4)
  230. # 总体质控合格的判断
  231. QC_report=pd.DataFrame()
  232. sampletype = samplename[-2:]
  233. if sampletype == 'TT':
  234. if (raw_depth >= 1000) & (Insert_size_median >= 100) & (Coverge_uniform >= 0.8) & (
  235. (float(Q30.split('%')[0]) / 100) >= 0.8) & (T_mean>=500):
  236. label = 'Success'
  237. else:
  238. label = 'Fail'
  239. elif sampletype == 'CT':
  240. if (raw_depth >= 5000) & (Insert_size_median <= 200) & (Coverge_uniform >= 0.8) & (
  241. (float(Q30.split('%')[0]) / 100) >= 0.8)& (T_mean>=1000):
  242. label = 'Success'
  243. else:
  244. label = 'Fail'
  245. elif sampletype == 'CN':
  246. if raw_depth >= 200:
  247. label = 'Success'
  248. else:
  249. label = 'Fail'
  250. QC_report.loc[0, 'sampleid'] = samplename
  251. QC_report.loc[0, 'Total_base'] = rawdata_base
  252. QC_report.loc[0, 'Total_average_depth'] = round(raw_depth)
  253. QC_report.loc[0, 'Unique_average_depth'] = T_mean
  254. QC_report.loc[0, 'insert_size'] = Insert_size_median
  255. QC_report.loc[0, 'coverge_uniform'] = str(round(Coverge_uniform * 100, 2)) + '%'
  256. QC_report.loc[0, 'Q30'] = Q30
  257. QC_report.loc[0, 'QC_overall'] = label
  258. print(QC_report)
  259. outputname = os.path.join(tempqc_dir, samplename + '_qc_report.txt')
  260. QC_report.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
  261. # 输出疾病样本的qc报告。那么需要改变一下样本名,然后再输出
  262. if (sampletype == 'TT') or (sampletype == 'CT'):
  263. samplenewid = samplename[:-2]
  264. # make the result dir
  265. result_dir = os.path.join(inputpath, 'resultfile')
  266. if not os.path.exists(result_dir):
  267. os.mkdir(result_dir)
  268. sample_dir = os.path.join(result_dir, samplenewid)
  269. if not os.path.exists(sample_dir):
  270. os.mkdir(sample_dir)
  271. QC_report.loc[0, 'sampleid'] = samplenewid
  272. QC_report1 = QC_report[
  273. ['sampleid', 'Total_average_depth', 'insert_size', 'coverge_uniform', 'Q30', 'QC_overall']]
  274. outputfile1 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc1.xlsx')
  275. writer = pd.ExcelWriter(outputfile1)
  276. QC_report1.to_excel(writer, sheet_name='QC', index=False)
  277. writer.save()
  278. writer.close()
  279. if sampletype == 'TT':
  280. sampleQCdata = excel_font_color(outputfile1, "B", 2, 1000, "red")
  281. sampleQCdata = excel_font_color(sampleQCdata, "C", 2, 100, "red")
  282. sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
  283. sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
  284. sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
  285. elif sampletype == 'CT':
  286. sampleQCdata = excel_font_color(outputfile1, "B", 2, 5000, "red")
  287. sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 200, "red")
  288. sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
  289. sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
  290. sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
  291. elif sampletype == 'CN':
  292. sampleQCdata = excel_font_color(outputfile1, "B", 2, 200, "red")
  293. sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 180, "red")
  294. sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
  295. sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
  296. sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
  297. outputfile2 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc.xlsx')
  298. sampleQCdata.save(outputfile2)
  299. os.remove(outputfile1)
  300. #主要的执行命令
  301. def qcrun(inputpath,samplename):
  302. print('step1.calculate the Q30')
  303. rawdata_Q30(inputpath,samplename)
  304. print('step2.mapping rate')
  305. mapping_rate(inputpath,samplename)
  306. print('step3.insertsize')
  307. ontarget_insertsize(inputpath,samplename)
  308. print('step4.coverage uniform')
  309. coverage_uniform_samtools(inputpath,samplename)
  310. print('step5.QCreport')
  311. QCreprot2_samtools(inputpath,samplename)
  312. if __name__=='__main__':
  313. parser = argparse.ArgumentParser(description='for the QC')
  314. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  315. parser.add_argument('-s', '--samplename', type=str, help='all of the sample')
  316. args = parser.parse_args()
  317. Inputpath = args.inputpath
  318. Samplename = args.samplename
  319. qcrun(Inputpath,Samplename)