123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362 |
- import sys,collections,math,os,os.path,re
- from collections import Counter
- import pandas as pd
- from pandas.core.frame import DataFrame
- import argparse
- import xlrd
- import subprocess
- from time import time
- from tqdm import tqdm
- import threading
- from multiprocessing.pool import ThreadPool #线程
- from openpyxl import *
- from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font
- ####excel颜色标签
- # file:文件路径或workbook对象,第一次运行函数后会返回一个workbook对象,后续可直接调用这个返回的对象进行其他操作
- # column_1:要操作的excel列,A-Z,比如第1列在excel中为A,第5列是E
- # type_1:对比类型,1是>,2是<,3是==
- # conditions_1:筛选条件,如type_1=1,conditions_1=10,则对大于等于10的值进行操作
- # color_1:设置符合条件的值得颜色,可填red,blue,green,black四种
- # reverse=False:设置不符合条件的值得颜色,默认是黑色,可填red,blue,green,black四种
- # head=True:表中是否包含列名,默认为True
- # percentage=False:是否是百分比数值,默认为False,当数值是百分比时设置为True
- def excel_font_color(file, column_1, type_1, conditions_1,color_1, reverse=False, head=True, percentage=False):
- if type(file) == workbook.workbook.Workbook:
- wb2 = file
- ws2 = wb2.active
- else:
- wb2 = load_workbook(file)
- ws2 = wb2.active
- color = {"red": "ff000c", "green": "04ff00", "blue": "0006ff", "black":"000000"}
- bold_itatic_24_font_2 = Font(color=color[color_1])
- bold_itatic_24_font_3 = False
- if reverse:
- bold_itatic_24_font_3 = Font(color=color[reverse])
- num = 0
- if head:
- num = 1
- for i in ws2[column_1]:
- if num == 1:
- num = 0
- continue
- if num == 0:
- if "%" in str(i.value):
- print("{}列含有%,注意设置percentage=True,如不需设置则忽略此警告".format(column_1))
- num = 2
- # 带有百分比的值读取时值类型可能存在不同,需要分开设置
- if percentage:
- if i.number_format == "General":
- vAlue = float(i.value[:-1])/100
- else:
- vAlue = i.value
- else:
- vAlue = i.value
- # 数据加颜色
- if type_1 == 1:
- if vAlue > conditions_1:
- i.font = bold_itatic_24_font_2
- else:
- if bold_itatic_24_font_3:
- i.font = bold_itatic_24_font_3
- if type_1 == 2:
- if vAlue < conditions_1:
- i.font = bold_itatic_24_font_2
- else:
- if bold_itatic_24_font_3:
- i.font = bold_itatic_24_font_3
- if type_1 == 3:
- if vAlue == conditions_1:
- i.font = bold_itatic_24_font_2
- else:
- if bold_itatic_24_font_3:
- i.font = bold_itatic_24_font_3
- return wb2
- ############data summary######
- ###rawdata qc summary
- #1.qcsummary(成功)
- def rawdata_Q30(inputpath,samplename):
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- tempqc_dir = os.path.join(temp_dir, 'QC')
- if not os.path.exists(tempqc_dir):
- os.mkdir(tempqc_dir)
- qcdir = os.path.join(inputpath, '8Fastqc')
- samplelistdir = os.path.join(qcdir,samplename+'.stat')
- try:
- qcdata = pd.read_table(samplelistdir, nrows=11, sep=' ', names=['reads', 'infor1', 'infor2', 'infor3'])
- qcsummary_re = pd.DataFrame()
- qcsummary_re.loc[0, 'sampleid'] = samplename
- qcsummary_re.loc[0, 'ReadNum'] = qcdata.loc[1, "infor1"].split('\t')[0]
- qcsummary_re.loc[0, 'BaseNum'] = qcdata.loc[1, "infor2"].split('\t')[0]
- qcsummary_re.loc[0, 'Q10'] = qcdata.loc[8, "infor3"]
- qcsummary_re.loc[0, 'Q20'] = qcdata.loc[9, "infor3"]
- qcsummary_re.loc[0, 'Q30'] = qcdata.loc[10, "infor3"]
- qcsummary_re.loc[0, 'GC'] = qcdata.loc[2, "infor1"].split('\t')[0]
- outputname = os.path.join(tempqc_dir,samplename+'_rawQ30.txt')
- qcsummary_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
- except:
- print(sampleid + ' is null!')
- #2.获得mapping rate summary针对flagstat的结果(成功)
- def mapping_rate(inputpath,samplename):
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- tempqc_dir = os.path.join(temp_dir, 'QC')
- if not os.path.exists(tempqc_dir):
- os.mkdir(tempqc_dir)
- ontargetdir = os.path.join(inputpath, '9Ontarget')
- samplelistdir=os.path.join(ontargetdir,samplename+'_clean.abra2.flagstat')
- try:
- samplemapping = pd.read_table(samplelistdir, sep='+', names=['reads', 'infor1', 'infor2'])
- ontarget_re = pd.DataFrame()
- ontarget_re.loc[0, 'sampleid'] = samplename
- ontarget_re.loc[0, 'Read Num'] = samplemapping.loc[0, 'reads']
- ontarget_re.loc[0, 'mapped'] = samplemapping.loc[4, 'reads']
- df = samplemapping["infor1"].str.split('(', expand=True)
- mappingrate = df.loc[4, 1].split(' ')[0]
- ontarget_re.loc[0, 'Mapping Ratio (%)'] = mappingrate
- outputname = os.path.join(tempqc_dir, samplename + '_mappingrate.txt')
- ontarget_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
- except:
- print(samplename + ' is null!')
- #3.对ontarget rate summary,
- #perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl -d ${inputpath} bamqc.txt
- #ontargetcmd='perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl '+' -d '+inputpath+' '+summarydir+'/'+'table3_ontarget.txt'
- #os.system(ontargetcmd)
- #inputpath=/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3
- #perl qualimap_extract.pl -d ${inputpath} -t p table3_ontarget_rate_1.txt
- def ontarget_insertsize(inputpath,samplename):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- tempqc_dir = os.path.join(temp_dir, 'QC')
- if not os.path.exists(tempqc_dir):
- os.mkdir(tempqc_dir)
- bamdir=os.path.join('/cgdata/pancancer/analyse/',laneid+'/bamfile')
- outputname=os.path.join(tempqc_dir,samplename+'_ontarget1.txt')
- ontargetcmd='perl /cgdata/liuxiangqiong/work62pancancer/pipeline/v0/prim_umi_v0/qualimap_extract.pl '+' -d '+bamdir+' '+'-t p '+outputname
- os.system(ontargetcmd)
- ontarget = pd.read_table(outputname, sep='\t', header=0)
- ontarget['sampleid'] = ontarget['Sample'].str.split("_", expand=True)[0]
- del ontarget['Sample']
- ontarget_sample=ontarget[ontarget['sampleid']==samplename]
- outputname1 = os.path.join(tempqc_dir, samplename + '_ontarget.txt')
- ontarget_sample.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t')
- os.remove(outputname)
- ###4.获得每个探针的测序深度
- def coverage_uniform_samtools(inputpath,samplename):
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- tempqc_dir = os.path.join(temp_dir, 'QC')
- if not os.path.exists(tempqc_dir):
- os.mkdir(tempqc_dir)
- #读入靶区间
- targetdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/NanOnco_Plus_Panel_v2.0_Covered_b37_cg.parY2X.sort.bed'
- targetdata = pd.read_table(targetdir, sep='\t', header=None, names=['chr', 'start', 'end', 'gene', 'infor1', 'strand'])
- #读入每个靶标区间的测序深度
- coveragedir = os.path.join(inputpath, '10Coverage')
- covdir = os.path.join(coveragedir, samplename + '.cov.samtools_coverage.txt')
- if os.path.getsize(covdir) != 1:
- coverdata_tumor = pd.read_table(covdir, header=0, sep='\t', low_memory=False)
- coverdata_tumor['start'] = coverdata_tumor['start'].astype('int')
- coverdata_tumor['end'] = coverdata_tumor['end'].astype('int')
- tumor_covsum = pd.merge(targetdata, coverdata_tumor, on=['chr', 'start', 'end', 'gene', 'strand'],how='left')
- outputname1 = os.path.join(tempqc_dir,samplename+'_target_coverage.txt')
- tumor_covsum.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t')
- # 计算每个样本的平均有效测序深度
- # 读入每个碱基位点测序深度和均一性
- coveragedir = os.path.join(inputpath, '10Coverage')
- unfilterreadsdir = os.path.join(coveragedir, samplename + '.cov.samtools.txt')
- if (os.path.exists(unfilterreadsdir)) and (os.path.getsize(unfilterreadsdir) != 0):
- inputdata1 = pd.read_table(unfilterreadsdir, header=None, sep='\t', low_memory=False)
- inputdata1.columns = ['chrom', 'start', 'readcounts']
- ##获得样本的平均测序深度
- # T_mean = ontarget[ontarget['sampleid'] == sampleid].reset_index(drop=True).loc[0, 'T_Mean']
- T_mean = int(inputdata1['readcounts'].sum() / len(inputdata1))
- # 计算覆盖均一性(大于平均深度20%的碱基位点占目标区域碱基位点总数的比例)
- Coverge_uniform = len(inputdata1[inputdata1['readcounts'] > 0.2 * T_mean]) / len(inputdata1)
- averagedata = pd.DataFrame()
- averagedata.loc[0, 'sampleid'] = samplename
- averagedata.loc[0, 'Unique_average_depth'] = round(T_mean,4)
- averagedata.loc[0, 'coverge_uniform']=round(Coverge_uniform,4)
- outputname2 = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt')
- averagedata.to_csv(outputname2, index=False, header=True, encoding='gbk', sep='\t')
- ####输出原始的测序深度
- #本panel大小2.6M
- #总体质控合格的判断
- #1)总的平均原始测序深度血液>=5000X;组织>=1000X;白细胞对照>=200X
- #2)血液insert size (median)<=180;组织insert size(median)>=100
- #3)覆盖均一性>=80%,修改,我们设定为80%
- #4)Q30>=80%
- ####补充计算覆盖均一性
- #大于平均深度20%的碱基位点占目标区域碱基位点总数的比例
- ####修改采用samtools计算平均测序深度时候
- def QCreprot2_samtools(inputpath,samplename):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- temp_dir = os.path.join(inputpath, 'tempfile')
- tempqc_dir = os.path.join(temp_dir, 'QC')
- #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3'
- # 本panel大小2.6M
- panelsize = 2600000
- # 读入基础质控
- rawdatadir=os.path.join(tempqc_dir,samplename+'_rawQ30.txt')
- QCdata = pd.read_table(rawdatadir, sep='\t', header=0)
- ###读入insert size
- insertsizedir = os.path.join(tempqc_dir, samplename + '_ontarget.txt')
- ontarget = pd.read_table(insertsizedir, sep='\t', header=0)
- # 读入平均有效测序深度和覆盖均一性
- coveragedir = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt')
- coveragedata = pd.read_table(coveragedir, header=0, sep='\t')
- ##获得样本的有效平均测序深度
- T_mean = coveragedata.loc[0, 'Unique_average_depth']
- # 获得均一性
- Coverge_uniform = coveragedata.loc[0, 'coverge_uniform']
- # 获得insert size(median)
- Insert_size_median = int(ontarget.loc[0, 'Insert_Size'].split(' / ')[1])
- ##获得Q30
- Q30 = QCdata.loc[0, 'Q30']
- ##获得原始数据量
- rawdata_base = 2 * 150 * (QCdata.loc[0, 'ReadNum'])
- # 获得原始测序深度
- raw_depth = round(rawdata_base / panelsize, 4)
- # 总体质控合格的判断
- QC_report=pd.DataFrame()
- sampletype = samplename[-2:]
- if sampletype == 'TT':
- if (raw_depth >= 1000) & (Insert_size_median >= 100) & (Coverge_uniform >= 0.8) & (
- (float(Q30.split('%')[0]) / 100) >= 0.8):
- label = 'Success'
- else:
- label = 'Fail'
- elif sampletype == 'CT':
- if (raw_depth >= 5000) & (Insert_size_median <= 180) & (Coverge_uniform >= 0.8) & (
- (float(Q30.split('%')[0]) / 100) >= 0.8):
- label = 'Success'
- else:
- label = 'Fail'
- elif sampletype == 'CN':
- if raw_depth >= 200:
- label = 'Success'
- else:
- label = 'Fail'
- QC_report.loc[0, 'sampleid'] = samplename
- QC_report.loc[0, 'Total_base'] = rawdata_base
- QC_report.loc[0, 'Total_average_depth'] = round(raw_depth)
- QC_report.loc[0, 'Unique_average_depth'] = T_mean
- QC_report.loc[0, 'insert_size'] = Insert_size_median
- QC_report.loc[0, 'coverge_uniform'] = str(round(Coverge_uniform * 100, 2)) + '%'
- QC_report.loc[0, 'Q30'] = Q30
- QC_report.loc[0, 'QC_overall'] = label
- print(QC_report)
- outputname = os.path.join(tempqc_dir, samplename + '_qc_report.txt')
- QC_report.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
- # 输出疾病样本的qc报告。那么需要改变一下样本名,然后再输出
- if (sampletype == 'TT') or (sampletype == 'CT'):
- samplenewid = samplename[:-2]
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, samplenewid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- QC_report.loc[0, 'sampleid'] = samplenewid
- QC_report1 = QC_report[
- ['sampleid', 'Total_average_depth', 'insert_size', 'coverge_uniform', 'Q30', 'QC_overall']]
- outputfile1 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc1.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- QC_report1.to_excel(writer, sheet_name='QC', index=False)
- writer.save()
- writer.close()
- if sampletype == 'TT':
- sampleQCdata = excel_font_color(outputfile1, "B", 2, 1000, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "C", 2, 100, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
- elif sampletype == 'CT':
- sampleQCdata = excel_font_color(outputfile1, "B", 2, 5000, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 180, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
- elif sampletype == 'CN':
- sampleQCdata = excel_font_color(outputfile1, "B", 2, 200, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 180, "red")
- sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True)
- sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red")
- outputfile2 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc.xlsx')
- sampleQCdata.save(outputfile2)
- os.remove(outputfile1)
- #主要的执行命令
- def qcrun(inputpath,samplename):
- print('step1.calculate the Q30')
- rawdata_Q30(inputpath,samplename)
- print('step2.mapping rate')
- mapping_rate(inputpath,samplename)
- print('step3.insertsize')
- ontarget_insertsize(inputpath,samplename)
- print('step4.coverage uniform')
- coverage_uniform_samtools(inputpath,samplename)
- print('step5.QCreport')
- QCreprot2_samtools(inputpath,samplename)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='for the QC')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--samplename', type=str, help='all of the sample')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Samplename = args.samplename
- qcrun(Inputpath,Samplename)
|