import sys,collections,math,os,os.path,re from collections import Counter import pandas as pd from pandas.core.frame import DataFrame import argparse import xlrd import subprocess from time import time from tqdm import tqdm import threading from multiprocessing.pool import ThreadPool #线程 from openpyxl import * from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font ####excel颜色标签 # file:文件路径或workbook对象,第一次运行函数后会返回一个workbook对象,后续可直接调用这个返回的对象进行其他操作 # column_1:要操作的excel列,A-Z,比如第1列在excel中为A,第5列是E # type_1:对比类型,1是>,2是<,3是== # conditions_1:筛选条件,如type_1=1,conditions_1=10,则对大于等于10的值进行操作 # color_1:设置符合条件的值得颜色,可填red,blue,green,black四种 # reverse=False:设置不符合条件的值得颜色,默认是黑色,可填red,blue,green,black四种 # head=True:表中是否包含列名,默认为True # percentage=False:是否是百分比数值,默认为False,当数值是百分比时设置为True def excel_font_color(file, column_1, type_1, conditions_1,color_1, reverse=False, head=True, percentage=False): if type(file) == workbook.workbook.Workbook: wb2 = file ws2 = wb2.active else: wb2 = load_workbook(file) ws2 = wb2.active color = {"red": "ff000c", "green": "04ff00", "blue": "0006ff", "black":"000000"} bold_itatic_24_font_2 = Font(color=color[color_1]) bold_itatic_24_font_3 = False if reverse: bold_itatic_24_font_3 = Font(color=color[reverse]) num = 0 if head: num = 1 for i in ws2[column_1]: if num == 1: num = 0 continue if num == 0: if "%" in str(i.value): print("{}列含有%,注意设置percentage=True,如不需设置则忽略此警告".format(column_1)) num = 2 # 带有百分比的值读取时值类型可能存在不同,需要分开设置 if percentage: if i.number_format == "General": vAlue = float(i.value[:-1])/100 else: vAlue = i.value else: vAlue = i.value # 数据加颜色 if type_1 == 1: if vAlue > conditions_1: i.font = bold_itatic_24_font_2 else: if bold_itatic_24_font_3: i.font = bold_itatic_24_font_3 if type_1 == 2: if vAlue < conditions_1: i.font = bold_itatic_24_font_2 else: if bold_itatic_24_font_3: i.font = bold_itatic_24_font_3 if type_1 == 3: if vAlue == conditions_1: i.font = bold_itatic_24_font_2 else: if bold_itatic_24_font_3: i.font = bold_itatic_24_font_3 return wb2 ############data summary###### ###rawdata qc summary #1.qcsummary(成功) def rawdata_Q30(inputpath,samplename): # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) tempqc_dir = os.path.join(temp_dir, 'QC') if not os.path.exists(tempqc_dir): os.mkdir(tempqc_dir) qcdir = os.path.join(inputpath, '8Fastqc') samplelistdir = os.path.join(qcdir,samplename+'.stat') try: qcdata = pd.read_table(samplelistdir, nrows=11, sep=' ', names=['reads', 'infor1', 'infor2', 'infor3']) qcsummary_re = pd.DataFrame() qcsummary_re.loc[0, 'sampleid'] = samplename qcsummary_re.loc[0, 'ReadNum'] = qcdata.loc[1, "infor1"].split('\t')[0] qcsummary_re.loc[0, 'BaseNum'] = qcdata.loc[1, "infor2"].split('\t')[0] qcsummary_re.loc[0, 'Q10'] = qcdata.loc[8, "infor3"] qcsummary_re.loc[0, 'Q20'] = qcdata.loc[9, "infor3"] qcsummary_re.loc[0, 'Q30'] = qcdata.loc[10, "infor3"] qcsummary_re.loc[0, 'GC'] = qcdata.loc[2, "infor1"].split('\t')[0] outputname = os.path.join(tempqc_dir,samplename+'_rawQ30.txt') qcsummary_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t') except: print(sampleid + ' is null!') #2.获得mapping rate summary针对flagstat的结果(成功) def mapping_rate(inputpath,samplename): # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) tempqc_dir = os.path.join(temp_dir, 'QC') if not os.path.exists(tempqc_dir): os.mkdir(tempqc_dir) ontargetdir = os.path.join(inputpath, '9Ontarget') samplelistdir=os.path.join(ontargetdir,samplename+'_clean.abra2.flagstat') try: samplemapping = pd.read_table(samplelistdir, sep='+', names=['reads', 'infor1', 'infor2']) ontarget_re = pd.DataFrame() ontarget_re.loc[0, 'sampleid'] = samplename ontarget_re.loc[0, 'Read Num'] = samplemapping.loc[0, 'reads'] ontarget_re.loc[0, 'mapped'] = samplemapping.loc[4, 'reads'] df = samplemapping["infor1"].str.split('(', expand=True) mappingrate = df.loc[4, 1].split(' ')[0] ontarget_re.loc[0, 'Mapping Ratio (%)'] = mappingrate outputname = os.path.join(tempqc_dir, samplename + '_mappingrate.txt') ontarget_re.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t') except: print(samplename + ' is null!') #3.对ontarget rate summary, #perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl -d ${inputpath} bamqc.txt #ontargetcmd='perl /cgdata/guocq/script/wf_idtwes/qualimap_extract.pl '+' -d '+inputpath+' '+summarydir+'/'+'table3_ontarget.txt' #os.system(ontargetcmd) #inputpath=/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3 #perl qualimap_extract.pl -d ${inputpath} -t p table3_ontarget_rate_1.txt def ontarget_insertsize(inputpath,samplename): laneid = inputpath.split('/')[-1].split('-')[-1] # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) tempqc_dir = os.path.join(temp_dir, 'QC') if not os.path.exists(tempqc_dir): os.mkdir(tempqc_dir) bamdir=os.path.join('/cgdata/pancancer/analyse/',laneid+'/bamfile') outputname=os.path.join(tempqc_dir,samplename+'_ontarget1.txt') ontargetcmd='perl /cgdata/liuxiangqiong/work62pancancer/pipeline/v0/prim_umi_v0/qualimap_extract.pl '+' -d '+bamdir+' '+'-t p '+outputname os.system(ontargetcmd) ontarget = pd.read_table(outputname, sep='\t', header=0) ontarget['sampleid'] = ontarget['Sample'].str.split("_", expand=True)[0] del ontarget['Sample'] ontarget_sample=ontarget[ontarget['sampleid']==samplename] outputname1 = os.path.join(tempqc_dir, samplename + '_ontarget.txt') ontarget_sample.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t') os.remove(outputname) ###4.获得每个探针的测序深度 def coverage_uniform_samtools(inputpath,samplename): # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) tempqc_dir = os.path.join(temp_dir, 'QC') if not os.path.exists(tempqc_dir): os.mkdir(tempqc_dir) #读入靶区间 targetdir = '/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/NanOnco_Plus_Panel_v2.0_Covered_b37_cg.parY2X.sort.bed' targetdata = pd.read_table(targetdir, sep='\t', header=None, names=['chr', 'start', 'end', 'gene', 'infor1', 'strand']) #读入每个靶标区间的测序深度 coveragedir = os.path.join(inputpath, '10Coverage') covdir = os.path.join(coveragedir, samplename + '.cov.samtools_coverage.txt') if os.path.getsize(covdir) != 1: coverdata_tumor = pd.read_table(covdir, header=0, sep='\t', low_memory=False) coverdata_tumor['start'] = coverdata_tumor['start'].astype('int') coverdata_tumor['end'] = coverdata_tumor['end'].astype('int') tumor_covsum = pd.merge(targetdata, coverdata_tumor, on=['chr', 'start', 'end', 'gene', 'strand'],how='left') outputname1 = os.path.join(tempqc_dir,samplename+'_target_coverage.txt') tumor_covsum.to_csv(outputname1, index=False, header=True, encoding='gbk', sep='\t') # 计算每个样本的平均有效测序深度 # 读入每个碱基位点测序深度和均一性 coveragedir = os.path.join(inputpath, '10Coverage') unfilterreadsdir = os.path.join(coveragedir, samplename + '.cov.samtools.txt') if (os.path.exists(unfilterreadsdir)) and (os.path.getsize(unfilterreadsdir) != 0): inputdata1 = pd.read_table(unfilterreadsdir, header=None, sep='\t', low_memory=False) inputdata1.columns = ['chrom', 'start', 'readcounts'] ##获得样本的平均测序深度 # T_mean = ontarget[ontarget['sampleid'] == sampleid].reset_index(drop=True).loc[0, 'T_Mean'] T_mean = int(inputdata1['readcounts'].sum() / len(inputdata1)) # 计算覆盖均一性(大于平均深度20%的碱基位点占目标区域碱基位点总数的比例) Coverge_uniform = len(inputdata1[inputdata1['readcounts'] > 0.2 * T_mean]) / len(inputdata1) averagedata = pd.DataFrame() averagedata.loc[0, 'sampleid'] = samplename averagedata.loc[0, 'Unique_average_depth'] = round(T_mean,4) averagedata.loc[0, 'coverge_uniform']=round(Coverge_uniform,4) outputname2 = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt') averagedata.to_csv(outputname2, index=False, header=True, encoding='gbk', sep='\t') ####输出原始的测序深度 #本panel大小2.6M,编码区大小1.39M #总体质控合格的判断 #1)总的平均原始测序深度血液>=5000X;组织>=1000X;白细胞对照>=200X #2)有效测序深度,血液>=1000X;组织>=500X #2)血液insert size (median)<=200;组织insert size(median)>=100 #3)覆盖均一性>=80% #4)Q30>=80% ####补充计算覆盖均一性 #大于平均深度20%的碱基位点占目标区域碱基位点总数的比例 ####修改采用samtools计算平均测序深度时候 def QCreprot2_samtools(inputpath,samplename): laneid = inputpath.split('/')[-1].split('-')[-1] temp_dir = os.path.join(inputpath, 'tempfile') tempqc_dir = os.path.join(temp_dir, 'QC') #inputpath = '/cgdata/liuxiangqiong/work62pancancer/pipelinetest-lane3' # 本panel大小2.6M panelsize = 2600000 # 读入基础质控 rawdatadir=os.path.join(tempqc_dir,samplename+'_rawQ30.txt') QCdata = pd.read_table(rawdatadir, sep='\t', header=0) ###读入insert size insertsizedir = os.path.join(tempqc_dir, samplename + '_ontarget.txt') ontarget = pd.read_table(insertsizedir, sep='\t', header=0) # 读入平均有效测序深度和覆盖均一性 coveragedir = os.path.join(tempqc_dir, samplename + '_avarageDepth_coverage_uniform.txt') coveragedata = pd.read_table(coveragedir, header=0, sep='\t') ##获得样本的有效平均测序深度 T_mean = coveragedata.loc[0, 'Unique_average_depth'] # 获得均一性 Coverge_uniform = coveragedata.loc[0, 'coverge_uniform'] # 获得insert size(median) Insert_size_median = int(ontarget.loc[0, 'Insert_Size'].split(' / ')[1]) ##获得Q30 Q30 = QCdata.loc[0, 'Q30'] ##获得原始数据量 rawdata_base = 2 * 150 * (QCdata.loc[0, 'ReadNum']) # 获得原始测序深度 raw_depth = round(rawdata_base / panelsize, 4) # 总体质控合格的判断 QC_report=pd.DataFrame() sampletype = samplename[-2:] if sampletype == 'TT': if (raw_depth >= 1000) & (Insert_size_median >= 100) & (Coverge_uniform >= 0.8) & ( (float(Q30.split('%')[0]) / 100) >= 0.8) & (T_mean>=500): label = 'Success' else: label = 'Fail' elif sampletype == 'CT': if (raw_depth >= 5000) & (Insert_size_median <= 200) & (Coverge_uniform >= 0.8) & ( (float(Q30.split('%')[0]) / 100) >= 0.8)& (T_mean>=1000): label = 'Success' else: label = 'Fail' elif sampletype == 'CN': if raw_depth >= 200: label = 'Success' else: label = 'Fail' QC_report.loc[0, 'sampleid'] = samplename QC_report.loc[0, 'Total_base'] = rawdata_base QC_report.loc[0, 'Total_average_depth'] = round(raw_depth) QC_report.loc[0, 'Unique_average_depth'] = T_mean QC_report.loc[0, 'insert_size'] = Insert_size_median QC_report.loc[0, 'coverge_uniform'] = str(round(Coverge_uniform * 100, 2)) + '%' QC_report.loc[0, 'Q30'] = Q30 QC_report.loc[0, 'QC_overall'] = label print(QC_report) outputname = os.path.join(tempqc_dir, samplename + '_qc_report.txt') QC_report.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t') # 输出疾病样本的qc报告。那么需要改变一下样本名,然后再输出 if (sampletype == 'TT') or (sampletype == 'CT'): samplenewid = samplename[:-2] # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, samplenewid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) QC_report.loc[0, 'sampleid'] = samplenewid QC_report1 = QC_report[ ['sampleid', 'Total_average_depth', 'insert_size', 'coverge_uniform', 'Q30', 'QC_overall']] outputfile1 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc1.xlsx') writer = pd.ExcelWriter(outputfile1) QC_report1.to_excel(writer, sheet_name='QC', index=False) writer.save() writer.close() if sampletype == 'TT': sampleQCdata = excel_font_color(outputfile1, "B", 2, 1000, "red") sampleQCdata = excel_font_color(sampleQCdata, "C", 2, 100, "red") sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red") elif sampletype == 'CT': sampleQCdata = excel_font_color(outputfile1, "B", 2, 5000, "red") sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 200, "red") sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red") elif sampletype == 'CN': sampleQCdata = excel_font_color(outputfile1, "B", 2, 200, "red") sampleQCdata = excel_font_color(sampleQCdata, "C", 1, 180, "red") sampleQCdata = excel_font_color(sampleQCdata, "D", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "E", 2, 0.8, "red", percentage=True) sampleQCdata = excel_font_color(sampleQCdata, "F", 3, "Fail", "red") outputfile2 = os.path.join(sample_dir, laneid + '-' + samplenewid + '.qc.xlsx') sampleQCdata.save(outputfile2) os.remove(outputfile1) #主要的执行命令 def qcrun(inputpath,samplename): print('step1.calculate the Q30') rawdata_Q30(inputpath,samplename) print('step2.mapping rate') mapping_rate(inputpath,samplename) print('step3.insertsize') ontarget_insertsize(inputpath,samplename) print('step4.coverage uniform') coverage_uniform_samtools(inputpath,samplename) print('step5.QCreport') QCreprot2_samtools(inputpath,samplename) if __name__=='__main__': parser = argparse.ArgumentParser(description='for the QC') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--samplename', type=str, help='all of the sample') args = parser.parse_args() Inputpath = args.inputpath Samplename = args.samplename qcrun(Inputpath,Samplename)