123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import pandas as pd
- import os,sys
- import argparse
- def ontarget_sum(inputpath):
- #inputpath = '/cgdata/bioproject/pancancer602gene/CGB0329'
- laneid = inputpath.split('/')[-1]
- QCsumdir = '/cgdata/bioproject/pancancer602gene/NGS_QC_602pancancer'
- outputdir = os.path.join(QCsumdir, laneid)
- if not os.path.exists(outputdir):
- os.mkdir(outputdir)
- tempfile = os.path.join(inputpath, 'tempfile')
- QCdir = os.path.join(tempfile, 'QC')
- # 对目录下所有的ontarget.txt进行汇总
- files = os.listdir(QCdir)
- ontargetfile = [s for s in files if s.endswith('ontarget.txt')]
- targetsum = pd.DataFrame()
- for i in range(len(ontargetfile)):
- sampledir = os.path.join(QCdir, ontargetfile[i])
- sampleQC = pd.read_table(sampledir, sep='\t', header=0)
- targetsum = sampleQC.append(targetsum)
- cols = ['sampleid', 'Total_Read(M)', 'Map(%)', 'T_size', 'On_Target(%)', 'T_Dup(%)', 'T_Mean', 'Insert_Size', 'SD',
- 'Fold80', '>1X', '>20X', '>30X', '>50X', '>100X', '>300X', 'Adjust_30X']
- targetsum = targetsum[cols]
- #输出结果
- targetoutput = os.path.join(outputdir, laneid + '_' + 'target_sum.txt')
- targetsum.to_csv(targetoutput, sep='\t', header=True, index=False)
- def report_sum(inputpath):
- # inputpath = '/cgdata/bioproject/pancancer602gene/CGB0329'
- laneid = inputpath.split('/')[-1]
- QCsumdir = '/cgdata/bioproject/pancancer602gene/NGS_QC_602pancancer'
- outputdir = os.path.join(QCsumdir, laneid)
- if not os.path.exists(outputdir):
- os.mkdir(outputdir)
- tempfile = os.path.join(inputpath, 'tempfile')
- QCdir = os.path.join(tempfile, 'QC')
- ###对目录下所有的report QC进行汇总
- files = os.listdir(QCdir)
- reportfile = [s for s in files if s.endswith('_qc_report.txt')]
- reportsum = pd.DataFrame()
- for i in range(len(reportfile)):
- sampledir = os.path.join(QCdir, reportfile[i])
- sample_report = pd.read_table(sampledir, sep='\t', header=0)
- reportsum = sample_report.append(reportsum)
- #output the result
- reportoutput = os.path.join(outputdir, laneid + '_report_sum.txt')
- reportsum.to_csv(reportoutput, sep='\t', header=True, index=False)
- def run(inputpath):
- ontarget_sum(inputpath)
- report_sum(inputpath)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='QC sum')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- args = parser.parse_args()
- Inputpath = args.inputpath
- run(Inputpath)
|