d1_datasummary_QC.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import pandas as pd
  2. import os,sys
  3. import argparse
  4. def ontarget_sum(inputpath):
  5. #inputpath = '/cgdata/bioproject/pancancer602gene/CGB0329'
  6. laneid = inputpath.split('/')[-1]
  7. QCsumdir = '/cgdata/bioproject/pancancer602gene/NGS_QC_602pancancer'
  8. outputdir = os.path.join(QCsumdir, laneid)
  9. if not os.path.exists(outputdir):
  10. os.mkdir(outputdir)
  11. tempfile = os.path.join(inputpath, 'tempfile')
  12. QCdir = os.path.join(tempfile, 'QC')
  13. # 对目录下所有的ontarget.txt进行汇总
  14. files = os.listdir(QCdir)
  15. ontargetfile = [s for s in files if s.endswith('ontarget.txt')]
  16. targetsum = pd.DataFrame()
  17. for i in range(len(ontargetfile)):
  18. sampledir = os.path.join(QCdir, ontargetfile[i])
  19. sampleQC = pd.read_table(sampledir, sep='\t', header=0)
  20. targetsum = sampleQC.append(targetsum)
  21. cols = ['sampleid', 'Total_Read(M)', 'Map(%)', 'T_size', 'On_Target(%)', 'T_Dup(%)', 'T_Mean', 'Insert_Size', 'SD',
  22. 'Fold80', '>1X', '>20X', '>30X', '>50X', '>100X', '>300X', 'Adjust_30X']
  23. targetsum = targetsum[cols]
  24. #输出结果
  25. targetoutput = os.path.join(outputdir, laneid + '_' + 'target_sum.txt')
  26. targetsum.to_csv(targetoutput, sep='\t', header=True, index=False)
  27. def report_sum(inputpath):
  28. # inputpath = '/cgdata/bioproject/pancancer602gene/CGB0329'
  29. laneid = inputpath.split('/')[-1]
  30. QCsumdir = '/cgdata/bioproject/pancancer602gene/NGS_QC_602pancancer'
  31. outputdir = os.path.join(QCsumdir, laneid)
  32. if not os.path.exists(outputdir):
  33. os.mkdir(outputdir)
  34. tempfile = os.path.join(inputpath, 'tempfile')
  35. QCdir = os.path.join(tempfile, 'QC')
  36. ###对目录下所有的report QC进行汇总
  37. files = os.listdir(QCdir)
  38. reportfile = [s for s in files if s.endswith('_qc_report.txt')]
  39. reportsum = pd.DataFrame()
  40. for i in range(len(reportfile)):
  41. sampledir = os.path.join(QCdir, reportfile[i])
  42. sample_report = pd.read_table(sampledir, sep='\t', header=0)
  43. reportsum = sample_report.append(reportsum)
  44. #output the result
  45. reportoutput = os.path.join(outputdir, laneid + '_report_sum.txt')
  46. reportsum.to_csv(reportoutput, sep='\t', header=True, index=False)
  47. def run(inputpath):
  48. ontarget_sum(inputpath)
  49. report_sum(inputpath)
  50. if __name__=='__main__':
  51. parser = argparse.ArgumentParser(description='QC sum')
  52. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  53. args = parser.parse_args()
  54. Inputpath = args.inputpath
  55. run(Inputpath)