datafile_QCcheck_summary_v0_20220929.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import sys,os,os.path
  2. import pandas as pd
  3. from pandas.core.frame import DataFrame
  4. import argparse
  5. import re
  6. sys.path.append('/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/')
  7. import datafile_QC_v0_20220906_finish as qcsummary
  8. #inputpath='/cgdata/bioproject/pancancer602gene/CGB0158_1'
  9. #laneid='CGB0158_1'
  10. def sampleQCcheck(inputpath,laneid):
  11. ###读入样本表
  12. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  13. sampletable = pd.read_table(sampledir, sep='\t', header=0)
  14. for i in range(len(sampletable)):
  15. tumor = sampletable.loc[i, 'tumor']
  16. try:
  17. qcsummary.qcrun(inputpath, tumor)
  18. except:
  19. print(tumor + ' qc data is wrong,please check!')
  20. def qcsummary_tumor(inputpath,laneid):
  21. QCdir = os.path.join(inputpath, 'tempfile/QC')
  22. ###读入样本表
  23. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  24. sampletable = pd.read_table(sampledir, sep='\t', header=0)
  25. QCsummary_tumor = pd.DataFrame()
  26. for i in range(len(sampletable)):
  27. tumor = sampletable.loc[i, 'tumor']
  28. samplename = sampletable.loc[i, 'samplename']
  29. tumorQCdir = os.path.join(QCdir, tumor + '_qc_report.txt')
  30. if os.path.exists(tumorQCdir):
  31. tumorQCdata = pd.read_table(tumorQCdir, sep='\t', header=0)
  32. tumorQCdata.loc[0, 'samplename'] = samplename
  33. tumorQCdata.loc[0, 'laneid'] = laneid
  34. else:
  35. tumorQCdata = pd.DataFrame()
  36. tumorQCdata.loc[0, 'samplename'] = samplename
  37. tumorQCdata.loc[0, 'sampleid'] = tumor
  38. tumorQCdata.loc[0, 'laneid'] = laneid
  39. QCsummary_tumor = tumorQCdata.append(QCsummary_tumor)
  40. QCsummary_tumor['Total_base(G)'] = pd.DataFrame(round(QCsummary_tumor['Total_base'] / 1000000000, 2))
  41. del QCsummary_tumor['Total_base']
  42. titlelist = ['samplename', 'sampleid', 'Total_base(G)', 'Total_average_depth', 'Unique_average_depth',
  43. 'insert_size', 'coverge_uniform', 'Q30', 'QC_overall', 'laneid']
  44. QCsummary_tumor = QCsummary_tumor[titlelist]
  45. return QCsummary_tumor
  46. def qcsummary_normal(inputpath,laneid):
  47. QCdir = os.path.join(inputpath, 'tempfile/QC')
  48. ###读入样本表
  49. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  50. sampletable = pd.read_table(sampledir, sep='\t', header=0)
  51. QCsummary_normal = pd.DataFrame()
  52. for i in range(len(sampletable)):
  53. normal = sampletable.loc[i, 'normal']
  54. samplename = sampletable.loc[i, 'samplename']
  55. tumorQCdir = os.path.join(QCdir, normal + '_qc_report.txt')
  56. if os.path.exists(tumorQCdir):
  57. tumorQCdata = pd.read_table(tumorQCdir, sep='\t', header=0)
  58. tumorQCdata.loc[0, 'samplename'] = samplename
  59. tumorQCdata.loc[0, 'laneid'] = laneid
  60. else:
  61. tumorQCdata = pd.DataFrame()
  62. tumorQCdata.loc[0, 'samplename'] = samplename
  63. tumorQCdata.loc[0, 'sampleid'] = tumor
  64. tumorQCdata.loc[0, 'laneid'] = laneid
  65. QCsummary_normal = tumorQCdata.append(QCsummary_normal)
  66. QCsummary_normal['Total_base(G)'] = pd.DataFrame(round(QCsummary_normal['Total_base'] / 1000000000, 2))
  67. del QCsummary_normal['Total_base']
  68. titlelist=['samplename', 'sampleid','Total_base(G)', 'Total_average_depth', 'Unique_average_depth', 'insert_size', 'coverge_uniform', 'Q30', 'QC_overall', 'laneid']
  69. QCsummary_normal=QCsummary_normal[titlelist]
  70. return QCsummary_normal
  71. def QCSum_allsum_main(inputpath,laneid):
  72. #先对运行后的结果进行检验
  73. sampleQCcheck(inputpath, laneid)
  74. #进行qcsummary
  75. qctumor = qcsummary_tumor(inputpath, laneid)
  76. qcnormal = qcsummary_normal(inputpath, laneid)
  77. qcsummarydata = qctumor.append(qcnormal)
  78. outputdir = os.path.join(inputpath, 'datasummary')
  79. if not os.path.exists(outputdir):
  80. os.mkdir(outputdir)
  81. outputname = os.path.join(outputdir, laneid + '_' + 'table1_QCsummary.txt')
  82. qcsummarydata.to_csv(outputname, sep='\t', index=False, header=True)
  83. if __name__=='__main__':
  84. parser = argparse.ArgumentParser(description='for the QCsum')
  85. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  86. parser.add_argument('-l', '--laneid', type=str, help='the laneid')
  87. args = parser.parse_args()
  88. Inputpath = args.inputpath
  89. Laneid = args.laneid
  90. QCSum_allsum_main(Inputpath,Laneid)