s0_sampleinfor_v0_20220705.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #edit the file
  2. #1.clean data
  3. #2.align
  4. #3.sv callin
  5. import sys,collections,math,os,os.path,re
  6. import pandas as pd
  7. from pandas.core.frame import DataFrame
  8. import argparse
  9. import numpy as np
  10. def analis_dir(inputpath):
  11. svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
  12. if not os.path.exists(svdir1):
  13. os.mkdir(svdir1)
  14. svdir2= os.path.join(inputpath, '1SV_vardict_pair')
  15. if not os.path.exists(svdir2):
  16. os.mkdir(svdir2)
  17. # for CNV
  18. CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
  19. if not os.path.exists(CNVdir):
  20. os.mkdir(CNVdir)
  21. # for MSI
  22. MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
  23. if not os.path.exists(MSI_dir):
  24. os.mkdir(MSI_dir)
  25. # for germline result
  26. gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
  27. if not os.path.exists(gemerlinedir):
  28. os.mkdir(gemerlinedir)
  29. # for HL result
  30. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  31. if not os.path.exists(HLdir):
  32. os.mkdir(HLdir)
  33. # for fusion
  34. fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
  35. if not os.path.exists(fusion_method1_dir):
  36. os.mkdir(fusion_method1_dir)
  37. fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
  38. if not os.path.exists(fusion_method2_dir):
  39. os.mkdir(fusion_method2_dir)
  40. #for HLA
  41. HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
  42. if not os.path.exists(HLA_dir):
  43. os.mkdir(HLA_dir)
  44. # for qc
  45. qc_dir = os.path.join(inputpath, '8Fastqc')
  46. if not os.path.exists(qc_dir):
  47. os.mkdir(qc_dir)
  48. # for ontarget
  49. ontarget_dir = os.path.join(inputpath, '9Ontarget')
  50. if not os.path.exists(ontarget_dir):
  51. os.mkdir(ontarget_dir)
  52. # for coverage
  53. coverage_dir = os.path.join(inputpath, '10Coverage')
  54. if not os.path.exists(coverage_dir):
  55. os.mkdir(coverage_dir)
  56. # datasummary
  57. datasummary_dir = os.path.join(inputpath, 'datasummary')
  58. if not os.path.exists(datasummary_dir):
  59. os.mkdir(datasummary_dir)
  60. ##### 首先要进行样本的配对,获得同一个样本的tumor和normal
  61. def sampleinfor(inputpath,laneid):
  62. files = os.listdir(inputpath)
  63. files1 = [inputpath + '/' + f for f in files if f.endswith(('sample.txt'))]
  64. samplelist = pd.read_table(files1[0], names=['sampleid'], sep='\t')
  65. samplenamelist = pd.DataFrame()
  66. for i in range(len(samplelist)):
  67. sample = samplelist.loc[i, 'sampleid'][:-2]
  68. samplenamelist.loc[i, 'samplename'] = sample
  69. samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True)
  70. sample_pair = pd.DataFrame()
  71. for j in range(len(samplenamelist_uq)):
  72. sample = samplenamelist_uq.loc[j, 'samplename']
  73. sample_pair.loc[j, 'samplename'] = sample
  74. for k in range(len(samplelist)):
  75. sampleid = samplelist.loc[k, 'sampleid']
  76. print(sampleid)
  77. label = sampleid.find(sample)
  78. if label != -1:
  79. lib_lable = sampleid[-1]
  80. print(lib_lable)
  81. if lib_lable == 'T':
  82. sample_pair.loc[j, 'tumor'] = sampleid
  83. elif lib_lable == 'N':
  84. sample_pair.loc[j, 'normal'] = sampleid
  85. outputname = inputpath + '/' +laneid+'_'+ 'sample_infor_label.txt'
  86. sample_pair.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
  87. if __name__=='__main__':
  88. parser = argparse.ArgumentParser(description='sample infor')
  89. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  90. parser.add_argument('-l', '--laneid', type=str, help='laneid')
  91. args = parser.parse_args()
  92. Inputpath = args.inputpath
  93. Laneid = args.laneid
  94. sampleinfor(Inputpath,Laneid)