datafile_chemotherapy_v0_20220705_finish.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import pandas as pd
  2. import os,os.path
  3. import xlrd
  4. import argparse
  5. #改2
  6. #rs3064744基因型能否给成TA(6)即可,另外这个位点是否应该给出两个等位基因的结果如TA(6)/TA(6) TA(6)/TA(7) TA(7)/TA(7)
  7. #只对正常对照进行分析
  8. def HL_summary_v1_normal(inputpath,laneid,normalid,sampleid):
  9. druginfor=pd.read_table('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_snp_list_loc.txt',sep='\t',header=0)
  10. druginfor.rename(columns={'rs_id':'rsID'},inplace=True)
  11. HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
  12. HLdatadir = HLdir + '/' + normalid + '.HL.xls'
  13. if os.path.exists(HLdatadir) and os.path.getsize(HLdatadir) != 0:
  14. #sampleid = normalid[:-2]
  15. # make the result dir
  16. result_dir = os.path.join(inputpath, 'resultfile')
  17. if not os.path.exists(result_dir):
  18. os.mkdir(result_dir)
  19. sample_dir = os.path.join(result_dir, sampleid)
  20. if not os.path.exists(sample_dir):
  21. os.mkdir(sample_dir)
  22. HLdata = pd.read_table(HLdatadir, sep='\t')
  23. if len(HLdata) !=0:
  24. HLdata = pd.read_table(HLdatadir, sep='\t')
  25. HLdata.insert(0, 'sampleid', sampleid)
  26. # 对位点rs3064744进行分析
  27. # 如果该位点对应的allele列中有CATAT,那么result的基因为TA(6)TAAGTAGG;如果有CATATAT,那么result的基因型为TA(7)TAAGTAGG
  28. rs3064744_alt = HLdata[HLdata['rsID'] == 'rs3064744']['allele']
  29. rs3064744_index = HLdata[HLdata['rsID'] == 'rs3064744'].index
  30. alt1 = rs3064744_alt.str.contains("CATAT/CATAT")
  31. alt2 = rs3064744_alt.str.contains("CATATAT/CATATAT")
  32. alt3 = rs3064744_alt.str.contains("CATAT/CATATAT")
  33. alt4 = rs3064744_alt.str.contains("CATATAT/CATAT")
  34. if len(rs3064744_alt[alt1]) != 0:
  35. HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(6)'
  36. elif len(rs3064744_alt[alt2]) != 0:
  37. HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(7)'
  38. elif len(rs3064744_alt[alt3]) != 0:
  39. HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(7)'
  40. elif len(rs3064744_alt[alt4]) != 0:
  41. HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(6)'
  42. HLdata1 = HLdata[['sampleid', '#Gene', 'rsID', 'Result']]
  43. HLdata_screen = pd.merge(HLdata1, druginfor[['rsID']], on=['rsID'], how='inner')
  44. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
  45. writer = pd.ExcelWriter(outputfile1)
  46. HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
  47. writer.save()
  48. writer.close()
  49. else:
  50. print(sampleid + ' is null in HL')
  51. HLdata_screen = pd.DataFrame()
  52. HLdata_screen.loc[0, 'sampleid']= sampleid
  53. HLdata_screen.loc[0, 'label'] ='NO result for chemotherapy'
  54. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
  55. writer = pd.ExcelWriter(outputfile1)
  56. HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
  57. writer.save()
  58. writer.close()
  59. else:
  60. print(normalid+' is null,please check the data')
  61. HLdata_screen.loc[0, 'sampleid'] = sampleid
  62. HLdata_screen.loc[0, 'label'] = 'No file'
  63. # make the temp dir
  64. temp_dir = os.path.join(inputpath, 'tempfile')
  65. if not os.path.exists(temp_dir):
  66. os.mkdir(temp_dir)
  67. bugfile_dir = os.path.join(temp_dir, 'bugfile')
  68. if not os.path.exists(bugfile_dir):
  69. os.mkdir(bugfile_dir)
  70. outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.chemotherapy.nofile.log.txt')
  71. HLdata_screen.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
  72. def chemothereapy_runmain(inputpath,normalid):
  73. laneid = inputpath.split('/')[-1].split('-')[-1]
  74. datasummarydir = os.path.join(inputpath, 'datasummary')
  75. isExists = os.path.exists(datasummarydir)
  76. if not isExists:
  77. os.makedirs(datasummarydir)
  78. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  79. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  80. sampledata = samplelist[samplelist['normal'] == normalid]
  81. sampledata.reset_index(drop=True, inplace=True)
  82. sampleid = sampledata.loc[0, 'samplename']
  83. HL_summary_v1_normal(inputpath, laneid, normalid, sampleid)
  84. if __name__=='__main__':
  85. parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain')
  86. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  87. parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
  88. args = parser.parse_args()
  89. Inputpath = args.inputpath
  90. Normalid = args.normalid
  91. chemothereapy_runmain(Inputpath,Normalid)
  92. ####for all
  93. def chemothera_summary(inputpath,laneid):
  94. datasummarydir = os.path.join(inputpath, 'datasummary')
  95. isExists = os.path.exists(datasummarydir)
  96. if not isExists:
  97. os.makedirs(datasummarydir)
  98. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  99. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  100. chemosummary = pd.DataFrame()
  101. for i in range(len(samplelist)):
  102. sampleid = samplelist.loc[i, 'samplename']
  103. normalid = samplelist.loc[i, 'normal']
  104. print(normalid)
  105. chemo_re=HL_summary_v1_normal(inputpath,laneid,normalid,sampleid)
  106. chemosummary =chemosummary.append(chemo_re)
  107. outputname = datasummarydir + '/' + laneid + '_table7_chemotherapy_datasummary.txt'
  108. chemosummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')