datafile_HLA_v0_20220705_finish.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. #
  2. import pandas as pd
  3. import os,os.path
  4. import numpy as np
  5. import xlrd
  6. import argparse
  7. ###对HLA的结果
  8. #只关注HLA 1类分子分型A,B,C
  9. #HLA 1类分子分型结果
  10. #1.A,B,C三个基因座如果均为杂合型,则检测结果定义位杂合;
  11. #2.A,B,C三个基因组如果有一个或两个位纯合型,则检测结果定义为部分纯合
  12. #3.A,B,C三个基因座如果均为纯合型,则检测结果定义为纯合
  13. #输出结果中,如果有一个出现“-”,则该基因座为纯合子。
  14. #最后输出结果为两行,五列(Sampleid,HLA-class,HLA-A,HLA-B,HLA-C)
  15. ##判断某个基因的二倍体信息,纯合子还是杂合子
  16. def gene_type_infor(type1,type2):
  17. '''
  18. ##输入数据包括某个基因对应的二倍体的信息。
  19. :param type1: diploid information of HLA gene for type1
  20. :param type2:diploid information of HLA gene for type1
  21. :return:
  22. '''
  23. #gene = 'A'
  24. #type1 = 'HLA-A*30:01:01'
  25. #type2 = 'HLA-A*02:07:01'
  26. # 1)首先判断是杂合还是纯合
  27. # 当出现一个‘-’则为纯合
  28. if type1 == '-' or type2 == '-':
  29. genetype = 'homo'
  30. else:
  31. genetype = 'hete'
  32. # 对结果的修改,保留第一冒号前后的数值,去除基因名称和*号
  33. # for type1
  34. if type1 == '-':
  35. type1_infor = '-'
  36. elif type1 == 'Not typed':
  37. type1_infor = 'Not typed'
  38. elif ':' in type1:
  39. type1_infor = (type1.split('*')[1]).split(':')[0] + ':' + (type1.split('*')[1]).split(':')[1]
  40. # for type2
  41. if type2 == '-':
  42. type2_infor = '-'
  43. elif type2 == 'Not typed':
  44. type2_infor = 'Not typed'
  45. elif ':' in type2:
  46. type2_infor = (type2.split('*')[1]).split(':')[0] + ':' + (type2.split('*')[1]).split(':')[1]
  47. # 返回基因纯杂合情况,二倍体信息
  48. return genetype,type1_infor,type2_infor
  49. ###HLA分型针对的是正常对照。
  50. ####改进
  51. #结果中出现'-'表示是为纯合子,需要把对应的也改为一样
  52. ###单样本分析
  53. def HLA_control(inputpath,laneid,normalid,sampleid):
  54. datasummarydir = os.path.join(inputpath, 'datasummary')
  55. isExists = os.path.exists(datasummarydir)
  56. if not isExists:
  57. os.makedirs(datasummarydir)
  58. HLAdir = os.path.join(inputpath, '7HLA-HD_unpair')
  59. samplelistdir=os.path.join(HLAdir,normalid+'.HLA.result.txt')
  60. #sampleid = normalid[:-2]
  61. result_dir = os.path.join(inputpath, 'resultfile')
  62. if not os.path.exists(result_dir):
  63. os.mkdir(result_dir)
  64. sample_dir = os.path.join(result_dir, sampleid)
  65. if not os.path.exists(sample_dir):
  66. os.mkdir(sample_dir)
  67. if os.path.exists(samplelistdir):
  68. HLAdata = pd.read_table(samplelistdir, sep='\t', header=None, names=['gene', 'type1', 'type2'])
  69. # 提取HLA-A的信息
  70. HLA_A = HLAdata[HLAdata['gene'] == 'A']
  71. HLA_A.reset_index(drop=True, inplace=True)
  72. A_type1_label = HLA_A.loc[0, 'type1']
  73. A_type2_label = HLA_A.loc[0, 'type2']
  74. HLA_A_infor = gene_type_infor(A_type1_label, A_type2_label)
  75. # 提取HLA-B的信息
  76. HLA_B = HLAdata[HLAdata['gene'] == 'B']
  77. HLA_B.reset_index(drop=True, inplace=True)
  78. B_type1_label = HLA_B.loc[0, 'type1']
  79. B_type2_label = HLA_B.loc[0, 'type2']
  80. HLA_B_infor = gene_type_infor(B_type1_label, B_type2_label)
  81. # 提取HLA-C的信息
  82. HLA_C = HLAdata[HLAdata['gene'] == 'C']
  83. HLA_C.reset_index(drop=True, inplace=True)
  84. C_type1_label = HLA_C.loc[0, 'type1']
  85. C_type2_label = HLA_C.loc[0, 'type2']
  86. HLA_C_infor = gene_type_infor(C_type1_label, C_type2_label)
  87. # 判断样本的杂合与纯合情况
  88. sample_gene_infor = [HLA_A_infor[0], HLA_B_infor[0], HLA_C_infor[0]]
  89. sample_hete_count = sample_gene_infor.count('hete')
  90. sample_homo_count = sample_gene_infor.count('homo')
  91. # 对整个样本的HLA判断
  92. if sample_homo_count == 3:
  93. HLA_class = 'homozygous'
  94. elif sample_hete_count == 3:
  95. HLA_class = 'heterzygous'
  96. else:
  97. HLA_class = 'partial homozygous'
  98. ##输出到结果
  99. HLAdata_sample = pd.DataFrame()
  100. HLAdata_sample.loc[0, 'sampleid'] = sampleid
  101. HLAdata_sample.loc[0, 'HLA-class'] = HLA_class
  102. HLAdata_sample.loc[0, 'HLA-A'] = HLA_A_infor[1]
  103. HLAdata_sample.loc[0, 'HLA-B'] = HLA_B_infor[1]
  104. HLAdata_sample.loc[0, 'HLA-C'] = HLA_C_infor[1]
  105. if HLA_A_infor[2] == '-':
  106. HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[1]
  107. else:
  108. HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[2]
  109. if HLA_B_infor[2] == '-':
  110. HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[1]
  111. else:
  112. HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[2]
  113. if HLA_C_infor[2] == '-':
  114. HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[1]
  115. else:
  116. HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[2]
  117. # 输出到excel
  118. outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.hla.xlsx')
  119. writer = pd.ExcelWriter(outputfile1)
  120. HLAdata_sample.to_excel(writer, sheet_name='hla', index=False)
  121. writer.save()
  122. writer.close()
  123. else:
  124. print(normalid+' is null,please check the data')
  125. HLAdata_sample.loc[0, 'sampleid'] = sampleid
  126. HLAdata_sample.loc[0, 'HLA-class'] = 'NO file'
  127. def HLA_runmain(inputpath,normalid):
  128. laneid = inputpath.split('/')[-1].split('-')[-1]
  129. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  130. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  131. sampledata = samplelist[samplelist['normal'] == normalid]
  132. sampledata.reset_index(drop=True, inplace=True)
  133. sampleid = sampledata.loc[0, 'samplename']
  134. HLA_control(inputpath,laneid,normalid,sampleid)
  135. if __name__=='__main__':
  136. parser = argparse.ArgumentParser(description='filter the HLA')
  137. parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
  138. parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
  139. args = parser.parse_args()
  140. Inputpath = args.inputpath
  141. Normalid = args.normalid
  142. HLA_runmain(Inputpath,Normalid)
  143. #########
  144. def HLA_sum(inputpath,laneid):
  145. datasummarydir = os.path.join(inputpath, 'datasummary')
  146. isExists = os.path.exists(datasummarydir)
  147. if not isExists:
  148. os.makedirs(datasummarydir)
  149. sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
  150. samplelist = pd.read_table(sampledir, sep='\t', header=0)
  151. HLAsummary_last = pd.DataFrame()
  152. for i in range(len(samplelist)):
  153. sampleid = samplelist.loc[i, 'samplename']
  154. normalid = samplelist.loc[i, 'normal']
  155. print(normalid)
  156. HLA_re=HLA_control(inputpath, laneid, normalid,sampleid)
  157. HLAsummary_last =HLAsummary_last.append(HLA_re)
  158. outputname = datasummarydir + '/' + laneid + '_table9_HLA_datasummary.txt'
  159. HLAsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')