123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- #
- import pandas as pd
- import os,os.path
- import numpy as np
- import xlrd
- import argparse
- ###对HLA的结果
- #只关注HLA 1类分子分型A,B,C
- #HLA 1类分子分型结果
- #1.A,B,C三个基因座如果均为杂合型,则检测结果定义位杂合;
- #2.A,B,C三个基因组如果有一个或两个位纯合型,则检测结果定义为部分纯合
- #3.A,B,C三个基因座如果均为纯合型,则检测结果定义为纯合
- #输出结果中,如果有一个出现“-”,则该基因座为纯合子。
- #最后输出结果为两行,五列(Sampleid,HLA-class,HLA-A,HLA-B,HLA-C)
- ##判断某个基因的二倍体信息,纯合子还是杂合子
- def gene_type_infor(type1,type2):
- '''
- ##输入数据包括某个基因对应的二倍体的信息。
- :param type1: diploid information of HLA gene for type1
- :param type2:diploid information of HLA gene for type1
- :return:
- '''
- #gene = 'A'
- #type1 = 'HLA-A*30:01:01'
- #type2 = 'HLA-A*02:07:01'
- # 1)首先判断是杂合还是纯合
- # 当出现一个‘-’则为纯合
- if type1 == '-' or type2 == '-':
- genetype = 'homo'
- else:
- genetype = 'hete'
- # 对结果的修改,保留第一冒号前后的数值,去除基因名称和*号
- # for type1
- if type1 == '-':
- type1_infor = '-'
- elif type1 == 'Not typed':
- type1_infor = 'Not typed'
- elif ':' in type1:
- type1_infor = (type1.split('*')[1]).split(':')[0] + ':' + (type1.split('*')[1]).split(':')[1]
- # for type2
- if type2 == '-':
- type2_infor = '-'
- elif type2 == 'Not typed':
- type2_infor = 'Not typed'
- elif ':' in type2:
- type2_infor = (type2.split('*')[1]).split(':')[0] + ':' + (type2.split('*')[1]).split(':')[1]
- # 返回基因纯杂合情况,二倍体信息
- return genetype,type1_infor,type2_infor
- ###HLA分型针对的是正常对照。
- ####改进
- #结果中出现'-'表示是为纯合子,需要把对应的也改为一样
- ###单样本分析
- def HLA_control(inputpath,laneid,normalid,sampleid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- HLAdir = os.path.join(inputpath, '7HLA-HD_unpair')
- samplelistdir=os.path.join(HLAdir,normalid+'.HLA.result.txt')
- #sampleid = normalid[:-2]
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- if os.path.exists(samplelistdir):
- HLAdata = pd.read_table(samplelistdir, sep='\t', header=None, names=['gene', 'type1', 'type2'])
- # 提取HLA-A的信息
- HLA_A = HLAdata[HLAdata['gene'] == 'A']
- HLA_A.reset_index(drop=True, inplace=True)
- A_type1_label = HLA_A.loc[0, 'type1']
- A_type2_label = HLA_A.loc[0, 'type2']
- HLA_A_infor = gene_type_infor(A_type1_label, A_type2_label)
- # 提取HLA-B的信息
- HLA_B = HLAdata[HLAdata['gene'] == 'B']
- HLA_B.reset_index(drop=True, inplace=True)
- B_type1_label = HLA_B.loc[0, 'type1']
- B_type2_label = HLA_B.loc[0, 'type2']
- HLA_B_infor = gene_type_infor(B_type1_label, B_type2_label)
- # 提取HLA-C的信息
- HLA_C = HLAdata[HLAdata['gene'] == 'C']
- HLA_C.reset_index(drop=True, inplace=True)
- C_type1_label = HLA_C.loc[0, 'type1']
- C_type2_label = HLA_C.loc[0, 'type2']
- HLA_C_infor = gene_type_infor(C_type1_label, C_type2_label)
- # 判断样本的杂合与纯合情况
- sample_gene_infor = [HLA_A_infor[0], HLA_B_infor[0], HLA_C_infor[0]]
- sample_hete_count = sample_gene_infor.count('hete')
- sample_homo_count = sample_gene_infor.count('homo')
- # 对整个样本的HLA判断
- if sample_homo_count == 3:
- HLA_class = 'homozygous'
- elif sample_hete_count == 3:
- HLA_class = 'heterzygous'
- else:
- HLA_class = 'partial homozygous'
- ##输出到结果
- HLAdata_sample = pd.DataFrame()
- HLAdata_sample.loc[0, 'sampleid'] = sampleid
- HLAdata_sample.loc[0, 'HLA-class'] = HLA_class
- HLAdata_sample.loc[0, 'HLA-A'] = HLA_A_infor[1]
- HLAdata_sample.loc[0, 'HLA-B'] = HLA_B_infor[1]
- HLAdata_sample.loc[0, 'HLA-C'] = HLA_C_infor[1]
- if HLA_A_infor[2] == '-':
- HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[1]
- else:
- HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[2]
- if HLA_B_infor[2] == '-':
- HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[1]
- else:
- HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[2]
- if HLA_C_infor[2] == '-':
- HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[1]
- else:
- HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[2]
- # 输出到excel
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.hla.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- HLAdata_sample.to_excel(writer, sheet_name='hla', index=False)
- writer.save()
- writer.close()
- else:
- print(normalid+' is null,please check the data')
- HLAdata_sample.loc[0, 'sampleid'] = sampleid
- HLAdata_sample.loc[0, 'HLA-class'] = 'NO file'
- def HLA_runmain(inputpath,normalid):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- sampledata = samplelist[samplelist['normal'] == normalid]
- sampledata.reset_index(drop=True, inplace=True)
- sampleid = sampledata.loc[0, 'samplename']
- HLA_control(inputpath,laneid,normalid,sampleid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the HLA')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Normalid = args.normalid
- HLA_runmain(Inputpath,Normalid)
- #########
- def HLA_sum(inputpath,laneid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- HLAsummary_last = pd.DataFrame()
- for i in range(len(samplelist)):
- sampleid = samplelist.loc[i, 'samplename']
- normalid = samplelist.loc[i, 'normal']
- print(normalid)
- HLA_re=HLA_control(inputpath, laneid, normalid,sampleid)
- HLAsummary_last =HLAsummary_last.append(HLA_re)
- outputname = datasummarydir + '/' + laneid + '_table9_HLA_datasummary.txt'
- HLAsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|