# import pandas as pd import os,os.path import numpy as np import xlrd import argparse ###对HLA的结果 #只关注HLA 1类分子分型A,B,C #HLA 1类分子分型结果 #1.A,B,C三个基因座如果均为杂合型,则检测结果定义位杂合; #2.A,B,C三个基因组如果有一个或两个位纯合型,则检测结果定义为部分纯合 #3.A,B,C三个基因座如果均为纯合型,则检测结果定义为纯合 #输出结果中,如果有一个出现“-”,则该基因座为纯合子。 #最后输出结果为两行,五列(Sampleid,HLA-class,HLA-A,HLA-B,HLA-C) ##判断某个基因的二倍体信息,纯合子还是杂合子 def gene_type_infor(type1,type2): ''' ##输入数据包括某个基因对应的二倍体的信息。 :param type1: diploid information of HLA gene for type1 :param type2:diploid information of HLA gene for type1 :return: ''' #gene = 'A' #type1 = 'HLA-A*30:01:01' #type2 = 'HLA-A*02:07:01' # 1)首先判断是杂合还是纯合 # 当出现一个‘-’则为纯合 if type1 == '-' or type2 == '-': genetype = 'homo' else: genetype = 'hete' # 对结果的修改,保留第一冒号前后的数值,去除基因名称和*号 # for type1 if type1 == '-': type1_infor = '-' elif type1 == 'Not typed': type1_infor = 'Not typed' elif ':' in type1: type1_infor = (type1.split('*')[1]).split(':')[0] + ':' + (type1.split('*')[1]).split(':')[1] # for type2 if type2 == '-': type2_infor = '-' elif type2 == 'Not typed': type2_infor = 'Not typed' elif ':' in type2: type2_infor = (type2.split('*')[1]).split(':')[0] + ':' + (type2.split('*')[1]).split(':')[1] # 返回基因纯杂合情况,二倍体信息 return genetype,type1_infor,type2_infor ###HLA分型针对的是正常对照。 ####改进 #结果中出现'-'表示是为纯合子,需要把对应的也改为一样 ###单样本分析 def HLA_control(inputpath,laneid,normalid,sampleid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) HLAdir = os.path.join(inputpath, '7HLA-HD_unpair') samplelistdir=os.path.join(HLAdir,normalid+'.HLA.result.txt') #sampleid = normalid[:-2] result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) if os.path.exists(samplelistdir): HLAdata = pd.read_table(samplelistdir, sep='\t', header=None, names=['gene', 'type1', 'type2']) # 提取HLA-A的信息 HLA_A = HLAdata[HLAdata['gene'] == 'A'] HLA_A.reset_index(drop=True, inplace=True) A_type1_label = HLA_A.loc[0, 'type1'] A_type2_label = HLA_A.loc[0, 'type2'] HLA_A_infor = gene_type_infor(A_type1_label, A_type2_label) # 提取HLA-B的信息 HLA_B = HLAdata[HLAdata['gene'] == 'B'] HLA_B.reset_index(drop=True, inplace=True) B_type1_label = HLA_B.loc[0, 'type1'] B_type2_label = HLA_B.loc[0, 'type2'] HLA_B_infor = gene_type_infor(B_type1_label, B_type2_label) # 提取HLA-C的信息 HLA_C = HLAdata[HLAdata['gene'] == 'C'] HLA_C.reset_index(drop=True, inplace=True) C_type1_label = HLA_C.loc[0, 'type1'] C_type2_label = HLA_C.loc[0, 'type2'] HLA_C_infor = gene_type_infor(C_type1_label, C_type2_label) # 判断样本的杂合与纯合情况 sample_gene_infor = [HLA_A_infor[0], HLA_B_infor[0], HLA_C_infor[0]] sample_hete_count = sample_gene_infor.count('hete') sample_homo_count = sample_gene_infor.count('homo') # 对整个样本的HLA判断 if sample_homo_count == 3: HLA_class = 'homozygous' elif sample_hete_count == 3: HLA_class = 'heterzygous' else: HLA_class = 'partial homozygous' ##输出到结果 HLAdata_sample = pd.DataFrame() HLAdata_sample.loc[0, 'sampleid'] = sampleid HLAdata_sample.loc[0, 'HLA-class'] = HLA_class HLAdata_sample.loc[0, 'HLA-A'] = HLA_A_infor[1] HLAdata_sample.loc[0, 'HLA-B'] = HLA_B_infor[1] HLAdata_sample.loc[0, 'HLA-C'] = HLA_C_infor[1] if HLA_A_infor[2] == '-': HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[1] else: HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[2] if HLA_B_infor[2] == '-': HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[1] else: HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[2] if HLA_C_infor[2] == '-': HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[1] else: HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[2] # 输出到excel outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.hla.xlsx') writer = pd.ExcelWriter(outputfile1) HLAdata_sample.to_excel(writer, sheet_name='hla', index=False) writer.save() writer.close() else: print(normalid+' is null,please check the data') HLAdata_sample.loc[0, 'sampleid'] = sampleid HLAdata_sample.loc[0, 'HLA-class'] = 'NO file' def HLA_runmain(inputpath,normalid): laneid = inputpath.split('/')[-1].split('-')[-1] sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) sampledata = samplelist[samplelist['normal'] == normalid] sampledata.reset_index(drop=True, inplace=True) sampleid = sampledata.loc[0, 'samplename'] HLA_control(inputpath,laneid,normalid,sampleid) if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the HLA') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample') args = parser.parse_args() Inputpath = args.inputpath Normalid = args.normalid HLA_runmain(Inputpath,Normalid) ######### def HLA_sum(inputpath,laneid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) HLAsummary_last = pd.DataFrame() for i in range(len(samplelist)): sampleid = samplelist.loc[i, 'samplename'] normalid = samplelist.loc[i, 'normal'] print(normalid) HLA_re=HLA_control(inputpath, laneid, normalid,sampleid) HLAsummary_last =HLAsummary_last.append(HLA_re) outputname = datasummarydir + '/' + laneid + '_table9_HLA_datasummary.txt' HLAsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')