chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
							#

import pandas as pd
import os,os.path
import numpy as np
import xlrd
import argparse

###对HLA的结果
#只关注HLA 1类分子分型A，B,C
#HLA 1类分子分型结果
#1.A,B,C三个基因座如果均为杂合型，则检测结果定义位杂合；
#2.A,B,C三个基因组如果有一个或两个位纯合型，则检测结果定义为部分纯合
#3.A,B,C三个基因座如果均为纯合型，则检测结果定义为纯合
#输出结果中，如果有一个出现“-”，则该基因座为纯合子。
#最后输出结果为两行，五列（Sampleid,HLA-class,HLA-A,HLA-B,HLA-C）

##判断某个基因的二倍体信息，纯合子还是杂合子
def gene_type_infor(type1,type2):
	'''
	##输入数据包括某个基因对应的二倍体的信息。
	:param type1: diploid information of HLA gene for type1
	:param type2:diploid information of HLA gene for type1
	:return:
	'''
	#gene = 'A'
	#type1 = 'HLA-A*30:01:01'
	#type2 = 'HLA-A*02:07:01'
	# 1)首先判断是杂合还是纯合
	# 当出现一个‘-’则为纯合
	if type1 == '-' or type2 == '-':
		genetype = 'homo'
	else:
		genetype = 'hete'
	# 对结果的修改，保留第一冒号前后的数值，去除基因名称和*号
	# for type1
	if type1 == '-':
		type1_infor = '-'
	elif type1 == 'Not typed':
		type1_infor = 'Not typed'
	elif ':' in type1:
		type1_infor = (type1.split('*')[1]).split(':')[0] + ':' + (type1.split('*')[1]).split(':')[1]
	# for type2
	if type2 == '-':
		type2_infor = '-'
	elif type2 == 'Not typed':
		type2_infor = 'Not typed'
	elif ':' in type2:
		type2_infor = (type2.split('*')[1]).split(':')[0] + ':' + (type2.split('*')[1]).split(':')[1]
	# 返回基因纯杂合情况，二倍体信息
	return genetype,type1_infor,type2_infor


###HLA分型针对的是正常对照。
####改进
#结果中出现'-'表示是为纯合子，需要把对应的也改为一样
###单样本分析

def HLA_control(inputpath,laneid,normalid,sampleid):
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	HLAdir = os.path.join(inputpath, '7HLA-HD_unpair')
	samplelistdir=os.path.join(HLAdir,normalid+'.HLA.result.txt')
	#sampleid = normalid[:-2]
	result_dir = os.path.join(inputpath, 'resultfile')
	if not os.path.exists(result_dir):
		os.mkdir(result_dir)
	sample_dir = os.path.join(result_dir, sampleid)
	if not os.path.exists(sample_dir):
		os.mkdir(sample_dir)
	if os.path.exists(samplelistdir):
		HLAdata = pd.read_table(samplelistdir, sep='\t', header=None, names=['gene', 'type1', 'type2'])
		# 提取HLA-A的信息
		HLA_A = HLAdata[HLAdata['gene'] == 'A']
		HLA_A.reset_index(drop=True, inplace=True)
		A_type1_label = HLA_A.loc[0, 'type1']
		A_type2_label = HLA_A.loc[0, 'type2']
		HLA_A_infor = gene_type_infor(A_type1_label, A_type2_label)
		# 提取HLA-B的信息
		HLA_B = HLAdata[HLAdata['gene'] == 'B']
		HLA_B.reset_index(drop=True, inplace=True)
		B_type1_label = HLA_B.loc[0, 'type1']
		B_type2_label = HLA_B.loc[0, 'type2']
		HLA_B_infor = gene_type_infor(B_type1_label, B_type2_label)
		# 提取HLA-C的信息
		HLA_C = HLAdata[HLAdata['gene'] == 'C']
		HLA_C.reset_index(drop=True, inplace=True)
		C_type1_label = HLA_C.loc[0, 'type1']
		C_type2_label = HLA_C.loc[0, 'type2']
		HLA_C_infor = gene_type_infor(C_type1_label, C_type2_label)
		# 判断样本的杂合与纯合情况
		sample_gene_infor = [HLA_A_infor[0], HLA_B_infor[0], HLA_C_infor[0]]
		sample_hete_count = sample_gene_infor.count('hete')
		sample_homo_count = sample_gene_infor.count('homo')
		# 对整个样本的HLA判断
		if sample_homo_count == 3:
			HLA_class = 'homozygous'
		elif sample_hete_count == 3:
			HLA_class = 'heterzygous'
		else:
			HLA_class = 'partial homozygous'
		##输出到结果
		HLAdata_sample = pd.DataFrame()
		HLAdata_sample.loc[0, 'sampleid'] = sampleid
		HLAdata_sample.loc[0, 'HLA-class'] = HLA_class
		HLAdata_sample.loc[0, 'HLA-A'] = HLA_A_infor[1]
		HLAdata_sample.loc[0, 'HLA-B'] = HLA_B_infor[1]
		HLAdata_sample.loc[0, 'HLA-C'] = HLA_C_infor[1]
		if HLA_A_infor[2] == '-':
			HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[1]
		else:
			HLAdata_sample.loc[1, 'HLA-A'] = HLA_A_infor[2]
		if HLA_B_infor[2] == '-':
			HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[1]
		else:
			HLAdata_sample.loc[1, 'HLA-B'] = HLA_B_infor[2]
		if HLA_C_infor[2] == '-':
			HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[1]
		else:
			HLAdata_sample.loc[1, 'HLA-C'] = HLA_C_infor[2]
		# 输出到excel
		outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.hla.xlsx')
		writer = pd.ExcelWriter(outputfile1)
		HLAdata_sample.to_excel(writer, sheet_name='hla', index=False)
		writer.save()
		writer.close()
	else:
		print(normalid+' is null,please check the data')
		HLAdata_sample.loc[0, 'sampleid'] = sampleid
		HLAdata_sample.loc[0, 'HLA-class'] = 'NO file'


def HLA_runmain(inputpath,normalid):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t', header=0)
	sampledata = samplelist[samplelist['normal'] == normalid]
	sampledata.reset_index(drop=True, inplace=True)
	sampleid = sampledata.loc[0, 'samplename']
	HLA_control(inputpath,laneid,normalid,sampleid)


if __name__=='__main__':
	parser = argparse.ArgumentParser(description='filter the HLA')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Normalid = args.normalid
	HLA_runmain(Inputpath,Normalid)


#########
def HLA_sum(inputpath,laneid):
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t', header=0)
	HLAsummary_last = pd.DataFrame()
	for i in range(len(samplelist)):
		sampleid = samplelist.loc[i, 'samplename']
		normalid = samplelist.loc[i, 'normal']
		print(normalid)
		HLA_re=HLA_control(inputpath, laneid, normalid,sampleid)
		HLAsummary_last =HLAsummary_last.append(HLA_re)
	outputname = datasummarydir + '/' + laneid + '_table9_HLA_datasummary.txt'
	HLAsummary_last.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')