chaowen.xi
/
602panel_test


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
							import pandas as pd
import os,os.path
import xlrd
import argparse

#改2
#rs3064744基因型能否给成TA(6)即可，另外这个位点是否应该给出两个等位基因的结果如TA(6)/TA(6)  TA(6)/TA(7)  TA(7)/TA(7)
#只对正常对照进行分析

def HL_summary_v1_normal(inputpath,laneid,normalid,sampleid):
	druginfor=pd.read_table('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_snp_list_loc.txt',sep='\t',header=0)
	druginfor.rename(columns={'rs_id':'rsID'},inplace=True)
	HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
	HLdatadir = HLdir + '/' + normalid + '.HL.xls'
	if os.path.exists(HLdatadir) and os.path.getsize(HLdatadir) != 0:
		#sampleid = normalid[:-2]
		# make the result dir
		result_dir = os.path.join(inputpath, 'resultfile')
		if not os.path.exists(result_dir):
			os.mkdir(result_dir)
		sample_dir = os.path.join(result_dir, sampleid)
		if not os.path.exists(sample_dir):
			os.mkdir(sample_dir)
		HLdata = pd.read_table(HLdatadir, sep='\t')
		if len(HLdata) !=0:
			HLdata = pd.read_table(HLdatadir, sep='\t')
			HLdata.insert(0, 'sampleid', sampleid)
			# 对位点rs3064744进行分析
			# 如果该位点对应的allele列中有CATAT，那么result的基因为TA(6)TAAGTAGG；如果有CATATAT,那么result的基因型为TA(7)TAAGTAGG
			rs3064744_alt = HLdata[HLdata['rsID'] == 'rs3064744']['allele']
			rs3064744_index = HLdata[HLdata['rsID'] == 'rs3064744'].index
			alt1 = rs3064744_alt.str.contains("CATAT/CATAT")
			alt2 = rs3064744_alt.str.contains("CATATAT/CATATAT")
			alt3 = rs3064744_alt.str.contains("CATAT/CATATAT")
			alt4 = rs3064744_alt.str.contains("CATATAT/CATAT")
			if len(rs3064744_alt[alt1]) != 0:
				HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(6)'
			elif len(rs3064744_alt[alt2]) != 0:
				HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(7)'
			elif len(rs3064744_alt[alt3]) != 0:
				HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(7)'
			elif len(rs3064744_alt[alt4]) != 0:
				HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(6)'
			HLdata1 = HLdata[['sampleid', '#Gene', 'rsID', 'Result']]
			HLdata_screen = pd.merge(HLdata1, druginfor[['rsID']], on=['rsID'], how='inner')
			outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
			writer = pd.ExcelWriter(outputfile1)
			HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
			writer.save()
			writer.close()
		else:
			print(sampleid + ' is null in HL')
			HLdata_screen = pd.DataFrame()
			HLdata_screen.loc[0, 'sampleid']= sampleid
			HLdata_screen.loc[0, 'label'] ='NO result for chemotherapy'
			outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
			writer = pd.ExcelWriter(outputfile1)
			HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
			writer.save()
			writer.close()
	else:
		print(normalid+' is null,please check the data')
		HLdata_screen.loc[0, 'sampleid'] = sampleid
		HLdata_screen.loc[0, 'label'] = 'No file'
		# make the temp dir
		temp_dir = os.path.join(inputpath, 'tempfile')
		if not os.path.exists(temp_dir):
			os.mkdir(temp_dir)
		bugfile_dir = os.path.join(temp_dir, 'bugfile')
		if not os.path.exists(bugfile_dir):
			os.mkdir(bugfile_dir)
		outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.chemotherapy.nofile.log.txt')
		HLdata_screen.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')


def chemothereapy_runmain(inputpath,normalid):
	laneid = inputpath.split('/')[-1].split('-')[-1]
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t', header=0)
	sampledata = samplelist[samplelist['normal'] == normalid]
	sampledata.reset_index(drop=True, inplace=True)
	sampleid = sampledata.loc[0, 'samplename']
	HL_summary_v1_normal(inputpath, laneid, normalid, sampleid)

if __name__=='__main__':
	parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain')
	parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
	parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
	args = parser.parse_args()
	Inputpath = args.inputpath
	Normalid = args.normalid
	chemothereapy_runmain(Inputpath,Normalid)


####for all
def chemothera_summary(inputpath,laneid):
	datasummarydir = os.path.join(inputpath, 'datasummary')
	isExists = os.path.exists(datasummarydir)
	if not isExists:
		os.makedirs(datasummarydir)
	sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
	samplelist = pd.read_table(sampledir, sep='\t', header=0)
	chemosummary = pd.DataFrame()
	for i in range(len(samplelist)):
		sampleid = samplelist.loc[i, 'samplename']
		normalid = samplelist.loc[i, 'normal']
		print(normalid)
		chemo_re=HL_summary_v1_normal(inputpath,laneid,normalid,sampleid)
		chemosummary =chemosummary.append(chemo_re)
	outputname = datasummarydir + '/' + laneid + '_table7_chemotherapy_datasummary.txt'
	chemosummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')