import pandas as pd import os,os.path import xlrd import argparse #改2 #rs3064744基因型能否给成TA(6)即可,另外这个位点是否应该给出两个等位基因的结果如TA(6)/TA(6) TA(6)/TA(7) TA(7)/TA(7) #只对正常对照进行分析 def HL_summary_v1_normal(inputpath,laneid,normalid,sampleid): druginfor=pd.read_table('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_snp_list_loc.txt',sep='\t',header=0) druginfor.rename(columns={'rs_id':'rsID'},inplace=True) HLdir = os.path.join(inputpath, '5HL_gatk_unpair') HLdatadir = HLdir + '/' + normalid + '.HL.xls' if os.path.exists(HLdatadir) and os.path.getsize(HLdatadir) != 0: #sampleid = normalid[:-2] # make the result dir result_dir = os.path.join(inputpath, 'resultfile') if not os.path.exists(result_dir): os.mkdir(result_dir) sample_dir = os.path.join(result_dir, sampleid) if not os.path.exists(sample_dir): os.mkdir(sample_dir) HLdata = pd.read_table(HLdatadir, sep='\t') if len(HLdata) !=0: HLdata = pd.read_table(HLdatadir, sep='\t') HLdata.insert(0, 'sampleid', sampleid) # 对位点rs3064744进行分析 # 如果该位点对应的allele列中有CATAT,那么result的基因为TA(6)TAAGTAGG;如果有CATATAT,那么result的基因型为TA(7)TAAGTAGG rs3064744_alt = HLdata[HLdata['rsID'] == 'rs3064744']['allele'] rs3064744_index = HLdata[HLdata['rsID'] == 'rs3064744'].index alt1 = rs3064744_alt.str.contains("CATAT/CATAT") alt2 = rs3064744_alt.str.contains("CATATAT/CATATAT") alt3 = rs3064744_alt.str.contains("CATAT/CATATAT") alt4 = rs3064744_alt.str.contains("CATATAT/CATAT") if len(rs3064744_alt[alt1]) != 0: HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(6)' elif len(rs3064744_alt[alt2]) != 0: HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(7)' elif len(rs3064744_alt[alt3]) != 0: HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(7)' elif len(rs3064744_alt[alt4]) != 0: HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(6)' HLdata1 = HLdata[['sampleid', '#Gene', 'rsID', 'Result']] HLdata_screen = pd.merge(HLdata1, druginfor[['rsID']], on=['rsID'], how='inner') outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx') writer = pd.ExcelWriter(outputfile1) HLdata_screen.to_excel(writer, sheet_name='chemical', index=False) writer.save() writer.close() else: print(sampleid + ' is null in HL') HLdata_screen = pd.DataFrame() HLdata_screen.loc[0, 'sampleid']= sampleid HLdata_screen.loc[0, 'label'] ='NO result for chemotherapy' outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx') writer = pd.ExcelWriter(outputfile1) HLdata_screen.to_excel(writer, sheet_name='chemical', index=False) writer.save() writer.close() else: print(normalid+' is null,please check the data') HLdata_screen.loc[0, 'sampleid'] = sampleid HLdata_screen.loc[0, 'label'] = 'No file' # make the temp dir temp_dir = os.path.join(inputpath, 'tempfile') if not os.path.exists(temp_dir): os.mkdir(temp_dir) bugfile_dir = os.path.join(temp_dir, 'bugfile') if not os.path.exists(bugfile_dir): os.mkdir(bugfile_dir) outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.chemotherapy.nofile.log.txt') HLdata_screen.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t') def chemothereapy_runmain(inputpath,normalid): laneid = inputpath.split('/')[-1].split('-')[-1] datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) sampledata = samplelist[samplelist['normal'] == normalid] sampledata.reset_index(drop=True, inplace=True) sampleid = sampledata.loc[0, 'samplename'] HL_summary_v1_normal(inputpath, laneid, normalid, sampleid) if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample') args = parser.parse_args() Inputpath = args.inputpath Normalid = args.normalid chemothereapy_runmain(Inputpath,Normalid) ####for all def chemothera_summary(inputpath,laneid): datasummarydir = os.path.join(inputpath, 'datasummary') isExists = os.path.exists(datasummarydir) if not isExists: os.makedirs(datasummarydir) sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt') samplelist = pd.read_table(sampledir, sep='\t', header=0) chemosummary = pd.DataFrame() for i in range(len(samplelist)): sampleid = samplelist.loc[i, 'samplename'] normalid = samplelist.loc[i, 'normal'] print(normalid) chemo_re=HL_summary_v1_normal(inputpath,laneid,normalid,sampleid) chemosummary =chemosummary.append(chemo_re) outputname = datasummarydir + '/' + laneid + '_table7_chemotherapy_datasummary.txt' chemosummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')