123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- import pandas as pd
- import os,os.path
- import xlrd
- import argparse
- #改2
- #rs3064744基因型能否给成TA(6)即可,另外这个位点是否应该给出两个等位基因的结果如TA(6)/TA(6) TA(6)/TA(7) TA(7)/TA(7)
- #只对正常对照进行分析
- def HL_summary_v1_normal(inputpath,laneid,normalid,sampleid):
- druginfor=pd.read_table('/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/drug_snp_list_loc.txt',sep='\t',header=0)
- druginfor.rename(columns={'rs_id':'rsID'},inplace=True)
- HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
- HLdatadir = HLdir + '/' + normalid + '.HL.xls'
- if os.path.exists(HLdatadir) and os.path.getsize(HLdatadir) != 0:
- #sampleid = normalid[:-2]
- # make the result dir
- result_dir = os.path.join(inputpath, 'resultfile')
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
- sample_dir = os.path.join(result_dir, sampleid)
- if not os.path.exists(sample_dir):
- os.mkdir(sample_dir)
- HLdata = pd.read_table(HLdatadir, sep='\t')
- if len(HLdata) !=0:
- HLdata = pd.read_table(HLdatadir, sep='\t')
- HLdata.insert(0, 'sampleid', sampleid)
- # 对位点rs3064744进行分析
- # 如果该位点对应的allele列中有CATAT,那么result的基因为TA(6)TAAGTAGG;如果有CATATAT,那么result的基因型为TA(7)TAAGTAGG
- rs3064744_alt = HLdata[HLdata['rsID'] == 'rs3064744']['allele']
- rs3064744_index = HLdata[HLdata['rsID'] == 'rs3064744'].index
- alt1 = rs3064744_alt.str.contains("CATAT/CATAT")
- alt2 = rs3064744_alt.str.contains("CATATAT/CATATAT")
- alt3 = rs3064744_alt.str.contains("CATAT/CATATAT")
- alt4 = rs3064744_alt.str.contains("CATATAT/CATAT")
- if len(rs3064744_alt[alt1]) != 0:
- HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(6)'
- elif len(rs3064744_alt[alt2]) != 0:
- HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(7)'
- elif len(rs3064744_alt[alt3]) != 0:
- HLdata.loc[rs3064744_index, 'Result'] = 'TA(6)/TA(7)'
- elif len(rs3064744_alt[alt4]) != 0:
- HLdata.loc[rs3064744_index, 'Result'] = 'TA(7)/TA(6)'
- HLdata1 = HLdata[['sampleid', '#Gene', 'rsID', 'Result']]
- HLdata_screen = pd.merge(HLdata1, druginfor[['rsID']], on=['rsID'], how='inner')
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
- writer.save()
- writer.close()
- else:
- print(sampleid + ' is null in HL')
- HLdata_screen = pd.DataFrame()
- HLdata_screen.loc[0, 'sampleid']= sampleid
- HLdata_screen.loc[0, 'label'] ='NO result for chemotherapy'
- outputfile1 = os.path.join(sample_dir, laneid + '-' + sampleid + '.chemical.xlsx')
- writer = pd.ExcelWriter(outputfile1)
- HLdata_screen.to_excel(writer, sheet_name='chemical', index=False)
- writer.save()
- writer.close()
- else:
- print(normalid+' is null,please check the data')
- HLdata_screen.loc[0, 'sampleid'] = sampleid
- HLdata_screen.loc[0, 'label'] = 'No file'
- # make the temp dir
- temp_dir = os.path.join(inputpath, 'tempfile')
- if not os.path.exists(temp_dir):
- os.mkdir(temp_dir)
- bugfile_dir = os.path.join(temp_dir, 'bugfile')
- if not os.path.exists(bugfile_dir):
- os.mkdir(bugfile_dir)
- outputfile2 = os.path.join(bugfile_dir, laneid + '-' + sampleid + '.chemotherapy.nofile.log.txt')
- HLdata_screen.to_csv(outputfile2, index=False, header=True, encoding='gbk', sep='\t')
- def chemothereapy_runmain(inputpath,normalid):
- laneid = inputpath.split('/')[-1].split('-')[-1]
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- sampledata = samplelist[samplelist['normal'] == normalid]
- sampledata.reset_index(drop=True, inplace=True)
- sampleid = sampledata.loc[0, 'samplename']
- HL_summary_v1_normal(inputpath, laneid, normalid, sampleid)
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='filter the chemothereapy_runmain')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-s', '--normalid', type=str, help='the normal name of sample')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Normalid = args.normalid
- chemothereapy_runmain(Inputpath,Normalid)
- ####for all
- def chemothera_summary(inputpath,laneid):
- datasummarydir = os.path.join(inputpath, 'datasummary')
- isExists = os.path.exists(datasummarydir)
- if not isExists:
- os.makedirs(datasummarydir)
- sampledir = os.path.join(inputpath, laneid + '_sample_infor_label.txt')
- samplelist = pd.read_table(sampledir, sep='\t', header=0)
- chemosummary = pd.DataFrame()
- for i in range(len(samplelist)):
- sampleid = samplelist.loc[i, 'samplename']
- normalid = samplelist.loc[i, 'normal']
- print(normalid)
- chemo_re=HL_summary_v1_normal(inputpath,laneid,normalid,sampleid)
- chemosummary =chemosummary.append(chemo_re)
- outputname = datasummarydir + '/' + laneid + '_table7_chemotherapy_datasummary.txt'
- chemosummary.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
|