#edit the file #1.clean data #2.align #3.sv callin import sys,collections,math,os,os.path,re import pandas as pd from pandas.core.frame import DataFrame import argparse import numpy as np def analis_dir(inputpath): svdir1 = os.path.join(inputpath, '1SV_varscan_pair') if not os.path.exists(svdir1): os.mkdir(svdir1) svdir2= os.path.join(inputpath, '1SV_vardict_pair') if not os.path.exists(svdir2): os.mkdir(svdir2) # for CNV CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair') if not os.path.exists(CNVdir): os.mkdir(CNVdir) # for MSI MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair') if not os.path.exists(MSI_dir): os.mkdir(MSI_dir) # for germline result gemerlinedir = os.path.join(inputpath, '4Germline_unpair') if not os.path.exists(gemerlinedir): os.mkdir(gemerlinedir) # for HL result HLdir = os.path.join(inputpath, '5HL_gatk_unpair') if not os.path.exists(HLdir): os.mkdir(HLdir) # for fusion fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair') if not os.path.exists(fusion_method1_dir): os.mkdir(fusion_method1_dir) fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair') if not os.path.exists(fusion_method2_dir): os.mkdir(fusion_method2_dir) #for HLA HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair') if not os.path.exists(HLA_dir): os.mkdir(HLA_dir) # for qc qc_dir = os.path.join(inputpath, '8Fastqc') if not os.path.exists(qc_dir): os.mkdir(qc_dir) # for ontarget ontarget_dir = os.path.join(inputpath, '9Ontarget') if not os.path.exists(ontarget_dir): os.mkdir(ontarget_dir) # for coverage coverage_dir = os.path.join(inputpath, '10Coverage') if not os.path.exists(coverage_dir): os.mkdir(coverage_dir) # datasummary datasummary_dir = os.path.join(inputpath, 'datasummary') if not os.path.exists(datasummary_dir): os.mkdir(datasummary_dir) ##### 首先要进行样本的配对,获得同一个样本的tumor和normal def sampleinfor(inputpath,laneid): files = os.listdir(inputpath) files1 = [inputpath + '/' + f for f in files if f.endswith(('sample.txt'))] samplelist = pd.read_table(files1[0], names=['sampleid'], sep='\t') samplenamelist = pd.DataFrame() for i in range(len(samplelist)): sample = samplelist.loc[i, 'sampleid'][:-2] samplenamelist.loc[i, 'samplename'] = sample samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True) sample_pair = pd.DataFrame() for j in range(len(samplenamelist_uq)): sample = samplenamelist_uq.loc[j, 'samplename'] sample_pair.loc[j, 'samplename'] = sample for k in range(len(samplelist)): sampleid = samplelist.loc[k, 'sampleid'] print(sampleid) label = sampleid.find(sample) if label != -1: lib_lable = sampleid[-1] print(lib_lable) if lib_lable == 'T': sample_pair.loc[j, 'tumor'] = sampleid elif lib_lable == 'N': sample_pair.loc[j, 'normal'] = sampleid outputname = inputpath + '/' +laneid+'_'+ 'sample_infor_label.txt' sample_pair.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t') if __name__=='__main__': parser = argparse.ArgumentParser(description='sample infor') parser.add_argument('-i', '--inputpath', type=str, help='the path of lane') parser.add_argument('-l', '--laneid', type=str, help='laneid') args = parser.parse_args() Inputpath = args.inputpath Laneid = args.laneid sampleinfor(Inputpath,Laneid)