123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #edit the file
- #1.clean data
- #2.align
- #3.sv callin
- import sys,collections,math,os,os.path,re
- import pandas as pd
- from pandas.core.frame import DataFrame
- import argparse
- import numpy as np
- def analis_dir(inputpath):
- svdir1 = os.path.join(inputpath, '1SV_varscan_pair')
- if not os.path.exists(svdir1):
- os.mkdir(svdir1)
- svdir2= os.path.join(inputpath, '1SV_vardict_pair')
- if not os.path.exists(svdir2):
- os.mkdir(svdir2)
- # for CNV
- CNVdir = os.path.join(inputpath, '2CNV_cnvkit_pair')
- if not os.path.exists(CNVdir):
- os.mkdir(CNVdir)
- # for MSI
- MSI_dir = os.path.join(inputpath, '3MSI_msisensor2_pair')
- if not os.path.exists(MSI_dir):
- os.mkdir(MSI_dir)
- # for germline result
- gemerlinedir = os.path.join(inputpath, '4Germline_unpair')
- if not os.path.exists(gemerlinedir):
- os.mkdir(gemerlinedir)
- # for HL result
- HLdir = os.path.join(inputpath, '5HL_gatk_unpair')
- if not os.path.exists(HLdir):
- os.mkdir(HLdir)
- # for fusion
- fusion_method1_dir = os.path.join(inputpath, '6Fusion_manta_pair')
- if not os.path.exists(fusion_method1_dir):
- os.mkdir(fusion_method1_dir)
- fusion_method2_dir = os.path.join(inputpath, '6Fusion_genefusion_unpair')
- if not os.path.exists(fusion_method2_dir):
- os.mkdir(fusion_method2_dir)
- #for HLA
- HLA_dir = os.path.join(inputpath, '7HLA-HD_unpair')
- if not os.path.exists(HLA_dir):
- os.mkdir(HLA_dir)
- # for qc
- qc_dir = os.path.join(inputpath, '8Fastqc')
- if not os.path.exists(qc_dir):
- os.mkdir(qc_dir)
- # for ontarget
- ontarget_dir = os.path.join(inputpath, '9Ontarget')
- if not os.path.exists(ontarget_dir):
- os.mkdir(ontarget_dir)
- # for coverage
- coverage_dir = os.path.join(inputpath, '10Coverage')
- if not os.path.exists(coverage_dir):
- os.mkdir(coverage_dir)
- # datasummary
- datasummary_dir = os.path.join(inputpath, 'datasummary')
- if not os.path.exists(datasummary_dir):
- os.mkdir(datasummary_dir)
- ##### 首先要进行样本的配对,获得同一个样本的tumor和normal
- def sampleinfor(inputpath,laneid):
- files = os.listdir(inputpath)
- files1 = [inputpath + '/' + f for f in files if f.endswith(('sample.txt'))]
- samplelist = pd.read_table(files1[0], names=['sampleid'], sep='\t')
- samplenamelist = pd.DataFrame()
- for i in range(len(samplelist)):
- sample = samplelist.loc[i, 'sampleid'][:-2]
- samplenamelist.loc[i, 'samplename'] = sample
- samplenamelist_uq = samplenamelist.drop_duplicates().reset_index(drop=True)
- sample_pair = pd.DataFrame()
- for j in range(len(samplenamelist_uq)):
- sample = samplenamelist_uq.loc[j, 'samplename']
- sample_pair.loc[j, 'samplename'] = sample
- for k in range(len(samplelist)):
- sampleid = samplelist.loc[k, 'sampleid']
- print(sampleid)
- label = sampleid.find(sample)
- if label != -1:
- lib_lable = sampleid[-1]
- print(lib_lable)
- if lib_lable == 'T':
- sample_pair.loc[j, 'tumor'] = sampleid
- elif lib_lable == 'N':
- sample_pair.loc[j, 'normal'] = sampleid
- outputname = inputpath + '/' +laneid+'_'+ 'sample_infor_label.txt'
- sample_pair.to_csv(outputname, index=False, header=True, encoding='gbk', sep='\t')
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='sample infor')
- parser.add_argument('-i', '--inputpath', type=str, help='the path of lane')
- parser.add_argument('-l', '--laneid', type=str, help='laneid')
- args = parser.parse_args()
- Inputpath = args.inputpath
- Laneid = args.laneid
- sampleinfor(Inputpath,Laneid)
|