import pandas as pd import os,sys import argparse def vcfsplit(inputdata): #inputdata = '/cgdata/bioproject/pancancer602gene/CGB0183/1SV_varscan_pair/Lib26611T.merge.Somatic.hc.nohead.vcf' dir = inputdata.split('.') vcfdata = pd.read_table(inputdata, sep='\t', header=None, low_memory=False) chrom = pd.DataFrame(vcfdata[0].drop_duplicates(keep='first')) chrom.reset_index(drop=True, inplace=True) for i in range(len(chrom)): chrid = chrom.loc[i, 0] chr_vcf = vcfdata[vcfdata[0] == chrid] chr_vcf.reset_index(drop=True, inplace=True) outputname = ('.'.join(dir[0:(len(dir) - 1)])) + '_' + str(chrid) + '.vcf' chr_vcf.to_csv(outputname, sep='\t', index=False, header=None) if __name__=='__main__': parser = argparse.ArgumentParser(description='filter the MSI') parser.add_argument('-i', '--inputdata', type=str, help='the inputdata of vcf') parser.add_argument('-s', '--sample', type=str, help='the sample name') args = parser.parse_args() Inputdata = args.inputdata vcfsplit(Inputdata)