vcfsplit_v0_20220929_finish.py 1.0 KB

1234567891011121314151617181920212223242526
  1. import pandas as pd
  2. import os,sys
  3. import argparse
  4. def vcfsplit(inputdata):
  5. #inputdata = '/cgdata/bioproject/pancancer602gene/CGB0183/1SV_varscan_pair/Lib26611T.merge.Somatic.hc.nohead.vcf'
  6. dir = inputdata.split('.')
  7. vcfdata = pd.read_table(inputdata, sep='\t', header=None, low_memory=False)
  8. chrom = pd.DataFrame(vcfdata[0].drop_duplicates(keep='first'))
  9. chrom.reset_index(drop=True, inplace=True)
  10. for i in range(len(chrom)):
  11. chrid = chrom.loc[i, 0]
  12. chr_vcf = vcfdata[vcfdata[0] == chrid]
  13. chr_vcf.reset_index(drop=True, inplace=True)
  14. outputname = ('.'.join(dir[0:(len(dir) - 1)])) + '_' + str(chrid) + '.vcf'
  15. chr_vcf.to_csv(outputname, sep='\t', index=False, header=None)
  16. if __name__=='__main__':
  17. parser = argparse.ArgumentParser(description='filter the MSI')
  18. parser.add_argument('-i', '--inputdata', type=str, help='the inputdata of vcf')
  19. parser.add_argument('-s', '--sample', type=str, help='the sample name')
  20. args = parser.parse_args()
  21. Inputdata = args.inputdata
  22. vcfsplit(Inputdata)