vcf_combine.pbs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/bin/sh
  2. #PBS -N noUMI_varscan_pair_combined
  3. #PBS -j oe
  4. #PBS -l ncpus=12
  5. #PBS -l nodes=1
  6. #PBS -l mem=20G
  7. source /home/liuxiangqiong/miniconda3/bin/activate base
  8. #inputpath=/cgdata/bioproject/pancancer602gene/CGB0402_1
  9. #sample=Lib52939T
  10. #tumor=Lib52939TTT
  11. #bam_dir=/cgdata/pancancer/analyse/CGB0402_1/bamfile
  12. #设置相邻位点合并长度
  13. #length=15
  14. #min_af=0.005 for blood,0.01 for tissue
  15. outputdir=${inputpath}/1SV_varscan_pair
  16. hg19=/cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta
  17. target=/cgdata/liuxiangqiong/work62pancancer/Client/v0/script/refdata/NanOnco_Plus_Panel_v2.0_Covered_b37_cg.parY2X.sort.bed
  18. target1=/cgdata/liuxiangqiong/work62pancancer/pipeline/v0/refdata/NanOnco_Plus_Panel_v2.0_Covered_b37_cg.parY2X.sort
  19. ##将VCF按照染色体进行拆分
  20. python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/vcfsplit_v0_20220929_finish.py -i ${outputdir}/${sample}.merge.Somatic.hc.nohead.vcf
  21. fun() {
  22. hg19=/cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta
  23. python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/complex_variant_new.py -i ${file} -b ${bam_dir}/${tumor}_clean.bam -f ${hg19} -mode 2 -n ${length} -p 0.5 -o ${file}_chr.vcf &
  24. wait
  25. echo "fun is end."
  26. }
  27. for file in ${outputdir}/${sample}.merge.Somatic.hc.nohead_*.vcf;
  28. do
  29. fun ${tumor} ${file} ${bam_dir} &
  30. done
  31. wait
  32. echo "the vcfmergeing is finished"
  33. grep "^#" ${outputdir}/${sample}.snp.Somatic.hc.vcf >${outputdir}/${sample}.merge.Somatic.hc.combind1.vcf
  34. cat ${outputdir}/${sample}.merge.Somatic.hc.nohead_*.vcf_chr.vcf>>${outputdir}/${sample}.merge.Somatic.hc.combind1.vcf
  35. rm -rf ${outputdir}/${sample}.merge.Somatic.hc.nohead_*.vcf
  36. rm -rf ${outputdir}/${sample}.merge.Somatic.hc.nohead_*.vcf_chr.vcf
  37. #########################4.filter the false positive########################################################################
  38. grep -v "#" ${outputdir}/${sample}.merge.Somatic.hc.combind1.vcf | awk '{a=$2+10;print $1"\t"$2"\t"a}' > ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed
  39. #将bed分成多个小文件
  40. split -l 3000 ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed -d -a 2 ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed_
  41. conda deactivate
  42. source /home/liuxiangqiong/miniconda3/bin/activate python2
  43. #bam-readcount -q 20 -b 20 -w 100 -f ${hg19} -l ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed ${bam_dir}/${tumor}_clean.bam >${outputdir}/${sample}.merge.Somatic.hc.combind1.readcount
  44. fun() {
  45. hg19=/cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta
  46. bam-readcount -q 20 -b 20 -w 100 -f ${hg19} -l ${file} ${bam_dir}/${tumor}_clean.bam >${file}.readcount &
  47. wait
  48. echo "fun is end."
  49. }
  50. for file in ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed_*;
  51. do
  52. fun ${inputpath} ${tumor} ${file} ${bam_dir} &
  53. done
  54. wait
  55. echo "the bam-readcount is finished"
  56. cat ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed_*readcount>${outputdir}/${sample}.merge.Somatic.hc.combind1.readcount
  57. rm -rf ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed_*readcount
  58. rm -rf ${outputdir}/${sample}.merge.Somatic.hc.combind1.bed_*
  59. conda deactivate
  60. source /home/liuxiangqiong/miniconda3/bin/activate base
  61. varscan fpfilter \
  62. ${outputdir}/${sample}.merge.Somatic.hc.combind1.vcf \
  63. ${outputdir}/${sample}.merge.Somatic.hc.combind1.readcount \
  64. --min-var-count 5 \
  65. --min-var-basequal 20 \
  66. --min-ref-basequal 20 \
  67. --min-var-freq ${min_af} \
  68. --output-file ${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.vcf
  69. ##########################5.注释########################################################################
  70. #5.1对没有过滤的进行注释对高置信的过滤
  71. #perl /cgdata/CGTools/soft/src/annovar_20200608/table_annovar.pl ${outputdir}/${sample}.merge.Somatic.hc.head.combind1.vcf /cgdata/soft/src/annovar.archive/DB/ -buildver hg19 -out ${outputdir}/${sample}.merge.Somatic.hc.head.combind1.anno -remove -protocol refGene,popfreq_max_20150413,gnomad_genome,avsnp150,snp138NonFlagged,dbnsfp33a,cosmic94,clinvar_20210501,intervar_20170202,icgc28 -operation g,f,f,f,f,f,f,f,f,f -nastring . -vcfinput --thread 40
  72. #perl /cgdata/CGTools/soft/src/annovar_20200608/table_annovar.pl ${outputdir}/${sample}.snp.Somatic.hc.vcf /cgdata/soft/src/annovar.archive/DB/ -buildver hg19 -out ${outputdir}/${sample}.merge.Somatic.hc.anno -remove -protocol refGene,popfreq_max_20150413,gnomad_genome,avsnp150,snp138NonFlagged,dbnsfp33a,cosmic94,clinvar_20210501,intervar_20170202,icgc28 -operation g,f,f,f,f,f,f,f,f,f -nastring . -vcfinput --thread 40
  73. #5.2先拆分再标准化对齐
  74. #vt decompose ${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.vcf |vt normalize - -r ${hg19} -o ${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.norm.vcf
  75. conda deactivate
  76. #5.3对经过标准化的进行vep注释
  77. source /home/liuxiangqiong/miniconda3/bin/activate ensemble-vep
  78. PERL5LIB=/home/liuxiangqiong/miniconda3/envs/ensemble-vep/share/ensembl-vep-105.0-1/Bio/EnsEMBL/Variation/Utils/DB/HTS/Faidx
  79. export PERL5LIB=$PERL5LIB
  80. inputdir0=${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.vcf
  81. outdir1_0=${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.norm.vepanno.vcf
  82. vep --cache --fork 16 \
  83. -i ${inputdir0} \
  84. -o ${outdir1_0} \
  85. --assembly GRCh37 \
  86. --cache_version 104 \
  87. --dir_cache /home/liuxiangqiong/miniconda3/share/ensembl-vep-88.9-0/hg19 \
  88. --refseq \
  89. --canonical \
  90. --offline \
  91. --hgvs \
  92. --fasta /cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta \
  93. --hgvsg \
  94. --max_af \
  95. --check_existing \
  96. --numbers \
  97. --symbol \
  98. --canonical \
  99. --offline \
  100. -vcf \
  101. --use_given_ref \
  102. --af_gnomad \
  103. --check_existing
  104. #5.4对vep的再进行annovar注释
  105. outdir2_0=${outputdir}/${sample}.merge.Somatic.hc.combind1.pass.norm.vepanno.annovar
  106. perl /cgdata/CGTools/soft/src/annovar_20200608/table_annovar.pl ${outdir1_0} /cgdata/soft/src/annovar.archive/DB/ -buildver hg19 -out ${outdir2_0} -remove -protocol refGene,popfreq_max_20150413,gnomad_genome,avsnp150,snp138NonFlagged,dbnsfp33a,cosmic94,clinvar_20220829,intervar_20170202,icgc28,exac03,esp6500siv2_all -operation g,f,f,f,f,f,f,f,f,f,f,f -nastring . -vcfinput --thread 40
  107. ###############################6.删除中间文件
  108. #rm -rf ${outputdir}/${normal}.pileup
  109. #rm -rf ${outputdir}/${tumor}.pileup
  110. rm -rf ${outputdir}/${sample}.merge.Somatic.hc.readcount
  111. conda deactivate
  112. source /home/liuxiangqiong/miniconda3/bin/activate base
  113. #对varscan的结果进行过滤
  114. python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/datafile_somatic_filter_varscan_v0_20220929_finish.py -i ${inputpath} -s ${sample}
  115. ##对varscan和vardict的方法结合获得最后的突变
  116. #python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/datafile_somatic_filter_mergeresult_v0_20220929_finish.py -i ${inputpath} -s ${sample}
  117. #########################################################################原始的结果进行合并并注释#####################
  118. ##对原始的结果进行合并并注释
  119. less ${outputdir}/${sample}.snp.vcf | grep -vE "^#" > ${outputdir}/${sample}.merge.Somatic.vcf
  120. less ${outputdir}/${sample}.indel.vcf | grep -vE "^#" >> ${outputdir}/${sample}.merge.Somatic.vcf
  121. #3.3 将邻近的位点进行合并#3.3 将邻近的位点进行合并
  122. #python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/complex_variant_new.py -i ${outputdir}/${sample}.merge.Somatic.vcf -b ${bam_dir}/${tumor}_clean.bam -f ${hg19} -mode 2 -n 10 -p 0.5 -o ${outputdir}/${sample}.merge.Somatic.combind11.vcf
  123. #grep "^#" ${outputdir}/${sample}.snp.vcf >${outputdir}/${sample}.merge.Somatic.combind1.vcf
  124. #less ${outputdir}/${sample}.merge.Somatic.combind11.vcf >>${outputdir}/${sample}.merge.Somatic.combind1.vcf
  125. #rm -rf ${outputdir}/${sample}.merge.Somatic.combind11.vcf
  126. ##将VCF按照染色体进行拆分
  127. python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/20220705/vcfsplit_v0_20220929_finish.py -i ${outputdir}/${sample}.merge.Somatic.vcf
  128. fun() {
  129. hg19=/cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta
  130. python3 /cgdata/liuxiangqiong/work62pancancer/Client/v0/script/complex_variant_new.py -i ${file} -b ${bam_dir}/${tumor}_clean.bam -f ${hg19} -mode 2 -n 10 -p 0.5 -o ${file}_chr.vcf &
  131. wait
  132. echo "fun is end."
  133. }
  134. for file in ${outputdir}/${sample}.merge.Somatic_*.vcf;
  135. do
  136. fun ${tumor} ${file} ${bam_dir} &
  137. done
  138. wait
  139. echo "the vcfmergeing is finished"
  140. grep "^#" ${outputdir}/${sample}.snp.vcf >${outputdir}/${sample}.merge.Somatic.combind1.vcf
  141. cat ${outputdir}/${sample}.merge.Somatic_*.vcf_chr.vcf>>${outputdir}/${sample}.merge.Somatic.combind1.vcf
  142. rm -rf ${outputdir}/${sample}.merge.Somatic_*.vcf
  143. rm -rf ${outputdir}/${sample}.merge.Somatic_*.vcf_chr.vcf
  144. #perl /cgdata/CGTools/soft/src/annovar_20200608/table_annovar.pl ${outputdir}/${sample}.merge.Somatic.combind1.vcf /cgdata/soft/src/annovar.archive/DB/ -buildver hg19 -out ${outputdir}/${sample}.merge.Somatic.combind1.anno -remove -protocol refGene,popfreq_max_20150413,gnomad_genome,avsnp150,snp138NonFlagged,dbnsfp33a,cosmic94,clinvar_20210501,intervar_20170202,icgc28 -operation g,f,f,f,f,f,f,f,f,f -nastring . -vcfinput --thread 40
  145. #5.2先拆分再标准化对齐
  146. vt decompose ${outputdir}/${sample}.merge.Somatic.combind1.vcf |vt normalize - -r ${hg19} -o ${outputdir}/${sample}.merge.Somatic.combind1.norm.vcf
  147. conda deactivate
  148. #5.3对经过标准化的进行vep注释
  149. source /home/liuxiangqiong/miniconda3/bin/activate ensemble-vep
  150. PERL5LIB=/home/liuxiangqiong/miniconda3/envs/ensemble-vep/share/ensembl-vep-105.0-1/Bio/EnsEMBL/Variation/Utils/DB/HTS/Faidx
  151. export PERL5LIB=$PERL5LIB
  152. inputdir=${outputdir}/${sample}.merge.Somatic.combind1.norm.vcf
  153. outdir1=${outputdir}/${sample}.merge.Somatic.combind1.norm.vepanno.vcf
  154. vep --cache --fork 16 \
  155. -i ${inputdir} \
  156. -o ${outdir1} \
  157. --assembly GRCh37 \
  158. --cache_version 104 \
  159. --dir_cache /home/liuxiangqiong/miniconda3/share/ensembl-vep-88.9-0/hg19 \
  160. --refseq \
  161. --canonical \
  162. --offline \
  163. --hgvs \
  164. --fasta /cgdata/Database/GATK/b37/human_g1k_v37_decoy.fasta \
  165. --hgvsg \
  166. --max_af \
  167. --check_existing \
  168. --numbers \
  169. --symbol \
  170. --canonical \
  171. --offline \
  172. -vcf \
  173. --use_given_ref \
  174. --af_gnomad \
  175. --check_existing
  176. #5.4对vep的再进行annovar注释
  177. outdir2=${outputdir}/${sample}.merge.Somatic.combind1.norm.vepanno.annovar
  178. perl /cgdata/CGTools/soft/src/annovar_20200608/table_annovar.pl ${outdir1} /cgdata/soft/src/annovar.archive/DB/ -buildver hg19 -out ${outdir2} -remove -protocol refGene,popfreq_max_20150413,gnomad_genome,avsnp150,snp138NonFlagged,dbnsfp33a,cosmic94,clinvar_20220829,intervar_20170202,icgc28,exac03,esp6500siv2_all -operation g,f,f,f,f,f,f,f,f,f,f,f -nastring . -vcfinput --thread 40
  179. conda deactivate