#!/usr/bin/perl use 5.006; use strict; use warnings; use File::Basename; my $usage="\nperl $0 sample.gatk/vardict.hg19_multianno.txt Input ANNOVAR table outputs and get Mutations statistic "; die $usage unless @ARGV; my $in_file=shift;#$sam.".gatk.hg19_multianno.txt"; my $sam=basename $in_file; $sam=~s/\.hg19_multianno.txt//; my $out_file=$sam.".stat.xls"; my (@samples,%data,@column,@sp_column,%indel); open my $in_fh,'<',$in_file or die "Cannot open file:$!"; while(<$in_fh>){ chomp; if(/^Chr\s+/ or $.==1){ my @title=split /\t/; for my $i(0..$#title){ if($title[$i] eq "Otherinfo13" or $title[$i] eq 'Otherinfo12'){ push @column,$i; }elsif($title[$i]=~/avsnp\d+$/ or $title[$i]=~/dbSNP/){ push @column,$i; }elsif($title[$i] eq 'gnomAD_genome_ALL'){ push @column,$i; }elsif($title[$i] eq 'Func.refGene'){ push @column,$i; }elsif($title[$i] eq 'ExonicFunc.refGene'){ push @column,$i; }elsif($title[$i] eq 'ALTS' or $title[$i] eq 'Alt'){ push @column,$i; }elsif($title[$i] eq 'Ref'){ push @column,$i; } } next; } my ($ref,$alt,$func,$efunc,$rs,$tg,$format,$sample)=(split /\t/)[@column]; ## must in order next if $sample =~/^\./; my @fmts=split /:/,$format; my $n_gt; for my $i (0..$#fmts){ $n_gt=$i if $fmts[$i] eq 'GT'; } my $gt=(split /:/,$sample)[$n_gt]; next if $gt =~/^\./ or $gt eq '0/0'; if($ref=~/[ATCG]/ and $alt=~/[ATCG]/){ $data{total} ++ ; if(is_hetero($gt) eq 1){ $data{hom}++; }else{ $data{het}++; } $data{dbSNP} ++ if $rs ne '.'; $data{TG} ++ if $tg ne '.' ; $data{locale}{$func} ++ if $func=~/\w/; $data{result}{$efunc} ++ if $efunc; $data{transfm_CDS}{$ref.'->'.$alt}++ if($func=~/^exonic/);# or $locate=~/[^_]UTR/ # $data{transfm}{$ref.'->'.$alt}++; }elsif($ref=~/-/ or $alt=~/-/){ $indel{total} ++ ; $indel{ins} ++ if $ref=~/-/; $indel{del} ++ if $alt=~/-/; if(is_hetero($gt) eq 1){ $indel{hom}++ }else{ $indel{het}++ } $indel{dbSNP} ++ if $rs ne '.'; $indel{TG} ++ if $tg ne '.' ; $indel{locale}{$func} ++ if $func=~/\w/; $indel{result}{$efunc} ++ if $efunc; } } close($in_fh); my ($total,$het,$hom,$het2hom,$dbsnp,$tg); $data{total}=defined $data{total} ? $data{total} :0 ; my $dbsnp_ratio=defined $data{dbSNP} ? 100*$data{dbSNP}/$data{total}: 0; my $tg_ratio=defined $data{TG} ? 100*$data{TG}/$data{total} :0; $dbsnp_ratio=sprintf("%.2f",$dbsnp_ratio); $tg_ratio=sprintf("%.2f",$tg_ratio); if(defined $total){ $total .="\t".$data{total}; $het .="\t".$data{het}; $hom .="\t".$data{hom}; $dbsnp .="\t".$dbsnp_ratio; $tg .= "\t".$tg_ratio; }else{ $data{hom}= defined $data{hom}? $data{hom} :0; $data{het}= defined $data{het}? $data{het} :0; $total="Total_Num_SNP:\t".$data{total}; $het="Heterozygotes:\t".$data{het}; $hom="Homozygotes:\t".$data{hom}; $het2hom=$data{hom}>0 ? "Het/Hom:\t".sprintf("%.2f",$data{het}/$data{hom}) : "Het/Hom:\tNA" ; $dbsnp="dbSNP(%):\t".$dbsnp_ratio; $tg="gnomAD_genome(%):\t".$tg_ratio; } my ($total_id,$het_id,$hom_id,$het2hom_id,$dbsnp_id,$tg_id,$ins,$del); $indel{total} = defined $indel{total} ? $indel{total} : 0; my $dbsnp_ratio_id=defined $indel{dbSNP} ? 100*$indel{dbSNP}/$indel{total} : 0; my $tg_ratio_id=defined $indel{TG} ? 100*$indel{TG}/$indel{total} :0 ; $dbsnp_ratio_id=sprintf("%.2f",$dbsnp_ratio_id); $tg_ratio_id=sprintf("%.2f",$tg_ratio_id); if(defined $total_id){ $total_id .="\t".$indel{total}; $ins .="\t".$indel{ins}; $del .="\t".$indel{del}; $het_id .="\t".$indel{het}; $hom_id .="\t".$indel{hom}; $dbsnp_id .="\t".$dbsnp_ratio_id; $tg_id .= "\t".$tg_ratio_id; }else{ $total_id="Total_Num_INDEL:\t".$indel{total}; $indel{ins}=defined $indel{ins} ? $indel{ins}:0; $indel{del}=defined $indel{del} ? $indel{del}:0; $indel{het}=defined $indel{het} ? $indel{het}:0; $indel{hom}=defined $indel{hom} ? $indel{hom}:0; $ins="Insertions:\t".$indel{ins}; $del="Deletions:\t".$indel{del}; $het_id="Heterozygotes:\t".$indel{het}; $hom_id="Homozygotes:\t".$indel{hom}; $het2hom_id=$indel{hom} > 0 ? "Het/Hom:\t".sprintf("%.2f",$indel{het}/$indel{hom}) : "Het/Hom:\tNA"; $dbsnp_id="dbSNP(%):\t".$dbsnp_ratio_id; $tg_id="gnomAD_genome(%):\t".$tg_ratio_id; } my($id_locations,$id_results, $locations,$results); $locations=hash2string($data{locale},0); $results=hash2string($data{result},1); $id_locations=hash2string($indel{locale},0); $id_results=hash2string($indel{result},1); for my $transfm(keys %{$data{transfm_CDS}}){ if ($transfm eq 'A->G' or $transfm eq 'G->A' or $transfm eq 'T->C' or $transfm eq 'C->T'){ $data{'Ti(CDS)'} += $data{transfm_CDS}{$transfm}; }else{ $data{'Tv(CDS)'} += $data{transfm_CDS}{$transfm}; } } #######Ti/Tv information collection total ############### #for my $transfm(keys %{$data{transfm}}){ # if ($transfm eq 'A->G' or $transfm eq 'G->A' # or $transfm eq 'T->C' or $transfm eq 'C->T'){ # $data{Ti} += $data{transfm}{$transfm}; # }else{ # $data{Tv} += $data{transfm}{$transfm}; # } #} open my $w_fh,'>',$out_file or die "Cannot write file:$!"; my $title="Sample_Name:\t$sam"; print $w_fh join("\n",($title,$total,$het,$hom,$het2hom,$dbsnp,$tg)),"\n",$locations,$results; ######## print Ti Tv Ti/Tv information ############ #for my $i('Ti','Tv','Ti(CDS)','Tv_CDS'){ if (defined $data{'Ti(CDS)'}){ for my $i('Ti(CDS)','Tv(CDS)'){ print $w_fh $i.':'; print $w_fh "\t",$data{$i}; print $w_fh "\n"; } } #print $w_fh 'Ti/Tv:'; #print $w_fh "\t",sprintf "%.2f",$data{Ti}/$data{Tv}; #print $w_fh "\n"; if(defined $data{'Ti(CDS)'}){ print $w_fh 'Ti/Tv(CDS):'; print $w_fh "\t",sprintf "%.2f",$data{'Ti(CDS)'}/$data{'Tv(CDS)'}; print $w_fh "\n"; } print $w_fh "######INDEL######\n"; print $w_fh join("\n",($total_id,$dbsnp_id,$tg_id,$ins,$del,$het_id,$hom_id,$het2hom_id)),"\n",$id_locations,$id_results; sub is_hetero{ my ($gt)=shift; my ($a,$b)=split /\//,$gt; if($a==$b){ return 1; }else{ return 0; } } sub is_alt{ my ($gt,$alts,$alt)=@_; my ($a,$b)=split /\//,$gt; my @base=split /,/,$alts; @base=(' ',@base); my $flag; for($a,$b){ if($base[$_] eq $alt){ return 1; } } return 0; } sub hash2string{ my ($hash,$gap)=@_; unless (defined $hash){ return ''; } my %hash=%{$hash}; my $string=''; my $pre=$gap? " ":""; for my $result(sort keys %hash){ next if $result eq '.' or $result=~/ncRNA/ or $result=~/intronic/ or $result=~/stream/ or $result=~/intergenic/ or $result=~/UTR/ ; if(defined $string){ $string .= $pre.$result.":"; }else{ $string= $pre.$result.":"; } $hash{$result}=defined $hash{$result}?$hash{$result}:0; $string .="\t".$hash{$result}; $string .="\n"; } return $string; }