123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- #!/usr/bin/perl
- use 5.006;
- use strict;
- use warnings;
- use File::Basename;
- my $usage="\nperl $0 sample.gatk/vardict.hg19_multianno.txt
- Input ANNOVAR table outputs and get Mutations statistic ";
- die $usage unless @ARGV;
- my $in_file=shift;#$sam.".gatk.hg19_multianno.txt";
- my $sam=basename $in_file;
- $sam=~s/\.hg19_multianno.txt//;
- my $out_file=$sam.".stat.xls";
- my (@samples,%data,@column,@sp_column,%indel);
- open my $in_fh,'<',$in_file or die "Cannot open file:$!";
- while(<$in_fh>){
- chomp;
- if(/^Chr\s+/ or $.==1){
- my @title=split /\t/;
- for my $i(0..$#title){
- if($title[$i] eq "Otherinfo13" or $title[$i] eq 'Otherinfo12'){
- push @column,$i;
- }elsif($title[$i]=~/avsnp\d+$/ or $title[$i]=~/dbSNP/){
- push @column,$i;
- }elsif($title[$i] eq 'gnomAD_genome_ALL'){
- push @column,$i;
- }elsif($title[$i] eq 'Func.refGene'){
- push @column,$i;
- }elsif($title[$i] eq 'ExonicFunc.refGene'){
- push @column,$i;
- }elsif($title[$i] eq 'ALTS' or $title[$i] eq 'Alt'){
- push @column,$i;
- }elsif($title[$i] eq 'Ref'){
- push @column,$i;
- }
- }
- next;
- }
- my ($ref,$alt,$func,$efunc,$rs,$tg,$format,$sample)=(split /\t/)[@column]; ## must in order
- next if $sample =~/^\./;
- my @fmts=split /:/,$format;
- my $n_gt;
- for my $i (0..$#fmts){
- $n_gt=$i if $fmts[$i] eq 'GT';
- }
- my $gt=(split /:/,$sample)[$n_gt];
- next if $gt =~/^\./ or $gt eq '0/0';
- if($ref=~/[ATCG]/ and $alt=~/[ATCG]/){
- $data{total} ++ ;
- if(is_hetero($gt) eq 1){
- $data{hom}++;
- }else{
- $data{het}++;
- }
- $data{dbSNP} ++ if $rs ne '.';
- $data{TG} ++ if $tg ne '.' ;
- $data{locale}{$func} ++ if $func=~/\w/;
- $data{result}{$efunc} ++ if $efunc;
- $data{transfm_CDS}{$ref.'->'.$alt}++ if($func=~/^exonic/);# or $locate=~/[^_]UTR/
- # $data{transfm}{$ref.'->'.$alt}++;
- }elsif($ref=~/-/ or $alt=~/-/){
- $indel{total} ++ ;
- $indel{ins} ++ if $ref=~/-/;
- $indel{del} ++ if $alt=~/-/;
- if(is_hetero($gt) eq 1){
- $indel{hom}++
- }else{
- $indel{het}++
- }
- $indel{dbSNP} ++ if $rs ne '.';
- $indel{TG} ++ if $tg ne '.' ;
- $indel{locale}{$func} ++ if $func=~/\w/;
- $indel{result}{$efunc} ++ if $efunc;
- }
- }
- close($in_fh);
- my ($total,$het,$hom,$het2hom,$dbsnp,$tg);
- $data{total}=defined $data{total} ? $data{total} :0 ;
- my $dbsnp_ratio=defined $data{dbSNP} ? 100*$data{dbSNP}/$data{total}: 0;
- my $tg_ratio=defined $data{TG} ? 100*$data{TG}/$data{total} :0;
- $dbsnp_ratio=sprintf("%.2f",$dbsnp_ratio);
- $tg_ratio=sprintf("%.2f",$tg_ratio);
- if(defined $total){
- $total .="\t".$data{total};
- $het .="\t".$data{het};
- $hom .="\t".$data{hom};
- $dbsnp .="\t".$dbsnp_ratio;
- $tg .= "\t".$tg_ratio;
- }else{
- $data{hom}= defined $data{hom}? $data{hom} :0;
- $data{het}= defined $data{het}? $data{het} :0;
- $total="Total_Num_SNP:\t".$data{total};
- $het="Heterozygotes:\t".$data{het};
- $hom="Homozygotes:\t".$data{hom};
- $het2hom=$data{hom}>0 ? "Het/Hom:\t".sprintf("%.2f",$data{het}/$data{hom}) : "Het/Hom:\tNA" ;
- $dbsnp="dbSNP(%):\t".$dbsnp_ratio;
- $tg="gnomAD_genome(%):\t".$tg_ratio;
- }
- my ($total_id,$het_id,$hom_id,$het2hom_id,$dbsnp_id,$tg_id,$ins,$del);
- $indel{total} = defined $indel{total} ? $indel{total} : 0;
- my $dbsnp_ratio_id=defined $indel{dbSNP} ? 100*$indel{dbSNP}/$indel{total} : 0;
- my $tg_ratio_id=defined $indel{TG} ? 100*$indel{TG}/$indel{total} :0 ;
- $dbsnp_ratio_id=sprintf("%.2f",$dbsnp_ratio_id);
- $tg_ratio_id=sprintf("%.2f",$tg_ratio_id);
- if(defined $total_id){
- $total_id .="\t".$indel{total};
- $ins .="\t".$indel{ins};
- $del .="\t".$indel{del};
- $het_id .="\t".$indel{het};
- $hom_id .="\t".$indel{hom};
- $dbsnp_id .="\t".$dbsnp_ratio_id;
- $tg_id .= "\t".$tg_ratio_id;
- }else{
- $total_id="Total_Num_INDEL:\t".$indel{total};
- $indel{ins}=defined $indel{ins} ? $indel{ins}:0;
- $indel{del}=defined $indel{del} ? $indel{del}:0;
- $indel{het}=defined $indel{het} ? $indel{het}:0;
- $indel{hom}=defined $indel{hom} ? $indel{hom}:0;
- $ins="Insertions:\t".$indel{ins};
- $del="Deletions:\t".$indel{del};
- $het_id="Heterozygotes:\t".$indel{het};
- $hom_id="Homozygotes:\t".$indel{hom};
- $het2hom_id=$indel{hom} > 0 ? "Het/Hom:\t".sprintf("%.2f",$indel{het}/$indel{hom}) : "Het/Hom:\tNA";
- $dbsnp_id="dbSNP(%):\t".$dbsnp_ratio_id;
- $tg_id="gnomAD_genome(%):\t".$tg_ratio_id;
- }
- my($id_locations,$id_results, $locations,$results);
- $locations=hash2string($data{locale},0);
- $results=hash2string($data{result},1);
- $id_locations=hash2string($indel{locale},0);
- $id_results=hash2string($indel{result},1);
- for my $transfm(keys %{$data{transfm_CDS}}){
- if ($transfm eq 'A->G' or $transfm eq 'G->A'
- or $transfm eq 'T->C' or $transfm eq 'C->T'){
- $data{'Ti(CDS)'} += $data{transfm_CDS}{$transfm};
- }else{
- $data{'Tv(CDS)'} += $data{transfm_CDS}{$transfm};
- }
- }
- #######Ti/Tv information collection total ###############
- #for my $transfm(keys %{$data{transfm}}){
- # if ($transfm eq 'A->G' or $transfm eq 'G->A'
- # or $transfm eq 'T->C' or $transfm eq 'C->T'){
- # $data{Ti} += $data{transfm}{$transfm};
- # }else{
- # $data{Tv} += $data{transfm}{$transfm};
- # }
- #}
-
- open my $w_fh,'>',$out_file or die "Cannot write file:$!";
- my $title="Sample_Name:\t$sam";
- print $w_fh join("\n",($title,$total,$het,$hom,$het2hom,$dbsnp,$tg)),"\n",$locations,$results;
- ######## print Ti Tv Ti/Tv information ############
- #for my $i('Ti','Tv','Ti(CDS)','Tv_CDS'){
- if (defined $data{'Ti(CDS)'}){
- for my $i('Ti(CDS)','Tv(CDS)'){
- print $w_fh $i.':';
- print $w_fh "\t",$data{$i};
- print $w_fh "\n";
- }
- }
- #print $w_fh 'Ti/Tv:';
- #print $w_fh "\t",sprintf "%.2f",$data{Ti}/$data{Tv};
- #print $w_fh "\n";
- if(defined $data{'Ti(CDS)'}){
- print $w_fh 'Ti/Tv(CDS):';
- print $w_fh "\t",sprintf "%.2f",$data{'Ti(CDS)'}/$data{'Tv(CDS)'};
- print $w_fh "\n";
- }
- print $w_fh "######INDEL######\n";
- print $w_fh join("\n",($total_id,$dbsnp_id,$tg_id,$ins,$del,$het_id,$hom_id,$het2hom_id)),"\n",$id_locations,$id_results;
- sub is_hetero{
- my ($gt)=shift;
- my ($a,$b)=split /\//,$gt;
- if($a==$b){
- return 1;
- }else{
- return 0;
- }
- }
- sub is_alt{
- my ($gt,$alts,$alt)=@_;
- my ($a,$b)=split /\//,$gt;
- my @base=split /,/,$alts;
- @base=(' ',@base);
- my $flag;
- for($a,$b){
- if($base[$_] eq $alt){
- return 1;
- }
- }
- return 0;
- }
- sub hash2string{
- my ($hash,$gap)=@_;
- unless (defined $hash){
- return '';
- }
- my %hash=%{$hash};
- my $string='';
- my $pre=$gap? " ":"";
- for my $result(sort keys %hash){
- next if $result eq '.' or $result=~/ncRNA/ or $result=~/intronic/ or
- $result=~/stream/ or $result=~/intergenic/ or $result=~/UTR/ ;
- if(defined $string){
- $string .= $pre.$result.":";
- }else{
- $string= $pre.$result.":";
- }
- $hash{$result}=defined $hash{$result}?$hash{$result}:0;
- $string .="\t".$hash{$result};
- $string .="\n";
- }
- return $string;
- }
|