snp_stat.pl 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #!/usr/bin/perl
  2. use 5.006;
  3. use strict;
  4. use warnings;
  5. use File::Basename;
  6. my $usage="\nperl $0 sample.gatk/vardict.hg19_multianno.txt
  7. Input ANNOVAR table outputs and get Mutations statistic ";
  8. die $usage unless @ARGV;
  9. my $in_file=shift;#$sam.".gatk.hg19_multianno.txt";
  10. my $sam=basename $in_file;
  11. $sam=~s/\.hg19_multianno.txt//;
  12. my $out_file=$sam.".stat.xls";
  13. my (@samples,%data,@column,@sp_column,%indel);
  14. open my $in_fh,'<',$in_file or die "Cannot open file:$!";
  15. while(<$in_fh>){
  16. chomp;
  17. if(/^Chr\s+/ or $.==1){
  18. my @title=split /\t/;
  19. for my $i(0..$#title){
  20. if($title[$i] eq "Otherinfo13" or $title[$i] eq 'Otherinfo12'){
  21. push @column,$i;
  22. }elsif($title[$i]=~/avsnp\d+$/ or $title[$i]=~/dbSNP/){
  23. push @column,$i;
  24. }elsif($title[$i] eq 'gnomAD_genome_ALL'){
  25. push @column,$i;
  26. }elsif($title[$i] eq 'Func.refGene'){
  27. push @column,$i;
  28. }elsif($title[$i] eq 'ExonicFunc.refGene'){
  29. push @column,$i;
  30. }elsif($title[$i] eq 'ALTS' or $title[$i] eq 'Alt'){
  31. push @column,$i;
  32. }elsif($title[$i] eq 'Ref'){
  33. push @column,$i;
  34. }
  35. }
  36. next;
  37. }
  38. my ($ref,$alt,$func,$efunc,$rs,$tg,$format,$sample)=(split /\t/)[@column]; ## must in order
  39. next if $sample =~/^\./;
  40. my @fmts=split /:/,$format;
  41. my $n_gt;
  42. for my $i (0..$#fmts){
  43. $n_gt=$i if $fmts[$i] eq 'GT';
  44. }
  45. my $gt=(split /:/,$sample)[$n_gt];
  46. next if $gt =~/^\./ or $gt eq '0/0';
  47. if($ref=~/[ATCG]/ and $alt=~/[ATCG]/){
  48. $data{total} ++ ;
  49. if(is_hetero($gt) eq 1){
  50. $data{hom}++;
  51. }else{
  52. $data{het}++;
  53. }
  54. $data{dbSNP} ++ if $rs ne '.';
  55. $data{TG} ++ if $tg ne '.' ;
  56. $data{locale}{$func} ++ if $func=~/\w/;
  57. $data{result}{$efunc} ++ if $efunc;
  58. $data{transfm_CDS}{$ref.'->'.$alt}++ if($func=~/^exonic/);# or $locate=~/[^_]UTR/
  59. # $data{transfm}{$ref.'->'.$alt}++;
  60. }elsif($ref=~/-/ or $alt=~/-/){
  61. $indel{total} ++ ;
  62. $indel{ins} ++ if $ref=~/-/;
  63. $indel{del} ++ if $alt=~/-/;
  64. if(is_hetero($gt) eq 1){
  65. $indel{hom}++
  66. }else{
  67. $indel{het}++
  68. }
  69. $indel{dbSNP} ++ if $rs ne '.';
  70. $indel{TG} ++ if $tg ne '.' ;
  71. $indel{locale}{$func} ++ if $func=~/\w/;
  72. $indel{result}{$efunc} ++ if $efunc;
  73. }
  74. }
  75. close($in_fh);
  76. my ($total,$het,$hom,$het2hom,$dbsnp,$tg);
  77. $data{total}=defined $data{total} ? $data{total} :0 ;
  78. my $dbsnp_ratio=defined $data{dbSNP} ? 100*$data{dbSNP}/$data{total}: 0;
  79. my $tg_ratio=defined $data{TG} ? 100*$data{TG}/$data{total} :0;
  80. $dbsnp_ratio=sprintf("%.2f",$dbsnp_ratio);
  81. $tg_ratio=sprintf("%.2f",$tg_ratio);
  82. if(defined $total){
  83. $total .="\t".$data{total};
  84. $het .="\t".$data{het};
  85. $hom .="\t".$data{hom};
  86. $dbsnp .="\t".$dbsnp_ratio;
  87. $tg .= "\t".$tg_ratio;
  88. }else{
  89. $data{hom}= defined $data{hom}? $data{hom} :0;
  90. $data{het}= defined $data{het}? $data{het} :0;
  91. $total="Total_Num_SNP:\t".$data{total};
  92. $het="Heterozygotes:\t".$data{het};
  93. $hom="Homozygotes:\t".$data{hom};
  94. $het2hom=$data{hom}>0 ? "Het/Hom:\t".sprintf("%.2f",$data{het}/$data{hom}) : "Het/Hom:\tNA" ;
  95. $dbsnp="dbSNP(%):\t".$dbsnp_ratio;
  96. $tg="gnomAD_genome(%):\t".$tg_ratio;
  97. }
  98. my ($total_id,$het_id,$hom_id,$het2hom_id,$dbsnp_id,$tg_id,$ins,$del);
  99. $indel{total} = defined $indel{total} ? $indel{total} : 0;
  100. my $dbsnp_ratio_id=defined $indel{dbSNP} ? 100*$indel{dbSNP}/$indel{total} : 0;
  101. my $tg_ratio_id=defined $indel{TG} ? 100*$indel{TG}/$indel{total} :0 ;
  102. $dbsnp_ratio_id=sprintf("%.2f",$dbsnp_ratio_id);
  103. $tg_ratio_id=sprintf("%.2f",$tg_ratio_id);
  104. if(defined $total_id){
  105. $total_id .="\t".$indel{total};
  106. $ins .="\t".$indel{ins};
  107. $del .="\t".$indel{del};
  108. $het_id .="\t".$indel{het};
  109. $hom_id .="\t".$indel{hom};
  110. $dbsnp_id .="\t".$dbsnp_ratio_id;
  111. $tg_id .= "\t".$tg_ratio_id;
  112. }else{
  113. $total_id="Total_Num_INDEL:\t".$indel{total};
  114. $indel{ins}=defined $indel{ins} ? $indel{ins}:0;
  115. $indel{del}=defined $indel{del} ? $indel{del}:0;
  116. $indel{het}=defined $indel{het} ? $indel{het}:0;
  117. $indel{hom}=defined $indel{hom} ? $indel{hom}:0;
  118. $ins="Insertions:\t".$indel{ins};
  119. $del="Deletions:\t".$indel{del};
  120. $het_id="Heterozygotes:\t".$indel{het};
  121. $hom_id="Homozygotes:\t".$indel{hom};
  122. $het2hom_id=$indel{hom} > 0 ? "Het/Hom:\t".sprintf("%.2f",$indel{het}/$indel{hom}) : "Het/Hom:\tNA";
  123. $dbsnp_id="dbSNP(%):\t".$dbsnp_ratio_id;
  124. $tg_id="gnomAD_genome(%):\t".$tg_ratio_id;
  125. }
  126. my($id_locations,$id_results, $locations,$results);
  127. $locations=hash2string($data{locale},0);
  128. $results=hash2string($data{result},1);
  129. $id_locations=hash2string($indel{locale},0);
  130. $id_results=hash2string($indel{result},1);
  131. for my $transfm(keys %{$data{transfm_CDS}}){
  132. if ($transfm eq 'A->G' or $transfm eq 'G->A'
  133. or $transfm eq 'T->C' or $transfm eq 'C->T'){
  134. $data{'Ti(CDS)'} += $data{transfm_CDS}{$transfm};
  135. }else{
  136. $data{'Tv(CDS)'} += $data{transfm_CDS}{$transfm};
  137. }
  138. }
  139. #######Ti/Tv information collection total ###############
  140. #for my $transfm(keys %{$data{transfm}}){
  141. # if ($transfm eq 'A->G' or $transfm eq 'G->A'
  142. # or $transfm eq 'T->C' or $transfm eq 'C->T'){
  143. # $data{Ti} += $data{transfm}{$transfm};
  144. # }else{
  145. # $data{Tv} += $data{transfm}{$transfm};
  146. # }
  147. #}
  148. open my $w_fh,'>',$out_file or die "Cannot write file:$!";
  149. my $title="Sample_Name:\t$sam";
  150. print $w_fh join("\n",($title,$total,$het,$hom,$het2hom,$dbsnp,$tg)),"\n",$locations,$results;
  151. ######## print Ti Tv Ti/Tv information ############
  152. #for my $i('Ti','Tv','Ti(CDS)','Tv_CDS'){
  153. if (defined $data{'Ti(CDS)'}){
  154. for my $i('Ti(CDS)','Tv(CDS)'){
  155. print $w_fh $i.':';
  156. print $w_fh "\t",$data{$i};
  157. print $w_fh "\n";
  158. }
  159. }
  160. #print $w_fh 'Ti/Tv:';
  161. #print $w_fh "\t",sprintf "%.2f",$data{Ti}/$data{Tv};
  162. #print $w_fh "\n";
  163. if(defined $data{'Ti(CDS)'}){
  164. print $w_fh 'Ti/Tv(CDS):';
  165. print $w_fh "\t",sprintf "%.2f",$data{'Ti(CDS)'}/$data{'Tv(CDS)'};
  166. print $w_fh "\n";
  167. }
  168. print $w_fh "######INDEL######\n";
  169. print $w_fh join("\n",($total_id,$dbsnp_id,$tg_id,$ins,$del,$het_id,$hom_id,$het2hom_id)),"\n",$id_locations,$id_results;
  170. sub is_hetero{
  171. my ($gt)=shift;
  172. my ($a,$b)=split /\//,$gt;
  173. if($a==$b){
  174. return 1;
  175. }else{
  176. return 0;
  177. }
  178. }
  179. sub is_alt{
  180. my ($gt,$alts,$alt)=@_;
  181. my ($a,$b)=split /\//,$gt;
  182. my @base=split /,/,$alts;
  183. @base=(' ',@base);
  184. my $flag;
  185. for($a,$b){
  186. if($base[$_] eq $alt){
  187. return 1;
  188. }
  189. }
  190. return 0;
  191. }
  192. sub hash2string{
  193. my ($hash,$gap)=@_;
  194. unless (defined $hash){
  195. return '';
  196. }
  197. my %hash=%{$hash};
  198. my $string='';
  199. my $pre=$gap? " ":"";
  200. for my $result(sort keys %hash){
  201. next if $result eq '.' or $result=~/ncRNA/ or $result=~/intronic/ or
  202. $result=~/stream/ or $result=~/intergenic/ or $result=~/UTR/ ;
  203. if(defined $string){
  204. $string .= $pre.$result.":";
  205. }else{
  206. $string= $pre.$result.":";
  207. }
  208. $hash{$result}=defined $hash{$result}?$hash{$result}:0;
  209. $string .="\t".$hash{$result};
  210. $string .="\n";
  211. }
  212. return $string;
  213. }