trim_PileupSum.pl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use File::Temp qw/tempfile/;
  5. use Statistics::R;
  6. use Getopt::Std;
  7. use File::Basename;
  8. use File::Find;
  9. my $usage="Usage: trim GATK4 GetPileupSummaries result and plot sample AF distribution.
  10. chr X/Y/M excluded. result file name like \$sample.ps.tab.
  11. \tperl $0 outfile\n
  12. Options:
  13. -d input directory. default '.' .
  14. -r Read counts required. default 20.
  15. -f Minimal pop AF frenquency.default 0.01. (0~1)
  16. -l plot by lane ID instead all samples, 1 or 0;defaut 1(by lane).
  17. lane info in sample ID is required.\n";
  18. my ($in_dir,$dp_lim,$lane,$freq_lim);
  19. my $otheraf_lim=0.04;
  20. my (%data,@samples);
  21. my %opts = (d => '.', r => 20, l=>1, f=> 0.01);
  22. my %opts1;
  23. getopts(':d:r:l:f:', \%opts1);
  24. unless (@ARGV){
  25. print $usage;
  26. exit;
  27. }
  28. my $out=shift;
  29. for my $key (keys %opts1){
  30. my $val=$opts1{$key};
  31. $opts{$key}=$val;
  32. }
  33. for my $key (keys %opts){
  34. my $val=$opts{$key};
  35. if($key eq 'd'){
  36. $in_dir=$val;
  37. print "Input directory: $val .\n";
  38. die "Dir:$val doesNot exists.\n" unless -d $val;
  39. }
  40. if($key eq 'r' ){
  41. $dp_lim= $val;
  42. print "Minimal Read Depth is: ", $val,".\n";
  43. }
  44. if($key eq 'f' ){
  45. $freq_lim= $val;
  46. print "Minimal population frenquence is: ", $val,".\n";
  47. }
  48. if($key eq 'l'){
  49. $lane=$val;
  50. if($lane){
  51. print "Plotting by lane.\n"
  52. }else{
  53. print "Plotting all input samples in one plot.\n"
  54. }
  55. }
  56. }
  57. my @files;
  58. find(sub {
  59. push @files,$File::Find::name if -f and /\.ps.tab$/;
  60. }, $in_dir);
  61. die "No *.ps.tab file under $in_dir,Please check.\n" unless @files;
  62. print "Numbef of Files:" ,scalar @files,"\n";
  63. for my $file (@files){
  64. open my $fh,'<',$file or die "Cannot open file:$!";
  65. my $sample=basename($file);
  66. $sample=~s/.ps.tab//;
  67. push @samples,$sample;
  68. while(<$fh>){
  69. next if /^#/;
  70. next if /^contig/;
  71. chomp;
  72. my ($chr,$pos,$nref,$nalt,$nother,$af)=split /\t/;
  73. next if $af< $freq_lim ;
  74. next if $chr=~/M/i or $chr=~/X/i or $chr=~/Y/i ;
  75. next unless ($nref+$nalt)>$dp_lim;
  76. next if $nother/($nref+$nalt+$nother) > $otheraf_lim ;
  77. my $freq=int(100*$nalt/($nalt+$nref)+0.5);
  78. $data{$sample}{$freq}++;
  79. }
  80. close($fh);
  81. }
  82. my ($wfh, $tfile) = tempfile();
  83. $tfile=~s/\\/\//g;
  84. print $wfh join("\t",'Sample',map {'P'.$_} (0..100) ),"\n";
  85. ;
  86. for my $sample (@samples){
  87. for my $freq(0..100){
  88. $data{$sample}{$freq} =exists $data{$sample}{$freq} ? $data{$sample}{$freq}: 0;
  89. }
  90. print $wfh join("\t",$sample, map {$data{$sample}{$_} } (0..100) ),"\n";
  91. }
  92. close($wfh);
  93. my $R =Statistics::R->new();
  94. $R -> run(qq`pdf(file="$out")`);
  95. $R -> run(qq`mdata <- read.table(file="$tfile",as.is=TRUE,
  96. header=TRUE,row.names = 1,sep="\t")`);
  97. $R -> run(qq`row.total <- rowSums(mdata)`);
  98. $R -> run(qq`af.mdata <- 100*mdata/row.total`);
  99. if($lane){
  100. $R -> run(qq`samples<-rownames(af.mdata) `);
  101. $R -> run(qq`lanes<-unique(sub("-.*\$","",samples)) `);
  102. my $cmds = <<EOF;
  103. for(lane in lanes){
  104. sub.mdata<-af.mdata[grep(lane,samples),]
  105. num<-dim(sub.mdata)[1]
  106. cols<- rainbow(num)
  107. matplot(t(sub.mdata),type='l',pch=2,ylim=c(0,5),lwd=1,col=cols,lty=1:num,main=paste("AF Distribution"),xlab="VAF",ylab="Percent(%)" )
  108. legend("topright", inset=0, legend=rownames(sub.mdata),horiz=FALSE,pch=19,col=cols,lty=1:num,cex=0.6)
  109. abline(v=seq(9,100,by=10),col = "gray80",lty=2)
  110. abline(v=seq(4,95,by=10),h=0,col = "gray80",lty=3)
  111. }
  112. EOF
  113. $R -> run($cmds);
  114. }else{
  115. $R -> run(qq`num<-dim(af.mdata)[1]`);
  116. $R -> run(qq`cols<- rainbow(num)`);
  117. $R -> run(qq`matplot(t(af.mdata),type='l',pch=2,ylim=c(0,5),lwd=1,col=cols,lty=1:num,main=paste("AF Distribution"),xlab="VAF",ylab="Percent(%)")`);
  118. $R -> run(qq`legend("topright", inset=0, legend=rownames(af.mdata),horiz=FALSE,pch=19,lty=1:num,cex=0.6,col=cols)`);
  119. $R -> run(qq`abline(v=seq(9,100,by=10),col = "gray80",lty=2) `);
  120. $R -> run(qq`abline(v=seq(4,95,by=10),h=0,col = "gray80",lty=3) `);
  121. }
  122. $R -> run(qq`dev.off() `);
  123. unlink($tfile);