123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use File::Temp qw/tempfile/;
- use Statistics::R;
- use Getopt::Std;
- use File::Basename;
- use File::Find;
- my $usage="Usage: trim GATK4 GetPileupSummaries result and plot sample AF distribution.
- chr X/Y/M excluded. result file name like \$sample.ps.tab.
- \tperl $0 outfile\n
- Options:
- -d input directory. default '.' .
- -r Read counts required. default 20.
- -f Minimal pop AF frenquency.default 0.01. (0~1)
- -l plot by lane ID instead all samples, 1 or 0;defaut 1(by lane).
- lane info in sample ID is required.\n";
-
- my ($in_dir,$dp_lim,$lane,$freq_lim);
- my $otheraf_lim=0.04;
- my (%data,@samples);
- my %opts = (d => '.', r => 20, l=>1, f=> 0.01);
- my %opts1;
- getopts(':d:r:l:f:', \%opts1);
- unless (@ARGV){
- print $usage;
- exit;
- }
- my $out=shift;
- for my $key (keys %opts1){
- my $val=$opts1{$key};
- $opts{$key}=$val;
- }
- for my $key (keys %opts){
- my $val=$opts{$key};
- if($key eq 'd'){
- $in_dir=$val;
- print "Input directory: $val .\n";
- die "Dir:$val doesNot exists.\n" unless -d $val;
- }
- if($key eq 'r' ){
- $dp_lim= $val;
- print "Minimal Read Depth is: ", $val,".\n";
- }
- if($key eq 'f' ){
- $freq_lim= $val;
- print "Minimal population frenquence is: ", $val,".\n";
- }
- if($key eq 'l'){
- $lane=$val;
- if($lane){
- print "Plotting by lane.\n"
- }else{
- print "Plotting all input samples in one plot.\n"
- }
- }
- }
- my @files;
- find(sub {
- push @files,$File::Find::name if -f and /\.ps.tab$/;
- }, $in_dir);
- die "No *.ps.tab file under $in_dir,Please check.\n" unless @files;
- print "Numbef of Files:" ,scalar @files,"\n";
- for my $file (@files){
- open my $fh,'<',$file or die "Cannot open file:$!";
- my $sample=basename($file);
- $sample=~s/.ps.tab//;
- push @samples,$sample;
-
- while(<$fh>){
- next if /^#/;
- next if /^contig/;
- chomp;
- my ($chr,$pos,$nref,$nalt,$nother,$af)=split /\t/;
- next if $af< $freq_lim ;
- next if $chr=~/M/i or $chr=~/X/i or $chr=~/Y/i ;
- next unless ($nref+$nalt)>$dp_lim;
- next if $nother/($nref+$nalt+$nother) > $otheraf_lim ;
- my $freq=int(100*$nalt/($nalt+$nref)+0.5);
- $data{$sample}{$freq}++;
- }
- close($fh);
-
- }
- my ($wfh, $tfile) = tempfile();
- $tfile=~s/\\/\//g;
- print $wfh join("\t",'Sample',map {'P'.$_} (0..100) ),"\n";
- ;
- for my $sample (@samples){
- for my $freq(0..100){
- $data{$sample}{$freq} =exists $data{$sample}{$freq} ? $data{$sample}{$freq}: 0;
- }
- print $wfh join("\t",$sample, map {$data{$sample}{$_} } (0..100) ),"\n";
-
- }
- close($wfh);
- my $R =Statistics::R->new();
- $R -> run(qq`pdf(file="$out")`);
- $R -> run(qq`mdata <- read.table(file="$tfile",as.is=TRUE,
- header=TRUE,row.names = 1,sep="\t")`);
- $R -> run(qq`row.total <- rowSums(mdata)`);
- $R -> run(qq`af.mdata <- 100*mdata/row.total`);
- if($lane){
- $R -> run(qq`samples<-rownames(af.mdata) `);
- $R -> run(qq`lanes<-unique(sub("-.*\$","",samples)) `);
- my $cmds = <<EOF;
- for(lane in lanes){
- sub.mdata<-af.mdata[grep(lane,samples),]
- num<-dim(sub.mdata)[1]
- cols<- rainbow(num)
- matplot(t(sub.mdata),type='l',pch=2,ylim=c(0,5),lwd=1,col=cols,lty=1:num,main=paste("AF Distribution"),xlab="VAF",ylab="Percent(%)" )
- legend("topright", inset=0, legend=rownames(sub.mdata),horiz=FALSE,pch=19,col=cols,lty=1:num,cex=0.6)
- abline(v=seq(9,100,by=10),col = "gray80",lty=2)
- abline(v=seq(4,95,by=10),h=0,col = "gray80",lty=3)
- }
- EOF
- $R -> run($cmds);
- }else{
- $R -> run(qq`num<-dim(af.mdata)[1]`);
- $R -> run(qq`cols<- rainbow(num)`);
- $R -> run(qq`matplot(t(af.mdata),type='l',pch=2,ylim=c(0,5),lwd=1,col=cols,lty=1:num,main=paste("AF Distribution"),xlab="VAF",ylab="Percent(%)")`);
- $R -> run(qq`legend("topright", inset=0, legend=rownames(af.mdata),horiz=FALSE,pch=19,lty=1:num,cex=0.6,col=cols)`);
- $R -> run(qq`abline(v=seq(9,100,by=10),col = "gray80",lty=2) `);
- $R -> run(qq`abline(v=seq(4,95,by=10),h=0,col = "gray80",lty=3) `);
- }
- $R -> run(qq`dev.off() `);
- unlink($tfile);
|