#!/usr/bin/perl use strict; use warnings; use File::Temp qw/tempfile/; use Statistics::R; use Getopt::Std; use File::Basename; use File::Find; my $usage="Usage: trim GATK4 GetPileupSummaries result and plot sample AF distribution. chr X/Y/M excluded. result file name like \$sample.ps.tab. \tperl $0 outfile\n Options: -d input directory. default '.' . -r Read counts required. default 20. -f Minimal pop AF frenquency.default 0.01. (0~1) -l plot by lane ID instead all samples, 1 or 0;defaut 1(by lane). lane info in sample ID is required.\n"; my ($in_dir,$dp_lim,$lane,$freq_lim); my $otheraf_lim=0.04; my (%data,@samples); my %opts = (d => '.', r => 20, l=>1, f=> 0.01); my %opts1; getopts(':d:r:l:f:', \%opts1); unless (@ARGV){ print $usage; exit; } my $out=shift; for my $key (keys %opts1){ my $val=$opts1{$key}; $opts{$key}=$val; } for my $key (keys %opts){ my $val=$opts{$key}; if($key eq 'd'){ $in_dir=$val; print "Input directory: $val .\n"; die "Dir:$val doesNot exists.\n" unless -d $val; } if($key eq 'r' ){ $dp_lim= $val; print "Minimal Read Depth is: ", $val,".\n"; } if($key eq 'f' ){ $freq_lim= $val; print "Minimal population frenquence is: ", $val,".\n"; } if($key eq 'l'){ $lane=$val; if($lane){ print "Plotting by lane.\n" }else{ print "Plotting all input samples in one plot.\n" } } } my @files; find(sub { push @files,$File::Find::name if -f and /\.ps.tab$/; }, $in_dir); die "No *.ps.tab file under $in_dir,Please check.\n" unless @files; print "Numbef of Files:" ,scalar @files,"\n"; for my $file (@files){ open my $fh,'<',$file or die "Cannot open file:$!"; my $sample=basename($file); $sample=~s/.ps.tab//; push @samples,$sample; while(<$fh>){ next if /^#/; next if /^contig/; chomp; my ($chr,$pos,$nref,$nalt,$nother,$af)=split /\t/; next if $af< $freq_lim ; next if $chr=~/M/i or $chr=~/X/i or $chr=~/Y/i ; next unless ($nref+$nalt)>$dp_lim; next if $nother/($nref+$nalt+$nother) > $otheraf_lim ; my $freq=int(100*$nalt/($nalt+$nref)+0.5); $data{$sample}{$freq}++; } close($fh); } my ($wfh, $tfile) = tempfile(); $tfile=~s/\\/\//g; print $wfh join("\t",'Sample',map {'P'.$_} (0..100) ),"\n"; ; for my $sample (@samples){ for my $freq(0..100){ $data{$sample}{$freq} =exists $data{$sample}{$freq} ? $data{$sample}{$freq}: 0; } print $wfh join("\t",$sample, map {$data{$sample}{$_} } (0..100) ),"\n"; } close($wfh); my $R =Statistics::R->new(); $R -> run(qq`pdf(file="$out")`); $R -> run(qq`mdata <- read.table(file="$tfile",as.is=TRUE, header=TRUE,row.names = 1,sep="\t")`); $R -> run(qq`row.total <- rowSums(mdata)`); $R -> run(qq`af.mdata <- 100*mdata/row.total`); if($lane){ $R -> run(qq`samples<-rownames(af.mdata) `); $R -> run(qq`lanes<-unique(sub("-.*\$","",samples)) `); my $cmds = < run($cmds); }else{ $R -> run(qq`num<-dim(af.mdata)[1]`); $R -> run(qq`cols<- rainbow(num)`); $R -> run(qq`matplot(t(af.mdata),type='l',pch=2,ylim=c(0,5),lwd=1,col=cols,lty=1:num,main=paste("AF Distribution"),xlab="VAF",ylab="Percent(%)")`); $R -> run(qq`legend("topright", inset=0, legend=rownames(af.mdata),horiz=FALSE,pch=19,lty=1:num,cex=0.6,col=cols)`); $R -> run(qq`abline(v=seq(9,100,by=10),col = "gray80",lty=2) `); $R -> run(qq`abline(v=seq(4,95,by=10),h=0,col = "gray80",lty=3) `); } $R -> run(qq`dev.off() `); unlink($tfile);