123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use JSON::Parse 'json_file_to_perl';
- use File::Find;
- use List::Util qw(sum0);
- use Getopt::Std;
- use File::Basename;
- my $usage="Usage:\n\tperl $0 outfile\n
- Options:\n
- -d input directory. default '.' .
- -x suffix,iSNP tab file ending with. default .fastp.json.
- -s If Scan is required.True for YES,False for NO. default 1 (YES) . \n\n";
- my ($in_dir,$scan,$suffix);
- my %opts = (d => '.', s=>1, x=> '.fastp.json');
- my %opts1;
- getopts(':d:q:s:x:', \%opts1);
- for my $key (keys %opts1){
- my $val=$opts1{$key};
- $opts{$key}=$val;
- }
- for my $key (keys %opts){
- my $val=$opts{$key};
- if($key eq 'd'){
- $in_dir=$val;
- print "Input directory: $val .\n";
- }
- if($key eq 'x'){
- $suffix=$val;
- print "Input files suffix : $val .\n";
- }
- if($key eq 's'){
- $scan=$val;
- print "Directory Scan is required.\n" if $scan;
- print "Sinple glob $in_dir for VariantsToTable outputs.\n" unless $scan;
- }
- }
- my @files;
- if($scan){
- find(sub {
- push @files,$File::Find::name if -f and /$suffix$/;
- }, $in_dir.'/');
- }else{
- @files=($in_dir);
- }
- @files=sort @files;
- unless (@ARGV){
- print $usage;
- exit;
- }
- my $out=shift;
- open my $wfh,'>' ,$out or die "Cannot open file:$!\n";
- my @key=('total_reads','total_bases','q20','q30','GC');
- my @n_key=map {/total_reads/ ? $_.' (M)': $_ } @key;
- @n_key=map {/total_bases/ ? $_.' (G)': $_ } @n_key;
- print $wfh join("\t",'Sample',@n_key , (map {if(/total/){
- my $k=$_;
- $k=~s/total/clean/;
- $k;
- }else{'clean_'.$_
- } } @n_key) ,'F1_AT_Dev','F2_AT_Dev','Clean_rate_read','Clean_rate_base'),"\n";
- for my $file(@files){
- my $json_f=$file;
- my $sample=basename $file;
- $sample=~s/.fastp.json//;
- my $decoded_json = json_file_to_perl( $json_f );
- my ($f1_gc_dev,$f2_gc_dev)=(0,0);
- for my $key(sort keys %{$decoded_json}){
- if ($key eq 'read1_after_filtering'){
- my $sum=${$decoded_json}{$key};
- $f1_gc_dev=get_dev($sum);
- }
- if ($key eq 'read2_after_filtering'){
- my $sum=${$decoded_json}{$key};
- $f2_gc_dev=get_dev($sum);
- }
- next unless $key eq 'summary';
- my $sum=${$decoded_json}{$key};
- my $b_reads=sprintf "%.2f", ${$sum}{'before_filtering'}{'total_reads'}/1000000;
- my $b_bases=sprintf "%.2f", ${$sum}{'before_filtering'}{'total_bases'}/1000000000;
- my $b_q20=sprintf "%.2f", ${$sum}{'before_filtering'}{'q20_rate'};
- my $b_q30=sprintf "%.2f", ${$sum}{'before_filtering'}{'q30_rate'};
- my $b_gc=sprintf "%.3f", ${$sum}{'before_filtering'}{'gc_content'};
- my $a_reads=sprintf "%.2f", ${$sum}{'after_filtering'}{'total_reads'}/1000000;
- my $a_bases=sprintf "%.2f", ${$sum}{'after_filtering'}{'total_bases'}/1000000000;
- my $a_q20=sprintf "%.2f", ${$sum}{'after_filtering'}{'q20_rate'};
- my $a_q30=sprintf "%.2f", ${$sum}{'after_filtering'}{'q30_rate'};
- my $a_gc=sprintf "%.3f", ${$sum}{'after_filtering'}{'gc_content'};
- my $clean_r=sprintf "%.2f",${$sum}{'after_filtering'}{'total_reads'}/${$sum}{'before_filtering'}{'total_reads'};
- my $clean_b=sprintf "%.2f",${$sum}{'after_filtering'}{'total_bases'}/${$sum}{'before_filtering'}{'total_bases'};
- print $wfh join("\t",$sample,$b_reads,$b_bases,$b_q20,$b_q30,$b_gc,
- $a_reads,$a_bases,$a_q20,$a_q30,$a_gc,$f1_gc_dev,$f2_gc_dev,$clean_r,$clean_b),"\n";
- }
- }
- sub get_dev{
- my $sum=shift;
- my @G_conts=@{${$sum}{'content_curves'}{'T'}};
- my @C_conts=@{${$sum}{'content_curves'}{'A'}};
- my $G_avg=sum0(@G_conts[10..$#G_conts])/($#G_conts-10);
- my $C_avg=sum0(@C_conts[10..$#C_conts])/($#C_conts-10);
- my $deno=$G_avg>$C_avg ? $G_avg:$C_avg;
- my $f1_gc=sprintf("%.3f",abs(($G_avg-$C_avg)/$deno ) );
- return $f1_gc;
- }
|