fastp2tab.pl 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use JSON::Parse 'json_file_to_perl';
  5. use File::Find;
  6. use List::Util qw(sum0);
  7. use Getopt::Std;
  8. use File::Basename;
  9. my $usage="Usage:\n\tperl $0 outfile\n
  10. Options:\n
  11. -d input directory. default '.' .
  12. -x suffix,iSNP tab file ending with. default .fastp.json.
  13. -s If Scan is required.True for YES,False for NO. default 1 (YES) . \n\n";
  14. my ($in_dir,$scan,$suffix);
  15. my %opts = (d => '.', s=>1, x=> '.fastp.json');
  16. my %opts1;
  17. getopts(':d:q:s:x:', \%opts1);
  18. for my $key (keys %opts1){
  19. my $val=$opts1{$key};
  20. $opts{$key}=$val;
  21. }
  22. for my $key (keys %opts){
  23. my $val=$opts{$key};
  24. if($key eq 'd'){
  25. $in_dir=$val;
  26. print "Input directory: $val .\n";
  27. }
  28. if($key eq 'x'){
  29. $suffix=$val;
  30. print "Input files suffix : $val .\n";
  31. }
  32. if($key eq 's'){
  33. $scan=$val;
  34. print "Directory Scan is required.\n" if $scan;
  35. print "Sinple glob $in_dir for VariantsToTable outputs.\n" unless $scan;
  36. }
  37. }
  38. my @files;
  39. if($scan){
  40. find(sub {
  41. push @files,$File::Find::name if -f and /$suffix$/;
  42. }, $in_dir.'/');
  43. }else{
  44. @files=($in_dir);
  45. }
  46. @files=sort @files;
  47. unless (@ARGV){
  48. print $usage;
  49. exit;
  50. }
  51. my $out=shift;
  52. open my $wfh,'>' ,$out or die "Cannot open file:$!\n";
  53. my @key=('total_reads','total_bases','q20','q30','GC');
  54. my @n_key=map {/total_reads/ ? $_.' (M)': $_ } @key;
  55. @n_key=map {/total_bases/ ? $_.' (G)': $_ } @n_key;
  56. print $wfh join("\t",'Sample',@n_key , (map {if(/total/){
  57. my $k=$_;
  58. $k=~s/total/clean/;
  59. $k;
  60. }else{'clean_'.$_
  61. } } @n_key) ,'F1_AT_Dev','F2_AT_Dev','Clean_rate_read','Clean_rate_base'),"\n";
  62. for my $file(@files){
  63. my $json_f=$file;
  64. my $sample=basename $file;
  65. $sample=~s/.fastp.json//;
  66. my $decoded_json = json_file_to_perl( $json_f );
  67. my ($f1_gc_dev,$f2_gc_dev)=(0,0);
  68. for my $key(sort keys %{$decoded_json}){
  69. if ($key eq 'read1_after_filtering'){
  70. my $sum=${$decoded_json}{$key};
  71. $f1_gc_dev=get_dev($sum);
  72. }
  73. if ($key eq 'read2_after_filtering'){
  74. my $sum=${$decoded_json}{$key};
  75. $f2_gc_dev=get_dev($sum);
  76. }
  77. next unless $key eq 'summary';
  78. my $sum=${$decoded_json}{$key};
  79. my $b_reads=sprintf "%.2f", ${$sum}{'before_filtering'}{'total_reads'}/1000000;
  80. my $b_bases=sprintf "%.2f", ${$sum}{'before_filtering'}{'total_bases'}/1000000000;
  81. my $b_q20=sprintf "%.2f", ${$sum}{'before_filtering'}{'q20_rate'};
  82. my $b_q30=sprintf "%.2f", ${$sum}{'before_filtering'}{'q30_rate'};
  83. my $b_gc=sprintf "%.3f", ${$sum}{'before_filtering'}{'gc_content'};
  84. my $a_reads=sprintf "%.2f", ${$sum}{'after_filtering'}{'total_reads'}/1000000;
  85. my $a_bases=sprintf "%.2f", ${$sum}{'after_filtering'}{'total_bases'}/1000000000;
  86. my $a_q20=sprintf "%.2f", ${$sum}{'after_filtering'}{'q20_rate'};
  87. my $a_q30=sprintf "%.2f", ${$sum}{'after_filtering'}{'q30_rate'};
  88. my $a_gc=sprintf "%.3f", ${$sum}{'after_filtering'}{'gc_content'};
  89. my $clean_r=sprintf "%.2f",${$sum}{'after_filtering'}{'total_reads'}/${$sum}{'before_filtering'}{'total_reads'};
  90. my $clean_b=sprintf "%.2f",${$sum}{'after_filtering'}{'total_bases'}/${$sum}{'before_filtering'}{'total_bases'};
  91. print $wfh join("\t",$sample,$b_reads,$b_bases,$b_q20,$b_q30,$b_gc,
  92. $a_reads,$a_bases,$a_q20,$a_q30,$a_gc,$f1_gc_dev,$f2_gc_dev,$clean_r,$clean_b),"\n";
  93. }
  94. }
  95. sub get_dev{
  96. my $sum=shift;
  97. my @G_conts=@{${$sum}{'content_curves'}{'T'}};
  98. my @C_conts=@{${$sum}{'content_curves'}{'A'}};
  99. my $G_avg=sum0(@G_conts[10..$#G_conts])/($#G_conts-10);
  100. my $C_avg=sum0(@C_conts[10..$#C_conts])/($#C_conts-10);
  101. my $deno=$G_avg>$C_avg ? $G_avg:$C_avg;
  102. my $f1_gc=sprintf("%.3f",abs(($G_avg-$C_avg)/$deno ) );
  103. return $f1_gc;
  104. }