qualimap_extract.pl 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use HTML::TableExtract;
  5. use Getopt::Std;
  6. use File::Basename;
  7. use File::Find;
  8. my $usage="Usage:\n\tperl $0 outfile\n
  9. Options:\n
  10. -d input directory. default '.' .
  11. -s If Scan is required.True for YES,False for NO. default 1 (YES) .
  12. -t Sequencing Type. default 'p' .
  13. 'g' for WGS,'e' for WES,'p' for Panel sequencing.\n";
  14. my ($in_dir,$type,$scan);
  15. my $GFF=0;
  16. my %type=(p=>'Panel',e=>'Exon',g=>'Genome');
  17. my %opts = (d => '.', t => 'p', s=>1);
  18. my %opts1;
  19. getopts(':d:t:s:', \%opts1);
  20. unless (@ARGV){
  21. print $usage;
  22. exit;
  23. }
  24. for my $key (keys %opts1){
  25. my $val=$opts1{$key};
  26. $opts{$key}=$val;
  27. }
  28. for my $key (keys %opts){
  29. my $val=$opts{$key};
  30. if($key eq 'd'){
  31. $in_dir=$val;
  32. print "Input directory: $val .\n";
  33. }
  34. if($key eq 't' ){
  35. $val=lc $val;
  36. $type= $val;
  37. print "Sequencing Type: ", $type{$val},".\n";
  38. }
  39. if($key eq 's'){
  40. $scan=$val;
  41. print "Directory Scan is required.\n" if $scan;
  42. print "Sinple glob $in_dir for *_stats.\n" unless $scan;
  43. }
  44. }
  45. my @ladder=(1,20,30,50,100,300);
  46. if($type eq 'g'){
  47. @ladder=(1,5,10,15,20,30);
  48. }elsif($type eq 'e'){
  49. @ladder=(1,10,20,30,50,100,150);
  50. }
  51. my $OUT=shift;
  52. my (@samples,%data);
  53. my @dirs;
  54. if($scan){
  55. find(sub {
  56. push @dirs,$File::Find::name if -d and /_stats$/;
  57. }, $in_dir);
  58. # print "sanning...\n";
  59. }else{
  60. @dirs=($in_dir);
  61. # print "No sanning...\n";
  62. }
  63. if(@dirs){
  64. print scalar @dirs," directory at $in_dir .\n";
  65. }else{
  66. die "No qualimap result directory ending with *_stats at $in_dir . \n";
  67. }
  68. for my $dir(@dirs){
  69. #my $sam=$dir;
  70. #$sam=~s/$in_dir\/?//;
  71. my $sam=basename $dir;
  72. $sam=~s/_stats$//;
  73. push @samples,$sam;
  74. my $html=$dir.'/qualimapReport.html';
  75. die "NO file:$html. May Qualimap run error." unless -e $html;
  76. my $te =HTML::TableExtract->new( );
  77. $te->parse_file($html);
  78. my @table=();
  79. for my $ts($te -> tables){
  80. my @cords= $ts->coords;
  81. if($cords[1]==0){
  82. for my $row($ts->rows){
  83. my $cmd=$$row[0];
  84. @table= $cmd =~/-gff/ ? (3,6,8,4):(2,4,6);
  85. $GFF=1 if $cmd =~/-gff/;
  86. $data{$sam}{SD}= $cmd =~/-sd/ ?1 :0 ;
  87. }
  88. }
  89. if($cords[1]==$GFF+2){
  90. my $flag=0;
  91. for my $row($ts->rows){
  92. $flag=1 if $$row[0] =~/^Reference size$/;
  93. }
  94. if($flag==0){
  95. @table=map ++$_, @table;
  96. }
  97. }
  98. for my $row ($ts->rows){
  99. if($cords[1]==$table[0]){
  100. if($$row[0] eq 'Number of reads'){
  101. my $str=$$row[1];
  102. $str=~s/,//g;
  103. $data{$sam}{Total}=sprintf("%.1f",$str/1000000);
  104. }
  105. if( $$row[0] eq 'Mapped reads'){
  106. my $str=$$row[1];
  107. $str=~s/%$//;
  108. my ($num,$pct)=split /\//,$str;
  109. $data{$sam}{Map}=$pct;
  110. }
  111. if($$row[0] eq 'Duplicated reads (flagged)' or $$row[0] eq 'Duplicated reads (estimated)'){
  112. my $str=$$row[1];
  113. $str=~s/%$//;
  114. my ($num,$pct)=split /\//,$str;
  115. $data{$sam}{Dup}=$pct;
  116. }
  117. }
  118. if($#table ==3 and $cords[1]==$table[3]){
  119. if($$row[0] eq 'Regions size/percentage of reference'){
  120. my $str=$$row[1];
  121. $str=~s/\s*\/.+$//;
  122. $data{$sam}{T_size}=$str;
  123. }
  124. if( $$row[0] eq 'Mapped reads'){
  125. my $str=$$row[1];
  126. $str=~s/%$//;
  127. my ($num,$pct)=split /\//,$str;
  128. $data{$sam}{T_Map}=$pct;
  129. }
  130. if($$row[0] eq 'Duplicated reads (flagged)' or $$row[0] eq 'Duplicated reads (estimated)'){
  131. my $str=$$row[1];
  132. $str=~s/%$//;
  133. my ($num,$pct)=split /\//,$str;
  134. $data{$sam}{T_Dup}=$pct;
  135. }
  136. }
  137. if($cords[1]==$table[1]){
  138. $data{$sam}{T_Mean}=$$row[1] if $$row[0] eq 'Mean';
  139. if($$row[0] eq 'Mean'){
  140. my $str= $$row[1];
  141. $str=~s/,//g;
  142. $str=sprintf "%d",$str;
  143. $data{$sam}{T_Mean}=$str;
  144. }
  145. }
  146. if($cords[1]==$table[2]){
  147. $data{$sam}{Insert}=$$row[1] if $$row[0] eq 'P25/Median/P75';
  148. }
  149. }
  150. $data{$sam}{Dup}=exists $data{$sam}{Dup} ?$data{$sam}{Dup} :0 ;
  151. $data{$sam}{T_Dup}=exists $data{$sam}{T_Dup} ?$data{$sam}{T_Dup} :0 ;
  152. }
  153. my $fcov=$dir.'/raw_data_qualimapReport/coverage_histogram.txt';
  154. open my $fh,'<' ,$fcov or die "Cannot open file:$!";
  155. my (%covs,$total,%coverage);
  156. my $site0=0;
  157. my ($n20, $base0);
  158. my $Btarget=$data{$sam}{T_size};
  159. $Btarget=~s/,//g;
  160. while(<$fh>){
  161. chomp;
  162. next if /^#/;
  163. my ($read,$site)=(split /\t/)[0,1];
  164. $covs{$read}=$site;
  165. if($GFF){
  166. $Btarget=$Btarget-$site if $read==0 or $read==0.0;
  167. $n20=$read if $site0/$Btarget<0.2 and ($site0+$site)/$Btarget>=0.2;
  168. $site0+=$site if $read>0;
  169. $base0+=$site*$read;
  170. }
  171. $total+=$site;
  172. }
  173. close($fh);
  174. for my $read(keys %covs){
  175. for my $lad(@ladder){
  176. $coverage{$lad} += $covs{$read} if $lad < $read;
  177. }
  178. }
  179. $data{$sam}{Cover}=join ("\t", map {my $count=exists $coverage{$_} ? $coverage{$_} :0;
  180. sprintf "%.3f", $count/$total} @ladder);
  181. my $cov30=$coverage{1} ? sprintf("%.3f",$coverage{30}/$coverage{1}) : 0;
  182. $data{$sam}{Cover} .= "\t".$cov30 ;
  183. $data{$sam}{Fold80}=sprintf("%.3f",$base0/($n20*$site0) ) if $GFF;
  184. }
  185. my (@keys, @outkeys);
  186. if($GFF){
  187. @keys=qw(Total Map T_size T_Map T_Dup T_Mean Insert SD Fold80 Cover);
  188. @outkeys=qw(Total_Read(M) Map(%) T_size On_Target(%) T_Dup(%) T_Mean Insert_Size SD Fold80);
  189. }else{
  190. @keys=qw(Total Map Dup T_Mean Insert SD Cover);
  191. @outkeys=qw(Total_Read(M) Map(%) Dup(%) T_Mean Insert_Size SD);
  192. }
  193. open my $wfh,'>' ,$OUT or die "Cannot write file:$!";
  194. print $wfh join("\t",'Sample',@outkeys,map { '>'.$_.'X'} @ladder),"\tAdjust_30X\n";
  195. for my $sam (sort {
  196. my ($da,$db);
  197. ($da=$data{$a}{T_size})=~s/,//g;
  198. ($db=$data{$b}{T_size})=~s/,//g;
  199. $da <=> $db or $a cmp $b;
  200. } @samples){
  201. print $wfh join("\t",$sam, map { $data{$sam}{$_} } @keys),"\n";
  202. }