qualimap_extract.pl 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. #!/usr/bin/perl
  2. #2207 ADD ERROR RATE
  3. use strict;
  4. use warnings;
  5. use HTML::TableExtract;
  6. use Getopt::Std;
  7. use File::Basename;
  8. use File::Find;
  9. my $usage="Usage:\n\tperl $0 outfile\n
  10. Options:\n
  11. -d input directory. default '.' .
  12. -s If Scan is required.True for YES,False for NO. default 1 (YES) .
  13. -t Sequencing Type. default 'p' .
  14. 'g' for WGS,'e' for WES,'p' for Panel sequencing.\n";
  15. my ($in_dir,$type,$scan);
  16. my $GFF=0;
  17. my %type=(p=>'Panel',e=>'Exon',g=>'Genome');
  18. my %opts = (d => '.', t => 'p', s=>1);
  19. my %opts1;
  20. getopts(':d:t:s:', \%opts1);
  21. unless (@ARGV){
  22. print $usage;
  23. exit;
  24. }
  25. for my $key (keys %opts1){
  26. my $val=$opts1{$key};
  27. $opts{$key}=$val;
  28. }
  29. for my $key (keys %opts){
  30. my $val=$opts{$key};
  31. if($key eq 'd'){
  32. $in_dir=$val;
  33. print "Input directory: $val .\n";
  34. }
  35. if($key eq 't' ){
  36. $val=lc $val;
  37. $type= $val;
  38. print "Sequencing Type: ", $type{$val},".\n";
  39. }
  40. if($key eq 's'){
  41. $scan=$val;
  42. print "Directory Scan is required.\n" if $scan;
  43. print "Sinple glob $in_dir for *_stats.\n" unless $scan;
  44. }
  45. }
  46. my @ladder=(1,20,30,50,100,300);
  47. if($type eq 'g'){
  48. @ladder=(1,5,10,15,20,30);
  49. }elsif($type eq 'e'){
  50. @ladder=(1,10,20,30,50,100,150);
  51. }
  52. my $OUT=shift;
  53. my (@samples,%data);
  54. my @dirs;
  55. if($scan){
  56. find(sub {
  57. push @dirs,$File::Find::name if -d and /_stats$/;
  58. }, $in_dir);
  59. # print "sanning...\n";
  60. }else{
  61. @dirs=($in_dir);
  62. # print "No sanning...\n";
  63. }
  64. if(@dirs){
  65. print scalar @dirs," directory at $in_dir .\n";
  66. }else{
  67. die "No qualimap result directory ending with *_stats at $in_dir . \n";
  68. }
  69. for my $dir(@dirs){
  70. #my $sam=$dir;
  71. #$sam=~s/$in_dir\/?//;
  72. my $sam=basename $dir;
  73. $sam=~s/_stats$//;
  74. push @samples,$sam;
  75. my $html=$dir.'/qualimapReport.html';
  76. die "NO file:$html. May Qualimap run error." unless -e $html;
  77. my $te =HTML::TableExtract->new( );
  78. $te->parse_file($html);
  79. my @table=();
  80. for my $ts($te -> tables){
  81. my @cords= $ts->coords;
  82. if($cords[1]==0){
  83. for my $row($ts->rows){
  84. my $cmd=$$row[0];
  85. @table= $cmd =~/-gff/ ? (3,6,8,9,4):(2,4,6,7);
  86. $GFF=1 if $cmd =~/-gff/;
  87. $data{$sam}{SD}= $cmd =~/-sd/ ?1 :0 ;
  88. }
  89. }
  90. if($cords[1]==$GFF+2){
  91. my $flag=0;
  92. for my $row($ts->rows){
  93. $flag=1 if $$row[0] =~/^Reference size$/;
  94. }
  95. if($flag==0){
  96. @table=map ++$_, @table;
  97. }
  98. }
  99. for my $row ($ts->rows){
  100. if($cords[1]==$table[0]){
  101. if($$row[0] eq 'Number of reads'){
  102. my $str=$$row[1];
  103. $str=~s/,//g;
  104. $data{$sam}{Total}=sprintf("%.1f",$str/1000000);
  105. }
  106. if( $$row[0] eq 'Mapped reads'){
  107. my $str=$$row[1];
  108. $str=~s/%$//;
  109. my ($num,$pct)=split /\//,$str;
  110. $data{$sam}{Map}=$pct;
  111. }
  112. if($$row[0] eq 'Duplicated reads (flagged)' or $$row[0] eq 'Duplicated reads (estimated)'){
  113. my $str=$$row[1];
  114. $str=~s/%$//;
  115. my ($num,$pct)=split /\//,$str;
  116. $data{$sam}{Dup}=$pct;
  117. }
  118. }
  119. if($#table ==4 and $cords[1]==$table[4]){
  120. if($$row[0] eq 'Regions size/percentage of reference'){
  121. my $str=$$row[1];
  122. $str=~s/\s*\/.+$//;
  123. $data{$sam}{T_size}=$str;
  124. }
  125. if( $$row[0] eq 'Mapped reads'){
  126. my $str=$$row[1];
  127. $str=~s/%$//;
  128. my ($num,$pct)=split /\//,$str;
  129. $data{$sam}{T_Map}=$pct;
  130. }
  131. if($$row[0] eq 'Duplicated reads (flagged)' or $$row[0] eq 'Duplicated reads (estimated)'){
  132. my $str=$$row[1];
  133. $str=~s/%$//;
  134. my ($num,$pct)=split /\//,$str;
  135. $data{$sam}{T_Dup}=$pct;
  136. }
  137. }
  138. if($cords[1]==$table[1]){
  139. $data{$sam}{T_Mean}=$$row[1] if $$row[0] eq 'Mean';
  140. if($$row[0] eq 'Mean'){
  141. my $str= $$row[1];
  142. $str=~s/,//g;
  143. $str=sprintf "%.0f",$str;
  144. $data{$sam}{T_Mean}=$str;
  145. }
  146. }
  147. if($cords[1]==$table[2]){
  148. $data{$sam}{Insert}=$$row[1] if $$row[0] eq 'P25/Median/P75';
  149. }
  150. if($cords[1]==$table[3]){
  151. if( $$row[0] eq 'General error rate'){
  152. $data{$sam}{ERROR}=$$row[1];
  153. $data{$sam}{ERROR}=~s/%$//;
  154. }
  155. }
  156. }
  157. $data{$sam}{Dup}=exists $data{$sam}{Dup} ?$data{$sam}{Dup} :0 ;
  158. $data{$sam}{T_Dup}=exists $data{$sam}{T_Dup} ?$data{$sam}{T_Dup} :0 ;
  159. }
  160. my $fcov=$dir.'/raw_data_qualimapReport/coverage_histogram.txt';
  161. open my $fh,'<' ,$fcov or die "Cannot open file:$!";
  162. my (%covs,$total,%coverage);
  163. my $site0=0;
  164. my ($n20, $base0);
  165. my $Btarget=$data{$sam}{T_size};
  166. $Btarget=~s/,//g;
  167. while(<$fh>){
  168. chomp;
  169. next if /^#/;
  170. my ($read,$site)=(split /\t/)[0,1];
  171. $covs{$read}=$site;
  172. if($GFF){
  173. $Btarget=$Btarget-$site if $read==0 or $read==0.0;
  174. $n20=$read if $site0/$Btarget<0.2 and ($site0+$site)/$Btarget>=0.2;
  175. $site0+=$site if $read>0;
  176. $base0+=$site*$read;
  177. }
  178. $total+=$site;
  179. }
  180. close($fh);
  181. for my $read(keys %covs){
  182. for my $lad(@ladder){
  183. $coverage{$lad} += $covs{$read} if $lad < $read;
  184. }
  185. }
  186. $data{$sam}{Cover}=join ("\t", map {my $count=exists $coverage{$_} ? $coverage{$_} :0;
  187. sprintf "%.3f", $count/$total} @ladder);
  188. my $cov30=$coverage{1} ? sprintf("%.3f",$coverage{30}/$coverage{1}) : 0;
  189. $data{$sam}{Cover} .= "\t".$cov30 ;
  190. $data{$sam}{Fold80}=sprintf("%.3f",$base0/($n20*$site0) ) if $GFF;
  191. }
  192. my (@keys, @outkeys);
  193. if($GFF){
  194. @keys=qw(Total Map T_size T_Map T_Dup T_Mean Insert ERROR SD Fold80 Cover);
  195. @outkeys=qw(Total_Read(M) Map(%) T_size On_Target(%) T_Dup(%) T_Mean Insert_Size ERROR(%) SD Fold80);
  196. }else{
  197. @keys=qw(Total Map Dup T_Mean Insert ERROR SD Cover);
  198. @outkeys=qw(Total_Read(M) Map(%) Dup(%) T_Mean Insert_Size ERROR(%) SD);
  199. }
  200. open my $wfh,'>' ,$OUT or die "Cannot write file:$!";
  201. print $wfh join("\t",'Sample',@outkeys,map { '>'.$_.'X'} @ladder),"\tAdjust_30X\n";
  202. for my $sam (sort {
  203. my ($da,$db);
  204. ($da=$data{$a}{T_size})=~s/,//g;
  205. ($db=$data{$b}{T_size})=~s/,//g;
  206. $da <=> $db or $a cmp $b;
  207. } @samples){
  208. print $wfh join("\t",$sam, map { $data{$sam}{$_} } @keys),"\n";
  209. }