trim_cnv_result.pl 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use utf8;
  5. binmode(STDOUT,":encoding(gbk)");
  6. binmode(STDIN,":encoding(gbk)");
  7. use open ":encoding(gbk)",":std";
  8. ## Version 2 : add AnnoSV 2020.09.07
  9. ## Version 3 : output format 2021.07.10
  10. my $nb_gene=3;
  11. my $prefix=shift;
  12. my $annosv_f=$prefix.".AnnotSV.tsv";
  13. my $inf= -e $prefix.'-M-cnv.input.xls' ? $prefix.'-M-cnv.input.xls' : $prefix.'-F-cnv.input.xls' ;
  14. my $annof=$prefix.'.cnv.anno.xls';
  15. my $out=$prefix.'.CNV.xls';
  16. my $gcfile=shift;
  17. my (%data,@cols,%map);
  18. open my $in_fh, '<', $inf or die "Cannot open file:$!";
  19. while(<$in_fh>){
  20. next if /^Sample/;
  21. my ($chr,$start,$end,$n_probe)=(split /\t/)[2,3,4,5];
  22. my $id=join("\t",$chr,$start,$end);
  23. $data{$id}{np}=$n_probe;
  24. }
  25. close($in_fh);
  26. open my $gcfh,'<', $gcfile or die "Cannot open file:$! $gcfile\n";
  27. while(<$gcfh>){
  28. chomp;
  29. next if /^ID/;
  30. my ($chr,$start,$end,$gc,$map)=(split /\t/)[0,1,2,4,5];
  31. $chr=~s/^chr//;
  32. $map{$chr}{$start.'_'.$end}=$gc.'_'.$map;
  33. }
  34. close($gcfh);
  35. open my $asv_fh, '<' , $annosv_f or die "Cannot open file:$! $annosv_f";
  36. while(<$asv_fh>){
  37. chomp;
  38. if(/^AnnotSV ID/){
  39. my @conts=split /\t/;
  40. for my $i (0..$#conts){
  41. push @cols ,$i if $conts[$i] eq 'AnnotSV ranking' or $conts[$i] eq 'AnnotSV type'
  42. or $conts[$i] eq 'AnnotSV ID' or $conts[$i] eq 'DGV_GAIN_Frequency'
  43. or $conts[$i] eq 'DGV_LOSS_Frequency';
  44. }
  45. next;
  46. }
  47. my ($svid, $type , $gainAF, $lossAF, $rank)=(split /\t/)[@cols];
  48. next unless $type eq 'full';
  49. my $id=join("\t",(split /_/,$svid)[0..2] );
  50. my $svtype=(split /_/, $svid)[3];
  51. my $af= $svtype eq 'DEL' ? $lossAF : $gainAF;
  52. $af=sprintf("%.5f",$af) if $af>0;
  53. $data{$id}{rank}=$rank."\t".$af;
  54. }
  55. close($asv_fh);
  56. open my $anno_fh, '<' , $annof or die "Cannot open file:$! $annof";
  57. open my $w_fh, '>',$out or die "Cannot write file:$!" ;
  58. while(<$anno_fh>){
  59. chomp;
  60. if(/^Sample/){
  61. print $w_fh join("\t", "检出变异\tChr\tStart\tEnd\t变异类型\t变异长度\t包含基因\t重复/缺失",
  62. "Map",'GC','Rank', 'DGV_AF','Num_Probes',"Sample\tGender\tCopy-ratio",
  63. "Band_region\tGene\tGeneReview\tDECIPHER\tClinVarCNV\tOMIM\tHGMD"), "\n";
  64. next;
  65. }
  66. my ($sample,$gender,$chr,$start,$end,$len,$ratio,$band,$gene,$gr,$dp,$cc,$omim,$hgmd)=split /\t/;
  67. my $id=join("\t",$chr,$start,$end);
  68. my $nprobe=$data{$id}{np};
  69. next if $ratio > 0.64 and $nprobe <8 and $ratio < 1;
  70. next if $ratio >1 and $nprobe <8 and $ratio < 1.36;
  71. my $rank=$data{$id}{rank} ? $data{$id}{rank} : " \t ";
  72. my ($map,$gc,$np);
  73. for my $reg (keys %{$map{$chr} }){
  74. my ($t_start,$t_end)=split /_/,$reg;
  75. if($t_start >= $start and $t_end <= $end){
  76. my ($tgc,$tmap)=split /_/,$map{$chr}{$reg};
  77. $map+=$tmap;
  78. $gc+=$tgc;
  79. $np++;
  80. }
  81. }
  82. my $cnv_map=sprintf("%.2f",$map/$np);
  83. my $cnv_gc=sprintf("%.2f",$gc/$np);
  84. my $cnv_type=$ratio>1? 'dup': 'del';
  85. my $hgvs=cnv_hgvs($chr,$start,$end,$band,$cnv_type);
  86. my $tlen=trim_len($len);
  87. my $re_gene=trim_gene($gene,$omim);
  88. $re_gene=~s/;/, /g;
  89. my $c_type=$ratio>1? '重复': '缺失';
  90. $gene=~s/^gene://;
  91. $gr=~s/^geneReview://;
  92. $dp=~s/^Decipher://;
  93. $cc=~s/^clinvarCnv://;
  94. $omim=~s/^OMIM://;
  95. $hgmd=~s/^HGMD://;
  96. my $sv_type='';
  97. if( $gender eq 'M' and ($chr=~/X/ or $chr=~/Y/) ){
  98. $sv_type= $ratio>1 ? '重复' : '半合子缺失';
  99. }elsif($ratio<0.2){
  100. $sv_type='纯合缺失';
  101. }else{
  102. $sv_type= $ratio>1 ? '重复' : '杂合缺失';
  103. }
  104. $chr= $chr=~/chr/ ? $chr : 'chr'.$chr ;
  105. print $w_fh join("\t",$hgvs,$chr,$start,$end,$sv_type,$tlen,$re_gene,$c_type,
  106. $cnv_map,$cnv_gc,$rank,$nprobe,$sample,$gender,$ratio,$band,$gene,$gr,
  107. $dp,$cc,$omim,$hgmd),"\n";
  108. }
  109. close($anno_fh);
  110. sub cnv_hgvs{
  111. my ($chr,$start,$end, $band, $type,$ref)=@_;
  112. unless(defined $ref){
  113. $ref='GRCh37'
  114. }
  115. $chr=~s/chr//;
  116. $band=~s/^[^qp]+([pq])/$1/;
  117. return "seq[$ref] $type($chr)($band)chr$chr:g.$start".'_'."$end$type";
  118. }
  119. sub trim_len{
  120. my $len=shift;
  121. $len=int($len);
  122. if($len <1000){
  123. return $len.'bp';
  124. }elsif($len <1000000){
  125. my $len_kb=sprintf("%.1f", $len/1000);
  126. return "约".$len_kb.'kb';
  127. }elsif($len < 1000000000){
  128. my $len_mb=sprintf("%.1f", $len/1000000);
  129. return "约".$len_mb.'M' ;
  130. }else{
  131. return 'NA';
  132. }
  133. }
  134. sub trim_gene{
  135. my ($genes, $omim)=@_;
  136. my ($num,$fgene);
  137. $genes=~s/gene://;
  138. $num=0;
  139. $fgene='';
  140. my $o_genes=omim_gene($omim);
  141. for my $gene(split /;/,$o_genes){
  142. next if $gene eq '-';
  143. $num++;
  144. if($num< $nb_gene+1){
  145. $fgene= $fgene ? $fgene.';'.$gene : $gene;
  146. }else{
  147. return $fgene."等";
  148. }
  149. $genes=~s/$gene;//;
  150. }
  151. my $re_gene=$fgene.';'.$genes;
  152. $num=0;
  153. $fgene='';
  154. for my $gene(split /;/,$re_gene){
  155. next unless $gene;
  156. $num++;
  157. return $fgene."等" if $num==$nb_gene+1;
  158. $fgene= $fgene ? $fgene.';'.$gene : $gene;
  159. }
  160. return $fgene;
  161. }
  162. sub omim_gene{
  163. my $omim=shift;
  164. my ($genes,%gen);
  165. return '-' unless defined $omim;
  166. $omim=~s/OMIM://;
  167. $omim=~s/;\s*$//;
  168. return '' if$omim eq '-';
  169. for my $ann (split /;/, $omim ){
  170. my ($gene)=(split /&/,$ann)[-1];
  171. unless(exists $gen{$gene}){
  172. $genes= $genes ? $genes.';'.$gene: $gene;
  173. $gen{$gene}=undef;
  174. }
  175. }
  176. return $genes;
  177. }