123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use utf8;
- binmode(STDOUT,":encoding(gbk)");
- binmode(STDIN,":encoding(gbk)");
- use open ":encoding(gbk)",":std";
- ## Version 2 : add AnnoSV 2020.09.07
- ## Version 3 : output format 2021.07.10
- my $nb_gene=3;
- my $prefix=shift;
- my $annosv_f=$prefix.".AnnotSV.tsv";
- my $inf= -e $prefix.'-M-cnv.input.xls' ? $prefix.'-M-cnv.input.xls' : $prefix.'-F-cnv.input.xls' ;
- my $annof=$prefix.'.cnv.anno.xls';
- my $out=$prefix.'.CNV.xls';
- my $gcfile=shift;
- my (%data,@cols,%map);
- open my $in_fh, '<', $inf or die "Cannot open file:$!";
- while(<$in_fh>){
- next if /^Sample/;
- my ($chr,$start,$end,$n_probe)=(split /\t/)[2,3,4,5];
- my $id=join("\t",$chr,$start,$end);
- $data{$id}{np}=$n_probe;
- }
- close($in_fh);
- open my $gcfh,'<', $gcfile or die "Cannot open file:$! $gcfile\n";
- while(<$gcfh>){
- chomp;
- next if /^ID/;
- my ($chr,$start,$end,$gc,$map)=(split /\t/)[0,1,2,4,5];
- $chr=~s/^chr//;
- $map{$chr}{$start.'_'.$end}=$gc.'_'.$map;
- }
- close($gcfh);
- open my $asv_fh, '<' , $annosv_f or die "Cannot open file:$! $annosv_f";
- while(<$asv_fh>){
- chomp;
- if(/^AnnotSV ID/){
- my @conts=split /\t/;
- for my $i (0..$#conts){
- push @cols ,$i if $conts[$i] eq 'AnnotSV ranking' or $conts[$i] eq 'AnnotSV type'
- or $conts[$i] eq 'AnnotSV ID' or $conts[$i] eq 'DGV_GAIN_Frequency'
- or $conts[$i] eq 'DGV_LOSS_Frequency';
- }
- next;
- }
- my ($svid, $type , $gainAF, $lossAF, $rank)=(split /\t/)[@cols];
- next unless $type eq 'full';
- my $id=join("\t",(split /_/,$svid)[0..2] );
- my $svtype=(split /_/, $svid)[3];
- my $af= $svtype eq 'DEL' ? $lossAF : $gainAF;
- $af=sprintf("%.5f",$af) if $af>0;
- $data{$id}{rank}=$rank."\t".$af;
- }
- close($asv_fh);
- open my $anno_fh, '<' , $annof or die "Cannot open file:$! $annof";
- open my $w_fh, '>',$out or die "Cannot write file:$!" ;
- while(<$anno_fh>){
- chomp;
- if(/^Sample/){
- print $w_fh join("\t", "检出变异\tChr\tStart\tEnd\t变异类型\t变异长度\t包含基因\t重复/缺失",
- "Map",'GC','Rank', 'DGV_AF','Num_Probes',"Sample\tGender\tCopy-ratio",
- "Band_region\tGene\tGeneReview\tDECIPHER\tClinVarCNV\tOMIM\tHGMD"), "\n";
- next;
- }
- my ($sample,$gender,$chr,$start,$end,$len,$ratio,$band,$gene,$gr,$dp,$cc,$omim,$hgmd)=split /\t/;
- my $id=join("\t",$chr,$start,$end);
- my $nprobe=$data{$id}{np};
- next if $ratio > 0.64 and $nprobe <8 and $ratio < 1;
- next if $ratio >1 and $nprobe <8 and $ratio < 1.36;
- my $rank=$data{$id}{rank} ? $data{$id}{rank} : " \t ";
- my ($map,$gc,$np);
- for my $reg (keys %{$map{$chr} }){
- my ($t_start,$t_end)=split /_/,$reg;
- if($t_start >= $start and $t_end <= $end){
- my ($tgc,$tmap)=split /_/,$map{$chr}{$reg};
- $map+=$tmap;
- $gc+=$tgc;
- $np++;
- }
- }
- my $cnv_map=sprintf("%.2f",$map/$np);
- my $cnv_gc=sprintf("%.2f",$gc/$np);
- my $cnv_type=$ratio>1? 'dup': 'del';
- my $hgvs=cnv_hgvs($chr,$start,$end,$band,$cnv_type);
- my $tlen=trim_len($len);
- my $re_gene=trim_gene($gene,$omim);
- $re_gene=~s/;/, /g;
- my $c_type=$ratio>1? '重复': '缺失';
- $gene=~s/^gene://;
- $gr=~s/^geneReview://;
- $dp=~s/^Decipher://;
- $cc=~s/^clinvarCnv://;
- $omim=~s/^OMIM://;
- $hgmd=~s/^HGMD://;
- my $sv_type='';
- if( $gender eq 'M' and ($chr=~/X/ or $chr=~/Y/) ){
- $sv_type= $ratio>1 ? '重复' : '半合子缺失';
- }elsif($ratio<0.2){
- $sv_type='纯合缺失';
- }else{
- $sv_type= $ratio>1 ? '重复' : '杂合缺失';
- }
- $chr= $chr=~/chr/ ? $chr : 'chr'.$chr ;
- print $w_fh join("\t",$hgvs,$chr,$start,$end,$sv_type,$tlen,$re_gene,$c_type,
- $cnv_map,$cnv_gc,$rank,$nprobe,$sample,$gender,$ratio,$band,$gene,$gr,
- $dp,$cc,$omim,$hgmd),"\n";
- }
- close($anno_fh);
- sub cnv_hgvs{
- my ($chr,$start,$end, $band, $type,$ref)=@_;
- unless(defined $ref){
- $ref='GRCh37'
- }
- $chr=~s/chr//;
- $band=~s/^[^qp]+([pq])/$1/;
- return "seq[$ref] $type($chr)($band)chr$chr:g.$start".'_'."$end$type";
- }
- sub trim_len{
- my $len=shift;
- $len=int($len);
- if($len <1000){
- return $len.'bp';
- }elsif($len <1000000){
- my $len_kb=sprintf("%.1f", $len/1000);
- return "约".$len_kb.'kb';
- }elsif($len < 1000000000){
- my $len_mb=sprintf("%.1f", $len/1000000);
- return "约".$len_mb.'M' ;
- }else{
- return 'NA';
- }
- }
- sub trim_gene{
- my ($genes, $omim)=@_;
- my ($num,$fgene);
- $genes=~s/gene://;
- $num=0;
- $fgene='';
- my $o_genes=omim_gene($omim);
- for my $gene(split /;/,$o_genes){
- next if $gene eq '-';
- $num++;
- if($num< $nb_gene+1){
- $fgene= $fgene ? $fgene.';'.$gene : $gene;
- }else{
- return $fgene."等";
- }
- $genes=~s/$gene;//;
- }
- my $re_gene=$fgene.';'.$genes;
- $num=0;
- $fgene='';
- for my $gene(split /;/,$re_gene){
- next unless $gene;
- $num++;
- return $fgene."等" if $num==$nb_gene+1;
- $fgene= $fgene ? $fgene.';'.$gene : $gene;
- }
- return $fgene;
- }
- sub omim_gene{
- my $omim=shift;
- my ($genes,%gen);
- return '-' unless defined $omim;
- $omim=~s/OMIM://;
- $omim=~s/;\s*$//;
- return '' if$omim eq '-';
- for my $ann (split /;/, $omim ){
- my ($gene)=(split /&/,$ann)[-1];
- unless(exists $gen{$gene}){
- $genes= $genes ? $genes.';'.$gene: $gene;
- $gen{$gene}=undef;
- }
- }
- return $genes;
- }
|