123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use utf8;
- #binmode(STDOUT,":encoding(gbk)");
- use open ":encoding(gbk)",":std";
- my $nb_gene=3;
- my $prefix=shift;
- my $inf= -e $prefix.'-M-cnv.input.xls' ? $prefix.'-M-cnv.input.xls' : $prefix.'-F-cnv.input.xls';
- my $annof=$prefix.'.cnv.anno.xls';
- my (%data);
- open my $in_fh, '<', $inf or die "Cannot open file:$!";
- while(<$in_fh>){
- next if /^Sample/;
- my ($chr,$start,$end,$n_probe)=(split /\t/)[2,3,4,5];
- my $id=join("\t",$chr,$start,$end);
- $data{$id}=$n_probe;
- }
- close($in_fh);
- open my $anno_fh, '<' , $annof or die "Cannot open file:$!";
- open my $w_fh, '>',$prefix.'.CNV.xls' or die "Cannot write file:$!" ;
- while(<$anno_fh>){
- chomp;
- if(/^Sample/){
- print $w_fh join("\t", "检出变异\tChr\tStart\tEnd\t变异类型\t变异长度\t包含基因\t重复/缺失",
- 'Num_Probes',"Sample\tGender\tCopy-ratio",
- "Band_region\tGene\tGeneReview\tDECIPHER\tClinVarCNV\tOMIM\tHGMD"),"\n";
- next;
- }
- #my ($chr,$start,$end,$n_probe)=(split /\t/)[2,3,4,5];
- my ($sample,$gender,$chr,$start,$end,$len,$ratio,
- $band,$gene,$gr,$dp,$cc,$omim,$hgmd)=split /\t/;
- my $id=join("\t",$chr,$start,$end);
- my $nprobe=$data{$id};
-
- my $cnv_type=$ratio>1? 'dup': 'del';
- my $hgvs=cnv_hgvs($chr,$start,$end,$band,$cnv_type);
- my $tlen=trim_len($len);
- my $re_gene=trim_gene($gene,$omim);
- $re_gene=~s/;/, /g;
- my $c_type=$ratio>1? '重复': '缺失';
- $gene=~s/^gene://;
- $gr=~s/^geneReview://;
- $dp=~s/^Decipher://;
- $cc=~s/^clinvarCnv://;
- $omim=~s/^OMIM://;
- $hgmd=~s/^HGMD://;
- my $sv_type='';
- if( $gender eq 'M' and ($chr=~/X/ or $chr=~/Y/) ){
- $sv_type= $ratio>1 ? '重复' : '半合子缺失';
- }elsif($ratio<0.2){
- $sv_type='纯合缺失';
- }else{
- $sv_type= $ratio>1 ? '重复' : '杂合缺失';
- }
- $chr= $chr=~/chr/ ? $chr : 'chr'.$chr ;
- print $w_fh join("\t",$hgvs,$chr,$start,$end,$sv_type,$tlen,$re_gene,$c_type,
- $nprobe,$sample,$gender,$ratio,$band,$gene,$gr,$dp,$cc,$omim,$hgmd),"\n";
- }
- close($anno_fh);
- sub cnv_hgvs{
- my ($chr,$start,$end, $band, $type,$ref)=@_;
- unless(defined $ref){
- $ref='GRCh37'
- }
- $chr=~s/chr//;
- $band=~s/^[^qp]+([pq])/$1/;
- return "seq[$ref] $type($chr)($band)chr$chr:g.$start".'_'."$end$type";
- }
- sub trim_len{
- my $len=shift;
- $len=int($len);
- if($len <1000){
- return $len.'bp';
- }elsif($len <1000000){
- my $len_kb=sprintf("%.1f", $len/1000);
- return "约".$len_kb.'kb';
- }elsif($len < 1000000000){
- my $len_mb=sprintf("%.1f", $len/1000000);
- return "约".$len_mb.'M' ;
- }else{
- return 'NA';
- }
- }
- sub trim_gene{
- my ($genes, $omim)=@_;
- my ($num,$fgene);
- $genes=~s/gene://;
- $num=0;
- $fgene='';
- my $o_genes=omim_gene($omim);
- for my $gene(split /;/,$o_genes){
- next if $gene eq '-';
- $num++;
- if($num< $nb_gene+1){
- $fgene= $fgene ? $fgene.';'.$gene : $gene;
- }else{
- return $fgene."等";
- }
- $genes=~s/$gene;//;
- }
- my $re_gene=$fgene.';'.$genes;
- $num=0;
- $fgene='';
- for my $gene(split /;/,$re_gene){
- next unless $gene;
- $num++;
- return $fgene."等" if $num==$nb_gene+1;
- $fgene= $fgene ? $fgene.';'.$gene : $gene;
- }
- return $fgene;
- }
- sub omim_gene{
- my $omim=shift;
- my ($genes,%gen);
- return '-' unless defined $omim;
- $omim=~s/OMIM://;
- $omim=~s/;\s*$//;
- return '' if$omim eq '-';
- for my $ann (split /;/, $omim ){
- my ($gene)=(split /&/,$ann)[-1];
- unless(exists $gen{$gene}){
- $genes= $genes ? $genes.';'.$gene: $gene;
- $gen{$gene}=undef;
- }
- }
- return $genes;
- }
|