123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use File::Basename;
- # gcq@20211019
- # Count (TG)mTn For CFTR (hg19 chr7:117188660-117188680)
- # REF: (TG)11T7
- # left seq TCATCTTTTATTTTTGA
- # right seq AACAGGGATTTGGGGA
- #
- # To Be DONE:
- # Ajust For sequencing errors, ex. STR error
- # Ajust For STR Length/Counts, ex. more repeats ,less count
- my $Rlseq="TTTTGA";
- my $Rrseq="AACAGG";
- my $tot=0;
- my (%dat);
- my $bam=shift;
- my $sample=basename($bam,".bam");
- my $chrID;
- open my $fhead, "samtools view -H $bam |" or die "Cannot open pipe:$!\n";
- while(<$fhead>){
- if(/LN:159138663/){
- my $chr=$1 if /SN:([a-z]*7)\s+/;
- $chrID=$chr.":117188660-117188670";
- }
- if(/LN:159345973/){
- my $chr=$1 if /SN:([a-z]*7)\s+/;
- $chrID=$chr.":117548606-117548617"
- }
- }
- close($fhead);
- exit unless $chrID;
- open my $fh, "samtools view $bam $chrID |" or die "Cannot open pipe:$!\n";
- while(<$fh>){
- chomp;
- my ($id, $flag, $mapq, $seq)=(split /\t/)[0, 1, 4, 9];
- next if $flag & 1024 or $flag & 2048;
- next if $flag & 256;
- next unless $seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/;
- my ($lseq,$strseq,$rseq)=$seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/;
- next if $lseq =~/TG$/;
- next if $rseq =~/^T/;
-
- my $ldist=distance($lseq, $Rlseq);
- my $rdist=distance($rseq, $Rrseq);
- # next if $rdist>1 or $ldist>1;
- next if $rdist + $ldist> 2;
- $tot++;
- $dat{$strseq}++;
- }
- print join("\t","Sample","GT","Depth","GT1","VAF1","GT2","VAF2"),"\n";
- print $sample,"\t";
- my $gts='';
- for my $seq(sort { $dat{$b} <=> $dat{$a}} keys %dat){
- my ($tgseq)=$seq=~/^((?:TG)+)/;
- my ($tseq)=$seq=~/(T+)$/;
- my $tg_len=length($tgseq)/2;
- my $t_len=length($tseq);
- next if $dat{$seq}<$tot/20;
- my $gts_info=join("\t","(TG)$tg_len(T)$t_len",sprintf("%.2f",$dat{$seq}/$tot));
- $gts=$gts? $gts."\t".$gts_info:$gts_info;
- }
- my $gt=get_GT($gts);
- print join("\t",$gt,$tot,$gts),"\n";
- sub distance{
- my ($s1,$s2) = @_;
- my ($len1,$len2) = (length $s1,length $s2);
- return $len2 if ($len1 == 0);
- return $len1 if ($len2 == 0);
-
- my %mat;
- for (my $i = 0; $i <= $len1; ++$i){
- for (my $j = 0; $j <= $len2; ++$j){
- $mat{$i}{$j} = 0;
- $mat{0}{$j} = $j;
- }
- $mat{$i}{0} = $i;
- }
- my @ar1 = split(//,$s1);
- my @ar2 = split(//,$s2);
-
- for (my $i = 1; $i <= $len1; ++$i){
- for (my $j = 1; $j <= $len2; ++$j){
- my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
- $mat{$i}{$j} = min($mat{$i-1}{$j} + 1,$mat{$i}{$j-1} + 1,$mat{$i-1}{$j-1} + $cost);
- }
- }
- return $mat{$len1}{$len2};
- }
- sub min {
- my @out = sort {$a <=> $b} @_;
- return($out[0]);
- }
- sub get_GT{
- my $GTs=shift;
- my $gt='';
- my @cont=split /\t/,$GTs;
- if($cont[1]>0.8 or $#cont<=1){
- return $cont[0].'/'.$cont[0];
- }elsif($cont[1]/$cont[3] > 3){
- return $cont[0].'/'.$cont[0];
- }else{
- $gt=$cont[0].'/'.$cont[2];
- }
- return join("/", sort(split /\//,$gt));
- }
|