#!/usr/bin/perl use strict; use warnings; use File::Basename; # gcq@20211019 # Count (TG)mTn For CFTR (hg19 chr7:117188660-117188680) # REF: (TG)11T7 # left seq TCATCTTTTATTTTTGA # right seq AACAGGGATTTGGGGA # # To Be DONE: # Ajust For sequencing errors, ex. STR error # Ajust For STR Length/Counts, ex. more repeats ,less count my $Rlseq="TTTTGA"; my $Rrseq="AACAGG"; my $tot=0; my (%dat); my $bam=shift; my $sample=basename($bam,".bam"); my $chrID; open my $fhead, "samtools view -H $bam |" or die "Cannot open pipe:$!\n"; while(<$fhead>){ if(/LN:159138663/){ my $chr=$1 if /SN:([a-z]*7)\s+/; $chrID=$chr.":117188660-117188670"; } if(/LN:159345973/){ my $chr=$1 if /SN:([a-z]*7)\s+/; $chrID=$chr.":117548606-117548617" } } close($fhead); exit unless $chrID; open my $fh, "samtools view $bam $chrID |" or die "Cannot open pipe:$!\n"; while(<$fh>){ chomp; my ($id, $flag, $mapq, $seq)=(split /\t/)[0, 1, 4, 9]; next if $flag & 1024 or $flag & 2048; next if $flag & 256; next unless $seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/; my ($lseq,$strseq,$rseq)=$seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/; next if $lseq =~/TG$/; next if $rseq =~/^T/; my $ldist=distance($lseq, $Rlseq); my $rdist=distance($rseq, $Rrseq); # next if $rdist>1 or $ldist>1; next if $rdist + $ldist> 2; $tot++; $dat{$strseq}++; } print join("\t","Sample","GT","Depth","GT1","VAF1","GT2","VAF2"),"\n"; print $sample,"\t"; my $gts=''; for my $seq(sort { $dat{$b} <=> $dat{$a}} keys %dat){ my ($tgseq)=$seq=~/^((?:TG)+)/; my ($tseq)=$seq=~/(T+)$/; my $tg_len=length($tgseq)/2; my $t_len=length($tseq); next if $dat{$seq}<$tot/20; my $gts_info=join("\t","(TG)$tg_len(T)$t_len",sprintf("%.2f",$dat{$seq}/$tot)); $gts=$gts? $gts."\t".$gts_info:$gts_info; } my $gt=get_GT($gts); print join("\t",$gt,$tot,$gts),"\n"; sub distance{ my ($s1,$s2) = @_; my ($len1,$len2) = (length $s1,length $s2); return $len2 if ($len1 == 0); return $len1 if ($len2 == 0); my %mat; for (my $i = 0; $i <= $len1; ++$i){ for (my $j = 0; $j <= $len2; ++$j){ $mat{$i}{$j} = 0; $mat{0}{$j} = $j; } $mat{$i}{0} = $i; } my @ar1 = split(//,$s1); my @ar2 = split(//,$s2); for (my $i = 1; $i <= $len1; ++$i){ for (my $j = 1; $j <= $len2; ++$j){ my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; $mat{$i}{$j} = min($mat{$i-1}{$j} + 1,$mat{$i}{$j-1} + 1,$mat{$i-1}{$j-1} + $cost); } } return $mat{$len1}{$len2}; } sub min { my @out = sort {$a <=> $b} @_; return($out[0]); } sub get_GT{ my $GTs=shift; my $gt=''; my @cont=split /\t/,$GTs; if($cont[1]>0.8 or $#cont<=1){ return $cont[0].'/'.$cont[0]; }elsif($cont[1]/$cont[3] > 3){ return $cont[0].'/'.$cont[0]; }else{ $gt=$cont[0].'/'.$cont[2]; } return join("/", sort(split /\//,$gt)); }