count_TGmTn.pl 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use File::Basename;
  5. # gcq@20211019
  6. # Count (TG)mTn For CFTR (hg19 chr7:117188660-117188680)
  7. # REF: (TG)11T7
  8. # left seq TCATCTTTTATTTTTGA
  9. # right seq AACAGGGATTTGGGGA
  10. #
  11. # To Be DONE:
  12. # Ajust For sequencing errors, ex. STR error
  13. # Ajust For STR Length/Counts, ex. more repeats ,less count
  14. my $Rlseq="TTTTGA";
  15. my $Rrseq="AACAGG";
  16. my $tot=0;
  17. my (%dat);
  18. my $bam=shift;
  19. my $sample=basename($bam,".bam");
  20. my $chrID;
  21. open my $fhead, "samtools view -H $bam |" or die "Cannot open pipe:$!\n";
  22. while(<$fhead>){
  23. if(/LN:159138663/){
  24. my $chr=$1 if /SN:([a-z]*7)\s+/;
  25. $chrID=$chr.":117188660-117188670";
  26. }
  27. if(/LN:159345973/){
  28. my $chr=$1 if /SN:([a-z]*7)\s+/;
  29. $chrID=$chr.":117548606-117548617"
  30. }
  31. }
  32. close($fhead);
  33. exit unless $chrID;
  34. open my $fh, "samtools view $bam $chrID |" or die "Cannot open pipe:$!\n";
  35. while(<$fh>){
  36. chomp;
  37. my ($id, $flag, $mapq, $seq)=(split /\t/)[0, 1, 4, 9];
  38. next if $flag & 1024 or $flag & 2048;
  39. next if $flag & 256;
  40. next unless $seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/;
  41. my ($lseq,$strseq,$rseq)=$seq=~/([ATCG]{6})((?:TG){6,}T{3,})([ATCG]{6})/;
  42. next if $lseq =~/TG$/;
  43. next if $rseq =~/^T/;
  44. my $ldist=distance($lseq, $Rlseq);
  45. my $rdist=distance($rseq, $Rrseq);
  46. # next if $rdist>1 or $ldist>1;
  47. next if $rdist + $ldist> 2;
  48. $tot++;
  49. $dat{$strseq}++;
  50. }
  51. print join("\t","Sample","GT","Depth","GT1","VAF1","GT2","VAF2"),"\n";
  52. print $sample,"\t";
  53. my $gts='';
  54. for my $seq(sort { $dat{$b} <=> $dat{$a}} keys %dat){
  55. my ($tgseq)=$seq=~/^((?:TG)+)/;
  56. my ($tseq)=$seq=~/(T+)$/;
  57. my $tg_len=length($tgseq)/2;
  58. my $t_len=length($tseq);
  59. next if $dat{$seq}<$tot/20;
  60. my $gts_info=join("\t","(TG)$tg_len(T)$t_len",sprintf("%.2f",$dat{$seq}/$tot));
  61. $gts=$gts? $gts."\t".$gts_info:$gts_info;
  62. }
  63. my $gt=get_GT($gts);
  64. print join("\t",$gt,$tot,$gts),"\n";
  65. sub distance{
  66. my ($s1,$s2) = @_;
  67. my ($len1,$len2) = (length $s1,length $s2);
  68. return $len2 if ($len1 == 0);
  69. return $len1 if ($len2 == 0);
  70. my %mat;
  71. for (my $i = 0; $i <= $len1; ++$i){
  72. for (my $j = 0; $j <= $len2; ++$j){
  73. $mat{$i}{$j} = 0;
  74. $mat{0}{$j} = $j;
  75. }
  76. $mat{$i}{0} = $i;
  77. }
  78. my @ar1 = split(//,$s1);
  79. my @ar2 = split(//,$s2);
  80. for (my $i = 1; $i <= $len1; ++$i){
  81. for (my $j = 1; $j <= $len2; ++$j){
  82. my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
  83. $mat{$i}{$j} = min($mat{$i-1}{$j} + 1,$mat{$i}{$j-1} + 1,$mat{$i-1}{$j-1} + $cost);
  84. }
  85. }
  86. return $mat{$len1}{$len2};
  87. }
  88. sub min {
  89. my @out = sort {$a <=> $b} @_;
  90. return($out[0]);
  91. }
  92. sub get_GT{
  93. my $GTs=shift;
  94. my $gt='';
  95. my @cont=split /\t/,$GTs;
  96. if($cont[1]>0.8 or $#cont<=1){
  97. return $cont[0].'/'.$cont[0];
  98. }elsif($cont[1]/$cont[3] > 3){
  99. return $cont[0].'/'.$cont[0];
  100. }else{
  101. $gt=$cont[0].'/'.$cont[2];
  102. }
  103. return join("/", sort(split /\//,$gt));
  104. }