dedep_bed.pl 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use List::Util qw/max min /;
  5. use Data::Dumper;
  6. my $BIN=120;
  7. my (%ids);
  8. # my ($a,$b)= get_pad(105,200,$BIN);
  9. # print $b-$a, " $a, $b \n";
  10. # exit;
  11. my $bedf=shift;
  12. my $out=$bedf;
  13. $out=~s/\.bed/.uniq120.bed/;
  14. open my $bedfh, '<', $bedf or die "Cannot open file: $bedf $! \n";
  15. open my $wfh , '>' , $out or die "Cannot write file: $! \n";
  16. my ($pos, $str, $strand)=('', '', '');
  17. while(<$bedfh>){
  18. chomp;
  19. # last if $. > 50;
  20. my ($chr, $start, $end, $id, $qual , $plus)=split /\t/,$_;
  21. my $posi=join("\t",$chr,$start, $end );
  22. LABEL: my $over=0;
  23. $over=is_overlap($pos, $posi) if $pos;
  24. if($pos eq ''){
  25. $strand=$plus;
  26. $pos=$posi;
  27. $str=$id;
  28. }elsif($pos eq $posi){
  29. $str= length($str) > length($id) ? $id : $str;
  30. }elsif( $over ){
  31. my ($m_chr, $m_start, $m_end)=split /\t/,$pos;
  32. if($end <= $m_end){
  33. my ($last_id)=(split /,/,$str)[-1];
  34. $str=~s/$last_id$/$id/ if length($id) < length($last_id);
  35. }else{
  36. my ($f, $r)=split /_/,$over;
  37. $pos=join("\t",$chr,$f, $r );
  38. $str.=','.$id;
  39. $strand=$plus;
  40. }
  41. }else{
  42. my ($chr,$merge_s, $merge_e)=split /\t/, $pos;
  43. my ($pad_s,$pad_e)=get_pad($merge_s, $merge_e,$BIN);
  44. # print $pad_e-$pad_s," $.:$pad_s, $pad_e \n";
  45. if($str=~/,/){
  46. my @strs=split /,/,$str;
  47. for my $i(0..$#strs){
  48. my $id=$strs[$i];
  49. my $num=0 ;
  50. while(exists $ids{$id}){
  51. if($id=~/\.\d+$/){
  52. $id=~s/\.\d+$/.$num/;
  53. }else{
  54. $id=$id.'.'.$num;
  55. }
  56. $num++;
  57. }
  58. $ids{$id}=undef;
  59. if($i<$#strs){
  60. print $wfh join("\t", $chr, $pad_s, $pad_s+$BIN, $id, 1,$strand),"\n" ;
  61. }else{
  62. print $wfh join("\t", $chr, $pad_s, $pad_e, $id, 1,$strand),"\n" ;
  63. }
  64. $pad_s+=$BIN;
  65. last if $pad_s >= $pad_e;
  66. }
  67. }else{
  68. my $id=$str;
  69. my $num=0 ;
  70. while(exists $ids{$id}){
  71. if($id=~/\.\d+$/){
  72. $id=~s/\.\d+$/.$num/;
  73. }else{
  74. $id=$id.'.'.$num;
  75. }
  76. $num++;
  77. }
  78. $ids{$id}=undef;
  79. print $wfh join("\t", $chr, $pad_s, $pad_e, $id, 0,$strand),"\n" ;
  80. }
  81. last if eof($bedfh) and $posi eq "N:1-2";
  82. $strand=$plus;
  83. $pos=$posi;
  84. $str=$id;
  85. }
  86. if(eof($bedfh) ){
  87. $posi="N:1-2";
  88. goto LABEL;
  89. }
  90. last if eof($bedfh);
  91. }
  92. sub is_overlap{
  93. my ($fstr, $sstr)=@_;
  94. my ($f_chr, $f_start, $f_end)=split /\t/, $fstr;
  95. my ($s_chr, $s_start, $s_end)=split /\t/, $sstr;
  96. return 0 unless $f_chr eq $s_chr;
  97. my $max =max($f_start, $f_end, $s_start, $s_end) ;
  98. my $min =min($f_start, $f_end, $s_start, $s_end) ;
  99. if($max-$min < $f_end - $f_start + $s_end - $s_start +0.2 *$BIN){
  100. return $min.'_'.$max;
  101. }else{
  102. return 0;
  103. }
  104. }
  105. sub get_pad{
  106. my ($start,$end,$bin)=@_;
  107. my $num=($end-$start)/$bin;
  108. $num=int($num);
  109. my $length=($num*$bin-($end-$start))/2;
  110. return(int($start-$length),int($end+$length));
  111. }