123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- #!/usr/bin/perl
- use strict;
- use warnings;
- use File::Find;
- use Getopt::Std;
- use File::Basename;
- my $usage="Usage:\n\tperl $0 outfile\n
- Options:\n
- -d input directory. default '.' .
- -q GQ,genotype quality. default 30.
- -x suffix,iSNP tab file ending with. default .isnp.tab.
- -s If Scan is required.True for YES,False for NO. default 1 (YES) . \n\n";
-
- my ($in_dir,$GQ,$scan,$suffix);
- my %opts = (d => '.', s=>1,q=>30, x=> '.isnp.tab');
- my %opts1;
- getopts(':d:q:s:x:', \%opts1);
- for my $key (keys %opts1){
- my $val=$opts1{$key};
- $opts{$key}=$val;
- }
- for my $key (keys %opts){
- my $val=$opts{$key};
- if($key eq 'd'){
- $in_dir=$val;
- print "Input directory: $val .\n";
- }
- if($key eq 'q'){
- $GQ=$val;
- print "Minimal genotype quality: $val .\n";
- }
- if($key eq 'x'){
- $suffix=$val;
- print "Input files suffix : $val .\n";
- }
- if($key eq 's'){
- $scan=$val;
- print "Directory Scan is required.\n" if $scan;
- print "Sinple glob $in_dir for VariantsToTable outputs.\n" unless $scan;
- }
- }
- my @files;
- if($scan){
- find(sub {
- push @files,$File::Find::name if -f and /$suffix$/;
- }, $in_dir.'/');
- # print "sanning...\n";
- }else{
- @files=<$in_dir/*$suffix>;
- # print "No sanning...\n";
- }
- unless (@ARGV){
- print $usage;
- exit;
- }
- my (@samples,%data,%diffs);
- my $out=shift;
- my @cols;
- for my $file (sort @files){
- my $sample=basename $file;
- $sample=~s/$suffix//;
- push @samples,$sample;
- open my $fh , '<', $file or die "Cannot open file:$!\n";
- while(<$fh>){
- chomp;
- if($.==1 and /^CHROM/){
- my @conts=split /\t/;
- for my $i(0..$#conts){
- push @cols,$i if $conts[$i] eq 'CHROM' or $conts[$i] eq 'POS' or $conts[$i] eq 'REF'
- or $conts[$i] eq 'ALT' or $conts[$i]=~/GT$/ or $conts[$i]=~/GQ$/;
- }
- next;
- }
- my ($chr,$pos,$ref,$alt,$call,$gq)=(split /\t/)[@cols];
- next if $gq eq 'NA';
- next if $gq<$GQ;
- my $id=join("\t",$chr,$pos,$ref,$alt);
- $data{$id}{$sample}=$call;
- }
- close($fh);
- }
- open my $wfh, '>' ,$out or die "Cannot write file:$!\n";
- print $wfh join("\t",'ID1','ID2','Z0','Z1','Z2','PI_HAT','Num_probe'),"\n";
- for my $i (0 .. $#samples-1 ){
- my $sam_i=$samples[$i];
- for my $j($i+1..$#samples){
-
- my $sam_j=$samples[$j];
- my $sam_id= $sam_i."\t".$sam_j;
- my ($diff0,$diff1,$diff2,$total)=(0,0,0,0);
- for my $id(keys %data){
- my $call_i=exists $data{$id}{$sam_i} ? $data{$id}{$sam_i} : '.' ;
- my $call_j=exists $data{$id}{$sam_j} ? $data{$id}{$sam_j} : '.';
- my $diff=call_diff($call_i,$call_j);
- if(defined $diff){
- if($diff ==2){
- $diff2++;
- }elsif($diff==1){
- $diff1++;
- }elsif($diff==0){
- $diff0++;
- }
- $total++;
- }
- }
- if($total){
- my $z1=sprintf("%.3f",$diff1/$total);
- my $z2=sprintf("%.3f",$diff2/$total);
- my $pi_hat=sprintf("%.3f",$z2+$z1/2);
- print $wfh join("\t", $sam_i, $sam_j, sprintf("%.3f",$diff0/$total), $z1,
- $z2,$pi_hat,$total),"\n";
- }else{
- print $wfh join("\t",$sam_i,$sam_j,0,0,0,0,$total),"\n";
- }
- }
- }
- sub call_diff{
- my ($call_i,$call_j)=@_;
- if ($call_i =~/\./ or $call_j =~/\./){
- return undef;
- }
-
- my ($pi,$mi)=split /\//, $call_i;
- my ($pj,$mj)=split /\//, $call_j;
-
- my $n_hom=1;
- $n_hom=2 if $pi eq $mi;
- $n_hom=2 if $pj eq $mj;
-
- my $match=0;
- for my $f($pi,$mi){
- for my $s($pj,$mj){
- $match++ if $f eq $s;
- }
- }
- return $match/$n_hom;
- }
|