Perl Working On Two Hash References - perl

I would like to compare the values of two hash references.
The data dumper of my first hash is this:
$VAR1 = {
'42-MG-BA' => [
{
'chromosome' => '19',
'position' => '35770059',
'genotype' => 'TC'
},
{
'chromosome' => '2',
'position' => '68019584',
'genotype' => 'G'
},
{
'chromosome' => '16',
'position' => '9561557',
'genotype' => 'G'
},
And the second hash is similar to this but with more hashes in the array. I would like to compare the genotype of my first and second hash if the position and the choromosome matches.
map {print "$_= $cave_snp_list->{$_}->[0]->{chromosome}\n"}sort keys %$cave_snp_list;
map {print "$_= $geno_seq_list->{$_}->[0]->{chromosome}\n"}sort keys %$geno_seq_list;
I could do that for the first array of the hashes.
Could you help me in how to work for all the arrays?
This is my actual code in full
#!/software/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Benchmark;
use Config::Config qw(Sequenom.ini);
useDatabase::Conn;
use Data::Dumper;
GetOptions("sam=s" => \my $sample);
my $geno_seq_list = getseqgenotypes($sample);
my $cave_snp_list = getcavemansnpfile($sample);
#print Dumper($geno_seq_list);
print scalar %$geno_seq_list, "\n";
foreach my $sam (keys %{$geno_seq_list}) {
my $seq_used = $geno_seq_list->{$sam};
my $cave_used = $cave_snp_list->{$sam};
print scalar(#$geno_seq_list->{$_}) if sort keys %$geno_seq_list, "\n";
print scalar(#$cave_used), "\n";
#foreach my $seq2com (# {$seq_used } ){
# foreach my $cave2com( # {$cave_used} ){
# print $seq2com->{chromosome},":" ,$cave2com->{chromosome},"\n";
# }
#}
map { print "$_= $cave_snp_list->{$_}->[0]->{chromosome}\n" } sort keys %$cave_snp_list;
map { print "$_= $geno_seq_list->{$_}->[0]->{chromosome}\n" } sort keys %$geno_seq_list;
}
sub getseqgenotypes {
my $snpconn;
my $gen_list = {};
$snpconn = Database::Conn->new('live');
$snpconn->addConnection(DBI->connect('dbi:Oracle:pssd.world', 'sn', 'ss', { RaiseError => 1, AutoCommit => 0 }),
'pssd');
#my $conn2 =Database::Conn->new('live');
#$conn2->addConnection(DBI->connect('dbi:Oracle:COSI.world','nst_owner','nst_owner', {RaiseError =>1 , AutoCommit=>0}),'nst');
my $id_ind = $snpconn->execute('snp::Sequenom::getIdIndforExomeSample', $sample);
my $genotype = $snpconn->executeArrRef('snp::Sequenom::getGenotypeCallsPosition', $id_ind);
foreach my $geno (#{$genotype}) {
push #{ $gen_list->{ $geno->[1] } }, {
chromosome => $geno->[2],
position => $geno->[3],
genotype => $geno->[4],
};
}
return ($gen_list);
} #end of sub getseqgenotypes
sub getcavemansnpfile {
my $nstconn;
my $caveman_list = {};
$nstconn = Database::Conn->new('live');
$nstconn->addConnection(
DBI->connect('dbi:Oracle:CANP.world', 'nst_owner', 'NST_OWNER', { RaiseError => 1, AutoCommit => 0 }), 'nst');
my $id_sample = $nstconn->execute('nst::Caveman::getSampleid', $sample);
#print "IDSample: $id_sample\n";
my $file_location = $nstconn->execute('nst::Caveman::getCaveManSNPSFile', $id_sample);
open(SNPFILE, "<$file_location") || die "Error: Cannot open the file $file_location:$!\n";
while (<SNPFILE>) {
chomp;
next if /^>/;
my #data = split;
my ($nor_geno, $tumor_geno) = split /\//, $data[5];
# array of hash
push #{ $caveman_list->{$sample} }, {
chromosome => $data[0],
position => $data[1],
genotype => $nor_geno,
};
} #end of while loop
close(SNPFILE);
return ($caveman_list);
}

The problem that I see is that you're constructing a tree for generic storage of data, when what you want is a graph, specific to the task. While you are constructing the record, you could also be constructing the part that groups data together. Below is just one example.
my %genotype_for;
my $record
= { chromosome => $data[0]
, position => $data[1]
, genotype => $nor_geno
};
push #{ $gen_list->{ $geno->[1] } }, $record;
# $genotype_for{ position }{ chromosome }{ name of array } = genotype code
$genotype_for{ $data[1] }{ $data[0] }{ $sample } = $nor_geno;
...
return ( $caveman_list, \%genotype_for );
In the main line, you receive them like so:
my ( $cave_snp_list, $geno_lookup ) = getcavemansnpfile( $sample );
This approach at least allows you to locate similar position and chromosome values. If you're going to do much with this, I might suggest an OO approach.
Update
Assuming that you wouldn't have to store the label, we could change the lookup to
$genotype_for{ $data[1] }{ $data[0] } = $nor_geno;
And then the comparison could be written:
foreach my $pos ( keys %$small_lookup ) {
next unless _HASH( my $sh = $small_lookup->{ $pos } )
and _HASH( my $lh = $large_lookup->{ $pos } )
;
foreach my $chrom ( keys %$sh ) {
next unless my $sc = $sh->{ $chrom }
and my $lc = $lh->{ $chrom }
;
print "$sc:$sc";
}
}
However, if you had limited use for the larger list, you could construct the specific case
and pass that in as a filter when creating the longer list.
Thus, in whichever loop creates the longer list, you could just go
...
next unless $sample{ $position }{ $chromosome };
my $record
= { chromosome => $chromosome
, position => $position
, genotype => $genotype
};
...

Related

Grouping with Perl: finding a faster solution to recursion

The Perl code below works, but it doesn't scale well even with considerable computer resources. I hoping that someone can help me find more efficient code such as by replacing recursion with iteration, if that's the problem.
my data structure looks like this:
my %REV_ALIGN;
$REV_ALIGN{$dna}{$rna} = ();
Any dna key may have multiple rna sub keys. The same rna sub key may appear with multiple different dna keys. The purpose is to group rna ( transcripts ) based on shared dna sequence elements. For example, if dnaA has RNA1, RNA8, RNA9, and RNA4, and dnaB has RNA11, RNA4, and RNA99, then we group all these transcripts together ( RNA1, RNA9, RNA4, RNA11, RNA99 ) and continue to proceed to try and add to the group by selecting other dna. My recusive solution to this problem works but doesn't scale so well when using data from whole genome to transcriptome alignment.
SO MY QUESTION IS: WHAT IS A MORE EFFICIENT SOLUTION TO THIS PROBLEM? THANK YOU VERY MUCH
my #groups;
while ( my $x =()= keys %REV_ALIGN )
{
my #DNA = keys %REV_ALIGN;
my $dna = shift #DNA;
# the corresponding list of rna
my #RNA = keys %{$REV_ALIGN{$dna}};
delete $REV_ALIGN{$dna};
if ( $x == 1 )
{
push #groups, \#RNA;
last;
}
my $ref = group_transcripts ( \#RNA, \%REV_ALIGN );
push #groups, $ref;
}
sub group_transcripts
{
my $tran_ref = shift;
my $align_ref = shift;
my #RNA_A = #$tran_ref;
my %RNA;
# create a null hash with seed list of transcripts
#RNA{#RNA_A} = ();
# get a list of all remaining dna sequences in the alignment
my #DNA = keys %{$align_ref};
my %count;
# select a different list of transcripts
for my $dna ( #DNA )
{
next unless exists $align_ref->{$dna};
my #RNA_B = keys %{$align_ref->{$dna}};
# check to see two list share and transcripts
for my $element ( #RNA_A, #RNA_B )
{
$count{$element}++;
}
for my $rna_a ( keys %count )
{
# if they do, add any new transcripts to the current group
if ( $count{$rna_a} == 2 )
{
for my $rna_b ( #RNA_B )
{
push #RNA_A, $rna_b if $count{$rna_b} == 1;
}
delete $align_ref->{$dna};
delete $count{$_} foreach keys %count;
# recurse to try and continue adding to list
#_ = ( \#RNA_A, $align_ref );
goto &group_transcripts;
}
}
delete $count{$_} foreach keys %count;
}
# if no more transcripts can be added, return a reference to the group
return \#RNA_A;
}
You have a loops nested four deep. It's an pretty safe bet that's why your code scales poorly.
If I understand correctly what you are trying to accomplish, the input
my %REV_ALIGN = (
"DNA1" => { map { $_ => undef } "RNA1", "RNA2" }, # \ Linked by RNA1 \
"DNA2" => { map { $_ => undef } "RNA1", "RNA3" }, # / \ Linked by RNA3 > Group
"DNA3" => { map { $_ => undef } "RNA3", "RNA4" }, # / /
"DNA4" => { map { $_ => undef } "RNA5", "RNA6" }, # \ Linked by RNA5 \ Group
"DNA5" => { map { $_ => undef } "RNA5", "RNA7" }, # / /
"DNA6" => { map { $_ => undef } "RNA8" }, # > Group
);
should result in
my #groups = (
[
dna => [ "DNA1", "DNA2", "DNA3" ],
rna => [ "RNA1", "RNA2", "RNA3", "RNA4" ],
],
[
dna => [ "DNA4", "DNA5" ],
rna => [ "RNA5", "RNA6", "RNA7" ],
],
[
dna => [ "DNA6" ],
rna => [ "RNA8" ],
],
);
If so, you can use the following:
use strict;
use warnings;
use Graph::Undirected qw( );
my %REV_ALIGN = (
"DNA1" => { map { $_ => undef } "RNA1", "RNA2" },
"DNA2" => { map { $_ => undef } "RNA1", "RNA3" },
"DNA3" => { map { $_ => undef } "RNA3", "RNA4" },
"DNA4" => { map { $_ => undef } "RNA5", "RNA6" },
"DNA5" => { map { $_ => undef } "RNA5", "RNA7" },
"DNA6" => { map { $_ => undef } "RNA8" },
);
my $g = Graph::Undirected->new();
for my $dna (keys(%REV_ALIGN)) {
for my $rna (keys(%{ $REV_ALIGN{$dna} })) {
$g->add_edge("dna:$dna", "rna:$rna");
}
}
my #groups;
for my $raw_group ($g->connected_components()) {
my %group = ( dna => [], rna => [] );
for (#$raw_group) {
my ($type, $val) = split(/:/, $_, 2);
push #{ $group{$type} }, $val;
}
push #groups, \%group;
}
use Data::Dumper qw( Dumper );
print(Dumper(\#groups));
If you just want the RNA, the final section simplifies to the following:
my #groups;
for my $raw_group ($g->connected_components()) {
my #group;
for (#$raw_group) {
my ($type, $val) = split(/:/, $_, 2);
push #group, $val if $type eq 'rna';
}
push #groups, \#group;
}

How can I find which keys in a Perl multi-level hash correspond to a given value?

I have a data structure which looks like this:
my %hoh = (
'T431567' => {
machin => '01',
bidule => '02',
truc => '03',
},
'T123456' => {
machin => '97',
bidule => '99',
truc => '69',
},
'T444444' => {
machin => '12',
bidule => '64',
truc => '78',
},
);
I want to search the various values of truc for a particular value and find the top-level attribute which corresponds to that entry. For example, looking for a value of 78, I want to find the result 'T444444', because $hoh{T444444}{truc} is 78.
How can I do this, please?
You can do this with grep:
my #keys = grep { $hoh{$_}{truc} == 78 } keys %hoh;
Note that this can return more than one key, if there are duplicate values in the hash. Also note that this is not particularly efficient, since it has to search the entire hash. In most cases it's probably fine, but if the hash can be very large and you may need to run lots of such queries against it, it may be more efficient to build a reverse index as suggested by Sobrique:
my %trucs;
foreach my $part (keys %hoh) {
my $val = $hoh{$part}{truc};
push #{ $trucs{$val} }, $part;
}
my #keys = #{ $trucs{78} };
or, more generally:
my %index;
foreach my $part (keys %hoh) {
my %data = %{ $hoh{$part} };
foreach my $key (keys %data) {
my $val = $data{$key};
push #{ $index{$key}{$val} }, $part;
}
}
my #keys = #{ $index{truc}{78} };
Can't with that data structure as is - There is no 'backwards' relationship from value to key without you creating it.
You've two options - run a search, or create an 'index'. Practically speaking, these are the same, just one saves the results.
my %index;
foreach my $key ( keys %hoh ) {
my $truc = $hoh{$key}{'truc'};
$index{$truc} = $key;
}
Note - won't do anything clever if the 'truc' numbers are duplicated - it'll overwrite. (Handling this is left as an exercise to the reader).
This solution is similar to those already posted, but it uses the each operator to process the original hash in fewer lines of code, and probably more quickly.
I have added the dump output only so that you can see the form of the structure that is built.
use strict;
use warnings;
my %hoh = (
T123456 => { bidule => '99', machin => '97', truc => '69' },
T431567 => { bidule => '02', machin => '01', truc => '03' },
T444444 => { bidule => '64', machin => '12', truc => '78' },
);
my %trucs;
while ( my ($key, $val) = each %hoh ) {
next unless defined( my $truc = $val->{truc} );
push #{ $trucs{$truc} }, $key ;
}
use Data::Dump;
dd \%trucs;
print "\n";
print "$_\n" for #{ $trucs{78} };
output
{ "03" => ["T431567"], "69" => ["T123456"], "78" => ["T444444"] }
T444444
If you can guarantee that the answer is unique, i.e. that there is never more than one element of the original hash that has a given value for the truc entry, or you are interested only in the last one found, then you can write this still more neatly
my %trucs;
while ( my ($key, $val) = each %hoh ) {
next unless defined( my $truc = $val->{truc} );
$trucs{$truc} = $key ;
}
print $trucs{78}, "\n";
output
T444444
Simplest of all, if there is always a truc entry in each second-level hash, and its values is guaranteed to be unique, then this will do the job
my %trucs = map { $hoh{$_}{truc} => $_ } keys %hoh;
print $trucs{78}, "\n";
with the output as above.

Printing Hash of Hash into a Matrix Table in Perl

I have a data structure like this:
#!/usr/bin/perl -w
my $hash = {
'abTcells' => {
'mesenteric_lymph_node' => {
'Itm2a' => '664.661',
'Gm16452' => '18.1425',
'Sergef' => '142.8205'
},
'spleen' => {
'Itm2a' => '58.07155',
'Dhx9' => '815.2795',
'Ssu72' => '292.889'
}
}
};
What I want to do is to print it out into this format:
mesenteric_lymph_node spleen
Itm2a 664.661 58.07155
Gm16452 18.1425 NA
Sergef 142.8205 NA
Dhx9 NA 815.2795
Ssu72 NA 292.889
What's the way to do it.
I'm currently stuck with the following code https://eval.in/44207
foreach my $ct (keys %{$hash}) {
print "$ct\n\n";
my %hash2 = %{$hash->{$ct}};
foreach my $ts (keys %hash2) {
print "$ts\n";
my %hash3 = %{$hash2{$ts}};
foreach my $gn (keys %hash3) {
print "$gn $hash3{$gn}\n";
}
}
}
Use Text::Table for output. Beautify to taste.
#!/usr/bin/env perl
use strict;
use warnings;
use Text::Table;
my $hash = {
'abTcells' => {
'mesenteric_lymph_node' => {
'Itm2a' => '664.661',
'Gm16452' => '18.1425',
'Sergef' => '142.8205'
},
'spleen' => {
'Itm2a' => '58.07155',
'Dhx9' => '815.2795',
'Ssu72' => '292.889'
}
}
};
my $struct = $hash->{abTcells};
my #cols = sort keys %{ $struct };
my #rows = sort keys %{ { map {
my $x = $_;
map { $_ => undef }
keys %{ $struct->{$x} }
} #cols } };
my $tb = Text::Table->new('', #cols);
for my $r (#rows) {
$tb->add($r, map $struct->{$_}{$r} // 'NA', #cols);
}
print $tb;
Output:
mesenteric_lymph_node spleen
Dhx9 NA 815.2795
Gm16452 18.1425 NA
Itm2a 664.661 58.07155
Sergef 142.8205 NA
Ssu72 NA 292.889
Now, the order of the rows above is different than the one you show because I wanted it to be consistent. If you know the set of all possible rows, then you can specify another order obviously.
First thing would be to separate out the two hashes:
my %lymph_node = %{ $hash->{abTcells}->{mesenteric_lymph_node} };
my %spleen = %{ $hash->{abTcells}->{spleen} };
Now, you have two separate hashes that contains the data you want.
What we need is a list of all the keys. Let's make a third hash that contains your keys.
my %keys;
map { $keys{$_} = 1; } keys %lymph_node, keys %spleen;
Now, we can go through all your keys and print the value for each of the two hashes. If one of the hashes doesn't have the data, we'll set it to NA:
for my $value ( sort keys %keys ) {
my $spleen_value;
my $lymph_nodes_value;
$spleen_value = exists $spleen{$value} ? $spleen{$value} : "NA";
$lymph_node_value = exists $lymph_node{$value} ? $lymph_node{$value} : "NA";
printf "%-20.20s %-9.5f %-9.5f\n", $key, $lymph_node_value, $spleen_value;
}
The printf statement is a nice way to tabularize data. You'll have to create the headings yourself. The ... ? ... : ... statement is an abbreviated if/then/else If the statement before the ? is true, then the value is the value between the ? and the :. Else, the value is the value after the :.
Both of your inner hashes have the same keys, So do a foreach on one of the hashes to get the key, and then print both.

Get value from hash of hashes

I would like to get value from hash of hashes but i do not. My code is :
sub test {
my $filename = $_[0];
open INFILE, ${filename} or die $!;
my %hashCount;
my #firstline = split('\t',<INFILE>);
shift(#firstline);
while (my $line = <INFILE>)
{
my %temp;
chomp($line);
my #line = split('\t', $line);
foreach my $cpt (1..$#line) {
$temp{$firstline[$cpt-1]}=$line[$cpt];
}
$hashCount{$line[0]}={%temp};
}
return %hashCount;
}
sub get_hash_of_hash {
my $h = shift;
foreach my $key (keys %$h) {
if( ref $h->{$key}) {
get_hash_of_hash( $h->{$key} );
}
else {
say $h->{$key};
}
}
}
And when i display my hash :
$VAR10679 = 'M00967_43_1106_2493_14707';
$VAR10680 = {
'A' => '1',
'B' => '0',
'C' => '1',
'D' => '0',
'E' => '0'
};
My first function return my hash of hashes and i get my specific value with the second function.
So I want to get value like that :
my %hashTest = test("FILE.txt");
get_hash_of_hash(%hashTest,"M00967_43_1106_2493_14707","A")
//return value '1'
You can either access nested elements like
$hash{keyA}{keyB}
or we can write a function that walks the data structure, like
sub walk {
my ($hashref, #keys) = #_;
my $pointer = $hashref;
for my $key (#keys) {
if (exists $pointer->{$key}) {
$pointer = $pointer->{$key};
} else {
die "No value at ", join "->", #keys;
}
}
return $pointer;
}
which can be used like
my %hash = (
'M00967_43_1106_2493_14707' => {
'A' => '1',
'B' => '0',
'C' => '1',
'D' => '0',
'E' => '0'
},
);
say walk(\%hash, 'M00967_43_1106_2493_14707', 'A');
Note: When using Data::Dumper, pass references to the Dump function:
print Dump \%hash; # not print Dump %hash
This is neccessary to show the correct data structure.
Your hash holds references to hashes.
You can access them like this:
$hashTest{'M00967_43_1106_2493_14707'}{'A'};
See perlref for more info
Use this subroutine..
sub get_hash_of_hash {
my $h = shift;
foreach my $key (keys %$h) {
if( ref $h->{$key}) {
get_hash_of_hash( $h->{$key} );
}
else {
print $h->{$key};
}
}
}

not able to access hash of hash of array values

I have written the following code in Perl. The code is reading a pdb file and getting some values. Ignore the top part of the code,where everything is working perfect.
Problem is in the sub-routine part, where I try to store arrays in the hash3 with model as key another key position
the array values can be accessed inside the if condition using this :
$hash3{$model}{$coordinates}[1].
but when I go out of all foreach loop and try to access the elements I only get one value.
Please look at the end foreach loop and tell me is it the wrong way to access the hash values.
The pdb file I am using can be downloaded from this link http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=pdb&compression=NO&structureId=1NZS
#!/usr/bin/perl
open(IN,$ARGV[0]);
my #phosphosites;
my $model=1;
my %hash3;
while(<IN>)
{
#findmod(#line);
#finddist;
#findfreq;
if((/^MODRES/) && (/PHOSPHO/))
{
#line=split;
push(#phosphosites, $line[2]);
#print "$line[4]";
}
foreach $elements (#phosphosites){
if(/^HETATM\s+\d+\s+CA\s+$i/)
{
#line1=split;
#print "$line1[5]";
#print "$line1[6] $line1[7] $line1[8]\n";
push(#phosphositesnum, $line1[5]);
}
}
$pos=$line1[5];
#findspatial(\#line,\#line1);
}
my #ori_data=removeDuplicates(#phosphositesnum);
sub removeDuplicates {
my %seen = ();
my #vals = ();
foreach my $i (#_) {
unless ($seen{$i}) {
push #vals, $i;
$seen{$i} = 1;
}
}
return #vals;
}
$a=(#phosphosites);
print "$a\n";
print "#phosphosites\n";
print "#ori_data\n";
close(IN);
open(IN1,$ARGV[0]);
my (#data)=<IN1>;
spatial(\#ori_data);
sub spatial {
my #spatial_array1=#{$_[0]};
foreach $coordinates(#spatial_array1)
{
$model=1;
{foreach $data1(#data){
if($data1=~ m/^HETATM\s+\d+\s+CA\s+[A-Z]*\s+[A-Z]*\s+$coordinates/)
{
#cordivals=split(/\s+/,$data1);
push #{ $sphash{$model} },[$cordivals[6], $cordivals[7], $cordivals[8]];
$hash3{$model}{$coordinates}= \#cordivals;
#print "$model $coordinates $hash3{$model}{$coordinates}[6] $hash3{$model}{$coordinates}[7] $hash3{$model}{$coordinates}[8]\n";
#print "$model $sphash{$model}[$i][0] $sphash{$model}[$i][1] $sphash{$model}[$i][2]\n";
}
elsif($data1=~ m/^ENDMDL/)
{
$model++;
}
#print "$model $coordinates $hash3{$model}{$coordinates}[6] $hash3{$model}{$coordinates}[7] $hash3{$model}{$coordinates}[8]\n";
}
}
}
#foreach $z1 (sort keys %hash3)
# {
# foreach $z2(#spatial_array1){
# print "$z1 $z2";
# print "$hash3{$z1}{$z2}[6]\n";
# print "$z2\n";
# }
# }
}
After using the Data::Dumper option it is giving me this kind of output
$VAR1 = {
'11' => {
'334' => [
'HETATM',
'115',
'CA',
'SEP',
'A',
'343',
'-0.201',
'-2.884',
'1.022',
'1.00',
'99.99',
'C'
],
'342' => $VAR1->{'11'}{'334'},
'338' => $VAR1->{'11'}{'334'},
'335' => $VAR1->{'11'}{'334'},
'340' => $VAR1->{'11'}{'334'},
'343' => $VAR1->{'11'}{'334'},
'336' => $VAR1->{'11'}{'334'}
},
'7' => {
'334' => $VAR1->{'11'}{'334'},
'342' => $VAR1->{'11'}{'334'},
'338' => $VAR1->{'11'}{'334'},
'335' => $VAR1->{'11'}{'334'},
'340' => $VAR1->{'11'}{'334'},
'343' => $VAR1->{'11'}{'334'},
'336' => $VAR1->{'11'}{'334'}
},
'2' => {
'334' => $VAR1->{'11'}{'334'},
'342' => $VAR1->{'11'}{'334'},
...
Change:
#cordivals=split(/\s+/,$data1);
to:
my #cordivals=split(/\s+/,$data1);
What seems to be happening is that all the hash elements contain references to the same array variable, because you're not making the variable local to that iteration.
In general, you should use my with all variables.