I'm working with perl to make a script that will work with Dot products/assorted vector math. I've got a working script ( Still very much in progress/needs refinement ) that will do what I ask.
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Math::Vector::Real;
use 5.010;
use Math::Trig;
my $source = "./IN";
my $out = "./OUT";
open(IN, '<', $source) or die "Couldn't open $source: $!\n";
open(OUT, '>', $out) or die "Couldn't open $out: $!\n";
my(#data);
while (<IN>) {
push #data, [ split ];
}
my $v;
my $u;
$v = V(0, 0, 0);
$u = V(0, 0, 0);
my $i = 0;
sub maths {
my $dot = $v * $u;
my $mag1 = (sqrt ((#$v[0])**2 + (#$v[1])**2 + (#$v[2])**2 ));
my $mag2 = (sqrt ((#$u[0])**2 + (#$u[1])**2 + (#$u[2])**2 ));
my $prefinal = acos( ( $dot ) / ( $mag1 * $mag2 ) );
my $degrees = ( ( 180 / 3.14159 ) * ( $prefinal ) );
return ($degrees);
}
my $ref1 = $data[$i][0];
my $ref2 = $data[$i][1];
my $ref3 = $data[$i][2];
my $ref4 = $data[$i+1][0];
my $ref5 = $data[$i+1][1];
my $ref6 = $data[$i+1][2];
$v->[0] = $ref1;
$v->[1] = $ref2;
$v->[2] = $ref3;
$u->[0] = $ref4;
$u->[1] = $ref5;
$u->[2] = $ref6;
my $result = maths;
print "$result\n";
A lot of stuff in the script is vestigial and for ease to follow (For me).
What I desire it to do, is to have the script rotate through each line of the input file and perform the calculations on it.
Something akin to having :
foreach $i (#data) {
my $ref1 = $data[$i][0];
my $ref2 = $data[$i][1];
my $ref3 = $data[$i][2];
my $ref4 = $data[$i+1][0];
my $ref5 = $data[$i+1][1];
my $ref6 = $data[$i+1][2];
$v->[0] = $ref1;
$v->[1] = $ref2;
$v->[2] = $ref3;
$u->[0] = $ref4;
$u->[1] = $ref5;
$u->[2] = $ref6;
my $result = maths;
print "$result\n";
}
Anything is appreciated.
As per my comment, most of the functionality of your program is provided by the Math::Vector::Real module that you're already using
It looks like you want the angle in degrees between successive pairs of 3D vectors in your file. This code creates vectors from each line in the file until there are two vectors in #pair, and then uses atan2 to calculate the angle between them. DEG_PER_RAD is precalculated as a constant from the value of atan2(1,1) which is π ÷ 4 by definition
#!/usr/bin/perl
use strict;
use warnings;
use Math::Vector::Real;
use constant DEG_PER_RAD => 45 / atan2(1, 1);
my ( $source, $out ) = qw/ IN OUT /;
open my $in_fh, '<', $source or die qq{Unable to open "$source" for input: $!\n};
open my $out_fh, '<', $out or die qq{Unable to open "$out" for output: $!\n};
select $out_fh;
my #pair;
while ( <$in_fh> ) {
push #pair, V(split);
if ( #pair == 2 ) {
my $degrees = atan2(#pair) * DEG_PER_RAD;
print "$degrees\n";
shift #pair;
}
}
Update
To do the calculation for every possible pair of vectors, it is simplest to read the whole of the file into an array of vectors and process them from there with two nested loops
This code demonstrates. Insert it after the select call in the original program. It labels each line of output with the two array indices that gave rise to the angle
my #data;
push #data, V(split) while <$in_fh>;
for my $i ( 0 .. $#data-1 ) {
for my $j ( $i+1 .. $#data ) {
my $degrees = atan2(#data[$i,$j]) * DEG_PER_RAD;
print "[$i,$j] $degrees\n";
}
}
You could use the following:
my $v = V( #{ $data[$i+0] } );
my $u = V( #{ $data[$i+1] } );
Cleaned up:
#!/usr/bin/perl
use strict;
use warnings;
use Math::Trig qw( acos );
use Math::Vector::Real qw( V );
sub maths {
my ($v, $u) = #_;
my $dot = $v * $u;
my $mag1 = sqrt( $v->[0]**2 + $v->[1]**2 + $v->[2]**2 );
my $mag2 = sqrt( $u->[0]**2 + $u->[1]**2 + $u->[2]**2 );
my $prefinal = acos( $dot / ( $mag1 * $mag2 ) );
my $degrees = ( 180 / 3.14159 ) * $prefinal;
return $degrees;
}
{
my #data = map { V( split ) } <>;
for my $i (0..$#data-1) {
for my $j ($i+1..$#data) {
my $maths = maths(#data[$i, $j]);
print("$i,$j: $maths\n");
}
}
}
Or with Borodin's alternate method:
#!/usr/bin/perl
use strict;
use warnings;
use Math::Trig qw( acos atan2 );
use Math::Vector::Real qw( V );
use constant DEG_PER_RAD => 45 / atan2(1, 1);
{
my #data = map { V( split ) } <>;
for my $i (0..$#data-1) {
for my $j ($i+1..$#data) {
my $maths = atan2(#data[$i, $j]) * DEG_PER_RAD;
print("$i,$j: $maths\n");
}
}
}
Usage:
perl script.pl ./IN >./OUT
Related
I'm trying to understand the piece of code below; I just cannot understand what is being done in line 15.
It seems like it is trying to initialise/assign to %heading but I am just not sure how that syntax works.
$strings = [qw(city state country language code )];
my $file = "fname";
my $fn = $strings;
my $c = 0;
open( FILEH, "< ${file}.txt" ) or die( $! );
while ( <FILEH> ) {
my %heading;
chomp;
$c++;
#heading{ ( #$fn, "One" ) } = split( /[|]/ ); # Line 15
if ( defined( $heading{"One"} ) ) {
my $One = $heading{"One"};
}
That's called a "slice". It assigns to several keys at once:
#hash{ $key1, $key2 } = ($value1, $value2);
is a shorter and faster way of doing
$hash{$key1} = $value1;
$hash{$key2} = $value2;
#$fn is the same as #{ $fn }, i.e. array dereference.
I'm trying to get the Weighted Cosine Similarity of two documents. I'm using Text::Document and Text::DocumentCollection. My code seems to work but it isn't returning a number as I expected.
Here is my code
use strict;
use warnings;
use Text::Document;
use Text::DocumentCollection;
my $newfile = shift #ARGV;
my $newfile2 = shift #ARGV;
##This is in another file.
my $t1 = countFreq($newfile);
my $t2 = countFreq($newfile2);
my $collection = Text::DocumentCollection->new(file => 'coll.db');
$collection->Add("One", $t1);
$collection->Add("Two", $t2);
my $wSim = $t1->WeightedCosineSimilarity( $t2,
\&Text::DocumentCollection::IDF,
$collection
);
print "\nWeighted Cosine Sim is: $wSim\n";
All this returns is Weighted Cosine Sim is: without anything following the colon.
Here is the code for countFreq:
sub countFreq{
my ($file) = #_;
my $t1 = Text::Document->new();
open (my $info, $file) or die "Could not open file.";
while (my $line = <$info>) {
chomp $line;
$line =~ s/[[:punct:]]//g;
foreach my $str (split /\s+/, $line) {
if (!defined $sp{lc($str)}) {
$t1 -> AddContent ($str);
}
}
}
return $t1;
}
###Update
Here's an example program that works fine. It's based on looking at the test code in the distribution for inspiration
I was expecting the test to be much less sensitive, so I was getting zeroes from two wildly different text sources. This example adds three short sentences $d1, $d1, and $d3, to a collection $c, and then compares each of the three documents to $d1
Comparing $d1 to itself produces 1 -- an exact match, as expected, while comparing $d2and $d3 gives 0.087 and 0 respectively -- a partial match and no match at all
I hope this helps you to resolve your specific issue?
use strict;
use warnings 'all';
use Text::Document;
use Text::DocumentCollection;
my $d1 = Text::Document->new;
$d1->AddContent( 'my heart belongs to sally webster' );
my $d2 = Text::Document->new;
$d2->AddContent( 'my heart belongs to the girl next door' );
my $d3 = Text::Document->new;
$d3->AddContent( 'I want nothing to do with my neighbours' );
my $c = Text::DocumentCollection->new( file => 'coll2.db' );
$c->Add('one', $d1);
$c->Add('two', $d2);
$c->Add('three', $d3);
for my $doc ( $d1, $d2, $d3 ) {
my $wcs = $d1->WeightedCosineSimilarity(
$doc,
\&Text::DocumentCollection::IDF,
$c
);
die qq{Invalid parameters for "WeightedCosineSimilarity"} unless defined $wcs;
print $wcs, "\n";
}
###output
1
0.0874311036726221
0
This is the code for Text::Document::WeightedCosineSimilarity
# this is rather rough
sub WeightedCosineSimilarity
{
my $self = shift;
my ($e,$weightFunction,$rock) = #_;
my ($Dv,$Ev) = ($self->{terms}, $e->{terms});
# compute union
my %union = %{$self->{terms}};
my #keyse = keys %{$e->{terms}};
#union{#keyse} = #keyse;
my #allkeys = keys %union;
# weighted D
my #Dw = map(( defined( $Dv->{$_} )?
&{$weightFunction}( $rock, $_ )*$Dv->{$_} : 0.0 ),
#allkeys
);
# weighted E
my #Ew = map(( defined( $Ev->{$_} )?
&{$weightFunction}( $rock, $_ )*$Ev->{$_} : 0.0 ),
#allkeys
);
# dot product of D and E
my $dotProduct = 0.0;
map( $dotProduct += $Dw[$_] * $Ew[$_] , 0..$#Dw );
# norm of D
my $nD = 0.0;
map( $nD += $Dw[$_] * $Dw[$_] , 0..$#Dw );
$nD = sqrt( $nD );
# norm of E
my $nE = 0.0;
map( $nE += $Ew[$_] * $Ew[$_] , 0..$#Ew );
$nE = sqrt( $nE );
# dot product scaled by norm
if( ($nD==0) || ($nE==0) ){
return undef;
} else {
return $dotProduct / $nD / $nE;
}
}
I'm afraid I don't understand the theory behind what it is doing, but it looks like your problem is that either $nD ("norm of D") or $nE ("norm of D") is zero
All I can suggest is that your two text samples may be too similar/different, or perhaps they are too long/short?
Either way, your code should look like this so as to catch an invalid return value from the cosine function:
my $wSim = $t1->WeightedCosineSimilarity( $t2,
\&Text::DocumentCollection::IDF,
$collection
);
die qq{Invalid parameters for "WeightedCosineSimilarity"} unless defined $wSim;
print "\nWeighted Cosine Sim is: $wSim\n";
I'm trying to modify this script:
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Math::Vector::Real;
use constant DEG_PER_RAD => 45 / atan2(1, 1);
my ( $source, $out ) = qw/ OUT4 OUTABA12 /;
open my $in_fh, '<', $source or die qq{Unable to open "$source" for input: $!\n};
open my $out_fh, '>', $out or die qq{Unable to open "$out" for output: $!\n};
my #data;
push #data, V(split) while <$in_fh>;
my #aoa;
for my $i ( 0 .. $#data ) {
for my $j ( 0 .. $#data ) {
my $val1 = $data[$i];
my $val2 = $data[$j];
if ($val1 != $val2) {
my $math = sqrt(($val1->[0] - $val2->[0])**2 +
($val1->[1] - $val2->[1])**2 +
($val1->[2] - $val2->[2])**2);
if ($math < 2.2) {
push #aoa, "#$val1 #$val2";
}
}
}
}
for my $k ( 0 .. $#aoa ) {
for my $m ( 0 .. $#aoa ) {
my $aoadata1 = $aoa[$k];
my $aoadata2 = $aoa[$m];
my $vect1 = $aoadata1[0..2];
my $vect2 = $aoadata2[0..2];
print "$vect1 $vect2\n";
}
}
.
At the end of the script, I want to be able to do things with the variables $aoadata1 and $aoadata2 in fields 0-2. However, I cannot get them to stop throwing up errors regarding things not referenced right (I think). Can anyone tell me why this is happening/how to fix it?
Thanks.
If you want to use multiple subscripts in an array, you have to change the sigil:
#array[ 0 .. 2 ];
#{ $arra_ref }[ 0 .. 2 ];
It makes no sense to assign the result to a scalar, though. Use an anonymous array:
my $aoadata1 = $aoa[$k];
my $vect1 = [ #{ $aoadata1 }[ 0 .. 2 ] ];
or, without the temp var:
my $vect1 = [ #{ $aoa[$k] }[ 0 .. 2 ] ];
It might still not work, as I noticed you used
push #aoa, "#$val1 #$val2";
Did you mean
push #aoa, [ #$val1, #$val2 ];
or something similar?
So, I have this script :
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Math::Vector::Real;
use constant DEG_PER_RAD => 45 / atan2(1, 1);
my ( $source, $out ) = qw/ OUT4 OUTABA12 /;
open my $in_fh, '<', $source or die qq{Unable to open "$source" for input: $!\n};
open my $out_fh, '>', $out or die qq{Unable to open "$out" for output: $!\n};
my #data;
push #data, V(split) while <$in_fh>;
my #aoa;
for my $i ( 0 .. $#data ) {
for my $j ( 0 .. $#data ) {
my $val1 = $data[$i];
my $val2 = $data[$j];
if ($val1 != $val2) {
my $math = sqrt(($val1->[0] - $val2->[0])**2 +
($val1->[1] - $val2->[1])**2 +
($val1->[2] - $val2->[2])**2);
if ($math < 2.2) {
print "#$val1 #$val2 $math\n";
push #aoa, [#$val1, #$val2, $math];
}
}
}
}
for my $k ( 0 .. $#aoa-1 ) {
my $aoadata1 = $aoa[$k];
my $aoadata2 = $aoa[$k+1];
my $vect1 = [ #{ $aoa[$k] }[0..2] ];
my $vect2 = [ #{ $aoa[$k+1] }[0..2] ];
my $vect3 = [ #{ $aoa[$k] }[3..5] ];
my $vect4 = [ #{ $aoa[$k+1] }[3..5] ];
my $math1 = [ #{ $aoa[$k] }[6] ];
my $math2 = [ #{ $aoa[$k+1] }[6] ];
my #matha = #$math1;
my #mathb = #$math2;
my #vecta = #$vect1;
my #vectb = #$vect2;
my #vectc = #$vect3;
my #vectd = #$vect4;
if ( #vecta != #vectb ) {
print "1\n";
}
}
Which runs on a test file like so:
18.474525 20.161419 20.33903
21.999333 20.220667 19.786734
18.333228 21.649157 21.125111
20.371077 19.675844 19.77649
17.04323 19.3106 20.148842
22.941106 19.105412 19.069893
and it calculates the distance between each point and every other point, and if it's below a threshold push it to an array for later. (For testing purposes, I also have it printing it. )
What I've been stuck on is the bottom half - I'm trying to eventually get to the point where the bottom half of the script script will iterate between rows like so:
If the first triple set of values on row 1 is not identical to the first set of triple values on row two, print 180, but only if this is the only instance of this line. If there is ever a point where row one's values were equal to row two's, do not print 180 whatsoever.
I cannot, for the life of me, get it to work. Any help is appreciated.
I think this is correct
I've used the facilities of the Math::Vector::Real module as I described in a comment
I leave the vectors as objects and avoid accessing their their contents directly
I calculate the distance between $vec1 and $vec2 as abs($vec1 - $vec2
I use the class's stringify ability to display it instead of extracting the individual values in my code
I have also changed the intermediate data format. The distance is no longer kept because it's not necessary, and the array #groups now contains an array for each group of vector pairs that have a common first vector. Each group is of the form
[ $vec1, $vec2, $vec2, $vec2, ... ]
and I use the first function from List::Util to find the group that each new vector pair belongs in. If an existing group is found with a matching first value then the second vector is just pushed onto the end of the group; otherwise a new group is created that looks like [ $vec1, $vec2 ]
Once the #groups array is built, it is processed again to generate the output
If there are only two values in the group then they are $vec1 and $vec2 of a unique point. $vec1 is printed with 180
If there are more than two elements then a line of output is generated for every pair of $vec2 values, each containing the value $vec1 together with the angle between the two $vec2 vectors in the pair
use strict;
use warnings;
use Math::Vector::Real qw/ V /;
use List::Util qw / first /;
use constant DEG_PER_RAD => 45 / atan2(1, 1);
my ( $source, $out ) = qw/ OUT4 OUTABA12 /;
open my $in_fh, '<', $source or die qq{Unable to open "$source" for input: $!\n};
my #data = map V(split), <$in_fh>;
my #groups;
for my $vec1 ( #data ) {
for my $vec2 ( #data ) {
next if abs($vec1 - $vec2) > 2.2 or $vec1 == $vec2;
my $group = first { $_->[0] == $vec1 } #groups;
if ( $group ) {
push #$group, $vec2;
}
else {
push #groups, [ $vec1, $vec2 ];
}
}
}
open my $out_fh, '>', $out or die qq{Unable to open "$out" for output: $!\n};
select $out_fh;
for my $group ( #groups ) {
my ($vec1, #vec2) = #$group;
if ( #vec2 == 1 ) {
print "$vec1 180\n";
next;
}
for my $i ( 0 .. $#vec2-1 ) {
for my $j ( $i+1 .. $#vec2 ) {
my ($vec2a, $vec2b) = #vec2[$i, $j];
my $angle = atan2( $vec2a, $vec2b ) * DEG_PER_RAD;
print "$vec1 $angle\n";
}
}
}
output
{18.474525, 20.161419, 20.33903} 4.96831567625921
{18.474525, 20.161419, 20.33903} 1.65052300046634
{18.474525, 20.161419, 20.33903} 4.80888617553369
{21.999333, 20.220667, 19.786734} 4.22499387836286
{18.333228, 21.649157, 21.125111} 180
{20.371077, 19.675844, 19.77649} 5.09280172684743
{17.04323, 19.3106, 20.148842} 180
{22.941106, 19.105412, 19.069893} 180
I'm in the process of learning how to use perl for genomics applications. I am trying to clean up paired end reads (1 forward, 1 reverse). These are stored in 2 files, but the lines match. What I'm having trouble doing is getting the relevant subroutines to read from the second file (the warnings I get are for uninitialized values).
These files are set up in 4 line blocks(fastq) where the first line is a run ID, 2nd is a sequence, 3rd is a "+", and the fourth holds quality values for the sequence in line 2.
I had no real trouble with this code when it was applied only for one file, but I think I'm misunderstanding how to handle multiple files.
Any guidance is much appreciated!
My warning in this scenario is as such : Use of uninitialized value $thisline in subtraction (-) at ./pairedendtrim.pl line 137, line 4.
#!/usr/bin/perl
#pairedendtrim.pl by AHU
use strict;
use warnings;
die "usage: readtrimmer.pl <file1> <file2> <nthreshold> " unless #ARGV == 3;
my $nthreshold = "$ARGV[2]";
open( my $fastq1, "<", "$ARGV[0]" );
open( my $fastq2, "<", "$ARGV[1]" );
my #forline;
my #revline;
while ( not eof $fastq2 and not eof $fastq1 ) {
chomp $fastq1;
chomp $fastq2;
$forline[0] = <$fastq1>;
$forline[1] = <$fastq1>;
$forline[2] = <$fastq1>;
$forline[3] = <$fastq1>;
$revline[0] = <$fastq2>;
$revline[1] = <$fastq2>;
$revline[2] = <$fastq2>;
$revline[3] = <$fastq2>;
my $ncheckfor = removen( $forline[1] );
my $ncheckrev = removen( $revline[1] );
my $fortest = 0;
if ( $ncheckfor =~ /ok/ ) { $fortest = 1 }
my $revtest = 0;
if ( $ncheckrev =~ /ok/ ) { $revtest = 1 }
if ( $fortest == 1 and $revtest == 1 ) { print "READ 1 AND READ 2" }
if ( $fortest == 1 and $revtest == 0 ) { print "Read 1 only" }
if ( $fortest == 0 and $revtest == 1 ) { print "READ 2 only" }
}
sub removen {
my ($thisline) = $_;
my $ntotal = 0;
for ( my $i = 0; $i < length($thisline) - 1; $i++ ) {
my $pos = substr( $thisline, $i, 1 );
#print "$pos\n";
if ( $pos =~ /N/ ) { $ntotal++ }
}
my $nout;
if ( $ntotal <= $nthreshold ) #threshold for N
{
$nout = "ok";
} else {
$nout = "bad";
}
return ($nout);
}
The parameters to a subroutine are in #_, not $_
sub removen {
my ($thisline) = #_;
I have a few other tips for you as well:
use autodie; anytime that you're doing file processing.
Assign the values in #ARGV to variables first thing. This quickly documents what the hold.
Do not chomp a file handle. This does not do anything. Instead apply chomp to the values returned from reading.
Do not use the strings ok and bad as boolean values.
tr can be used to count the number times a character is in a string.
The following is a cleaned up version of your code:
#!/usr/bin/perl
#pairedendtrim.pl by AHU
use strict;
use warnings;
use autodie;
die "usage: readtrimmer.pl <file1> <file2> <nthreshold> " unless #ARGV == 3;
my ( $file1, $file2, $nthreshold ) = #ARGV;
open my $fh1, '<', $file1;
open my $fh2, '<', $file2;
while ( not eof $fh2 and not eof $fh1 ) {
chomp( my #forline = map { scalar <$fh1> } ( 1 .. 4 ) );
chomp( my #revline = map { scalar <$fh2> } ( 1 .. 4 ) );
my $ncheckfor = removen( $forline[1] );
my $ncheckrev = removen( $revline[1] );
print "READ 1 AND READ 2" if $ncheckfor and $ncheckrev;
print "Read 1 only" if $ncheckfor and !$ncheckrev;
print "READ 2 only" if !$ncheckfor and $ncheckrev;
}
sub removen {
my ($thisline) = #_;
my $ntotal = $thisline =~ tr/N/N/;
return $ntotal <= $nthreshold; #threshold for N
}