I have a code to parse 2000 csv files and build hashes based on them.
code is running good and fast until it reads ~100 files and there after it is running at snail pace
Memory consumed is ~ 1.8 GB uncompressed
Goal is to build global hash %_hist from the csv files.
File sizes range between 20KB to 30 MB
OS is Mac with 12 GB RAM
64 bit perl 5.18
I have create every variable in the functions as "my" expecting it to be released after the function exits.
The only persistent global variable is %_hist
Is there a way to improve performance?
foreach my $file (#files){
iLog ("Checking $file");
$| = 1; #flush io
return error("File $file doesn't exist") if not -e $file;
my #records = readCSVFile($file); #reads csv file to 2d array and returns the array
my #formatted_recs;
foreach $rec ( #records ){
my ($time,$c,$user_dst,$client,$ip_src,$first_seen,$last_seen,$first_seen_time,$last_seen_time,$device_ip,$country,$org,$user_agent) = #$rec;
my #newrec = ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org);
next if $time =~ /time/i; #Ignore first record
push(#formatted_recs, \#newrec);
}
baselineHistRecords(#formatted_recs);
}
sub readCSVFile{
my $file = shift;
my #data;
open my $fh, '<', $file or return error("Could not open $file: $!");
my $line = <$fh>; #Read headerline
my $sep_char = ',';
$sep_char = ';' if $line =~ /;"/;
$sep_char = '|' if $line =~ /\|/;
my $csv = Text::CSV->new({ sep_char => "$sep_char" });
push (#data, split(/$sep_char/, $line) );
while( my $row = $csv->getline( $fh ) ) {
push #data, $row;
}
close $fh;
return #data;
}
sub baselineHistRecords{
my #recs = #_;
undef $_ for ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
undef $_ for (%device_count, %ua_count, %location_count, %org_count );
my ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
my %loc = {}; my %loc2rec = {};
my %device_count = {}; my %ua_count = {}; my %location_count = {}; my %sorg_count = {};
my $hits=0;
my #suspicious_hits = ();
foreach $rec (#recs){
my $devtag=''; my $os = '';
my #row = #{$rec};
($time,$c,$client,$first_seen,$last_seen,$ip_src,$ua,$device_ip,$country,$org) = #row;
veryverbose("\n$time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org");
next if not is_ipv4($ip_src);
###### 1. Enrich IP
my $org = getOrgForIP($ip_src);
my ($country_code,$region,$city) = getGeoForIP($ip_src);
my $isp = getISPForIP($ip_src);
my $loc = join(" > ",($country_code, $region));
my $city = join(" > ",($country_code, $region, $city));
my $cidr = $ip_src; $cidr =~ s/\d+\.\d+$/0\.0\/16/; #Removing last octet
# my $packetmail = getPacketmailRep($ip_src);
# push (#suspicious_hits, "$time $c $client $ip_src $ua / $packetmail") if $packetmail !~ /NOTFOUND/;
##### 2. SANITIZE
$ua = cannonize($ua);
$devtag = $& if $ua =~ /\([^\)]+\)/;
#tokens = split(/;/, $devtag);
$os = $tokens[0];
$os =~ s/\+/ /g;$os =~ s/\(//g;$os =~ s/\)//g;
$os = 'Android' if $os !~ /Android/i and $devtag =~ /Android/i;
$os = "Windows NT" if $os =~ /compatible/i or $os =~ /Windows NT/i;
$_hist{$client}{"isp"}{$isp}{c} += 1;
$_hist{$client}{"os"}{$os}{c} += 1;
$_hist{$client}{"ua"}{$ua}{c} += 1 if not is_empty ($ua);
$_hist{$client}{"ua"}{c} += 1 if not is_empty ($ua); #An exception marked since all logs doesn't have UA values
$_hist{$client}{"loc"}{$loc}{c} += 1;
$_hist{$client}{"org"}{$org}{c} += 1;
$_hist{$client}{"cidr"}{$cidr}{c} += 1;
$_hist{$client}{"city"}{$city}{c} += 1;
$_hist{$client}{"c"} += 1;
$hits = $hits + 1;
print "." if $hits%100==0;
debug( "\n$ip_src : $os $loc $isp $org $ua: ".$_hist{$client}{"os"}{$os}{c} );
}
print "\nHITS: $hits";
return if ($hits==0); #return if empty
printf("\n######(( BASELINE for $client (".$_hist{$client}{c} ." records) ))#######################\n");
foreach my $item (qw/os org isp loc ua cidr/){
debug( sprintf ("\n\n--(( %s: %s ))-------------------------------- ",$client,uc($item)) );
## COMPUTE Usage Percent
my #item_values = sort { $_hist{$client}{$item}{$b}{c} <=> $_hist{$client}{$item}{$a}{c} } keys %{ $_hist{$client}{$item} };
my #cvalues = ();
foreach my $key ( #item_values ){
my $count = $_hist{$client}{$item}{$key}{c};
my $total = $_hist{$client}{c};
$total = $_hist{$client}{"ua"}{c} if $item =~ /^ua|os$/i and $_hist{$client}{"ua"}{c}; #Over for User_agent and OS determination as all logs doesn't have them
my $pc = ceil(( $count / $total ) * 100) ;
debug ("Ignoring empty value") if is_empty($key); # Ignoring Empty values
next if is_empty($key);
$_hist{$client}{$item}{$key}{p} = $pc ;
push (#cvalues, $pc);
#printf("\n%3d \% : %s",$pc,$key) if $pc>0;
}
## COMPUTE Cluster Centers
my #clustercenters = getClusterCenters(3,#cvalues);
my ($low, $medium, $high) = #clustercenters;
$_hist{$client}{$item}{low} = $low;
$_hist{$client}{$item}{medium} = $medium;
$_hist{$client}{$item}{high} = $high;
my %tags = ( $low => "rare",
$medium => "normal",
$high =>"most common",
);
debug ("\n(Cluster Centers) : $low \t$medium \t $high\n");
foreach my $key ( #item_values ){
next if is_empty($key);
my $pc = $_hist{$client}{$item}{$key}{p};
$_hist{$client}{$item}{$key}{tag} = $tags{ closest($pc, #clustercenters) };
debug( sprintf("\n%3d \% : %s : %s",$pc, $_hist{$client}{$item}{$key}{tag} , $key) );
}
}
printf("\n\n###################################\n");
saveHistBaselines();
}
Thanks,
Uma
This is more question for code review.
There's a ton of completely useless copying around in the code. E.g.: why the hell you copy data from my #$rec to #newrec? $rec to #row? Why do you return plain list of lines from readCSVFile instead of reference?
You don't really need to read entire file in memory and then process it - you can process data line by line and throw it away immideately after you done with it.
Related
I am trying to solve the doublets puzzle problem using Perl. This is one of my first times using Perl so please excuse the messy code.
I have everything working, I believe, but am having an issue printing the shortest path. Using a queue and BFS I am able to find the target word but not the actual path taken.
Does anyone have any suggestions? I have been told to keep track of the parents of each element but it is not working.
#!/usr/bin/perl
use strict;
my $file = 'test';
#my $file = 'wordlist';
open(my $fh, $file);
my $len = length($ARGV[0]);
my $source = $ARGV[0];
my $target = $ARGV[1];
my #words;
# Creates new array of correct length words
while (my $row = <$fh>) {
chomp $row;
my $rowlen = length($row);
if ($rowlen == $len) {
push #words, $row;
}
}
my %wordHash;
# Creates graph for word variations using dictionary
foreach my $word (#words) {
my $wordArr = [];
for (my $i = 0; $i < $len; $i++) {
my $begin = substr($word, 0, $i);
my $end = substr($word, $i+1, $len);
my $key = "$begin" . "_" . "$end";
my $Arr = [];
my $regex = "$begin" . "[a-z]" . "$end";
foreach my $wordTest (#words) {
if ("$wordTest" =~ $regex && "$wordTest" ne "$word") {
push $wordArr, "$wordTest";
}
}
}
$wordHash{"$word"} = $wordArr;
}
my #queue;
push(#queue, "$source");
my $next = $source;
my %visited;
my %parents;
my #path;
# Finds path using BFS and Queue
while ("$next" ne "$target") {
print "$next: ";
foreach my $variation (#{$wordHash{$next}}) {
push(#queue, "$variation");
$parents{"$variation"} = $next;
print "$variation | ";
}
print "\n-----------------\n";
$visited{"$next"} = 1;
push(#path, "$next");
$next = shift(#queue);
while ($visited{"$next"} == 1) {
$next = shift(#queue);
}
}
print "FOUND: $next\n\n";
print "Path the BFS took: ";
print "#path\n\n";
print "Value -> Parent: \n";
for my $key (keys %parents) {
print "$key -> $parents{$key}\n";
}
Before you accept a word from the #queue to be $next, you test to ensure that it's not been %visited. By then, though, damage has been done. The test has ensured a visited word wont become the focus again and hence, will prevent loops but the earlier code updated %parents whether the word had been %visited or not.
If a word has been %visited, you not only want to avoid it becomming the $next candidate, you want to avoid it being a considered $variation as that will screw up %parents. I don't have a word dictionary to test with and you haven't given an example of the failure but I think you can fix this up by shifting the %visited guard into the inner loop where variations are considered;
foreach my $variation (#{$wordHash{$next}}) {
next if %visited{ $variation } ;
push(#queue, "$variation");
... etc ...
This will protect the integrity of your #parents array as well as stop loops. On a small note, you don't need use double quotes when indexing into a hash; as I've done above, just state the scalar variable - using quotes just interpolates the value of the variable which produces the same result.
Your code, IMHO, is excellent for a beginner, BTW.
Update
I've since got a word dictionary and the problem above does exists as well as one other. The code does move one letter at a time from the source but in a near random direction - not necessarily closer to the target. To correct that, I changed the regex you use to build your graph such that the corresponding letter from the target replaces the generic [a-z]. There are also a couple of minor changes - mostly style related. The updated code looks like this;
use v5.12;
my $file = 'wordlist.txt';
#my $file = 'wordlist';
open(my $fh, $file);
my $len = length($ARGV[0]);
my $source = $ARGV[0];
my $target = $ARGV[1];
chomp $target;
my #target = split('', $target);
my #words;
# Creates new array of correct length words
while (my $row = <$fh>) {
$row =~ s/[\r\n]+$//;
my $rowlen = length($row);
if ($rowlen == $len) {
push #words, $row;
}
}
my %wordHash;
# Creates graph for word variations using dictionary
foreach my $word (#words) {
my $wordArr = [];
for (my $i = 0; $i < $len; $i++) {
my $begin = substr($word, 0, $i);
my $end = substr($word, $i+1, $len);
my $key = "$begin" . "_" . "$end";
my $Arr = [];
# my $re_str = "$begin[a-z]$end";
my $regex = $begin . $target[$i] . $end ;
foreach my $wordTest (#words) {
if ($wordTest =~ / ^ $regex $ /x ) {
next if $wordTest eq $word ;
push $wordArr, "$wordTest";
}
}
}
$wordHash{"$word"} = $wordArr;
}
my #queue;
push(#queue, "$source");
my $next = $source;
my %visited;
my %parents;
my #path;
# Finds path using BFS and Queue
while ($next ne $target) {
print "$next: ";
$visited{$next} = 1;
foreach my $variation (#{$wordHash{$next}}) {
next if $visited{ $variation } ;
push(#queue, $variation);
$parents{$variation} = $next;
print "$variation | ";
}
print "\n-----------------\n";
push(#path, $next);
while ( $visited{$next} ) {
$next = shift #queue ;
}
}
push #path, $target ;
print "FOUND: $next\n\n";
print "Path the BFS took: #path\n\n";
print "Value -> Parent: \n";
for my $key (keys %parents) {
print "$key -> $parents{$key}\n";
}
and when ran produces;
./words.pl head tail | more
head: heal |
-----------------
heal: teal | heil |
-----------------
teal:
-----------------
heil: hail |
-----------------
hail: tail |
-----------------
FOUND: tail
Path the BFS took: head heal teal heil hail tail
Value -> Parent:
hail -> heil
heil -> heal
teal -> heal
tail -> hail
heal -> head
You could probably remove the printing of the %parents hash - as hash values come out randomly, it doesnt tell you much
I'm trying to figure out how to use a code that's written in perl, but am not very familiar with perl syntax. I was wondering if someone could tell me what the format of the file #metafilecache is? The code is failing to read the samplerate within the file, but I'm not sure how I have it formatted incorrectly. Here's the excerpt of the code I think is appropriate:
my $tnet = $ARGV[0];
my $tsta = $ARGV[1];
my $stadir = $ARGV[2];
if ( ! -d "$targetdir" ) {
mkdir "$targetdir" || die "Cannot create $targetdir: $?\n";
}
die "Cannot find PDF bin base dir: $pdfbinbase\n" if ( ! -d "$pdfbinbase" );
my %targetdays = ();
my %targetchan = ();
# Collect target files in the $pdfbinbase dir, limited by $changlob
foreach my $nsldir (glob("$pdfbinbase/{CHRYS}/$tnet.$tsta.*")) {
next if ( ! -d "$nsldir" ); # Limit to directories
# Extract location ID from directory name
my ($net,$sta,$loc) = $nsldir =~ /\/(\w+)\.(\w+)\.([\w-]+)$/;
if ( $net ne $tnet ) {
print "Target network ($tnet) != network ($net)\n";
next;
}
if ( $sta ne $tsta ) {
print "Target station ($tsta) != station ($sta)\n";
next;
}
foreach my $chandir (glob("$nsldir/$changlob")) {
next if ( ! -d "$chandir" ); # Limit to directories
# Extract channel code from directory name
my ($chan) = $chandir =~ /.*\/([\w\d]+)$/;
foreach my $yeardir (glob("$chandir/Y*")) {
next if ( ! -d "$yeardir" ); # Limit to directories
# Extract year from directory name
my ($year) = $yeardir =~ /^.*\/Y(\d{4,4})$/;
foreach my $daybin (glob("$yeardir/D*.bin")) {
next if ( ! -f "$daybin" ); # Limit to regular files
my ($day) = $daybin =~ /^.*\/D(\d{3,3})\.bin$/;
$targetdays{"$loc.$chan.$year.$day"} = $daybin;
$targetchan{"$loc.$chan"} = 1;
}
}
}
}
if ( $verbose > 1 ) {
print "Target days from PDF bin files:\n";
my $count = 0;
foreach my $tday (sort keys %targetdays) {
print "Target day: $tday => $targetdays{$tday}\n";
$count++;
}
print "Targets: $count\n";
}
# Remove targets that have already been calculated by checking
# results files against targets.
foreach my $tchan ( keys %targetchan ) {
my ($loc,$chan) = split (/\./, $tchan);
# Generate target file name
my $targetfile = undef;
if ( $loc ne "--" ) { $targetfile = "$targetdir/$prefix-$loc.$chan"; }
else { $targetfile = "$targetdir/$prefix-$chan"; }
print "\nChecking target file for previous results: $targetfile\n"
if ( $verbose );
next if ( ! -f "$targetfile" );
# Open result file and remove any targets that are included
open IN, "$targetfile" || next;
foreach my $line (<IN>) {
next if ( $line =~ /^YEAR\.DAY/ );
my ($year,$day) = $line =~ /^(\d+).(\d+)/;
# Delete this target
delete $targetdays{"$loc.$chan.$year.$day"};
}
close IN;
}
if ( $verbose > 1 ) {
print "Remaining target days:\n";
my $count = 0;
foreach my $tday (sort keys %targetdays) {
print "Target day: $tday => $targetdays{$tday}\n";
$count++;
}
print "Remaining Targets: $count\n";
}
my %targetfiles = ();
# Calculate and store PDF mode for each target day
TARGET: foreach my $tday (sort keys %targetdays) {
my ($loc,$chan,$year,$day) = split (/\./, $tday);
my %power = ();
my %count = ();
my #period = ();
# Determine sampling rate
my $samprate = GetSampRate ($tnet,$tsta,$loc,$chan);
print "Samplerate for $tnet $tsta $loc $chan is: $samprate\n" if (
$verbose );
if ( ! defined $samprate ) {
if ( ($tsta eq "ECSD") || ($tsta eq "SFJ") || ($tsta eq "CASEE") ||
($tsta eq "JSC") ){
next;
}
else {
print "Cannot determine sample rate for channel
$tnet.$tsta.$loc.$chan\n";
next;
}
}
This is the subroutine GetSampRate:
sub GetSampRate { # GetSampRate (net,sta,loc,chan)
my $net = shift;
my $sta = shift;
my $loc = shift;
my $chan = shift;
my $samprate = undef;
# Generate source name: Net_Sta_Loc_Chan
my $srcname = "${net}_${sta}_";
$srcname .= ($loc eq "--") ? "_" : "${loc}_";
$srcname .= "$chan";
if ( $#metafilecache < 0 ) {
my $metafile = "$stadir/metadata.txt";
if ( ! -f "$metafile" ) {
print "GetSampRate(): Cannot find metadata file: $metafile\n";
return undef;
}
# Open metadata file
if ( ! (open MF, "<$metafile") ) {
print "GetSampRate(): Cannot open: $metafile\n";
return undef;
}
# Read all lines in the metafilecache
#metafilecache = <MF>;
close MF;
}
# Read all lines starting with srcname into #lines
my #lines = grep { /^$srcname/ } #metafilecache;
# Find maximum of sample rates for this channel
foreach my $line ( #lines ) {
my #fields = split(/\t/, $line);
my $rate = $fields[7];
$samprate = $rate if (!defined $samprate || $rate > $samprate);
}
return $samprate;
}
The code you have shown is very clunky.
As far this scope is concerned, the file is called $stadir/metadata.txt and I can't help with $stadir as it's either undefined or a global value that is set elsewhere — not a great design idea
After that, #metafilecache = <MF> loads the entire file into the array #metafilecache, leaving a trailing newline character at the end of each element
Then my #lines = grep { /^$srcname/ } #metafilecache duplicates to #lines all lines beginning with the string held in $srcname. This is another global variable that shouldn't be used
The following for loop splits the line on tab ("\t" or "\x09") separators and sets $rate to the eighth value ($fields[7]). $samprate is updated at each iteration if the latest value of $rate is greater than the current stored maximum
I hope that helps
I'm trying to use a foreach loop to loop through an array and then use a nested while loop to loop through each line of a text file to see if the array element matches a line of text; if so then I push data from that line into a new array to perform calculations.
The outer foreach loop appears to be working correctly (based on printed results with each array element) but the inner while loop is not looping (same data pushed into array each time).
Any advice?
The code is below
#! /usr/bin/perl -T
use CGI qw(:cgi-lib :standard);
print "Content-type: text/html\n\n";
my $input = param('sequence');
my $meanexpfile = "final_expression_complete.txt";
open(FILE, $meanexpfile) or print "unable to open file";
my #meanmatches;
#regex = (split /\s/, $input);
foreach $regex (#regex) {
while (my $line = <FILE>) {
if ( $line =~ m/$regex\s(.+\n)/i ) {
push(#meanmatches, $1);
}
}
my $average = average(#meanmatches);
my $std_dev = std_dev($average, #meanmatches);
my $average_round = sprintf("%0.4f", $average);
my $stdev_round = sprintf("%0.4f", $std_dev);
my $coefficient_of_variation = $stdev_round / $average_round;
my $cv_round = sprintf("%0.4f", $coefficient_of_variation);
print font(
{ color => "blue" }, "<br><B>$regex average: $average_round
 Standard deviation: $stdev_round Coefficient of
variation(Cv): $cv_round</B>"
);
}
sub average {
my (#values) = #_;
my $count = scalar #values;
my $total = 0;
$total += $_ for #values;
return $count ? $total / $count : 0;
}
sub std_dev {
my ($average, #values) = #_;
my $count = scalar #values;
my $std_dev_sum = 0;
$std_dev_sum += ($_ - $average)**2 for #values;
return $count ? sqrt($std_dev_sum / $count) : 0;
}
Yes, my advice would be:
Turn on strict and warnings.
perltidy your code,
use 3 argument open: open ( my $inputfile, "<", 'final_expression.txt' );
die if it doesn't open - the rest of your program is irrelevant.
chomp $line
you are iterating your filehandle, but once you've done this you're at the end of file for the next iteration of the foreach loop so your while loops becomes a null operation. Simplistically, reading the file into an array my #lines = <FILE>; would fix this.
So with that in mind:
#!/usr/bin/perl -T
use strict;
use warnings;
use CGI qw(:cgi-lib :standard);
print "Content-type: text/html\n\n";
my $input = param('sequence');
my $meanexpfile = "final_expression_complete.txt";
open( my $input_file, "<", $meanexpfile ) or die "unable to open file";
my #meanmatches;
my #regex = ( split /\s/, $input );
my #lines = <$input_file>;
chomp (#lines);
close($input_file) or warn $!;
foreach my $regex (#regex) {
foreach my $line (#lines) {
if ( $line =~ m/$regex\s(.+\n)/i ) {
push( #meanmatches, $1 );
}
}
my $average = average(#meanmatches);
my $std_dev = std_dev( $average, #meanmatches );
my $average_round = sprintf( "%0.4f", $average );
my $stdev_round = sprintf( "%0.4f", $std_dev );
my $coefficient_of_variation = $stdev_round / $average_round;
my $cv_round = sprintf( "%0.4f", $coefficient_of_variation );
print font(
{ color => "blue" }, "<br><B>$regex average: $average_round
 Standard deviation: $stdev_round Coefficient of
variation(Cv): $cv_round</B>"
);
}
sub average {
my (#values) = #_;
my $count = scalar #values;
my $total = 0;
$total += $_ for #values;
return $count ? $total / $count : 0;
}
sub std_dev {
my ( $average, #values ) = #_;
my $count = scalar #values;
my $std_dev_sum = 0;
$std_dev_sum += ( $_ - $average )**2 for #values;
return $count ? sqrt( $std_dev_sum / $count ) : 0;
}
The problem here is that starting from the second iteration of foreach you are trying to read from already read file handle. You need to rewind to the beginning to read it again:
foreach $regex (#regex) {
seek FILE, 0, 0;
while ( my $line = <FILE> ) {
However that does not look very performant. Why read file several times at all, when you can read it once before the foreach starts, and then iterate through the list:
my #lines;
while (<FILE>) {
push (#lines, $_);
}
foreach $regex (#regex) {
foreach $line (#lines) {
Having the latter, you might also what to consider using grep instead of the while loop.
This is the program as it stands right now, it takes in a .fasta file (a file containing genetic code), creates a hash table with the data and prints it, however, it is quite slow. It splits a string an compares it against all other letters in the file.
use strict;
use warnings;
use Data::Dumper;
my $total = $#ARGV + 1;
my $row;
my $compare;
my %hash;
my $unique = 0;
open( my $f1, '<:encoding(UTF-8)', $ARGV[0] ) or die "Could not open file '$ARGV[0]' $!\n";
my $discard = <$f1>;
while ( $row = <$f1> ) {
chomp $row;
$compare .= $row;
}
my $size = length($compare);
close $f1;
for ( my $i = 0; $i < $size - 6; $i++ ) {
my $vs = ( substr( $compare, $i, 5 ) );
for ( my $j = 0; $j < $size - 6; $j++ ) {
foreach my $value ( substr( $compare, $j, 5 ) ) {
if ( $value eq $vs ) {
if ( exists $hash{$value} ) {
$hash{$value} += 1;
} else {
$hash{$value} = 1;
}
}
}
}
}
foreach my $val ( values %hash ) {
if ( $val == 1 ) {
$unique++;
}
}
my $OUTFILE;
open $OUTFILE, ">output.txt" or die "Error opening output.txt: $!\n";
print {$OUTFILE} "Number of unique keys: " . $unique . "\n";
print {$OUTFILE} Dumper( \%hash );
close $OUTFILE;
Thanks in advance for any help!
It is not clear from the description what is wanted from this script, but if you're looking for matching sets of 5 characters, you don't actually need to do any string matching: you can just run through the whole sequence and keep a tally of how many times each 5-letter sequence occurs.
use strict;
use warnings;
use Data::Dumper;
my $str; # store the sequence here
my %hash;
# slurp in the whole file
open(IN, '<:encoding(UTF-8)', $ARGV[0]) or die "Could not open file '$ARGV[0]' $!\n";
while (<IN>) {
chomp;
$str .= $_;
}
close(IN);
# not sure if you were deliberately omitting the last two letters of sequence
# this looks at all the sequence
my $l_size = length($str) - 4;
for (my $i = 0; $i < $l_size; $i++) {
$hash{ substr($str, $i, 5) }++;
}
# grep in a scalar context will count the values.
my $unique = grep { $_ == 1 } values %hash;
open OUT, ">output.txt" or die "Error opening output.txt: $!\n";
print OUT "Number of unique keys: ". $unique."\n";
print OUT Dumper(\%hash);
close OUT;
It might help to remove searching for information that you already have.
I don't see that $j depends upon $i so you're actually matching values to themselves.
So you're getting bad counts as well. It works for 1, because 1 is the square of 1.
But if for each five-character string you're counting strings that match, you're going
to get the square of the actual number.
You would actually get better results if you did it this way:
# compute it once.
my $lim = length( $compare ) - 6;
for ( my $i = 0; $i < $lim; $i++ ){
my $vs = substr( $compare, $i, 5 );
# count each unique identity *once*
# if it's in the table, we've already counted it.
next if $hash{ $vs };
$hash{ $vs }++; # we've found it, record it.
for ( my $j = $i + 1; $j < $lim; $j++ ) {
my $value = substr( $compare, $j, 5 );
$hash{ $value }++ if $value eq $vs;
}
}
However, it could be an improvement on this to do an index for your second loop
and let the c-level of perl do your matching for you.
my $pos = $i;
while ( $pos > -1 ) {
$pos = index( $compare, $vs, ++$pos );
$hash{ $vs }++ if $pos > -1;
}
Also, if you used index, and wanted to omit the last two characters--as you do, it might make sense to remove those from the characters you have to search:
substr( $compare, -2 ) = ''
But you could do all of this in one pass, as you loop through file. I believe the code
below is almost an equivalent.
my $last_4 = '';
my $last_row = '';
my $discard = <$f1>;
# each row in the file after the first...
while ( $row = <$f1> ) {
chomp $row;
$last_row = $row;
$row = $last_4 . $row;
my $lim = length( $row ) - 5;
for ( my $i = 0; $i < $lim; $i++ ) {
$hash{ substr( $row, $i, 5 ) }++;
}
# four is the maximum we can copy over to the new row and not
# double count a strand of characters at the end.
$last_4 = substr( $row, -4 );
}
# I'm not sure what you're getting by omitting the last two characters of
# the last row, but this would replicate it
foreach my $bad_key ( map { substr( $last_row, $_ ) } ( -5, -6 )) {
--$hash{ $bad_key };
delete $hash{ $bad_key } if $hash{ $bad_key } < 1;
}
# grep in a scalar context will count the values.
$unique = grep { $_ == 1 } values %hash;
You may be interested in this more concise version of your code that uses a global regex match to find all the subsequences of five characters. It also reads the entire input file in one go, and removes the newlines afterwards.
The path to the input file is expected as a parameter on the command line, and the output is sent to STDIN, and can be redirected to a file on the command line, like this
perl subseq5.pl input.txt > output.txt
I've also used Data::Dump instead of Data::Dumper because I believe it to be vastly superior. However it is not a core module, and so you will probably need to install it.
use strict;
use warnings;
use open qw/ :std :encoding(utf-8) /;
use Data::Dump;
my $str = do { local $/; <>; };
$str =~ tr|$/||d;
my %dups;
++$dups{$1} while $str =~ /(?=(.{5}))/g;
my $unique = grep $_ == 1, values %dups;
print "Number of unique keys: $unique\n";
dd \%dups;
I have a hash of lists that is not getting populated.
I checked that the block at the end that adds to the hash is in fact being called on input. It should either add a singleton list if the key doesn't exist, or else push to the back of the list (referenced under the right key) if it does.
I understand that the GOTO is ugly, but I've commented it out and it has no effect.
The problem is that when printhits is called, nothing is printed, as if there are no values in the hash. I also tried each (%genomehits), no dice.
THANKS!
#!/usr/bin/perl
use strict;
use warnings;
my $len = 11; # resolution of the peaks
#$ARGV[0] is input file
#$ARGV[1] is call number
# optional -s = spread number from call
# optional -o specify output file name
my $usage = "see arguments";
my $input = shift #ARGV or die $usage;
my $call = shift #ARGV or die $usage;
my $therest = join(" ",#ARGV) . " ";
print "the rest".$therest."\n";
my $spread = 1;
my $output = $input . ".out";
if ($therest =~ /-s\s+(\d+)\s/) {$spread = $1;}
if ($therest =~ /-o\s+(.+)\s/) {$output = $1;}
# initialize master hash
my %genomehits = ();
foreach (split ';', $input) {
my $mygenename = "err_naming";
if ($_ =~ /^(.+)-/) {$mygenename = $1;}
open (INPUT, $_);
my #wiggle = <INPUT>;
&singlegene(\%genomehits, \#wiggle, $mygenename);
close (INPUT);
}
&printhits;
#print %genomehits;
sub printhits {
foreach my $key (%genomehits) {
print "key: $key , values: ";
foreach (#{$genomehits{$key}}) {
print $_ . ";";
}
print "\n";
}
}
sub singlegene {
# let %hash be the mapping hash
# let #mygene be the gene to currently process
# let $mygenename be the name of the gene to currently process
my (%hash) = %{$_[0]};
my (#mygene) = #{$_[1]};
my $mygenename = $_[2];
my $chromosome;
my $leftbound = -2;
my $rightbound = -2;
foreach (#mygene) {
#print "Doing line ". $_ . "\n";
if ($_ =~ "track" or $_ =~ "output" or $_ =~ "#") {next;}
if ($_ =~ "Step") {
if ($_ =~ /chrom=(.+)\s/) {$chromosome = $1;}
if ($_ =~ /span=(\d+)/) {$1 == 1 or die ("don't support span not equal to one, see wig spec")};
$leftbound = -2;
$rightbound = -2;
next;
}
my #line = split /\t/, $_;
my $pos = $line[0];
my $val = $line[-1];
# above threshold for a call
if ($val >= $call) {
# start of range
if ($rightbound != ($pos - 1)) {
$leftbound = $pos;
$rightbound = $pos;
}
# middle of range, increment rightbound
else {
$rightbound = $pos;
}
if (\$_ =~ $mygene[-1]) {goto FORTHELASTONE;}
}
# else reinitialize: not a call
else {
FORTHELASTONE:
# typical case, in an ocean of OFFs
if ($rightbound != ($pos-1)) {
$leftbound = $pos;
}
else {
# register the range
my $range = $rightbound - $leftbound;
for ($spread) {
$leftbound -= $len;
$rightbound += $len;
}
#print $range . "\n";
foreach ($leftbound .. $rightbound) {
my $key = "$chromosome:$_";
if (not defined $hash{$key}) {
$hash{$key} = [$mygenename];
}
else { push #{$hash{$key}}, $mygenename; }
}
}
}
}
}
You are passing a reference to %genomehits to the function singlegene, and then copying it into a new hash when you do my (%hash) = %{$_[0]};. You then add values to %hash which goes away at the end of the function.
To fix it, use the reference directly with arrow notation. E.g.
my $hash = $_[0];
...
$hash->{$key} = yadda yadda;
I think it's this line:
my (%hash) = %{$_[0]};
You're passing in a reference, but this statement is making a copy of your hash. All additions you make in singlegene are then lost when you return.
Leave it as a hash reference and it should work.
PS - Data::Dumper is your friend when large data structures are not behaving as expected. I'd sprinkle a few of these in your code...
use Data::Dumper; print Dumper \%genomehash;