I have an input file:
id_1 10 15 20:a:4:c
id_2 1 5 2:2:5:c
id_3 0.4 3 12:1:4:1
id_4 18 2 9:1:0/0:1
id_5 a b c:a:foo:2
I have many files of this type that I want to parse in different programs, so I want to make a function that returns a hash with easily accessible.
I've not written a function like this before, and I'm not sure how to properly access the returned hashes. Here's what I've got so far:
Library_SO.pm
#!/urs/bin/perl
package Library_SO;
use strict;
use warnings;
sub tum_norm_gt {
my $file = shift;
open my $in, '<', $file or die $!;
my %SVs;
my %info;
while(<$in>){
chomp;
my ($id, $start, $stop, $score) = split;
my #vals = (split)[1..2];
my #score_fields = split(/:/, $score);
$SVs{$id} = [ $start, $stop, $score ];
push #{$info{$id}}, #score_fields ;
}
return (\%SVs, \%info);
}
1;
And my main script:
get_vals.pl
#!/urs/bin/perl
use Library_SO;
use strict;
use warnings;
use feature qw/ say /;
use Data::Dumper;
my $file = shift or die $!;
my ($SVs, $info) = Library_SO::tum_norm_gt($file);
print Dumper \%$SVs;
print Dumper \%$info;
# for (keys %$SVs){
# say;
# my #vals = #{$SVs{$_}}; <- line 20
# }
I call this with:
perl get_vals.pl test_in.txt
The Dumper output is what I was hoping for, but when I try to iterate over the returned hash(?) and access the values (e.g. as in the commented out section) I get:
Global symbol "%SVs" requires explicit package name at get_vals.pl line 20.
Execution of get_vals.pl aborted due to compilation errors.
Have I got this totally upside down?
Your library function returns two hashrefs. If you now want to access the values you'll have to dereference the hashref:
my ($SVs, $info) = Library_SO::tum_norm_gt($file);
#print Dumper \%$SVs;
# Easier and better readable:
print Dumper $SVs ;
#print Dumper \%$info;
# Easier and better readable:
print Dumper $info ;
for (keys %{ $SVs } ){ # Better visual derefencing
say;
my #vals = #{$SVs->{$_}}; # it is not $SVs{..} but $SVs->{...}
}
Related
I have a file:
434462PW1 5
76252PPP8 5,714.79
76252PMB2 16,950.17
76252PRC5 25,079.70
76252PNY1 30,324.50
62630WCQ8 1.09
62630WCZ8 1.09
62630WBX4 36,731.90
62630WCQ8 1.07
62630WCZ8 1.07
76252PGB9 1.07
62630WBN6 1.07
62630WBA4 1.07
I need the commas stripped out of the second value, and a comma added between the 1st and 2nd values.
434462PW1,5
76252PPP8,5714.79
76252PMB2,16950.17
76252PRC5,25079.70
76252PNY1,30324.50
62630WCQ8,1.09
62630WCZ8,1.09
62630WBX4,36731.90
62630WCQ8,1.07
62630WCZ8,1.07
76252PGB9,1.07
62630WBN6,1.07
62630WBA4,1.07
Here is the code. I'm having trouble stripping just the number values.
#!/usr/bin/perl
use strict ;
use warnings;
open my $handle, '<', "foofile";
chomp(my #positionArray = <$handle>);
foreach my $pos(#positionArray) {
if ($pos =~ /(\w{9})\s+(.*)/) {
if ($2=~/,/) {
my $without = $2=~s/,//g ;
print "$1,$without\n";
}
}
}
Since commas only appear in the 2nd column, you can simply delete all commas from each line. Also, since whitespace only exists between your 2 columns, you can then replace all space with a comma.
foreach my $pos (#positionArray) {
$pos =~ s/,//g;
$pos =~ s/\s+/,/;
print "$pos\n";
}
Another way is you can sorted out this issues using map function (Input and output # array variable).
chomp(my #positionArray = <$handle>);
my #out = map { $_=~s/\,//g; $_=~s/\s+/,/; $_; } #positionArray;
use Data::Dumper;
print Dumper \#out;
For inexplicable reason you made code more complicated than it may be
use strict ;
use warnings;
use feature 'say';
my $filename = 'foofile';
open my $fh, '<', $filename
or die "Couldn't open $filename $!";
my #lines = <$fh>;
close $fh;
chomp #lines; # snip eol
for (#lines) {
my($id,$val) = split;
$val =~ s/,//; # modifier 'g' might be used if value goes beyond thousands
say "$id,$val";
}
Output
434462PW1,5
76252PPP8,5714.79
76252PMB2,16950.17
76252PRC5,25079.70
76252PNY1,30324.50
62630WCQ8,1.09
62630WCZ8,1.09
62630WBX4,36731.90
62630WCQ8,1.07
62630WCZ8,1.07
76252PGB9,1.07
62630WBN6,1.07
62630WBA4,1.07
I am trying to write a script that takes in one file and compares it to the second and then outputs the difference. I had it working, but decided I wanted to get rid of any line that starts with '#." I had to use push as .= was not working. Ever since then I get output like
keys = GLOB(0x23d2d48)
I'm not sure what I am missing.
#!/usr/bin/perl
use warnings;
use lib '/var/www/oooOOoO/lib/Perl';
my #a1;
my #a2;
my %diff1;
my %diff2;
my #diff1;
my #diff2;
my $input_file = "/etc/mysql/conf.d/replication-slave.cnf";
my $input_file2 = "tA.cnf";
open( my $input_fh, "<", $input_file ) || die "Can't open $input_file: $!";
open( my $input_fh2, "<", $input_file2 ) || die "Can't open $input_file: $!";
#a1 = ' ';
for ($input_fh) {
next if /^#/;
push#a1, $_;
}
#a2= ' ';
for ($input_fh2) {
next if /^#/;
push #a2, $_;
}
#diff1{ #a1 } = #a1;
delete #diff1{ #a2 };
# %diff1 contains elements from '#a1' that are not in '#a2'
#k = (keys %diff1);
print "keys = #k\n";
I've tried changing keys to values, but that didn't work.
Thanks
The problem lies in this bit of code:
for ($input_fh) {
next if /^#/;
push #a1, $_;
}
This is creating a single element list containing a filehandle, then pushing that filehandle to #a1. To read from the filehandle you need to wrap it with <>:
while (<$input_fh>) {
next if /^#/;
push #a1, $_;
}
Note I've switched the for to a while as for imposes list context and reads the file all at once whereas while will read one line at a time. You can also remove:
#a1 = ' ';
#a2 = ' ';
Which just adds an extra element to both arrays.
It should work, but you're code's a little messy. I'm also not sure what you're trying to do when you assign #diff1{#a1} = #a1.
Try this re-write and let me know:
#!/usr/bin/perl
use strict;
use warnings;
use lib '/var/www/ooooOOooOoo/lib/Perl';
my $input_file = "/etc/mysql/conf.d/replication-slave.cnf";
my $input_file2 = "tA.cnf";
open my $input_fh, "<", $input_file or die "Can't open $input_file: $!";
open my $input_fh2, "<", $input_file2 or die "Can't open $input_file: $!";
my (#a1, #a2);
while(<$input_fh>){
chomp;
next if /^#/;
push #a1, $_;
}
while(<$input_fh2>){
chomp;
next if /^#/;
push #a2, $_;
}
my %diff1;
#diff1{#a1} = #a1; # What are you actually trying to do here?
delete #diff1{#a2};
# %diff1 contains elements from '#a1' that are not in '#a2'
my #k = (keys %diff1);
print "keys = #k\n";
But you might want to try this approach instead:
my #nums1 = qw(1 2 3 4 5);
my #nums2 = qw(one two three four 5);
my (%compare1, %compare2);
foreach(#nums1){
chomp;
$compare1{$_} = 1;
}
foreach(#nums2){
chomp;
$compare2{$_} = 1;
}
foreach my $key (keys %compare1){
print "$key\n" unless $compare2{$key};
}
Once you have loaded the two arrays, it's better to use a Perl CPAN module to do this kind of taks. I think Array::Utils is a good candidate to achieve your goals. From the module documentation:
use Array::Utils qw(:all);
my #a = qw( a b c d );
my #b = qw( c d e f );
# symmetric difference
my #diff = array_diff(#a, #b);
# intersection
my #isect = intersect(#a, #b);
# unique union
my #unique = unique(#a, #b);
# check if arrays contain same members
if ( !array_diff(#a, #b) ) {
# do something
}
# get items from array #a that are not in array #b
my #minus = array_minus( #a, #b );
I have 3 or multiple files I need to merge, the data looks like this..
file 1
0334.45656
0334.45678
0335.67899
file 2
0334.89765
0335.12346
0335.56789
file 3
0334.12345
0335.45678
0335.98764
Expected output in file 4,
0334.89765
0334.89765
0334.89765
0334.12345
0335.67899
0335.12346
0335.56789
0335.45678
0335.98764
So far I have tried but data in 4rth file does not come in sorted order,
#!/usr/bin/perl
my %hash;
my $outFile = "outFile.txt";
foreach $file(#ARGV)
{
print "$file\n";
open (IN, "$file") || die "cannot open file $!";
open (OUT,">>$outFile") || die "cannot open file $!";
while ( <IN> )
{
chomp $_;
($timestamp,$data) = split (/\./,$_);
$hash{$timeStamp}{'data'}=$data;
if (defined $hash{$timeStamp})
{
print "$_\n";
print OUT"$_\n";
}
}
}
close (IN);
close (OUT);
I wouldn't normally suggest this, but unix utilties should be able to handle this just fine.
cat the 3 files together.
use sort to sort the merged file.
However, using perl, could just do the following:
#!/usr/bin/perl
use strict;
use warnings;
my #data;
push #data, $_ while (<>);
# Because the numbers are all equal length, alpha sort will work here
print for sort #data;
However, as we've discussed, it's possible that the files will be extremely large. Therefore it will be more efficient both in memory and speed if you're able to take advantage of the fact that all the files are already sorted.
The following solution therefore streams the files, pulling out the next one in order each loop of the while:
#!/usr/bin/perl
# Could name this catsort.pl
use strict;
use warnings;
use autodie;
# Initialize File handles
my #fhs = map {open my $fh, '<', $_; $fh} #ARGV;
# First Line of each file
my #data = map {scalar <$_>} #fhs;
# Loop while a next line exists
while (#data) {
# Pull out the next entry.
my $index = (sort {$data[$a] cmp $data[$b]} (0..$#data))[0];
print $data[$index];
# Fill In next Data at index.
if (! defined($data[$index] = readline $fhs[$index])) {
# End of that File
splice #fhs, $index, 1;
splice #data, $index, 1;
}
}
Using Miller's idea in a more reusable way,
use strict;
use warnings;
sub get_sort_iterator {
my #fhs = map {open my $fh, '<', $_ or die $!; $fh} #_;
my #d;
return sub {
for my $i (0 .. $#fhs) {
# skip to next file handle if it doesn't exists or we have value in $d[$i]
next if !$fhs[$i] or defined $d[$i];
# reading from $fhs[$i] file handle was success?
if ( defined($d[$i] = readline($fhs[$i])) ) { chomp($d[$i]) }
# file handle at EOF, not needed any more
else { undef $fhs[$i] }
}
# compare as numbers, return undef if no more data
my ($index) = sort {$d[$a] <=> $d[$b]} grep { defined $d[$_] } 0..$#d
or return;
# return value from $d[$index], and set it to undef
return delete $d[$index];
};
}
my $iter = get_sort_iterator(#ARGV);
while (defined(my $x = $iter->())) {
print "$x\n";
}
output
0334.12345
0334.45656
0334.45678
0334.89765
0335.12346
0335.45678
0335.56789
0335.67899
0335.98764
Suppose every input files are already in ascending order and have at least one line in them, this script could merge them in ascending order:
#!/usr/bin/perl
use warnings;
use strict;
use List::Util 'reduce';
sub min_index {
reduce { $_[$a] < $_[$b] ? $a : $b } 0 .. $#_;
}
my #fhs = map { open my $fh, '<', $_; $fh } #ARGV;
my #data = map { scalar <$_> } #fhs;
while (#data) {
my $idx = min_index(#data);
print "$data[$idx]";
if (! defined($data[$idx] = readline $fhs[$idx])) {
splice #data, $idx, 1;
splice #fhs, $idx, 1;
}
}
Note: this is basic the same as the second script offered by #Miller, but a bit clearer and more concise.
I suggest this solution, which uses a sorted array of hashes - each hash corresponding to an input file, and containing a file handle fh, the last line read line and the timestamp extracted from the line timestamp.
The hash at the end of the array always corresponds to the input that has the smallest value for the timestamp, so all that is necessary is to repeateedly pop the next value from the array, print its data, read the next line and (if it hasn't reached eof) insert it back into the array in sorted order.
This could produce an appreciable increase in speed over the repeated sorting of all the data for each output line that other answers use.
Note that the program expects the list of input files as parameters on the command line, and sends its merged output to STDOUT. It also assumes that the input files are already sorted.
use strict;
use warnings;
use autodie;
my #data;
for my $file (#ARGV) {
my $item;
open $item->{fh}, '<', $file;
insert_item($item, \#data);
}
while (#data) {
my $item = pop #data;
print $item->{line};
insert_item($item, \#data);
}
sub insert_item {
my ($item, $array) = #_;
return if eof $item->{fh};
$item->{line} = readline $item->{fh};
($item->{timestamp}) = $item->{line} =~ /^(\d+)/;
my $i = 0;
++$i while $i < #$array and $item->{timestamp} < $array->[$i]{timestamp};
splice #$array, $i, 0, $item;
}
output
0334.45656
0334.89765
0334.12345
0334.45678
0335.12346
0335.45678
0335.67899
0335.56789
0335.98764
So I have a text file with the following line:
123456789
But then I have a second file:
987654321
So how can I make the first file's contents the keys in a hash, and the second file's values the values? (Each character is a key/value)
Should I store each file into different arrays and then somehow merge them? How would I do that? Anything else?
Honestly, I would give you my code I have tried, but I haven't the slightest idea where to start.
You could use a hash slice.
If each line is a key/value: (s///r requires 5.14, but it can easily be rewritten for earlier versions)
my %h;
#h{ map s/\s+\z//r, <$fh1> } = map s/\s+\z//r, <$fh2>;
If each character is a key/value:
my %h;
{
local $/ = \1;
#h{ grep !/\n/, <$fh1> } = grep !/\n/, <$fh2>;
}
Just open both files and read them line by line simultaneously:
use strict; use warnings;
use autodie;
my %hash;
open my $keyFile, '<', 'keyfileName';
open my $valueFile, '<', 'valuefileName';
while(my $key = <$keyFile>) {
my $value = <$valueFile>;
chomp for $key, $value;
$hash{$key} = $value;
}
Of course this is just a quick sketch on how it could work.
The OP mentions that each character is a key or value, by this I take it that you mean that the output should be a hash like ( 1 => 9, 2 => 8, ... ). The OP also asks:
Should I store each file into different arrays and then somehow merge them? How would I do that?
This is exactly how this answer works. Here get_chars is a function that reads in each file, splits on every char and returns that array. Then use zip from List::MoreUtils to create the hash.
#!/usr/bin/env perl
use strict;
use warnings;
use List::MoreUtils 'zip';
my ($file1, $file2) = #ARGV;
my #file1chars = get_chars($file1);
my #file2chars = get_chars($file2);
my %hash = zip #file1chars, #file2chars;
use Data::Dumper;
print Dumper \%hash;
sub get_chars {
my $filename = shift;
open my $fh, '<', $filename
or die "Could not open $filename: $!";
my #chars;
while (<$fh>) {
chomp;
push #chars, split //;
}
return #chars;
}
Iterator madness:
#!/usr/bin/env perl
use autodie;
use strict; use warnings;
my $keyfile_contents = join("\n", 'A' .. 'J');
my $valuefile_contents = join("\n", map ord, 'A' .. 'E');
# Use get_iterator($keyfile, $valuefile) to read from physical files
my $each = get_iterator(\ ($keyfile_contents, $valuefile_contents) );
my %hash;
while (my ($k, $v) = $each->()) {
$hash{ $k } = $v;
}
use YAML;
print Dump \%hash;
sub get_iterator {
my ($keyfile, $valuefile) = #_;
open my $keyf, '<', $keyfile;
open my $valf, '<', $valuefile;
return sub {
my $key = <$keyf>;
return unless defined $key;
my $value = <$valf>;
chomp for grep defined, $key, $value;
return $key => $value;
};
}
Output:
C:\temp> yy
---
A: 65
B: 66
C: 67
D: 68
E: 69
F: ~
G: ~
H: ~
I: ~
J: ~
I would write
my %hash = ('123456789' => '987654321');
I want to do the inverse of sort(1) : randomize every line of stdin to stdout in Perl.
I bet real Perl hackers will tear this apart, but here it goes nonetheless.
use strict;
use warnings;
use List::Util 'shuffle';
my #lines = ();
my $bufsize = 512;
while(<STDIN>) {
push #lines, $_;
if (#lines == $bufsize) {
print shuffle(#lines);
undef #lines;
}
}
print shuffle(#lines);
Difference between this and the other solution:
Will not consume all the input and then randomize it (memory hog), but will randomize every $bufsize lines (not truly random and slow as a dog compared to the other option).
Uses a module which returns a new list instead of a in place editing Fisher - Yates implementation. They are interchangeable (except that you would have to separate the print from the shuffle). For more information type perldoc -q rand on your shell.
This perl snippet does the trick :
#! /usr/bin/perl
# randomize cat
# fisher_yates_shuffle code copied from Perl Cookbook
# (By Tom Christiansen & Nathan Torkington; ISBN 1-56592-243-3)
use strict;
my #lines = <>;
fisher_yates_shuffle( \#lines ); # permutes #array in place
foreach my $line (#lines) {
print $line;
}
# fisher_yates_shuffle( \#array ) : generate a random permutation
# of #array in place
sub fisher_yates_shuffle {
my $array = shift;
my $i;
for ($i = #$array; --$i; ) {
my $j = int rand ($i+1);
next if $i == $j;
#$array[$i,$j] = #$array[$j,$i];
}
}
__END__
use List::Util 'shuffle';
print shuffle <>
Or if you worry about last lines lacking \n,
chomp(my #lines = <>);
print "$_\n" for shuffle #lines;