So I have an array like this:
W,X,Y,Z
-7,6,101,15
-7,6,101,16
-7,6,101,17
-7,6,101,78
-7,6,101,79
-7,6,101,84
-7,6,101,92
-7,9,615,49
-7,9,615,50
-7,10,759,38
-7,10,759,39
Now, I want to print a line when W=X=Y and Z are consecutive numbers.
Expected Output:
W,X,Y,Z
-7,6,101,15-16-17
-7,6,101,78-79
-7,6,101,84
-7,6,101,92
-7,9,615,49-50
-7,10,759,38-39
How do I implement this on Perl?
Thanks,
Art
Here is my script:
while ( $output_line = <FILE_C> ) {
chomp $output_line;
my ( $W, $X, $Y, $C, $D, $E, $F, $Z ) = ( split /\s/, $output_line );
if ( $Y == $Block_previous ) {
print("Yes\t$Block_previous\t$Y\t$Z\n");
push( #Z_array, $Z );
push( #Y_array, $Y );
next;
}
else {
push( #Z_array_nonblkmatch, $Z );
}
foreach $Z_printer (#Z_array) {
print("$Y_array[0]\t$Z_printer\n");
if ( ( $Z_delta == 1 ) || ( $Z_delta == -1 ) ) {
push( #Z_adj, $Z_printer, $Z_printer_prev );
#~ print ("pair: $Z_printer_prev-$Z_printer\n");
}
else {
#~ print ("$Z_printer\n");
}
$Z_printer_prev = $Z_printer;
}
#Z_adj = ();
#Z_array = ();
#Y_array = ();
#Z_array_nonblkmatch = ();
$Block_previous = $Y;
#~ <STDIN>;
}
close(FILE_C);
Thanks, raina77ow! However, this is what the output look like:
-7,6,101,15-16-17-79
-7,6,101,16-17-79
-7,6,101,17-79
-7,6,101,78-79
-7,6,101,79-50
-7,6,101,84-50
-7,6,101,92
-7,6,615,49-50-39
-7,6,615,50
One possible approach (ideone demo):
use warnings;
use strict;
my $prev;
while (<DATA>) {
chomp;
next unless /\S/;
my #numbers = split /,/;
if (defined $prev && $numbers[3] == $prev + 1) {
print '-' . ++$prev;
next;
}
print "\n" if defined $prev;
print join ',', #numbers;
$prev = $numbers[3];
}
__DATA__
-7,6,101,15
-7,6,101,16
-7,6,101,17
-7,6,101,78
-7,6,101,79
-7,6,101,84
-7,6,101,92
-7,9,615,49
-7,9,615,50
-7,10,759,38
-7,10,759,39
I choose not to collect this data into intermediate array, as you did, as the question was simple: print it grouped. The key is storing the value of the last (Z) column, then checking each new line against it: if it matches, you print just the incremented value (that's what print '-' . ++$prev line for), if not, you end this line (for all but the first case) and start a new one with the numbers of this line.
Related
I have a perl code that extract from .xls file. My .xls file is as below
NUMBER NAME ALPHABET
one Jane a
two Adam b
three Josh c
;four
five Agnes e
six Mary f
;seven
eight Lara h
I want to extract the info and only take column 1 and 2. My perl code is as below.
#!/usr/bin/perl
use warnings;
use strict;
use Spreadsheet::ParseExcel;
main ();
sub main {
my $filename = 'Book1.xls';
my $parser = Spreadsheet::ParseExcel->new();
my $workbook = $parser->parse( $filename );
if ( !defined $workbook ) {
die "-E-: cannot parse <$filename>.\n ", $parser->error(), ".\n";
}
my $worksheet = $workbook -> Worksheet ( 'a' ) || die "-E-: cannot parse family pin list.\n";
my ( $row_min, $row_max ) = $worksheet-> row_range();
open ( my $file,"> output.txt");
for my $row ( 1 .. $row_max ) {
my #data;
for my $col ( 0 ) {
my $number = $worksheet-> get_cell( $row, $col );
if ( $number ) {
push #data, $number-> value();
}
else {
push #data, '';
}
}
for my $col ( 2 ) {
my $alphabet = $worksheet->get_cell( $row, $col );
if ( $alphabet ) {
push #data, $alphabet->value();
print $file "#data\n";
}
else {
push #data, '';
}
}
}
close $file;
print "done\n";
}
The result is
one a
two b
three c
;four
five e
six f
;seven
eight h
I want to remove the entire array that start with string ";". I extend my code like below
open ( my $file,"> output.txt");
for my $row ( 1 .. $row_max ) {
my #data;
for my $col ( 0 ) {
my $number = $worksheet-> get_cell( $row, $col );
if ( $number ) {
push #data, $number-> value();
}
else {
push #data, '';
}
}
for my $col ( 11 ) {
my $alphabet = $worksheet->get_cell( $row, $col );
if ( $alphabet ) {
push #data, $alphabet->value();
}
else {
push #data, '';
}
}
my #new_data = grep(!/;/, #data);
my #latest_data = grep ( $_ ne '', #new_data);
print $file "#latest_data\n";
}
close $file;
print "done\n";
}
The output result produce like below.
one a
two b
three c
five e
six f
eight h
I don't want to be empty space. How i want to eliminate the empty space that produce result as below?
one a
two b
three c
five e
six f
eight h
I also try doing like this, but the result is same.
for my $index (reverse 0..$#data) {
if ( $data[$index] =~ /^;/ ) {
splice(#data, $index, 1);
}
}
print $file "#data\n";
I hope you are looking for next
use warnings;
use strict;
#data=("one a", "two b","three c", ";four d","five e","six e","seven g","eight h")
for(#data){
next if (/^;/);
print $_,"\n";
}
let's get your data gist
#data=("one a", "two b","three c", ";four d","five e","six e","seven g","eight h")
for(#data){
s/^;.*//; # // <-- Put Replacer here between `//` , leave it to remove line
push(#new_data,$_)}
print "$_\n" for #new_data
In Perl, how to generate list of numbers from hyphen/comma string like:
1-8,10,12-15,23 ?
Expected output is: [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15,23]
Ok, I have some solution that seems to work, can I optimize it? :
#cl_list = "0,2-7,13-16"
#cl_list = split(/,/,join(',',#cl_list));
foreach $cl (#cl_list){
if (index($cl, '-') != -1){
#range=split(/-|,/,$cl,2);
$ld=$range[0];
$ud=$range[1];
while ($ld <= $ud) {
push #list, $ld;
print "$ld\n";
$ld++;
}
}
else {
push #list, $ld;
}
}
print "list=#list\n";
I'd probably do it like this:
#!/usr/bin/env perl
use strict;
use warnings;
my $thing = '1-8,10,12-15,23';
my #values;
#split the string on commas.
for ( split /,/, $thing ) {
#split each element in `-`. ($end is undefined if no `-` present)
my ( $start, $end ) = split ( '-' );
#iterate from start to end (or start to start if end is undef)
push ( #values, $_ ) for ( $start .. $end // $start );
}
print join ",", #values;
Prints:
1,2,3,4,5,6,7,8,10,12,13,14,15,23
Here is fixed Sobrique solution:
use strict;
use warnings;
my $thing = '1-8,10,12-15,23';
my #values;
#split the string on commas.
for ( split /,/, $thing ) {
#split each element in `-`. ($end is undefined if no `-` present)
my ( $start, $end ) = split ( '-',$_ );
if (!defined $end) { $end=$start;}
#iterate from start to end (or start to start if end is undef)
push ( #values, $_ ) for ( $start..$end );
}
print join ",", #values;
I am very new at perl and had discovered the solution at:
Perl: Compare Two CSV Files and Print out differences
I have gone through dozens of other solutions and this comes closest, except that instead of finding the differences between 2 CSV files, I want to find where the second CSV file matches the first one in column and row. How could I modify the following script to find the matches in column/row instead of the differences. I am hoping to dissect this code and learn arrays from there, but wanted to find out the solution to this application. Much thanks.
use strict;
my #arr1;
my #arr2;
my $a;
open(FIL,"a.txt") or die("$!");
while (<FIL>)
{chomp; $a=$_; $a =~ s/[\t;, ]*//g; push #arr1, $a if ($a ne '');};
close(FIL);
open(FIL,"b.txt") or die("$!");
while (<FIL>)
{chomp; $a=$_; $a =~ s/[\t;, ]*//g; push #arr2, $a if ($a ne '');};
close(FIL);
my %arr1hash;
my %arr2hash;
my #diffarr;
foreach(#arr1) {$arr1hash{$_} = 1; }
foreach(#arr2) {$arr2hash{$_} = 1; }
foreach $a(#arr1)
{
if (not defined($arr2hash{$a}))
{
push #diffarr, $a;
}
}
foreach $a(#arr2)
{
if (not defined($arr1hash{$a}))
{
push #diffarr, $a;
}
}
print "Diff:\n";
foreach $a(#diffarr)
{
print "$a\n";
}
# You can print to a file instead, by: print FIL "$a\n";
ok, I realize that this was more what I was looking for:
use strict;
use warnings;
use feature qw(say);
use autodie;
use constant {
FILE_1 => "file1.txt",
FILE_2 => "file2.txt",
};
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = 1;
}
close $file1_fh;
#
# Load Hash #2 with value from File #2
#
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
$hash2{$value} = 1;
}
close $file2_fh;
Now I want to search file2's hash to check if there are ANY matches from file1's hash. That is where I am stuck
With new code suggestion, code now looks like this
#!/usr/bin/env perl
use strict;
use warnings;
use feature qw(say);
use autodie;
use constant {
FILE_1 => "masterlist.csv",
FILE_2 => "pastebin.csv",
};
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = 1;
}
close $file1_fh;
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
if ( $hash1{$value} ) {
print "Match found $value\n";
$hash2{$value}++;
}
}
close $file2_fh;
print "Matches found:\n";
foreach my $key ( keys %hash2 ) {
print "$key found $hash2{$key} times\n";
}
I updated one part with split() and it seems to work, but have to test more to confirm if it fits the solution I'm looking for or I have more work to do one it
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = ( %hash1, (split(/,/, $_))[1,2] );
}
close $file1_fh;
So, with your code there - you've read in 'file1' to a hash.
Why not instead of reading file 2 into a hash, do instead:
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
if ( $hash1{$value} ) {
print "Match found $value\n";
$hash2{$value}++;
}
}
close $file2_fh;
print "Matches found:\n";
foreach my $key ( keys %hash2 ) {
print "$key found $hash2{$key} times\n";
}
I think this code identifies every place that a data field in file A matches a data field in file B (at least it does on my limited test data):
use strict;
use warnings;
my #arr1;
my #arr2;
# a.txt -> #arr1
my $file_a_name = "poster_a.txt";
open(FIL,$file_a_name) or die("$!");
my $a_line_counter = 0;
while (my $a_line = <FIL>)
{
$a_line_counter = $a_line_counter + 1;
chomp($a_line);
my #fields = (split /,/,$a_line);
my $num_fields = scalar(#fields);
s{^\s+|\s+$}{}g foreach #fields;
push #arr1, \#fields if ( $num_fields ne 0);
};;
close(FIL);
my $file_b_name = "poster_b.txt";
open(FIL,$file_b_name) or die("$!");
while (my $b_line = <FIL>)
{
chomp($b_line);
my #fields = (split /,/,$b_line);
my $num_fields = scalar(#fields);
s{^\s+|\s+$}{}g foreach #fields;
push #arr2, \#fields if ( $num_fields ne 0)
};
close(FIL);
# b.txt -> #arr2
#print "\n",#arr2, "\n";
my #match_array;
my $file_a_line_ctr = 1;
foreach my $file_a_line_fields (#arr1)
{
my $file_a_column_ctr = 1;
foreach my $file_a_line_field (#{$file_a_line_fields})
{
my $file_b_line_ctr = 1;
foreach my $file_b_line_fields(#arr2)
{
my $file_b_column_ctr = 1;
foreach my $file_b_field (#{$file_b_line_fields})
{
if ( $file_b_field eq $file_a_line_field )
{
my $match_info =
"$file_a_name line $file_a_line_ctr column $file_a_column_ctr" .
" (${file_a_line_field}) matches: " .
"$file_b_name line $file_b_line_ctr column $file_b_column_ctr ";
push(#match_array, $match_info);
print "$match_info \n";
}
$file_b_column_ctr = $file_b_column_ctr + 1;
}
$file_b_line_ctr = $file_b_line_ctr + 1;
}
$file_a_column_ctr = $file_a_column_ctr + 1;
}
$file_a_line_ctr = $file_a_line_ctr + 1;
}
print "there were ", scalar(#match_array)," matches\n";
Hey All below is my code:
#!/bin/usr/perl
#words = ();
#dup = ();
print "Please enter your query sentence:\n";
$input = <STDIN>;
chomp $input;
#words = split(/ /, $input);
#sort = sort { "\L$a" cmp "\L$b" } #words;
for $n ( 0..$#sort ) {
if (lc $sort[$n] eq lc $sort[($n+1)]) {
push (#dup, $sort[$n]);
}
else {
$n+=1;
}
}
if ( #dup == () ) {
print "There are no duplicates in the query sentence.\n";
}
else {
print "#dup \n";
}
The problem I am having is if the word appears more than twice it appears in #dup for each additional occurrence. How do I fix this? Thanks in advance for your help!
It would be much simpler to use a Hash, which will still allow you to track duplicates and saves you from needing to presort your Array:
use strict;
use warnings;
print "Please enter your query sentence:\n";
my $input = <STDIN>;
chomp $input;
my #words = split /\s+/, $input;
my %unique_elems;
for my $word ( #words ) {
$unique_elems{$word}++;
}
if ( scalar keys %unique_elems == scalar #words ) {
print "There are no duplicates in the query sentence.\n";
}
else {
my #dups = grep { $unique_elem{$_} > 1 } keys %unique_elems;
print join ',', #dups;
print "\n";
}
A couple notes:
You should always use strict; use warnings; at the top of your scripts, it will save you more time than you would think
You should be using my to declare lexical variables instead of declaring globals
print "Please enter your query sentence:\n";
$input = <STDIN>;
chomp $input;
my #words = split /\s+/, $input;
my %words;
foreach my $word(#words) {
$words{lc($word)}++;
}
my #dup = grep {$words{$_} > 1} keys %words;
if (#dup == 0) {
print "There are no duplicates in the query sentence.\n";
}
else {
print "#dup \n";
}
Use this:
#!/bin/usr/perl
#words = ();
#dup = ();
print "Please enter your query sentence:\n";
$input = <STDIN>;
chomp $input;
#words = split(/ /, $input);
#sort= sort {"\L$a" cmp "\L$b"}#words;
%hash;
for $n (0..$#sort) {
if (lc $sort[$n] eq lc $sort[($n+1)]) {
if(!defined $hash{$sort[$n]}) {
push (#dup, $sort[$n]);
$hash{$sort[$n]}=1;
}
}
else {$n+=1;}
}
if (#dup==()){print "There are no duplicates in the query sentence.\n";}
else {print "#dup \n";}
just defined an additional hash %hash to keep the unique entries in the array.
I'm not sure exactly how to explain this, so I'll just start with an example.
Given the following data:
Apple
Apricot
Blackberry
Blueberry
Cherry
Crabapple
Cranberry
Elderberry
Grapefruit
Grapes
Kiwi
Mulberry
Nectarine
Pawpaw
Peach
Pear
Plum
Raspberry
Rhubarb
Strawberry
I want to generate an index based on the first letter of my data, but I want the letters grouped together.
Here is the frequency of the first letters in the above dataset:
2 A
2 B
3 C
1 E
2 G
1 K
1 M
1 N
4 P
2 R
1 S
Since my example data set is small, let's just say that the maximum number to combine the letters together is 3. Using the data above, this is what my index would come out to be:
A B C D-G H-O P Q-Z
Clicking the "D-G" link would show:
Elderberry
Grapefruit
Grapes
In my range listing above, I am covering the full alphabet - I guess that is not completely neccessary - I would be fine with this output as well:
A B C E-G K-N P R-S
Obviously my dataset is not fruit, I will have more data (around 1000-2000 items), and my "maximum per range" will be more than 3.
I am not too worried about lopsided data either - so if I 40% of my data starts with an "S", then S will just have its own link - I don't need to break it down by the second letter in the data.
Since my dataset won't change too often, I would be fine with a static "maximum per range", but it would be nice to have that calculated dynamically too. Also, the dataset will not start with numbers - it is guaranteed to start with a letter from A-Z.
I've started building the algorithm for this, but it keeps getting so messy I start over. I don't know how to search google for this - I'm not sure what this method is called.
Here is what I started with:
#!/usr/bin/perl
use strict;
use warnings;
my $index_frequency = { map { ( $_, 0 ) } ( 'A' .. 'Z' ) };
my $ranges = {};
open( $DATASET, '<', 'mydata' ) || die "Cannot open data file: $!\n";
while ( my $item = <$DATASET> ) {
chomp($item);
my $first_letter = uc( substr( $item, 0, 1 ) );
$index_frequency->{$first_letter}++;
}
foreach my $letter ( sort keys %{$index_frequency} ) {
if ( $index_frequency->{$letter} ) {
# build $ranges here
}
}
My problem is that I keep using a bunch of global variables to keep track of counts and previous letters examined - my code gets very messy very fast.
Can someone give me a step in the right direction? I guess this is more of an algorithm question, so if you don't have a way to do this in Perl, pseudo code would work too, I guess - I can convert it to Perl.
Thanks in advance!
Basic approach:
#!/usr/bin/perl -w
use strict;
use autodie;
my $PAGE_SIZE = 3;
my %frequencies;
open my $fh, '<', 'data';
while ( my $l = <$fh> ) {
next unless $l =~ m{\A([a-z])}i;
$frequencies{ uc $1 }++;
}
close $fh;
my $current_sum = 0;
my #letters = ();
my #pages = ();
for my $letter ( "A" .. "Z" ) {
my $letter_weigth = ( $frequencies{ $letter } || 0 );
if ( $letter_weigth + $current_sum > $PAGE_SIZE ) {
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
$current_sum = $letter_weigth;
#letters = ( $letter );
next;
}
push #letters, $letter;
$current_sum += $letter_weigth;
}
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
print "Pages : " . join( " , ", #pages ) . "\n";
Problem with it is that it outputs (from your data):
Pages : A , B , C-D , E-J , K-O , P , Q-Z
But I would argue this is actually good approach :) And you can always change the for loop into:
for my $letter ( sort keys %frequencies ) {
if you need.
Here's my suggestion:
# get the number of instances of each letter
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# transform the list of counts into a map of count => letters
my %freq = ();
while (my ($letter, $count) = each %count)
{
push #{ $freq{ $count } }, $letter;
}
# now print out the list of letters for each count (or do other appropriate
# output)
foreach (sort keys %freq)
{
my #sorted_letters = sort #{ $freq{$_} };
print "$_: #sorted_letters\n";
}
Update: I think that I misunderstood your requirements. The following code block does something more like what you want.
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# get the maximum frequency
my $max_freq = (sort values %count)[-1];
my $curr_set_count = 0;
my #curr_set = ();
foreach ('A' .. 'Z') {
push #curr_set, $_;
$curr_set_count += $count{$_};
if ($curr_set_count >= $max_freq) {
# print out the range of the current set, then clear the set
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
#curr_set = ();
$curr_set_count = 0;
}
}
# print any trailing letters from the end of the alphabet
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
Try something like that, where frequency is the frequency array you computed at the previous step and threshold_low is the minimal number of entries in a range, and threshold_high is the max. number. This should give harmonious results.
count=0
threshold_low=3
threshold_high=6
inrange=false
frequency['Z'+1]=threshold_high+1
for letter in range('A' to 'Z'):
count += frequency[letter];
if (count>=threshold_low or count+frequency[letter+1]>threshold_high):
if (inrange): print rangeStart+'-'
print letter+' '
inrange=false
count=0
else:
if (not inrange) rangeStart=letter
inrange=true
use strict;
use warnings;
use List::Util qw(sum);
my #letters = ('A' .. 'Z');
my #raw_data = qw(
Apple Apricot Blackberry Blueberry Cherry Crabapple Cranberry
Elderberry Grapefruit Grapes Kiwi Mulberry Nectarine
Pawpaw Peach Pear Plum Raspberry Rhubarb Strawberry
);
# Store the data by starting letter.
my %data;
push #{$data{ substr $_, 0, 1 }}, $_ for #raw_data;
# Set max page size dynamically, based on the average
# letter-group size (in this case, a multiple of it).
my $MAX_SIZE = sum(map { scalar #$_ } values %data) / keys %data;
$MAX_SIZE = int(1.5 * $MAX_SIZE + .5);
# Organize the data into pages. Each page is an array reference,
# with the first element being the letter range.
my #pages = (['']);
for my $letter (#letters){
my #d = exists $data{$letter} ? #{$data{$letter}} : ();
if (#{$pages[-1]} - 1 < $MAX_SIZE or #d == 0){
push #{$pages[-1]}, #d;
$pages[-1][0] .= $letter;
}
else {
push #pages, [ $letter, #d ];
}
}
$_->[0] =~ s/^(.).*(.)$/$1-$2/ for #pages; # Convert letters to range.
This is an example of how I would write this program.
#! /opt/perl/bin/perl
use strict;
use warnings;
my %frequency;
{
use autodie;
open my $data_file, '<', 'datafile';
while( my $line = <$data_file> ){
my $first_letter = uc( substr( $line, 0, 1 ) );
$frequency{$first_letter} ++
}
# $data_file is automatically closed here
}
#use Util::Any qw'sum';
use List::Util qw'sum';
# This is just an example of how to calculate a threshold
my $mean = sum( values %frequency ) / scalar values %frequency;
my $threshold = $mean * 2;
my #index;
my #group;
for my $letter ( sort keys %frequency ){
my $frequency = $frequency{$letter};
if( $frequency >= $threshold ){
if( #group ){
if( #group == 1 ){
push #index, #group;
}else{
# push #index, [#group]; # copy #group
push #index, "$group[0]-$group[-1]";
}
#group = ();
}
push #index, $letter;
}elsif( sum( #frequency{#group,$letter} ) >= $threshold ){
if( #group == 1 ){
push #index, #group;
}else{
#push #index, [#group];
push #index, "$group[0]-$group[-1]"
}
#group = ($letter);
}else{
push #group, $letter;
}
}
#push #index, [#group] if #group;
push #index, "$group[0]-$group[-1]" if #group;
print join( ', ', #index ), "\n";