I have a perl code that extract from .xls file. My .xls file is as below
NUMBER NAME ALPHABET
one Jane a
two Adam b
three Josh c
;four
five Agnes e
six Mary f
;seven
eight Lara h
I want to extract the info and only take column 1 and 2. My perl code is as below.
#!/usr/bin/perl
use warnings;
use strict;
use Spreadsheet::ParseExcel;
main ();
sub main {
my $filename = 'Book1.xls';
my $parser = Spreadsheet::ParseExcel->new();
my $workbook = $parser->parse( $filename );
if ( !defined $workbook ) {
die "-E-: cannot parse <$filename>.\n ", $parser->error(), ".\n";
}
my $worksheet = $workbook -> Worksheet ( 'a' ) || die "-E-: cannot parse family pin list.\n";
my ( $row_min, $row_max ) = $worksheet-> row_range();
open ( my $file,"> output.txt");
for my $row ( 1 .. $row_max ) {
my #data;
for my $col ( 0 ) {
my $number = $worksheet-> get_cell( $row, $col );
if ( $number ) {
push #data, $number-> value();
}
else {
push #data, '';
}
}
for my $col ( 2 ) {
my $alphabet = $worksheet->get_cell( $row, $col );
if ( $alphabet ) {
push #data, $alphabet->value();
print $file "#data\n";
}
else {
push #data, '';
}
}
}
close $file;
print "done\n";
}
The result is
one a
two b
three c
;four
five e
six f
;seven
eight h
I want to remove the entire array that start with string ";". I extend my code like below
open ( my $file,"> output.txt");
for my $row ( 1 .. $row_max ) {
my #data;
for my $col ( 0 ) {
my $number = $worksheet-> get_cell( $row, $col );
if ( $number ) {
push #data, $number-> value();
}
else {
push #data, '';
}
}
for my $col ( 11 ) {
my $alphabet = $worksheet->get_cell( $row, $col );
if ( $alphabet ) {
push #data, $alphabet->value();
}
else {
push #data, '';
}
}
my #new_data = grep(!/;/, #data);
my #latest_data = grep ( $_ ne '', #new_data);
print $file "#latest_data\n";
}
close $file;
print "done\n";
}
The output result produce like below.
one a
two b
three c
five e
six f
eight h
I don't want to be empty space. How i want to eliminate the empty space that produce result as below?
one a
two b
three c
five e
six f
eight h
I also try doing like this, but the result is same.
for my $index (reverse 0..$#data) {
if ( $data[$index] =~ /^;/ ) {
splice(#data, $index, 1);
}
}
print $file "#data\n";
I hope you are looking for next
use warnings;
use strict;
#data=("one a", "two b","three c", ";four d","five e","six e","seven g","eight h")
for(#data){
next if (/^;/);
print $_,"\n";
}
let's get your data gist
#data=("one a", "two b","three c", ";four d","five e","six e","seven g","eight h")
for(#data){
s/^;.*//; # // <-- Put Replacer here between `//` , leave it to remove line
push(#new_data,$_)}
print "$_\n" for #new_data
Related
I am new to Perl. I have excel file say "sample.xls" which looks like follows.
Sample.xls
There are about data of 1000 rows like this. I want to parse this file and write it in another file say "output.xls" with following output format.
output.xls
I have written a script in perl, however, it doesn't give me the exact output the way I want. Also, looks like the script is not very efficient. Can anyone guide me how I can improve my script as well as have my output as shown in "output.xls" ??
Here's the Script:
#!/usr/bin/perl –w
use strict;
use warnings;
use Spreadsheet::ParseExcel;
use Spreadsheet::WriteExcel;
use Spreadsheet::WriteExcel::Chart;
# Read the input and output filenames.
my $inputfile = "path/sample.xls";
my $outputfile = "path/output.xls";
if ( !$inputfile || !$outputfile ) {
die( "Couldn't find file\n" );
}
my $parser = Spreadsheet::ParseExcel->new();
my $inwb = $parser->parse( $inputfile );
if ( !defined $inwb ) {
die "Parsing error: ", $parser->error(), ".\n";
}
my $outwb = Spreadsheet::WriteExcel->new( $outputfile );
my $inws = $inwb->worksheet( "Sheet1" );
my $outws = $outwb->add_worksheet("Sheet1");
my $out_row = 0;
my ( $row_min, $row_max ) = $inws->row_range();
my ( $col_min, $col_max ) = $inws->col_range();
my $format = $outwb->add_format(
center_across => 1,
bold => 1,
size => 10,
border => 4,
color => 'black',
border_color => 'black',
align => 'vcenter',
);
$outws->write(0,0, "Item Name", $format);
$outws->write(0,1, "Spec", $format);
$outws->write(0,2, "First name", $format);
$outws->write(0,3, "Middle Name", $format);
$outws->write(0,4, "Last Name", $format);
$outws->write(0,5, "Customer Number", $format);
$outws->write(0,6, "Age", $format);
$outws->write(0,7, "Units", $format);
my $col_count = 1;
#$row_min = 1;
for my $inws ( $inwb->worksheets() ) {
my ( $row_min, $row_max ) = $inws->row_range();
my ( $col_min, $col_max ) = $inws->col_range();
for my $in_row ( 2 .. $row_max ) {
for my $col ( 0 .. 0 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
$outws->write($in_row,$col, $cell->value());
$outws->write($in_row,$col+1, $fields[1]);
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 1 .. 1 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
#$outws->write($in_row,$col+1, $cell->value());
$outws->write($in_row,$col+1, $fields[0]);
$outws->write($in_row,$col+2, $fields[1]);
$outws->write($in_row,$col+3, $fields[2]);
$outws->write($in_row,$col+4, $fields[3]);
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 2 .. 2 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
$outws->write($in_row,6, $cell->value());
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 3 .. 9 ) {
my $cell = $inws->get_cell( $in_row, $col);
next unless $cell;
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 10 .. 10 ) {
my $cell = $inws->get_cell( $in_row, $col );
next unless $cell;
$outws->write($in_row,7, $cell->value());
}
}
}
To get your output sorted, you need to collect all the information first before you are writing it out. Right now, you are doing a bit of jumping back and forth between rows and columns.
Here are some changes I would make to get it sorted, and make it more efficient (to read).
Create a data structure $data outside of your loop to store all the information.
If there is only one worksheet, you don't need to loop over sheets. Just work with one sheet.
Loop over the lines.
Inside that loop, use the code you have to parse the individual fields to just parse them. No 2..2 loops. Just a bunch of statements.
my #item_fields = split /_/, $inws->get_cell( $in_row, 0 ) || q{};
my #name_fields = split /_/, $inws->get_cell( $in_row, $col ) || q{};
Store them in $data per item.
push #{ $data } = [ $item_fields[0], ... ];
Done with the loop. Open the output file.
Loop over $data with a sort and write to the output file.
foreach my $row (sort { $a->[0] cmp $b->[0] } #{ $data } ) { ... }
Done.
I suggest you read up on sort and also check out perlref and perlreftut to learn more about references (data structures).
I am very new at perl and had discovered the solution at:
Perl: Compare Two CSV Files and Print out differences
I have gone through dozens of other solutions and this comes closest, except that instead of finding the differences between 2 CSV files, I want to find where the second CSV file matches the first one in column and row. How could I modify the following script to find the matches in column/row instead of the differences. I am hoping to dissect this code and learn arrays from there, but wanted to find out the solution to this application. Much thanks.
use strict;
my #arr1;
my #arr2;
my $a;
open(FIL,"a.txt") or die("$!");
while (<FIL>)
{chomp; $a=$_; $a =~ s/[\t;, ]*//g; push #arr1, $a if ($a ne '');};
close(FIL);
open(FIL,"b.txt") or die("$!");
while (<FIL>)
{chomp; $a=$_; $a =~ s/[\t;, ]*//g; push #arr2, $a if ($a ne '');};
close(FIL);
my %arr1hash;
my %arr2hash;
my #diffarr;
foreach(#arr1) {$arr1hash{$_} = 1; }
foreach(#arr2) {$arr2hash{$_} = 1; }
foreach $a(#arr1)
{
if (not defined($arr2hash{$a}))
{
push #diffarr, $a;
}
}
foreach $a(#arr2)
{
if (not defined($arr1hash{$a}))
{
push #diffarr, $a;
}
}
print "Diff:\n";
foreach $a(#diffarr)
{
print "$a\n";
}
# You can print to a file instead, by: print FIL "$a\n";
ok, I realize that this was more what I was looking for:
use strict;
use warnings;
use feature qw(say);
use autodie;
use constant {
FILE_1 => "file1.txt",
FILE_2 => "file2.txt",
};
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = 1;
}
close $file1_fh;
#
# Load Hash #2 with value from File #2
#
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
$hash2{$value} = 1;
}
close $file2_fh;
Now I want to search file2's hash to check if there are ANY matches from file1's hash. That is where I am stuck
With new code suggestion, code now looks like this
#!/usr/bin/env perl
use strict;
use warnings;
use feature qw(say);
use autodie;
use constant {
FILE_1 => "masterlist.csv",
FILE_2 => "pastebin.csv",
};
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = 1;
}
close $file1_fh;
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
if ( $hash1{$value} ) {
print "Match found $value\n";
$hash2{$value}++;
}
}
close $file2_fh;
print "Matches found:\n";
foreach my $key ( keys %hash2 ) {
print "$key found $hash2{$key} times\n";
}
I updated one part with split() and it seems to work, but have to test more to confirm if it fits the solution I'm looking for or I have more work to do one it
#
# Load Hash #1 with value from File #1
#
my %hash1;
open my $file1_fh, "<", FILE_1;
while ( my $value = <$file1_fh> ) {
chomp $value;
$hash1{$value} = ( %hash1, (split(/,/, $_))[1,2] );
}
close $file1_fh;
So, with your code there - you've read in 'file1' to a hash.
Why not instead of reading file 2 into a hash, do instead:
my %hash2;
open my $file2_fh, "<", FILE_2;
while ( my $value = <$file2_fh> ) {
chomp $value;
if ( $hash1{$value} ) {
print "Match found $value\n";
$hash2{$value}++;
}
}
close $file2_fh;
print "Matches found:\n";
foreach my $key ( keys %hash2 ) {
print "$key found $hash2{$key} times\n";
}
I think this code identifies every place that a data field in file A matches a data field in file B (at least it does on my limited test data):
use strict;
use warnings;
my #arr1;
my #arr2;
# a.txt -> #arr1
my $file_a_name = "poster_a.txt";
open(FIL,$file_a_name) or die("$!");
my $a_line_counter = 0;
while (my $a_line = <FIL>)
{
$a_line_counter = $a_line_counter + 1;
chomp($a_line);
my #fields = (split /,/,$a_line);
my $num_fields = scalar(#fields);
s{^\s+|\s+$}{}g foreach #fields;
push #arr1, \#fields if ( $num_fields ne 0);
};;
close(FIL);
my $file_b_name = "poster_b.txt";
open(FIL,$file_b_name) or die("$!");
while (my $b_line = <FIL>)
{
chomp($b_line);
my #fields = (split /,/,$b_line);
my $num_fields = scalar(#fields);
s{^\s+|\s+$}{}g foreach #fields;
push #arr2, \#fields if ( $num_fields ne 0)
};
close(FIL);
# b.txt -> #arr2
#print "\n",#arr2, "\n";
my #match_array;
my $file_a_line_ctr = 1;
foreach my $file_a_line_fields (#arr1)
{
my $file_a_column_ctr = 1;
foreach my $file_a_line_field (#{$file_a_line_fields})
{
my $file_b_line_ctr = 1;
foreach my $file_b_line_fields(#arr2)
{
my $file_b_column_ctr = 1;
foreach my $file_b_field (#{$file_b_line_fields})
{
if ( $file_b_field eq $file_a_line_field )
{
my $match_info =
"$file_a_name line $file_a_line_ctr column $file_a_column_ctr" .
" (${file_a_line_field}) matches: " .
"$file_b_name line $file_b_line_ctr column $file_b_column_ctr ";
push(#match_array, $match_info);
print "$match_info \n";
}
$file_b_column_ctr = $file_b_column_ctr + 1;
}
$file_b_line_ctr = $file_b_line_ctr + 1;
}
$file_a_column_ctr = $file_a_column_ctr + 1;
}
$file_a_line_ctr = $file_a_line_ctr + 1;
}
print "there were ", scalar(#match_array)," matches\n";
I'm in the process of learning how to use perl for genomics applications. I am trying to clean up paired end reads (1 forward, 1 reverse). These are stored in 2 files, but the lines match. What I'm having trouble doing is getting the relevant subroutines to read from the second file (the warnings I get are for uninitialized values).
These files are set up in 4 line blocks(fastq) where the first line is a run ID, 2nd is a sequence, 3rd is a "+", and the fourth holds quality values for the sequence in line 2.
I had no real trouble with this code when it was applied only for one file, but I think I'm misunderstanding how to handle multiple files.
Any guidance is much appreciated!
My warning in this scenario is as such : Use of uninitialized value $thisline in subtraction (-) at ./pairedendtrim.pl line 137, line 4.
#!/usr/bin/perl
#pairedendtrim.pl by AHU
use strict;
use warnings;
die "usage: readtrimmer.pl <file1> <file2> <nthreshold> " unless #ARGV == 3;
my $nthreshold = "$ARGV[2]";
open( my $fastq1, "<", "$ARGV[0]" );
open( my $fastq2, "<", "$ARGV[1]" );
my #forline;
my #revline;
while ( not eof $fastq2 and not eof $fastq1 ) {
chomp $fastq1;
chomp $fastq2;
$forline[0] = <$fastq1>;
$forline[1] = <$fastq1>;
$forline[2] = <$fastq1>;
$forline[3] = <$fastq1>;
$revline[0] = <$fastq2>;
$revline[1] = <$fastq2>;
$revline[2] = <$fastq2>;
$revline[3] = <$fastq2>;
my $ncheckfor = removen( $forline[1] );
my $ncheckrev = removen( $revline[1] );
my $fortest = 0;
if ( $ncheckfor =~ /ok/ ) { $fortest = 1 }
my $revtest = 0;
if ( $ncheckrev =~ /ok/ ) { $revtest = 1 }
if ( $fortest == 1 and $revtest == 1 ) { print "READ 1 AND READ 2" }
if ( $fortest == 1 and $revtest == 0 ) { print "Read 1 only" }
if ( $fortest == 0 and $revtest == 1 ) { print "READ 2 only" }
}
sub removen {
my ($thisline) = $_;
my $ntotal = 0;
for ( my $i = 0; $i < length($thisline) - 1; $i++ ) {
my $pos = substr( $thisline, $i, 1 );
#print "$pos\n";
if ( $pos =~ /N/ ) { $ntotal++ }
}
my $nout;
if ( $ntotal <= $nthreshold ) #threshold for N
{
$nout = "ok";
} else {
$nout = "bad";
}
return ($nout);
}
The parameters to a subroutine are in #_, not $_
sub removen {
my ($thisline) = #_;
I have a few other tips for you as well:
use autodie; anytime that you're doing file processing.
Assign the values in #ARGV to variables first thing. This quickly documents what the hold.
Do not chomp a file handle. This does not do anything. Instead apply chomp to the values returned from reading.
Do not use the strings ok and bad as boolean values.
tr can be used to count the number times a character is in a string.
The following is a cleaned up version of your code:
#!/usr/bin/perl
#pairedendtrim.pl by AHU
use strict;
use warnings;
use autodie;
die "usage: readtrimmer.pl <file1> <file2> <nthreshold> " unless #ARGV == 3;
my ( $file1, $file2, $nthreshold ) = #ARGV;
open my $fh1, '<', $file1;
open my $fh2, '<', $file2;
while ( not eof $fh2 and not eof $fh1 ) {
chomp( my #forline = map { scalar <$fh1> } ( 1 .. 4 ) );
chomp( my #revline = map { scalar <$fh2> } ( 1 .. 4 ) );
my $ncheckfor = removen( $forline[1] );
my $ncheckrev = removen( $revline[1] );
print "READ 1 AND READ 2" if $ncheckfor and $ncheckrev;
print "Read 1 only" if $ncheckfor and !$ncheckrev;
print "READ 2 only" if !$ncheckfor and $ncheckrev;
}
sub removen {
my ($thisline) = #_;
my $ntotal = $thisline =~ tr/N/N/;
return $ntotal <= $nthreshold; #threshold for N
}
So I have an array like this:
W,X,Y,Z
-7,6,101,15
-7,6,101,16
-7,6,101,17
-7,6,101,78
-7,6,101,79
-7,6,101,84
-7,6,101,92
-7,9,615,49
-7,9,615,50
-7,10,759,38
-7,10,759,39
Now, I want to print a line when W=X=Y and Z are consecutive numbers.
Expected Output:
W,X,Y,Z
-7,6,101,15-16-17
-7,6,101,78-79
-7,6,101,84
-7,6,101,92
-7,9,615,49-50
-7,10,759,38-39
How do I implement this on Perl?
Thanks,
Art
Here is my script:
while ( $output_line = <FILE_C> ) {
chomp $output_line;
my ( $W, $X, $Y, $C, $D, $E, $F, $Z ) = ( split /\s/, $output_line );
if ( $Y == $Block_previous ) {
print("Yes\t$Block_previous\t$Y\t$Z\n");
push( #Z_array, $Z );
push( #Y_array, $Y );
next;
}
else {
push( #Z_array_nonblkmatch, $Z );
}
foreach $Z_printer (#Z_array) {
print("$Y_array[0]\t$Z_printer\n");
if ( ( $Z_delta == 1 ) || ( $Z_delta == -1 ) ) {
push( #Z_adj, $Z_printer, $Z_printer_prev );
#~ print ("pair: $Z_printer_prev-$Z_printer\n");
}
else {
#~ print ("$Z_printer\n");
}
$Z_printer_prev = $Z_printer;
}
#Z_adj = ();
#Z_array = ();
#Y_array = ();
#Z_array_nonblkmatch = ();
$Block_previous = $Y;
#~ <STDIN>;
}
close(FILE_C);
Thanks, raina77ow! However, this is what the output look like:
-7,6,101,15-16-17-79
-7,6,101,16-17-79
-7,6,101,17-79
-7,6,101,78-79
-7,6,101,79-50
-7,6,101,84-50
-7,6,101,92
-7,6,615,49-50-39
-7,6,615,50
One possible approach (ideone demo):
use warnings;
use strict;
my $prev;
while (<DATA>) {
chomp;
next unless /\S/;
my #numbers = split /,/;
if (defined $prev && $numbers[3] == $prev + 1) {
print '-' . ++$prev;
next;
}
print "\n" if defined $prev;
print join ',', #numbers;
$prev = $numbers[3];
}
__DATA__
-7,6,101,15
-7,6,101,16
-7,6,101,17
-7,6,101,78
-7,6,101,79
-7,6,101,84
-7,6,101,92
-7,9,615,49
-7,9,615,50
-7,10,759,38
-7,10,759,39
I choose not to collect this data into intermediate array, as you did, as the question was simple: print it grouped. The key is storing the value of the last (Z) column, then checking each new line against it: if it matches, you print just the incremented value (that's what print '-' . ++$prev line for), if not, you end this line (for all but the first case) and start a new one with the numbers of this line.
I'm not sure exactly how to explain this, so I'll just start with an example.
Given the following data:
Apple
Apricot
Blackberry
Blueberry
Cherry
Crabapple
Cranberry
Elderberry
Grapefruit
Grapes
Kiwi
Mulberry
Nectarine
Pawpaw
Peach
Pear
Plum
Raspberry
Rhubarb
Strawberry
I want to generate an index based on the first letter of my data, but I want the letters grouped together.
Here is the frequency of the first letters in the above dataset:
2 A
2 B
3 C
1 E
2 G
1 K
1 M
1 N
4 P
2 R
1 S
Since my example data set is small, let's just say that the maximum number to combine the letters together is 3. Using the data above, this is what my index would come out to be:
A B C D-G H-O P Q-Z
Clicking the "D-G" link would show:
Elderberry
Grapefruit
Grapes
In my range listing above, I am covering the full alphabet - I guess that is not completely neccessary - I would be fine with this output as well:
A B C E-G K-N P R-S
Obviously my dataset is not fruit, I will have more data (around 1000-2000 items), and my "maximum per range" will be more than 3.
I am not too worried about lopsided data either - so if I 40% of my data starts with an "S", then S will just have its own link - I don't need to break it down by the second letter in the data.
Since my dataset won't change too often, I would be fine with a static "maximum per range", but it would be nice to have that calculated dynamically too. Also, the dataset will not start with numbers - it is guaranteed to start with a letter from A-Z.
I've started building the algorithm for this, but it keeps getting so messy I start over. I don't know how to search google for this - I'm not sure what this method is called.
Here is what I started with:
#!/usr/bin/perl
use strict;
use warnings;
my $index_frequency = { map { ( $_, 0 ) } ( 'A' .. 'Z' ) };
my $ranges = {};
open( $DATASET, '<', 'mydata' ) || die "Cannot open data file: $!\n";
while ( my $item = <$DATASET> ) {
chomp($item);
my $first_letter = uc( substr( $item, 0, 1 ) );
$index_frequency->{$first_letter}++;
}
foreach my $letter ( sort keys %{$index_frequency} ) {
if ( $index_frequency->{$letter} ) {
# build $ranges here
}
}
My problem is that I keep using a bunch of global variables to keep track of counts and previous letters examined - my code gets very messy very fast.
Can someone give me a step in the right direction? I guess this is more of an algorithm question, so if you don't have a way to do this in Perl, pseudo code would work too, I guess - I can convert it to Perl.
Thanks in advance!
Basic approach:
#!/usr/bin/perl -w
use strict;
use autodie;
my $PAGE_SIZE = 3;
my %frequencies;
open my $fh, '<', 'data';
while ( my $l = <$fh> ) {
next unless $l =~ m{\A([a-z])}i;
$frequencies{ uc $1 }++;
}
close $fh;
my $current_sum = 0;
my #letters = ();
my #pages = ();
for my $letter ( "A" .. "Z" ) {
my $letter_weigth = ( $frequencies{ $letter } || 0 );
if ( $letter_weigth + $current_sum > $PAGE_SIZE ) {
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
$current_sum = $letter_weigth;
#letters = ( $letter );
next;
}
push #letters, $letter;
$current_sum += $letter_weigth;
}
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
print "Pages : " . join( " , ", #pages ) . "\n";
Problem with it is that it outputs (from your data):
Pages : A , B , C-D , E-J , K-O , P , Q-Z
But I would argue this is actually good approach :) And you can always change the for loop into:
for my $letter ( sort keys %frequencies ) {
if you need.
Here's my suggestion:
# get the number of instances of each letter
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# transform the list of counts into a map of count => letters
my %freq = ();
while (my ($letter, $count) = each %count)
{
push #{ $freq{ $count } }, $letter;
}
# now print out the list of letters for each count (or do other appropriate
# output)
foreach (sort keys %freq)
{
my #sorted_letters = sort #{ $freq{$_} };
print "$_: #sorted_letters\n";
}
Update: I think that I misunderstood your requirements. The following code block does something more like what you want.
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# get the maximum frequency
my $max_freq = (sort values %count)[-1];
my $curr_set_count = 0;
my #curr_set = ();
foreach ('A' .. 'Z') {
push #curr_set, $_;
$curr_set_count += $count{$_};
if ($curr_set_count >= $max_freq) {
# print out the range of the current set, then clear the set
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
#curr_set = ();
$curr_set_count = 0;
}
}
# print any trailing letters from the end of the alphabet
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
Try something like that, where frequency is the frequency array you computed at the previous step and threshold_low is the minimal number of entries in a range, and threshold_high is the max. number. This should give harmonious results.
count=0
threshold_low=3
threshold_high=6
inrange=false
frequency['Z'+1]=threshold_high+1
for letter in range('A' to 'Z'):
count += frequency[letter];
if (count>=threshold_low or count+frequency[letter+1]>threshold_high):
if (inrange): print rangeStart+'-'
print letter+' '
inrange=false
count=0
else:
if (not inrange) rangeStart=letter
inrange=true
use strict;
use warnings;
use List::Util qw(sum);
my #letters = ('A' .. 'Z');
my #raw_data = qw(
Apple Apricot Blackberry Blueberry Cherry Crabapple Cranberry
Elderberry Grapefruit Grapes Kiwi Mulberry Nectarine
Pawpaw Peach Pear Plum Raspberry Rhubarb Strawberry
);
# Store the data by starting letter.
my %data;
push #{$data{ substr $_, 0, 1 }}, $_ for #raw_data;
# Set max page size dynamically, based on the average
# letter-group size (in this case, a multiple of it).
my $MAX_SIZE = sum(map { scalar #$_ } values %data) / keys %data;
$MAX_SIZE = int(1.5 * $MAX_SIZE + .5);
# Organize the data into pages. Each page is an array reference,
# with the first element being the letter range.
my #pages = (['']);
for my $letter (#letters){
my #d = exists $data{$letter} ? #{$data{$letter}} : ();
if (#{$pages[-1]} - 1 < $MAX_SIZE or #d == 0){
push #{$pages[-1]}, #d;
$pages[-1][0] .= $letter;
}
else {
push #pages, [ $letter, #d ];
}
}
$_->[0] =~ s/^(.).*(.)$/$1-$2/ for #pages; # Convert letters to range.
This is an example of how I would write this program.
#! /opt/perl/bin/perl
use strict;
use warnings;
my %frequency;
{
use autodie;
open my $data_file, '<', 'datafile';
while( my $line = <$data_file> ){
my $first_letter = uc( substr( $line, 0, 1 ) );
$frequency{$first_letter} ++
}
# $data_file is automatically closed here
}
#use Util::Any qw'sum';
use List::Util qw'sum';
# This is just an example of how to calculate a threshold
my $mean = sum( values %frequency ) / scalar values %frequency;
my $threshold = $mean * 2;
my #index;
my #group;
for my $letter ( sort keys %frequency ){
my $frequency = $frequency{$letter};
if( $frequency >= $threshold ){
if( #group ){
if( #group == 1 ){
push #index, #group;
}else{
# push #index, [#group]; # copy #group
push #index, "$group[0]-$group[-1]";
}
#group = ();
}
push #index, $letter;
}elsif( sum( #frequency{#group,$letter} ) >= $threshold ){
if( #group == 1 ){
push #index, #group;
}else{
#push #index, [#group];
push #index, "$group[0]-$group[-1]"
}
#group = ($letter);
}else{
push #group, $letter;
}
}
#push #index, [#group] if #group;
push #index, "$group[0]-$group[-1]" if #group;
print join( ', ', #index ), "\n";