I am trying to make the port range of 1-65535 allow the user to exclude certain ranges of ports from it
The input will be a comma separated single or range of port numbers, such as:
1-2,3-4,4-5,5-6,7-8,9-10,1-100
The output should be:
101-65535
I have written code to cover many cases, but for some reason the code I currently have doesn't handle the last exclusion 1-100 because 9 is the current minimum port number
Here is my code:
my #ranges;
push #ranges, '1-65535';
my $bFound = 0;
do {
$bFound = 0;
foreach my $ptrExclusion (#exclusions) {
my %exclusion = %{$ptrExclusion};
print STDERR "handling exclusion # 7964: ".Dumper(\%exclusion);
my $currentPos = 0;
foreach my $range (#ranges) {
$currentPos++;
if ($range =~ /([0-9]+)-([0-9]+)/) {
my $firstPortInRange = $1;
my $secondPortInRange = $2;
if ($secondPortInRange == $exclusion{first} and
$exclusion{second} == $exclusion{first}) {
$bFound = 1;
my #newranges;
if ($exclusion{first} - 1 > 0) {
push #newranges, "$firstPortInRange-".(sprintf("%d", $exclusion{first} - 1));
} else { # Handle port "1"
### Don't put anything, we are excluded from adding this
}
if ($currentPos > 1) {
unshift #newranges, $ranges[1..($currentPos-1)];
}
if ($currentPos + 1 < scalar(#ranges)) {
push #newranges, $ranges[($currentPos+1) .. scalar(#ranges)];
}
print STDERR "newranges # 7985: ".Dumper(\#newranges);
#ranges = #newranges;
last;
}
if ($firstPortInRange == $exclusion{first} and
$exclusion{second} == $exclusion{first}) {
$bFound = 1;
my #newranges;
if ($exclusion{second} + 1 <= 65535) {
push #newranges, (sprintf("%d", $exclusion{second} + 1))."-$secondPortInRange";
} else { # Handle port 65535
#### Don't put anything, we are excluded from adding this
}
if ($currentPos > 1) {
unshift #newranges, $ranges[1..($currentPos-1)];
}
if ($currentPos + 1 < scalar(#ranges)) {
push #newranges, $ranges[($currentPos+1) .. scalar(#ranges)];
}
print STDERR "newranges # 8005: ".Dumper(\#newranges);
#ranges = #newranges;
last;
}
if ($firstPortInRange < $exclusion{first} and
$secondPortInRange > $exclusion{second} # An exclusion is between the ranges we currently have, this doesn't include "hits" on the exact port number, i.e. excluding port 1 and port 65535
) {
print STDERR "exclusion matched # 8022\n";
$bFound = 1;
#printf (STDERR "currentPos # 7973: %d\n", $currentPos);
my #newranges;
push #newranges, "$firstPortInRange-".(sprintf("%d", $exclusion{first} - 1));
push #newranges, (sprintf("%d", $exclusion{second} + 1))."-$secondPortInRange";
if ($currentPos > 1) {
unshift #newranges, $ranges[1..($currentPos-1)];
}
if ($currentPos + 1 < scalar(#ranges)) {
push #newranges, $ranges[($currentPos+1) .. scalar(#ranges)];
}
print STDERR "newranges # 8026: ".Dumper(\#newranges);
#ranges = #newranges;
last;
}
if ($firstPortInRange >= $exclusion{first} and
$firstPortInRange < $exclusion{second} and
$secondPortInRange <= $exclusion{second} and
$secondPortInRange > $exclusion{first} # An exclusion is holding our range inside it
) {
print STDERR "exclusion matched # 8045\n";
$bFound = 1;
splice(#ranges, $currentPos-1, 1); # -1 as our index starts from 1, while #ranges index starts at 0
print STDERR "ranges # 8051: ".Dumper(\#ranges);
last;
}
}
}
if ($bFound) {
last;
}
}
} while ($bFound);
print STDERR "ranges # 7980: ".join(", ", #ranges). "\n";
#exclusions has hash elements under it with a first and second values to them which designate port A and port B (lower range and top range) their value can match if the exclusion is for 1 port.
There are a number of modules for working with sets that will make your life easier. I would recommend Set::IntSpan::Fast:
use strict;
use warnings;
use 5.010;
use Set::IntSpan::Fast;
my $ports = Set::IntSpan::Fast->new('1-65535');
my $exclude = Set::IntSpan::Fast->new('1-2,3-4,4-5,5-6,7-8,9-10,1-100');
say $ports->diff($exclude)->as_string;
Output
101-65535
The Range::Object set of modules is very comprehensive, and Range::Object::Serial does exactly what you're looking for out of the box
This short program demonstrates
use strict;
use warnings;
use v5.10;
use Range::Object::Serial;
my $range = Range::Object::Serial->new('1-65535');
say scalar $range->collapsed;
$range->remove('1-2,3-4,4-5,5-6,7-8,9-10,1-100');
say scalar $range->collapsed;
output
1-65535
101-65535
Here is an attempt, without using a library, instead just using an array (since it is not very large).
#!/usr/bin/env perl
use warnings;
use strict;
my #port_range = ( 1, 65535 );
my #ports = map { 1 } ( 0 .. $port_range[1] );
my $exclusions = '1-2,3-4,4-5,5-6,7-8,9-10,1-100';
for my $exclusion ( split /,/, $exclusions ) {
if ($exclusion =~ m|\-|) {
# Range
my ($start, $stop) = split /-/, $exclusion;
$ports[$_] = 0 for ($start..$stop);
} else {
# Single port
$ports[$exclusion] = 0;
}
}
my #good_ports = grep { $ports[$_] > 0 } ( $port_range[0] .. $port_range[1] );
my $last_good = 0;
for my $i ( 1 .. $#good_ports ) {
if ($good_ports[$i] - $good_ports[$i-1] > 1) {
# gap
print join '-', $good_ports[$last_good], $good_ports[$i-1] . "\n";
$last_good = $i;
}
}
print join '-', $good_ports[$last_good], $good_ports[$#good_ports] . "\n";
Output
101-65535
With a range as small as that you could just keep a list of every port and set flags for which are open & closed, and print out the results.
#!/usr/bin/perl
use warnings;
use strict;
# print a list of open ports, given a string of closed ports
my #ports = map { 1 } 1..65535; # start with all ports flagged as open
while (<DATA>) { # get the string of closed port numbers & ranges
chomp;
/\d/ or next; # ensure we have at least one number to work with
my #exclusions = split /,/;
for (#exclusions) {
# each exclusion is a number, optionally followed
# by a dash and another number
/^(\d+)(?:-(\d+))?$/ or next;
# set the flag to 0 for a single port or a range of ports
if ($1 and ! $2) {
$ports[$1-1] = 0; # single port
}
elsif ($1 and $2) {
#ports[$1-1..$2-1] = map {0} $1..$2; # range of ports
}
}
}
# get a list of all ports which are open
my #open_ports = map {$_ + 1} grep {$ports[$_] == 1} 0..$#ports;
# the final list of open port ranges, to be displayed
my #ranges = ();
# build up the list of open ranges
for (#open_ports) {
my $one_less = $_ - 1;
# either add this open port to the previous range,
# or start a new range with this port
#
(#ranges and $ranges[-1] =~ s/-$one_less$/-$_/)
or push #ranges, "$_-$_";
}
# fix single-number ranges for display
for (#ranges) {
s/^(\d+)-\1$/$1/;
}
# display the result
print join ',', #ranges;
__DATA__
1-2,3-4,4-5,5-6,7-8,9-10,1-100
Output:
101-65535
Related
I have a text file with the following format:
1 4730 1031782 init
4 0 6 events
2190 450 0 top
21413 5928 1 sshd
22355 1970 2009 find
I need to sum the second column of data and return that value to the user.
My current attempt reads the data from the text file like so:
sub read_file{
my $data_failed = 1;
my $file = 'task_file';
if (-z $file){print "No tasks found\n";}
if(open (INFILE, "task_file" || die "$!\n")){
my #COLUMNS = qw( memory cpu program );
my %sort_strings = ( program => sub { $a cmp $b } );
my (%process_details, %sort);
while (<INFILE>) {
$data_failed = 0;
my ($process_id, $memory_size, $cpu_time, $program_name) = split;
$process_details{$process_id} = { memory => $memory_size,
cpu => $cpu_time,
program => $program_name };
undef $sort{memory}{$memory_size}{$process_id};
undef $sort{cpu}{$cpu_time}{$process_id};
undef $sort{program}{$program_name}{$process_id};
}
for my $column ($COLUMNS[2]) {
my $cmp = $sort_strings{$column} || sub { $a <=> $b };
for my $value (sort $cmp keys %{ $sort{$column} }
) {
my #pids = keys %{ $sort{$column}{$value} };
say join ' ', $_, #{ $process_details{$_} }{#COLUMNS}
for #pids;
}
}
if($total_memory_size == 1){
sum_memory_size();
}
} else { print "No tasks found"}
}
And my attempt to sum the values:
sub sum_memory_size{
my #do_sum = qw(memory_size);
my $column;
my $value;
for $column (keys %{ $sort{$column}{$value} }) {
my $found;
my $sum = 0;
for $value (#do_sum) {
next unless exists $sort{$column}{$value};
$sum += $hash{$column}{$value};
delete $hash{$column}{$value};
$found = 1;
}
$sort{$column}{somename} = $sum if $found;
}
print $sum;
}
Use sum form List::Util:
use List::Util qw{ sum };
# ...
my $memory_sum = sum(map $_->{memory}, values %process_details);
The question describes a simple task, which appears to be part of a larger problem. One answer to the simple task is:
perl -ne '#fields=split; $sum+=$fields[1]; END { print "$sum\n" }' foo.txt
For each line (-ne), split it and add the second value ($fields[1]) to a running $sum. Then, after all lines have been processed (END), print the sum. I get 13078 from the program, which is also what I get from a calculator on the input above :) (4730 + 0 + 450 + 5928 + 1970).
I have input file as follow.
ggaaaa (973026 to 973032) ctggag (1849680 to 1849686) = 6
ggaaaa (973056 to 973062) ctggag (1849706 to 1849712) = 6
ggaaaa (97322 to 97328) ctggag (184962 to 184968) = 6
cctgtggataacctgtgga (1849554 to 1849572) tccacaggttatccacagg (1849615 to 1849633) = 19
ggcccccccggagtt (470079 to 470093) aactccgggggggcc (1849574 to 1849588) = 15
ctggag (18497062 to 18497068) ggaaaa (9730562 to 9730568) = 6
First string is pattern with in bracket pattern position. Second string is repeat with in bracket repeat position. First 3 lines pattern and 6th line repeat are same but positions are different. So i want to make it as a cluster my output like this
ggaaaa==>(973026 to 973032)(973056 to 973062)(97322 to 97328)(9730562 to 9730568) ctggag==>(1849680 to 1849686)(1849706 to 1849712)(184962 to 184968)(18497062 to 18497068) 6 8
cctgtggataacctgtgga==>(1849554 to 1849572) tccacaggttatccacagg==>(1849615 to 1849633) 19 2
ggcccccccggagtt==>(470079 to 470093) aactccgggggggcc==>(1849574 to 1849588) 15 2
So first string are pattern and followed by their positions Second string are repeat and followed by their positions and TAB seperator is length of string again TAB separator is total of the pattern and repeat positions
I tried, small file i am getting output but large size file not getting output. I pasted my code below.
my $file = $_[0];
my %hashA;
my %hashB;
my #sorted;
my $i = 1;
my $j=$k=0;
my $tmplen = $len = 0;
my #sorted = `sort -nk10 $file`;
push(#sorted,"***");
open (FLWR,">$file") or die "File can't open $!";
$lengt = $sorted[0];
print FLWR $lengt;
my $linelen = #sorted;
while($i < $linelen)
{
($seqs,$len) = split(/\=/,$sorted[$i]);
$len =~s/\s+//g;
my($first,$second,$third,$fourth) = split(/\s+(?!to|\d+)/,$seqs);
if($len != $tmplen || $sorted[$i] eq "***")
{
if($tmplen != 0)
{
foreach $Alev2 (sort keys %{$hashA{$tmplen}})
{
foreach $Alev3 (sort keys %{$hashA{$tmplen}{$Alev2}})
{
foreach $Blev2 (sort keys %{$hashB{$tmplen}})
{
foreach $Blev3 (sort keys %{$hashB{$tmplen}{$Blev2}})
{
if($Alev3 eq $Blev3 && $Alev2 != $Blev2)
{
($Akey) = keys (%{$hashA{$tmplen}{$Blev2}});
($Akey1) = keys (%{$hashA{$tmplen}{$Blev2}{$Akey}});
foreach $Blev4 (sort keys %{$hashB{$tmplen}{$Blev2}{$Blev3}})
{
$hashA{$tmplen}{$Alev2}{$Alev3}{$Blev4}++;
$hashB{$tmplen}{$Alev2}{$Akey}{$Akey1}++ ;
}
delete($hashB{$tmplen}{$Blev2});
delete($hashA{$tmplen}{$Blev2});
}
}
}
}
}
}
$tmplen = $len;
}
if($first ne $dump_first)
{
$dump_first = $first;
$j++;
}
$hashA{$tmplen}{$j}{$dump_first}{$second}++;
$hashB{$tmplen}{$j}{$third}{$fourth}++;
$i++;
}
foreach $s1(sort keys %hashA)
{
foreach $s2 (sort keys %{$hashA{$s1}})
{
my $seq_concat = "";
my $a_concat = "";
my $b_concat = "";
my $a_inc = 0;
my $b_inc = 0;
foreach $s3 (sort keys %{$hashA{$s1}{$s2}})
{
next if($s3 eq "");
$Aseq_concat = "$s3==>";
foreach $s4 (sort keys %{$hashA{$s1}{$s2}{$s3}})
{
$a_inc++;
$a_concat .= $s4;
}
}
$s3 = "";
foreach $s3 (sort keys %{$hashB{$s1}{$s2}})
{
next if($s3 eq "");
$Bseq_concat = "$s3==>";
foreach $s4 (sort keys %{$hashB{$s1}{$s2}{$s3}})
{
$b_inc++;
$b_concat .= $s4;
}
}
next if($b_concat eq "");
$Bseq_concat = uc($Bseq_concat);
$b_concat = uc($b_concat);
$Aseq_concat = uc($Aseq_concat);
$a_concat = uc($a_concat);
if($a_inc > $b_inc)
{
print FLWR $Bseq_concat.$b_concat,"\t",$Aseq_concat,$a_concat;
}
else
{
print FLWR $Aseq_concat.$a_concat,"\t",$Bseq_concat,$b_concat;
}
print FLWR "\t$s1\t";
print FLWR $a_inc+$b_inc;
print FLWR "\n";
}
}
I'd prefer to suggest some updates to your code rather than hand out a completely re-worked solution - but I simply got lost in the level of nesting you have above.
Just a few key points;
The best attack with this is identifying that the ranges are the same format - therefore use a regex with 'global', /g option in a while loop.
Having decided to center the solution on a regex, its best to use 'extended mode' and lots of whitespace and comments.
The core data structure is a hash of hashes. The first level is keyed on the pattern string, the inner one keyed on the range specification string.
Your specification implies that all patterns on a given line of the data will be of the same length - the code I give below relies on this fact.
I added some checks to the given pattern length and the start and end positions - if the data is rock solid, maybe you don't need them?
Your requested output format added complexity to the code - if there was one report line for each pattern the code would be much simpler. As it is, a pattern appears at a certain line number in the report if it first appears on the corresponding line in the data. Recording the line that a pattern first appears added significantly to the code.
The code is written unix filter style - data supplied via STDIN and printed on STDOUT. Errors and warnings on STDERR
So, given those points;
use v5.12;
use warnings;
my $pattern_regex = qr/
(\w+) # capture actual pattern to $1
\s* # optional whitespace after pattern
( # capture the following to $2
\( # literal open bracket
(\d+) # capture start pos to $3
\s* to \s*
(\d+) # capture end pos to $4
\) # literal close bracket
) # close capture whole range spec to $1
\s* # gobble up any whitespae between patterns
/x ; # close regex - extended mode
my %ranges ;
my #first_seen_on_line ;
while (<>) {
chomp ;
my ($patterns_str, $given_length) = split /\s*=\s*/ ;
die "No length on line $." unless defined $given_length ;
# Repeatedly look for patterns and range-specifications
while ( $patterns_str =~ /$pattern_regex/g ) {
my ($pattern, $range_spec, $start_pos, $end_pos) = ($1,$2,$3,$4);
warn "Incorrect length for '$pattern' on line $.\n"
unless length($pattern) == $end_pos - $start_pos + 1
&& length($pattern) == $given_length ;
# Is this the first time we've seen this pattern?
if ( defined $ranges{ $pattern } ) {
# No its not - add this range to the hash of ranges for this pattern
$ranges{ $pattern }{ $range_spec }++ ;
}
else {
# Yes it is - record the fact that it was on this line and
# initialize the hash of ranges for this pattern
push #{ $first_seen_on_line[ $. ] }, $pattern ;
$ranges{ $pattern } = { $range_spec => 1 } ;
}
}
}
for my $line (1 .. $#first_seen_on_line) {
# Might not be anything to do for this line
next unless defined $first_seen_on_line[$line] ;
# Get the patterns that first appeared on this line
my #patterns = #{ $first_seen_on_line[$line] } ;
my ($pat_length , $range_count) ;
for my $pat (#patterns) {
# Get all the ranges for this pattern and print them
my #ranges = keys %{ $ranges{ $pat } };
print $pat, '==>', #ranges, "\t" ;
$range_count += #ranges ;
$pat_length = length($pat) ;
}
print $pat_length, "\t";
print $range_count, "\n" ;
# print length $pat, "\t", scalar #ranges, "\n" ;
}
Ran on the data above, it produces;
gaaaa==>(973026 to 973032)(973056 to 973062)(97322 to 97328)(9730562 to 9730568) ctggag==>(1849680 to 1849686)(1849706 to 1849712)(184962 to 184968)(18497062 to 18497068) 6 8
cctgtggataacctgtgga==>(1849554 to 1849572) tccacaggttatccacagg==>(1849615 to 1849633) 19 2
ggcccccccggagtt==>(470079 to 470093) aactccgggggggcc==>(1849574 to 1849588) 15 2
I have a file like this one below, where the line starting with a number is an ID for my sample and the following lines are the data.
10001;02/07/98;;PI;M^12/12/59^F^^SP^09/12/55
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D16S539
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D7S820
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D13S317
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D5S818
10002;02/07/98;;RJ;F^20/04/86^SP^
;;;;;F1|SP1;;;12;10;12;11;;D10S212
;;;;;F1|SP1;;;8;8;10;8;;D7S820
;;;;;F1|SP1;;;12;11;14;11;;D13S317
;;;;;F1|SP1;;;13;12;13;8;;D5S818
For the lines containing the data, I would like to test whether or not the fields 6-11 are the same because I want the data only if they are not equal to each other (in the first case they are all '9').
So I thought about splitting the lines and store them as an array, and then compare the arrays with the ~~ operator. But how do I do that if I'm reading the file inside a while loop and the array is redefined each line?
Or maybe there is better ways to do that.
Thanks in advance!
This is a pseudocode to illustrate what I want to do:
open FILE, $ARGV[0] or die $!;
while (<FILE>) {
chomp;
my #field = split /;/;
if ($field[0] eq '') {
if #fields[6 .. 11] is not equal to #fields[6 .. 11] in all the next lines {
do my calculation;
}
}
}
Am I correct in saying that data really represents two records? If so, you want to accumulate the lines for for the full record.
my #super_rec;
while (<>) {
chomp;
my #fields = split /;/;
if ($fields[0] ne '') {
process_rec(\#super_rec) if #super_rec;
#super_rec = \#fields;
} else {
push #super_rec, \#fields;
}
}
process_rec(\#super_rec) if #super_rec;
Then, your question can be answered.
sub process_rec {
my ($super_rec) = #_;
my ($rec, #subrecs) = #$super_rec;
my $do_calc = 0;
for my $i (1..$#subrecs) {
if ( $subrecs[0][ 6] ne $subrecs[$i][ 6]
|| $subrecs[0][ 7] ne $subrecs[$i][ 7]
|| $subrecs[0][ 8] ne $subrecs[$i][ 8]
|| $subrecs[0][ 9] ne $subrecs[$i][ 9]
|| $subrecs[0][10] ne $subrecs[$i][10]
|| $subrecs[0][11] ne $subrecs[$i][11]
) {
$do_calc = 1;
last;
}
}
if ($do_calc) {
...
}
}
I assume you're looking to compare data across lines, not within a single line. If I've got that wrong, ignore the rest of my answer.
The way I would do it is to re-join fields 6 through 11 as a string. Keep the data from the first line as $firstdata, and compare data from each successive line as $nextdata. Each time the data don't match, you up the $differences counter. When you get an ID line, check to see if the previous $differences was greater than zero and if so do your calculation (you may need to save the ID line and other fields in some other variables). Then re-initialize the $differences and $firstdata variable.
my $firstdata = "";
my $nextdata = "";
my $differences = 0;
open FILE, $ARGV[0] or die $!;
while (<FILE>) {
chomp;
my #field = split /;/;
if ($field[0] eq '') {
$nextdata = join(';', #fields[6..11]);
if ($firstdata && ($nextdata ne $firstdata)) {
$differences++;
} else {
$firstdata = $nextdata;
}
} else {
if ($differences) {
# do your calculation for previous ID
}
$firstdata = "";
$differences = 0;
}
}
if ($differences) {
# do your calculation one last time for the last ID
}
Here's a way to do it with Regex. This might be inefficient than other methods, if the indices are fixed from 6 to 11, and are known to be those only, because it will traverse entire String: -
open FILE, $ARGV[0] or die $!;
while (<FILE>) {
chomp;
my $num = 0;
my $same = 1;
while (/;(\d+);/) {
if ($num == 0) { $num = $1; }
elsif ($1 != $num) { $same = 0; last; }
# Substitute current digit matched with x (or any char)
# to avoid infinite loop
s/$1/x/;
}
if ($same) {
print "All digits same";
}
}
Using the Text::CSV_XS module you can do something like this:
use strict;
use warnings;
use Text::CSV_XS;
use feature 'say';
my $csv = Text::CSV_XS->new({
sep_char => ";",
binary => 1,
});
my %data;
my #hdrs; # store initial order of headers
my $hdr;
while (my $row = $csv->getline(*DATA)) {
if ($row->[0] =~ /^\d+$/) {
$csv->combine(#$row) or die "Cannot combine: " .
$csv->error_diag();
$hdr = $csv->string(); # recreate the header
push #hdrs, $hdr; # save list of headers
} else {
push #{ $data{$hdr} }, [ #{$row}[6..11] ];
}
}
for (#hdrs) {
say "$_\n arrays are: " . (is_identical($data{$_}) ? "same":"diff");
}
sub is_identical {
my $last;
for (#{$_[0]}) { # argument is two-dimensional array
$last //= $_;
return 0 unless ( #$_ ~~ #$last );
}
return 1; # default = all arrays were identical
}
__DATA__
10001;02/07/98;;PI;M^12/12/59^F^^SP^09/12/55
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D16S539
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D7S820
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D13S317
;;;;;M1|F1|SP1;9;9;9;9;9;9;;D5S818
10002;02/07/98;;RJ;F^20/04/86^SP^
;;;;;F1|SP1;;;12;10;12;11;;D10S212
;;;;;F1|SP1;;;8;8;10;8;;D7S820
;;;;;F1|SP1;;;12;11;14;11;;D13S317
;;;;;F1|SP1;;;13;12;13;8;;D5S818
Output:
10001;02/07/98;;PI;M^12/12/59^F^^SP^09/12/55
arrays are: same
10002;02/07/98;;RJ;F^20/04/86^SP^
arrays are: diff
I'm not sure exactly how to explain this, so I'll just start with an example.
Given the following data:
Apple
Apricot
Blackberry
Blueberry
Cherry
Crabapple
Cranberry
Elderberry
Grapefruit
Grapes
Kiwi
Mulberry
Nectarine
Pawpaw
Peach
Pear
Plum
Raspberry
Rhubarb
Strawberry
I want to generate an index based on the first letter of my data, but I want the letters grouped together.
Here is the frequency of the first letters in the above dataset:
2 A
2 B
3 C
1 E
2 G
1 K
1 M
1 N
4 P
2 R
1 S
Since my example data set is small, let's just say that the maximum number to combine the letters together is 3. Using the data above, this is what my index would come out to be:
A B C D-G H-O P Q-Z
Clicking the "D-G" link would show:
Elderberry
Grapefruit
Grapes
In my range listing above, I am covering the full alphabet - I guess that is not completely neccessary - I would be fine with this output as well:
A B C E-G K-N P R-S
Obviously my dataset is not fruit, I will have more data (around 1000-2000 items), and my "maximum per range" will be more than 3.
I am not too worried about lopsided data either - so if I 40% of my data starts with an "S", then S will just have its own link - I don't need to break it down by the second letter in the data.
Since my dataset won't change too often, I would be fine with a static "maximum per range", but it would be nice to have that calculated dynamically too. Also, the dataset will not start with numbers - it is guaranteed to start with a letter from A-Z.
I've started building the algorithm for this, but it keeps getting so messy I start over. I don't know how to search google for this - I'm not sure what this method is called.
Here is what I started with:
#!/usr/bin/perl
use strict;
use warnings;
my $index_frequency = { map { ( $_, 0 ) } ( 'A' .. 'Z' ) };
my $ranges = {};
open( $DATASET, '<', 'mydata' ) || die "Cannot open data file: $!\n";
while ( my $item = <$DATASET> ) {
chomp($item);
my $first_letter = uc( substr( $item, 0, 1 ) );
$index_frequency->{$first_letter}++;
}
foreach my $letter ( sort keys %{$index_frequency} ) {
if ( $index_frequency->{$letter} ) {
# build $ranges here
}
}
My problem is that I keep using a bunch of global variables to keep track of counts and previous letters examined - my code gets very messy very fast.
Can someone give me a step in the right direction? I guess this is more of an algorithm question, so if you don't have a way to do this in Perl, pseudo code would work too, I guess - I can convert it to Perl.
Thanks in advance!
Basic approach:
#!/usr/bin/perl -w
use strict;
use autodie;
my $PAGE_SIZE = 3;
my %frequencies;
open my $fh, '<', 'data';
while ( my $l = <$fh> ) {
next unless $l =~ m{\A([a-z])}i;
$frequencies{ uc $1 }++;
}
close $fh;
my $current_sum = 0;
my #letters = ();
my #pages = ();
for my $letter ( "A" .. "Z" ) {
my $letter_weigth = ( $frequencies{ $letter } || 0 );
if ( $letter_weigth + $current_sum > $PAGE_SIZE ) {
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
$current_sum = $letter_weigth;
#letters = ( $letter );
next;
}
push #letters, $letter;
$current_sum += $letter_weigth;
}
if ( $current_sum ) {
my $title = $letters[ 0 ];
$title .= '-' . $letters[ -1 ] if 1 < scalar #letters;
push #pages, $title;
}
print "Pages : " . join( " , ", #pages ) . "\n";
Problem with it is that it outputs (from your data):
Pages : A , B , C-D , E-J , K-O , P , Q-Z
But I would argue this is actually good approach :) And you can always change the for loop into:
for my $letter ( sort keys %frequencies ) {
if you need.
Here's my suggestion:
# get the number of instances of each letter
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# transform the list of counts into a map of count => letters
my %freq = ();
while (my ($letter, $count) = each %count)
{
push #{ $freq{ $count } }, $letter;
}
# now print out the list of letters for each count (or do other appropriate
# output)
foreach (sort keys %freq)
{
my #sorted_letters = sort #{ $freq{$_} };
print "$_: #sorted_letters\n";
}
Update: I think that I misunderstood your requirements. The following code block does something more like what you want.
my %count = ();
while (<FILE>)
{
$count{ uc( substr( $_, 0, 1 ) ) }++;
}
# get the maximum frequency
my $max_freq = (sort values %count)[-1];
my $curr_set_count = 0;
my #curr_set = ();
foreach ('A' .. 'Z') {
push #curr_set, $_;
$curr_set_count += $count{$_};
if ($curr_set_count >= $max_freq) {
# print out the range of the current set, then clear the set
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
#curr_set = ();
$curr_set_count = 0;
}
}
# print any trailing letters from the end of the alphabet
if (#curr_set > 1)
print "$curr_set[0] - $curr_set[-1]\n";
else
print "$_\n";
Try something like that, where frequency is the frequency array you computed at the previous step and threshold_low is the minimal number of entries in a range, and threshold_high is the max. number. This should give harmonious results.
count=0
threshold_low=3
threshold_high=6
inrange=false
frequency['Z'+1]=threshold_high+1
for letter in range('A' to 'Z'):
count += frequency[letter];
if (count>=threshold_low or count+frequency[letter+1]>threshold_high):
if (inrange): print rangeStart+'-'
print letter+' '
inrange=false
count=0
else:
if (not inrange) rangeStart=letter
inrange=true
use strict;
use warnings;
use List::Util qw(sum);
my #letters = ('A' .. 'Z');
my #raw_data = qw(
Apple Apricot Blackberry Blueberry Cherry Crabapple Cranberry
Elderberry Grapefruit Grapes Kiwi Mulberry Nectarine
Pawpaw Peach Pear Plum Raspberry Rhubarb Strawberry
);
# Store the data by starting letter.
my %data;
push #{$data{ substr $_, 0, 1 }}, $_ for #raw_data;
# Set max page size dynamically, based on the average
# letter-group size (in this case, a multiple of it).
my $MAX_SIZE = sum(map { scalar #$_ } values %data) / keys %data;
$MAX_SIZE = int(1.5 * $MAX_SIZE + .5);
# Organize the data into pages. Each page is an array reference,
# with the first element being the letter range.
my #pages = (['']);
for my $letter (#letters){
my #d = exists $data{$letter} ? #{$data{$letter}} : ();
if (#{$pages[-1]} - 1 < $MAX_SIZE or #d == 0){
push #{$pages[-1]}, #d;
$pages[-1][0] .= $letter;
}
else {
push #pages, [ $letter, #d ];
}
}
$_->[0] =~ s/^(.).*(.)$/$1-$2/ for #pages; # Convert letters to range.
This is an example of how I would write this program.
#! /opt/perl/bin/perl
use strict;
use warnings;
my %frequency;
{
use autodie;
open my $data_file, '<', 'datafile';
while( my $line = <$data_file> ){
my $first_letter = uc( substr( $line, 0, 1 ) );
$frequency{$first_letter} ++
}
# $data_file is automatically closed here
}
#use Util::Any qw'sum';
use List::Util qw'sum';
# This is just an example of how to calculate a threshold
my $mean = sum( values %frequency ) / scalar values %frequency;
my $threshold = $mean * 2;
my #index;
my #group;
for my $letter ( sort keys %frequency ){
my $frequency = $frequency{$letter};
if( $frequency >= $threshold ){
if( #group ){
if( #group == 1 ){
push #index, #group;
}else{
# push #index, [#group]; # copy #group
push #index, "$group[0]-$group[-1]";
}
#group = ();
}
push #index, $letter;
}elsif( sum( #frequency{#group,$letter} ) >= $threshold ){
if( #group == 1 ){
push #index, #group;
}else{
#push #index, [#group];
push #index, "$group[0]-$group[-1]"
}
#group = ($letter);
}else{
push #group, $letter;
}
}
#push #index, [#group] if #group;
push #index, "$group[0]-$group[-1]" if #group;
print join( ', ', #index ), "\n";
I need to generate a list of IP-addresses (IPv4) in Perl. I have start and end addresses, for example 1.1.1.1 and 1.10.20.30. How can I print all the addresses inbetween?
Use Net::IP. From the CPAN documentation:
my $ip = new Net::IP ('195.45.6.7 - 195.45.6.19') || die;
# Loop
do {
print $ip->ip(), "\n";
} while (++$ip);
This approach is more flexible because Net::IP accepts CIDR notation e.g. 193.0.1/24 and also supports IPv6.
Edit: if you are working with netblocks specifically, you might investigate Net::Netmask.
Use Net::IP's looping feature:
The + operator is overloaded in order to allow looping though a whole range of IP addresses:
It's all in how you code it. This is the fastest way I know.
my $start = 0x010101; # 1.1.1
my $end = 0x0a141e; # 10.20.30
for my $ip ( $start..$end ) {
my #ip = ( $ip >> 16 & 0xff
, $ip >> 8 & 0xff
, $ip & 0xff
);
print join( '.', 1, #ip ), "\n";
}
TMTOWTDI:
sub inc_ip { $_[0] = pack "N", 1 + unpack "N", $_[0] }
my $start = 1.1.1.1;
my $end = 1.10.20.30;
for ( $ip = $start; $ip le $end; inc_ip($ip) ) {
printf "%vd\n", $ip;
}
# We can use below code to generate IP range
use warnings;
use strict;
my $startIp = $ARGV[0];
my $endIp = $ARGV[1];
sub range {
my (#ip,#newIp,$i,$newIp,$j,$k,$l,$fh);
my ($j1,$k1,$l1);
open($fh,">","ip.txt") or die "could not open the file $!";
#ip = split(/\./,$startIp);
for($i=$ip[0];$i<=255;$i++) {
for($j=$ip[1];$j<=255;$j++) {
$ip[1]=0 if($j == 255);
for($k=$ip[2];$k<=255;$k++) {
$ip[2]=0 if($k == 255);
for($l=$ip[3];$l<=255;$l++) {
$ip[3]=0 if($l == 255);
#newIp = $newIp = join('.',$i,$j,$k,$l);
print $fh "$newIp \n";
exit if ($newIp eq $endIp);
}
}
}
}
}
range ($startIp, $endIp);