Perl Parallel::ForkManager empty return - perl

I am trying to use Parallel::ForkManager to run proceed parallel but unfortunately the subroutine parallel does not return any entries.
sub parallel {
my ($self,$values) = #_;
my %hash;
my $pm = Parallel::ForkManager->new(200);
foreach my $IP ( keys %{$values} ) {
my $pid = $pm->start and next;
$hash{$IP}=$self->getData($IP);
$pm->finish(0, \$hash{$IP});
}
$pm->wait_all_children;
return %hash;
}
print Dumper( parallel(%data) );
What I'm doing wrong? Any ideas?

Forking is the creation of a new process that's a copy of the current process. Changing a variable in one process doesn't change similarly named variables in other processes.
You modify the child's process's %hash, but you're dumping the parent's process's %hash.
P::FM does provide a mechanism for passing data back to the parent process. It's documented under the heading "RETRIEVING DATASTRUCTURES from child processes".
use Data::Dumper qw( Dumper );
use Parallel::ForkManager qw( );
use constant MAX_WORKERS => 200;
my %hash;
my $pm = Parallel::ForkManager->new(MAX_WORKERS);
$pm->run_on_finish(sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $result_ref) = #_;
my $IP = $ident;
warn("Child $IP killed by signal $exit_signal"), return if $exit_signal;
warn("Child $IP exited with error $exit_code"), return if $exit_code;
warn("Child $IP encountered an unknown error"), return if !$result_ref;
$hash{$IP} = $$result_ref;
});
for my $IP (keys %$values) {
my $pid = $pm->start($IP) and next;
$pm->finish(0, \$self->getData($IP));
}
$pm->wait_all_children();
print(Dumper(\%hash));

Related

How to pass hash contents of a forked subroutine back to main program?

I need to access in a main program the contents of hashes that were generated via subroutines that were forked. Here specifically is what I am trying to do:-
use Benchmark;
use File::Find;
use File::Basename;
use File::Path;
use Data::Dumper;
use strict;
use warnings;
print "Process ID: $$ \n";
my #PartitionRoots = qw(/nfs/dir1 /nfs/dir2 /nfs/dir3 /nfs/dir4);
my #PatternsToCheck = qw(prefix1 prefix2);
my #MatchedDirnames = qw();
my $DirCount = 0;
my $Forks = 0;
my #AllDirs = qw();
my %SweepStats = ();
foreach my $RootPath (#PartitionRoots) {
foreach my $Pattern (#PatternsToCheck) {
if (grep {-e} glob ("$RootPath/$Pattern*")) {
my #Files = glob ("$RootPath/$Pattern*");
foreach my $FileName (#Files) {
if (-d $FileName) {
$DirCount++;
push (#AllDirs, $FileName);
my $PID = fork;
if (not defined $PID) {
warn 'Could not fork!\n';
next;
}
if ($PID) {
$Forks++;
print "In the parent PID ($$), Child pid: $PID Number of forked child processes: $Forks\n";
} else {
print "In the child PID ($$)\n";
find(\&file_stats, $FileName);
print "Child ($$) exiting...\n";
exit;
}
}
}
}
}
}
for (1 .. $Forks) {
my $PID = wait();
print "Parent saw child $PID exit.\n";
}
print "Parent ($$) ending.\n";
print Dumper (\%SweepStats);
foreach my $DirName (#AllDirs) {
print ("Printing $DirName contents...\n");
foreach (#{$SweepStats{$DirName}}) {
my $uname = $_->{uname};
my $mtime = $_->{mtime};
my $size = $_->{size};
my $file = $_->{file};
print ("$uname $mtime $size $file\n");
}
}
sub file_stats {
if (-f $File::Find::name) {
my $FileName = $_;
my $PathName = dirname($_);
my $DirName = basename($_);
my $uid = (stat($_))[4];
my $uname = getpwuid($uid);
my $size = (stat($_))[7];
my $mtime = (stat($_))[9];
if (defined $uname && $uname ne '') {
push #{$SweepStats{$FileName}}, {path=>$PathName,dir=>$DirName,uname=>$uname,mtime=>$mtime,size=>$size,file=>$File::Find::name};
} else {
push #{$SweepStats{$FileName}}, {path=>$PathName,dir=>$DirName,uname=>$uid,mtime=>$mtime,size=>$size,file=>$File::Find::name};
}
}
return;
}
exit;
...but Dumper is coming up empty, so the dereferencing and printing that immediately follows is empty, too. I know the file stat collecting is working, because if I replace the "push #{$SweepStats{$FileName}}" statements with print statements, I see exactly what is expected. I just need to properly access the hashes from the global level, but I cannot get it quite right. What am I doing wrong here? There are all kinds of posts about passing hashes to subroutines, but not the other way around.
Thanks!
The fork call creates a new, independent process. That child process and its parent cannot write to each other's data. So in order for data to be exchanged between the parent and the child we need to use some Inter-Process-Communication (IPC) mechanism.†
It is by far easiest to use a library that takes care of details, and Parallel::ForkManager seems rather suitable here as it provides an easy way to pass the data from child back to the parent, and it has a simple queue (to keep the number of simultaneous processes limited to a given number).
Here is some working code, and comments follow
use warnings;
use strict;
use feature 'say';
use File::Find;
use File::Spec;
use Parallel::ForkManager;
my %file_stats; # written from callback in run_on_finish()
my $pm = Parallel::ForkManager->new(16);
$pm->run_on_finish(
sub { # 6th argument is what is passed back from finish()
my ($pid, $exit, $ident, $signal, $core, $dataref) = #_;
foreach my $file_name (keys %$dataref) {
$file_stats{$file_name} = $dataref->{$file_name};
}
}
);
my #PartitionRoots = '.'; # For my tests: current directory,
my #PatternsToCheck = ''; # no filter (pattern is empty string)
my %stats; # for use by File::Find in child processes
foreach my $RootPath (#PartitionRoots) {
foreach my $Pattern (#PatternsToCheck) {
my #dirs = grep { -d } glob "$RootPath/$Pattern*";
foreach my $dir (#dirs) {
#say "Looking inside $dir";
$pm->start and next; # child process
find(\&get_file_stats, $dir);
$pm->finish(0, { %stats }); # exits, {%stats} passed back
}
}
}
$pm->wait_all_children;
sub get_file_stats {
return if not -f;
#say "\t$File::Find::name";
my ($uid, $size, $mtime) = (stat)[4,7,9];
my $uname = getpwuid $uid;
push #{$stats{$File::Find::name}}, {
path => $File::Find::dir,
dir => ( File::Spec->splitdir($File::Find::dir) )[-1],
uname => (defined $uname and $uname ne '') ? $uname : $uid,
mtime => $mtime,
size => $size,
file => $File::Find::name
};
}
Comments
The main question in all this is: at which part of your three-level hierarchy to spawn child processes? I left it as in the question, where for each directory a child is forked. This may be suitable if (most of) directories have many files; but if it isn't so and there is little work for each child to do then it may all get too busy and the overhead may reduce/deny the speedup
The %stats hash, necessary for File::Find to store the data it finds, need be declared outside of all loops so that it is seen in the sub. So it is inherited by all child processes, but each gets its own copy as due and we need not worry about data overlap or such
I simplified (and corrected) code other than the forking as well, following what seemed to me to be desired. Please let me know if that is off
See linked documentation, and for example this post and links in it for details
In order to display complex data structures use a library, of which there are many.
I use Data::Dump, intended to simply print nicely,
use Data::Dump qw(dd pp);
...
dd \%file_stats; # or: say "Stats for all files: ", pp \%file_stats;
for its compact output, while the most widely used is the core Data::Dumper
use Data::Dumper
...
say Dumper \%file_stats;
which also "understands" data structures (so you can mostly eval them back).
(Note: In this case there'll likely be a lot of output! So redirect to a file, or exit those loops after the first iteration so just to see how it's all going.)
† As a process is forked the variables and data from the parent are available to the child. They aren't copied right away for efficiency reasons, so initially the child does read parent's data. But any data generated after the fork call in the parent or child cannot be seen by the other process.
Try this module: IPC::Shareable.
It recommended by perldoc perlipc, and you can find answer to your question here.

Append resulted data to scalar variable in Parallel::ForkManager Perl

I have a code, which is working as expected. But I have difficulty in storing the output of each of the executed command in $result = $ssh->capture($command_to_execute);. This uses Parallel::ForkManager module to run the commands in different Hosts by using different files as an input parameters.
Once the command execution is done, I want the resulted output should be stored in $result variable. It should append each of the hosts results in same variable and at the end I want to process the values which are in $result. I am using .= to append the resulted data to $result but it doent seems to be working.
Pasting my code here for reference:
.
.
.
my $result;
my $pm = Parallel::ForkManager->new(5);
DATA_LOOP:
foreach my $n (1..$num_buckets) {
my $pid = $pm->start and next DATA_LOOP;
$command_to_execute = $my_command." ".$Files{$n};
my $ssh = SSH_Connection( $list_of_ips[$n-1], 'username', 'pass' );
$result = $ssh->capture($command_to_execute);
$result .= "Result from File:$Files{$n} and Host:$list_of_ips[$n-1] is $result\n";
print "Result: INSIDE: $result";
$pm->finish;
}
$pm->wait_all_children;
print "Result: OUTSIDE: $result";
print "Done\n";
sub SSH_Connection {
my ( $host, $user, $passwd ) = #_;
my $ssh = Net::OpenSSH->new($host,
user => $user,
password => $passwd,
master_opts => [-o => "StrictHostKeyChecking=no"]
);
$ssh->error and die "Couldn't establish SSH connection: ". $ssh->error;
return $ssh;
}
print "Result: INSIDE: $result"; Could able to print result one by one. But print "Result: OUTSIDE: $result";
is empty, which should actually have the combined results of $results which has been taken from inside the for loop.
As shown in the documentation of Parallel::ForkManager, to get a result from a child, you need to supply a reference to the result as another parameter to finish.
$pm->finish(0, [$Files{$n}, $list_of_ips[$n-1], $result]);
Use run_on_finish to gather the results:
my $result;
$pm->run_on_finish( sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $single_result) = #_;
$result .= "Result from File: $single_result->[0] and Host: $single_result->[1]"
. " is $single_result->[2]\n";
Each time you run $pm->start, you are forking a new process to run the code until $pm->finish. This forked process cannot affect the parent process in any way except by the mechanism Parallel::ForkManager provides to send data back to the parent. This mechanism is described at https://metacpan.org/pod/Parallel::ForkManager#RETRIEVING-DATASTRUCTURES-from-child-processes.
$pm->run_on_finish(sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data) = #_;
my $result = $$data;
...
});
DATA_LOOP:
foreach my $n (1..$num_buckets) {
my $pid = $pm->start and next DATA_LOOP;
...
$pm->finish(0, \$result);
}
Forking is in fact not needed for these operations if you are willing to restructure a bit. Net::OpenSSH can provide commands that can be managed simultaneously by an event loop such as IO::Async::Loop, thus all Perl operations will occur in the same process (but not necessarily in the order they appear). Since IO::Async::Loop->run_process returns a Future, Future::Utils provides a way to manage concurrency of these commands.
use strict;
use warnings;
use Net::OpenSSH;
use IO::Async::Loop;
use Future::Utils 'fmap_concat';
my $loop = IO::Async::Loop->new;
my $future = fmap_concat {
my $n = shift;
...
my $remote_command = $ssh->make_remote_command($command_to_execute);
return $loop->run_process(command => $remote_command, capture => ['stdout'])
->transform(done => sub { "Result from File:$Files{$n} and Host:$list_of_ips[$n-1] is $_[0]\n"; });
} foreach => [1..$num_buckets], concurrent => 5;
my #results = $future->get;
There is a lot of flexibility to how the individual and overall (returned by fmap) Futures are managed, but by default any failure to execute a process will result in the whole Future failing immediately (causing get to throw an exception) and any nonzero exit will be ignored.

Can Net::LDAP and Parallel::Forkmanager work together?

I need to query different LDAP servers in perl via Net::LDAP. I have something that works well. However, in an attempt to speed up things, I tried to query the different servers in parallel, using Parallel::Forkmanager - and things do not work when I do that.
I get the following types of errors:
decode error 02<=>30 0 8 at /Users/myname/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/Convert/ASN1/_decode.pm line 113, <> line 18.
decode error 43<=>30 0 8 at /Users/myname/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/Convert/ASN1/_decode.pm line 113, <> line 25.
at the line that gets the search response:
$mesg = $ldap->search( base => $dn, filter => '(CN=*)');
I am puzzled.
Telling it in other other words, why does this fail:
use Net::LDAP;
use Parallel::Forkmanager;
...; # bind LDAP servers
while (<>) {
chop;
my $dn = $_;
foreach my $ldap (#servers) {
my $pid;
$pid = $pm->start and next; # do the fork
print $dn, $pid;
my $mesg;
try {
$mesg = $ldap->search( base => $dn, filter => '(CN=*)');
} catch {
...;
}
$pm->finish;
}
}
while this:
use Net::LDAP;
...; # bind LDAP servers
while (<>) {
chop;
my $dn = $_;
foreach my $ldap (#servers) {
print $dn;
my $mesg;
try {
$mesg = $ldap->search( base => $dn, filter => '(CN=*)');
} catch {
...;
}
}
}
works perfectly?
Whilst forking doesn't have quite the same thread safety problems of threading - there are still a few places you have gotchas. I think this is what's biting you - your Net::LDAP objects are created in the parent thread, but then (effectively) cloned to each when you fork.
Which means in your code - there's a very real possibility that if you've got the list of names coming in fast enough, that a new fork will try to reuse an existing Net::LDAP connection before a previous one is finished with it.
The easy way of preventing this is call wait_all_children to ensure all your parallel LDAP queries are finished before the next one starts.
If you put your LDAP bind within the ForkManager loop, do you still have the same problem? I appreciate that's a potential overhead as you'll be binding each iteration, but if that addresses it, I'd suggest that it's because Net::LDAP is sharing the same file descriptors between forks.
The next best solution there would be to adopt a 'worker' model, where you've got a bunch of 'workers' each with their on LDAP connections to do the querying. That's easier with threading, than forking - goes a bit like this:
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use Thread::Queue;
sub server_worker {
my ( $hostname, $q_ref ) = #_;
## connect LDAP to $hostname;
while ( my $dn = $$q_ref->dequeue ) {
#query $dn
}
}
my #queues;
foreach my $server (#list_of_servers) {
my $server_q = Threads::Queue->new();
push( #queues, $server_q );
threads->create( \&server_worker, $hostname, \$server_q );
}
while ( my $dn = <STDIN> ) {
chomp($dn);
foreach my $q (#queues) {
$q->enqueue($dn);
}
}
foreach my $q ( #queues ) {
$q -> end;
}
foreach my $thr ( threads->list ) {
$thr->join();
}
Doing something similar with forking should work:
#!/usr/bin/perl
use strict;
use warnings;
use IO::Pipe;
use Parallel::ForkManager;
use Net::LDAP;
my #list_of_servers = qw ( servername servenama anotherserver );
my $pm = Parallel::ForkManager -> new ( scalar #list_of_servers );
my %pipe_for;
foreach my $server ( #list_of_servers ) {
my $pipe = IO::Pipe -> new();
my $pid = pm -> start;
if ( $pid ) {
print "$$: parent\n";
$pipe -> writer -> autoflush;
$pipe_for{$server} = $pipe;
}
else {
print "$$ child connecting to $server\n";
$pipe -> reader -> autoflush;
close ( STDIN ); #because this is a child.
#Net::LDAP setup
while ( my $item = <$pipe> ) {
chomp ( $item );
#ldap_search $item;
}
}
$pm -> finish;
}
And then send stuff:
for my $number ( 1..10 ) {
foreach my $pipe ( values %pipe_for ) {
print {$pipe} "test$number\n";
}
}
$pm -> wait_all_children();
Edit: Note - autoflush is important, otherwise the IO buffers and doesn't look like it's working. I'm pretty sure closing STDIN is probably a good idea in the child, but perhaps not vitally necessary if they don't use it.

Limiting processes with Parallel::ForkManager

I am trying to use Parallel::ForkManager to control some child processes. I would like to limit the number of processes running concurrently to 10. In total I need to run 20.
I know I could set the process limit at 10 in the first line at the object declaration, but I am also using the $pm object to run child processes that do something different (the current function is much more memory intensive so needs to be limited).
The code I have currently does not work, the run on finish call is never made, so the remaining 10 children never get forked. I don't understand why this is the case- I'd have thought the child would still call the finish code on exit,and decrement the count, but the "if" statement seems to stop this. Could someone explain why this is the case?
Thanks for any help!
# Parallel declarations
my $pm = Parallel::ForkManager->new(30);
$pm->run_on_finish(sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_str_ref) = #_;
--$active_jobs;
})
my $total_jobs = 0;
my $active_jobs = 0;
while( $total_jobs < 20) {
sleep 300 and next if $active_jobs > 10;
my $pid = $pm->start and ++$active_p1_jobs and ++$total_p1_jobs and next;
my $return = module::function(%args);
$pm->finish(0, { index => $total_jobs, return => $return });
}
print STDERR "Submitted all jobs, now waiting for children to exit.\n";
$pm->wait_all_children();
I'm going to call "type 2" the jobs that are limited to 10.
This is how I'd do it with P::FM:
use strict;
use warnings;
use List::Util qw( shuffle );
use Parallel::ForkManager qw( );
use POSIX qw( WNOHANG );
use Time::HiRes qw( sleep );
use constant MAX_WORKERS => 30;
use constant MAX_TYPE2_WORKERS => 10;
sub is_type2_job { $_[0]{type} == 2 }
my #jobs = shuffle(
( map { { type => 1, data => $_ } } 0..19 ),
( map { { type => 2, data => $_ } } 0..19 ),
);
my $pm = Parallel::ForkManager->new(MAX_WORKERS);
my $type2_count = 0;
$pm->run_on_finish(sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $job) = #_;
--$type2_count if is_type2_job($job);
print "Finished: $pid, $job->{type}, $job->{data}, $job->{result}\n";
});
my #postponed_jobs;
while (#postponed_jobs || #jobs) {
my $job;
if (#postponed_jobs && $type2_count < MAX_TYPE2_WORKERS) {
$job = shift(#postponed_jobs);
}
elsif (#jobs) {
$job = shift(#jobs);
if ($type2_count >= MAX_TYPE2_WORKERS && is_type2_job($job)) {
push #postponed_jobs, $job;
redo;
}
}
# elsif (#postponed_jobs) {
# # Already max type 2 jobs being processed,
# # but there are idle workers.
# $job = shift(#postponed_jobs);
# }
else {
local $SIG{CHLD} = sub { };
select(undef, undef, undef, 0.300);
$pm->wait_one_child(WNOHANG);
redo;
}
++$type2_count if is_type2_job($job);
my $pid = $pm->start and next;
$job->{result} = $job->{data} + 100; # Or whatever.
$pm->finish(0, $job);
}
$pm->wait_all_children();
But this is broken. The code that picks the next job should be done in the middle of start (i.e. after it waits for children to finish, but before it forks), not before start. This could cause jobs to be run out of order. This isn't the first time I've wished P::FM has a pre-fork callback. Maybe you could ask the maintainer for one.

How to multithread seeing if a webpage exists in Perl?

I'm writing a Perl script that takes in a list of URLs and checks to see if they exist. (Note that I only care if they exist; I don’t care what their contents are. Here’s the important part of the program.
use LWP::Simple qw($ua head);
if (head($url))
{
$numberAlive ++;
}
else
{
$numberDead ++;
}
Right now the program works fine; however, I want it to run faster. Thus I'm considering making it multithreaded. I assume that the slow part of my program is contacting the server for each URL; therefore, I'm looking for a way in which I can send out requests to the URLs of other webpages on my list while I'm waiting for the first response. How can I do this? As far as I can tell, the head routine doesn't have a callback that can get called once the server has responded.
Begin with familiar-looking front matter.
#! /usr/bin/env perl
use strict;
use warnings;
use 5.10.0; # for // (defined-or)
use IO::Handle;
use IO::Select;
use LWP::Simple;
use POSIX qw/ :sys_wait_h /;
use Socket;
Global constants control program execution.
my $DEBUG = 0;
my $EXIT_COMMAND = "<EXIT>";
my $NJOBS = 10;
URLs to check arrive one per line on a worker’s end of the socket. For each URL, the worker calls LWP::Simple::head to determine whether the resource is fetchable. The worker then writes back to the socket a line of the form url : *status* where *status* is either "YES" or "NO" and represents the space character.
If the URL is $EXIT_COMMAND, then the worker exits immediately.
sub check_sites {
my($s) = #_;
warn "$0: [$$]: waiting for URL" if $DEBUG;
while (<$s>) {
chomp;
warn "$0: [$$]: got '$_'" if $DEBUG;
exit 0 if $_ eq $EXIT_COMMAND;
print $s "$_: ", (head($_) ? "YES" : "NO"), "\n";
}
die "NOTREACHED";
}
To create a worker, we start by creating a socketpair. The parent process will use one end and each worker (child) will use the other. We disable buffering at both ends and add the parent end to our IO::Select instance. We also note each child’s process ID so we can wait for all workers to finish.
sub create_worker {
my($sel,$kidpid) = #_;
socketpair my $parent, my $kid, AF_UNIX, SOCK_STREAM, PF_UNSPEC
or die "$0: socketpair: $!";
$_->autoflush(1) for $parent, $kid;
my $pid = fork // die "$0: fork: $!";
if ($pid) {
++$kidpid->{$pid};
close $kid or die "$0: close: $!";
$sel->add($parent);
}
else {
close $parent or die "$0: close: $!";
check_sites $kid;
die "NOTREACHED";
}
}
To dispatch URLs, the parent grabs as many readers as are available and hands out the same number of URLs from the job queue. Any workers that remain after the job queue is empty receive the exit command.
Note that print will fail if the underlying worker has already exited. The parent must ignore SIGPIPE to prevent immediate termination.
sub dispatch_jobs {
my($sel,$jobs) = #_;
foreach my $s ($sel->can_write) {
my $url = #$jobs ? shift #$jobs : $EXIT_COMMAND;
warn "$0 [$$]: sending '$url' to fd ", fileno $s if $DEBUG;
print $s $url, "\n" or $sel->remove($s);
}
}
By the time control reaches read_results, the workers have been created and received work. Now the parent uses can_read to wait for results to arrive from one or more workers. A defined result is an answer from the current worker, and an undefined result means the child has exited and closed the other end of the socket.
sub read_results {
my($sel,$results) = #_;
warn "$0 [$$]: waiting for readers" if $DEBUG;
foreach my $s ($sel->can_read) {
warn "$0: [$$]: reading from fd ", fileno $s if $DEBUG;
if (defined(my $result = <$s>)) {
chomp $result;
push #$results, $result;
warn "$0 [$$]: got '$result' from fd ", fileno $s if $DEBUG;
}
else {
warn "$0 [$$]: eof from fd ", fileno $s if $DEBUG;
$sel->remove($s);
}
}
}
The parent must keep track of live workers in order to collect all results.
sub reap_workers {
my($kidpid) = #_;
while ((my $pid = waitpid -1, WNOHANG) > 0) {
warn "$0: [$$]: reaped $pid" if $DEBUG;
delete $kidpid->{$pid};
}
}
Running the pool executes the subs above to dispatch all URLs and return all results.
sub run_pool {
my($n,#jobs) = #_;
my $sel = IO::Select->new;
my %kidpid;
my #results;
create_worker $sel, \%kidpid for 1 .. $n;
local $SIG{PIPE} = "IGNORE"; # writes to dead workers will fail
while (#jobs || keys %kidpid || $sel->handles) {
dispatch_jobs $sel, \#jobs;
read_results $sel, \#results;
reap_workers \%kidpid;
}
warn "$0 [$$]: returning #results" if $DEBUG;
#results;
}
Using an example main program
my #jobs = qw(
bogus
http://stackoverflow.com/
http://www.google.com/
http://www.yahoo.com/
);
my #results = run_pool $NJOBS, #jobs;
print $_, "\n" for #results;
the output is
bogus: NO
http://www.google.com/: YES
http://stackoverflow.com/: YES
http://www.yahoo.com/: YES
Another option is HTTP::Async.
#!/usr/bin/perl
use strict;
use warnings;
use HTTP::Request;
use HTTP::Async;
my $numberAlive = 0;
my $numberDead = 0;
my #urls = ('http://www.perl.com','http://www.example.xyzzy/foo.html');
my $async = HTTP::Async->new;
# you might want to wrap this in a loop to deal with #urls in batches
foreach my $url (#urls){
$async->add( HTTP::Request->new( HEAD => $url ) );
}
while ( my $response = $async->wait_for_next_response ) {
if ($response->code == 200){$numberAlive ++;}
else{$numberDead ++;}
}
print "$numberAlive Alive, $numberDead Dead\n";
Worker-based parallelisation (using your choice of threads or processes):
use strict;
use warnings;
use feature qw( say );
use threads; # or: use forks;
use LWP::Simple qw( head );
use Thread::Queue::Any qw( );
use constant NUM_WORKERS => 10; # Or whatever.
my $req_q = Thread::Queue::Any->new();
my $resp_q = Thread::Queue::Any->new();
my #workers;
for (1..NUM_WORKERS) {
push #workers, async {
while (my $url = $req_q->dequeue()) {
my $is_alive = head($url) ? 1 : 0;
$resp_q->enqueue($is_alive);
}
};
}
$req_q->enqueue($_) for #urls;
my ($alive, $dead);
for (1..#urls) {
my $is_alive = $resp_q->dequeue();
++( $is_alive ? $alive : $dead );
}
$req_q->enqueue(undef) for #workers;
$_->join for #workers;
say $alive;
say $dead;