Perl LWP:Simple Get URL String Varilable - perl

#!/usr/bin/perl
use LWP::Simple;
use warnings;
$content = 0;
$find = "webvis.edgesuite.net";
open (HOSTLIST,"lists.hosts");
#hosts = <HOSTLIST>;
foreach $host(#hosts) {
$results = `nslookup www.$host`;
my $pos = index($results, $find);
if ($pos > -1 )
{
my $url = "http://www.$host";
$content = get ($url);
print $content;
my $pos1 = index($content, $url);
if($pos1 > -1) {
print "Content Match\n";
} else {
print "No Content Match\n";
}
$count++;
chomp ($host);
print "$count www.$host\n";
}
}
close (HOSTLIST);
exit($errorcode);
Using the code above, I always get the following error:
IO::Socket::INET: Bad hostname 'www.test.com
If change the $url to:
$url = 'http://www.test.com';
I get the content retrieval from the page.
So my question is how do I pass in a string variable to the get attribute so it doesn't produce
the bad hostname error?
Thank you in advance

When you read in the hosts from <HOSTLIST>, each line (except possibly the last) will have a newline at the end of it which does not belong in a domain name and thus has to be explicitly removed with the chomp function before trying to do anything important.

Related

Web crawler using perl

I want to develop a web crawler which starts from a seed URL and then crawls 100 html pages it finds belonging to the same domain as the seed URL as well as keeps a record of the traversed URLs avoiding duplicates. I have written the following but the $url_count value does not seem to be incremented and the retrieved URLs contain links even from other domains. How do I solve this? Here I have inserted stackoverflow.com as my starting URL.
use strict;
use warnings;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
##open file to store links
open my $file1,">>", ("extracted_links.txt");
select($file1);
##starting URL
my #urls = 'http://stackoverflow.com/';
my $browser = LWP::UserAgent->new('IE 6');
$browser->timeout(10);
my %visited;
my $url_count = 0;
while (#urls)
{
my $url = shift #urls;
if (exists $visited{$url}) ##check if URL already exists
{
next;
}
else
{
$url_count++;
}
my $request = HTTP::Request->new(GET => $url);
my $response = $browser->request($request);
if ($response->is_error())
{
printf "%s\n", $response->status_line;
}
else
{
my $contents = $response->content();
$visited{$url} = 1;
#lines = split(/\n/,$contents);
foreach $line(#lines)
{
$line =~ m#(((http\:\/\/)|(www\.))([a-z]|[A-Z]|[0-9]|[/.]|[~]|[-_]|[()])*[^'">])#g;
print "$1\n";
push #urls, $$line[2];
}
sleep 60;
if ($visited{$url} == 100)
{
last;
}
}
}
close $file1;
Several points, your URL parsing is fragile, you certainly won't get relative links. Also you don't test for 100 links but 100 matches of the current url, which almost certainly isn't what you mean. Finally, I'm not too familiar with LWP so I'm going to show an example using the Mojolicious suite of tools.
This seems to work, perhaps it will give you some ideas.
#!/usr/bin/env perl
use strict;
use warnings;
use Mojo::UserAgent;
use Mojo::URL;
##open file to store links
open my $log, '>', 'extracted_links.txt' or die $!;
##starting URL
my $base = Mojo::URL->new('http://stackoverflow.com/');
my #urls = $base;
my $ua = Mojo::UserAgent->new;
my %visited;
my $url_count = 0;
while (#urls) {
my $url = shift #urls;
next if exists $visited{$url};
print "$url\n";
print $log "$url\n";
$visited{$url} = 1;
$url_count++;
# find all <a> tags and act on each
$ua->get($url)->res->dom('a')->each(sub{
my $url = Mojo::URL->new($_->{href});
if ( $url->is_abs ) {
return unless $url->host eq $base->host;
}
push #urls, $url;
});
last if $url_count == 100;
sleep 1;
}

MIME::Parser - Can't save binary attachment

I am using Perl to read messages, look for and save attachments. Attachments will always be binary pdf documents and there will never be more than one attachment. I need to read the subject, check for and save an attachment (if exists) an copy message to a folder for temporary storage.
The reading, printing, copying functions all work. I've tried a lot of different scenarios with MIME::Parser (I have MIME::Tools installed) but either get a blank file or file with 1 or 2 characters. I'd also like to know how to determine / set the file extension rather than just blindly rename to .pdf.
#!/usr/bin/perl
use Net::IMAP::Simple::SSL;
use Email::Simple;
use MIME::Parser;
print "Content-type: text/html\n\n";
$server = new Net::IMAP::Simple::SSL('xxx');
$server->login('xxx','xxx');
my $folder='inbox';
my ($unseen, $recent, $total) = $server->status($folder);
my $newm = $server->select('INBOX');
my $tmp=($total-9); #limit for testing
my $outputdir = "./temp";
my $parser = new MIME::Parser;
$parser->output_dir($outputdir);
for (my $i = $tmp; $i <= $total; $i++) {
if ($server->seen($i)) {
print "Message #$i has been seen before...<br />";
} else {
my $es=Email::Simple->new(join '', #{$server->top($i)});
print $es->header('Subject')." on ";
print $es->header('Date')."<br />";
print "You've just seen message #$i<br />" if $server->see($i)."<br />";
$msg = $server->get($i);
$parser->parse_data($msg);
$server->copy($i,'dump');
}
}
$server->quit();
exit;
Error
parse_data: wrong argument ref type: Net::IMAP::Simple::_message at mailextract.pl line x
Don't know why you're using two different parsers...
my $entity = $parser->parse_data($message);
my $from = $entity->head->get('From');
my $subject = $entity->head->get('Subject');
my $timestamp = $entity->head->get('Date');
for my $part ($entity->parts()) {
if ( $part->mime_type eq 'application/pdf' ) { ### Few different types in use, see what your
### messages get sent with
my $filename = $part->bodyhandle->path;
...
### Do whatever
}
}
Edit: And your error is happening because you're not feeding through the correct thing to be parsed, a Net::IMAP::Simple::_message instead of:
parse_data DATA
Instance method. Parse a MIME message that's already in core. This internally creates an "in memory" filehandle on a Perl scalar value
using PerlIO
You may supply the DATA in any of a number of ways...
A scalar which holds the message. A reference to this scalar will be used internally.
A ref to a scalar which holds the message. This reference will be used internally.
DEPRECATED
A ref to an array of scalars. The array is internally concatenated into a temporary string, and a reference to the new
string is used internally.
It is much more efficient to pass in a scalar reference, so please consider refactoring your code to use that interface instead.
If you absolutely MUST pass an array, you may be better off using
IO::ScalarArray in the calling code to generate a filehandle, and
passing that filehandle to parse()
Try $parser->parse($server->getfh($i));
#!/usr/bin/perl
use Net::IMAP::Simple::SSL;
use MIME::Parser;
print "Content-type: text/html\n\n";
$server = new Net::IMAP::Simple::SSL('xxx');
$server->login('xxx','xxx');
my $newm=0;
$newm = $server->select('INBOX');
if ($newm==0) {
$server->quit();
print "No New Messages.";
exit;
}
my $outputdir = "./temp";
my $parser = new MIME::Parser;
$parser->output_dir($outputdir);
for (my $i = 1; $i <= $newm; $i++) {
my $entity = $parser->parse($server->getfh($i));
my $from = $entity->head->get('From');
my $subject = $entity->head->get('Subject');
my $timestamp = $entity->head->get('Date');
print "#$i $from / $subject / $timestamp<br />";
for my $part ($entity->parts()) {
print " / ".$part->mime_type;
if ( $part->mime_type eq 'application/octet-stream' || $part->mime_type eq 'application/pdf' ) {
my $filename = $part->bodyhandle->path;
print " / $filename";
}
print "<br />";
}
$server->copy($i,'dump');
$server->delete($i);
}
$server->quit();

perl get webpage error with LWP:Simple

I have a project I'm working on for school but I seem to be getting an error here...
I get "Can't call method 'content' on an undefined value at line 5"
use LWP::Simple;
for(my $id=0;$id<55;$id++)
{
my $response = get("http://www.gamereplays.org/community/index.php?act=medals&CODE=showmedal&MDSID=" + $id );
my $content = $response->content;
for(my $id2=0;$id2<10;$id2++)
{
$content =~ /<img src="http:\/\/www\.gamereplays.org\/community\/style_medals\/(.*)$id2\.gif" alt=""\/>/;
$url = "http://www.gamereplays.org/community/style_medals/" . $1 . $id2 . ".gif";
getstore($url, $1 . $id2 . ".gif");
}
}
LWP::simple doesn't return a response object, it return directly a string containing the response body.
And your put some pause between each request to avoid pounding the targeted website.

How to use a perl module that you have written?

I've just written my first Perl module and am having trouble getting it to work with a script I produced also. Here is the error that the Perl interpreter displays when I attempt to run the script that is using my newly created module.
Error message:
scraper_tools_v1.pm did not return a true value at getYid.pl line 5.
BEGIN failed--compilation aborted at getYid.pl line 5.
scraper_tools_v1.pm is the Perl module which I have written and getYid.pl is the Perl script which attempts to utilize the scraper_tools_v1.pm module.
Here is the code for the scraper_tools_v1.pm file:
#!/usr/bin/perl
package scraper_tools_v1;
use strict;
use warnings;
use WWW::Curl::Easy;
# Note this function expects a single parameter which should be in the form of a URL
sub getWebPage($)
{
# Setting up the Curl parameters
my $curl = WWW::Curl::Easy->new; # create a variable to store the curl object
# A parameter set to 1 tells the library to include the header in the body output.
# This is only relevant for protocols that actually have headers preceding the data (like HTTP).
$curl->setopt(CURLOPT_HEADER, 1);
# Setting the target URL to retrieve with the passed parameter
$curl->setopt(CURLOPT_URL, #_);
# Declaring a variable to store the response from the Curl request
my $response_body = '';
# Creating a file handle for CURL to output to, then redirecting our output to the $response_body variable
open(my $fileb, ">",\$response_body) or die $!;
$curl->setopt(CURLOPT_WRITEDATA, $fileb);
# getting the return code from the header to see if the GET was successful
my $return_code = $curl->perform;
# capturing the response code from the GET request in the HTTP header, i.e... 200, 404, 500, etc...
# 200 is success
my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
# if the return code is zero than the request was a success
if ($return_code == 0)
{
# A little debug output to keep you informed
print ("Success ". $response_code.": ".#_."\n");
# return whatever was contained on the web page that we just got using a GET
return $response_body;
}
else
{
print ("Failure ". $response_code.": ".#_."\n");
}
close($fileb); # close the file-handle
}
And here is the getYid.pl script which attempts to use the above module
#!/usr/bin/perl
use strict;
use warnings;
use scraper_tools_v1;
my %cat_links; # Hash that stores categories and their numbers (ID's)
my $web_page = scraper_tools_v1->getWebPage("http://something.com/categoryindex.aspx");
my #lines = split(/\n/, $web_page);
foreach my $line (#lines)
{
chomp($line);
if ($line =~ /<option value=\"{1}(.+)\">(.+)<\/option>/)
{
my $num = $1;
my $desc = $2;
$desc =~ s/\s+&\s+/ & /;
$cat_links{$desc} = $num;
}
}
my #allTargetUrls; # make a new array to store all the links we need to extract listings from
$web_page = ''; # Reset this variable so we can reuse it.
my $totalNumberOfListings = 0;
foreach my $key (keys %cat_links)
{
my $target = "http://something.com/categorydetail.aspx?id=$cat_links{$key}&exact_phrase=0";
$web_page = scraper_tools_v1->getWebPage($target);
#lines = split(/\n/, $web_page);
foreach my $line (#lines)
{
my $pages;
chomp($line);
if ($line =~ /We found (\d) listings for your search\./)
{
my $listingsInCat = $1;
print ("$cat_links{$key}, $listingsInCat");
$totalNumberOfListings += $listingsInCat;
}
if ($line =~ /Page 1 of (\d)/)
{
$pages = $1;
}
for (my $i = 1; $i <= $pages; $i++)
{
#build the target urls
my $pageUrl = "http://something.com/categorydetail.aspx?id=$key&search=&exact_phrase=True&city=&state=&zipcode=&page=$i";
push(#allTargetUrls, $pageUrl);
}
}
print("Total number of listings = ".$totalNumberOfListings);
}
Any help in resolving this issue would greatly be appreciated and please note that I have tested both files independently for interpreter errors and found nothing. Thanks to all for taking a look.
When you write a Perl module, you should always end the file with the line
1;
Perl executes code at the module level when the module is imported. If you don't return a true value (1 is true), then you'll get the error you describe. Essentially, Perl is informing you that the initialisation code in your module didn't succeed.

Copy and retain previous output for backup of transformed json data

I have a perl script that transforms json data to perl and saves output in files called teams.txt, backyard, and also a file called backup.txt, where the output of teams.txt is copied from. The following are two snippets from the script/the part of it that writes the data to the text files:
my %manager_to_directs;
my %user_to_manager;
my #users;
my $url = "https://xxxxxxxxxxxxxx.com/api/v1/reports/active/week";
my $useragent = LWP::UserAgent->new();
my $response = $useragent->get(($url));
if ($response->code !~ "200" || $response->code !~ "204" ){
while ($url && $url ne "Null") {
my $data = fetch_json($url);
last if !defined $data;
$url = $data->{next};
.
.
.
# write backyard.txt
open my $backyard_fh, ">", "backyard.txt";
foreach my $user (sort keys %user_to_management_chain) {
my $chain = join ',', #{$user_to_management_chain{$user}};
print $backyard_fh "$user:$chain\n";
}
close $backyard_fh;
# write teams.txt
open my $team_fh, ">", "teams.txt";
foreach my $user (sort #users) {
my $followers = $manager_to_followers{$user};
my $followers_joined = $followers ? join (',', sort #$followers) : "";
print $team_fh "$user:$followers_joined\n";
}
close $team_fh;
# write backup.txt, backup for teams.txt
open my $backup_fh, ">", "backup.txt";
copy("teams.txt", "backup.txt")
or die ("Can't copy teams.txt \n");
close $backup_fh;
This works almost exactly how I want it to, but now I've been testing with a negative scenario, where the .json url provided in the script is false/nonexistent, and I have to make sure that not another teams.txt file is created and the backup.txt file is still retained from the last execution.
I tested by replacing
my $url = "https://xxxxxxxxxxxxxx.com/api/v1/reports/active/week";
with
my $url = "https://fakeUrl.com/api/v1/reports/active/week";
And in this scenario, 404 would be passed and the program is supposed to fail. With this test, I noticed that the the contents of teams.txt and backyard.txt get wiped, but the backup.txt file gets wiped too...and that's not good.
I'm fine with teams.txt and backyard.txt being overwritten per each run of the script, but I need the backup.txt file to be retained no matter what, unless the program runs successfully and there's new content from teams.txt to be copied over to backup.txt.
Any help I can get is highly appreciated!
Following code snippets taken almost directly from documentation for modules.
May be you should try this approach.
use strict;
use warnings;
use feature 'say';
use LWP::UserAgent ();
my $url = 'https://metacpan.org/pod/HTTP::Tiny';
$url = 'https://fakeUrl.com/api/v1/reports/active/week';
my $ua = LWP::UserAgent->new(timeout => 10);
$ua->env_proxy;
my $response = $ua->get($url);
my $data;
if ($response->is_success) {
$data = $response->decoded_content;
}
else {
die $response->status_line;
}
# Process further data
say $data;
Output
500 Can't connect to fakeUrl.com:443 (Bad file descriptor) at C:\....\http_lwp.pl line 19.
use strict;
use warnings;
use feature 'say';
use HTTP::Tiny;
my $url = 'https://metacpan.org/pod/HTTP::Tiny';
$url = 'https://fakeUrl.com/api/v1/reports/active/week';
my $data;
my $response = HTTP::Tiny->new->get($url);
if( $response->{success} ) {
$data = $response->{content};
} else {
say "$response->{status} $response->{reason}";
exit 1;
}
# Process further data
say $data;
Output
403 Forbidden