Parsing a dynamic page with Perl and Gtk3::WebKit - perl

I am using the Gtk3::WebKit to parse a website which loads content to itself with JavaScript. When the site is loaded it adds multiple divs with content to the DOM:
<div class="product-card">...</div>
How do I get this content with Gtk3::WebKit? How to get the nested tag's content? Is there a normal documentation about Gtk3::WebKit, because everything I've already seen is very poor documented.

Here is an example using Gtk3::WebKit2 :
use feature qw(say);
use strict;
use warnings;
use Gtk3 -init;
use Gtk3::WebKit2;
use Gtk3::JavaScriptCore;
{
my $url = 'https://metacpan.org/pod/Gtk3::WebKit2';
my $window = Gtk3::Window->new('toplevel');
$window->set_default_size(800, 600);
$window->signal_connect(destroy => sub { Gtk3->main_quit() });
my $ctx = Gtk3::WebKit2::WebContext::get_default();
my $view = Gtk3::WebKit2::WebView->new_with_context($ctx);
$view->signal_connect('load-changed', sub {
my ($view, $load_event) = #_;
if ($load_event eq 'finished') {
run_javascript(
$view,
'document.getElementsByClassName("logged_out")[1].innerHTML;'
);
}
});
$view->load_uri($url);
my $scrolls = Gtk3::ScrolledWindow->new();
$scrolls->add($view);
$window->add($scrolls);
$window->show_all();
Gtk3::main_iteration while Gtk3::events_pending;
Gtk3->main;
}
sub run_javascript {
my ($view, $javascript_string) = #_;
my $done = 0;
$view->run_javascript($javascript_string, undef, sub {
my ($object, $result, $user_data) = #_;
my $value = $view->run_javascript_finish($result)->get_js_value;
say $value->to_string;
$done = 1;
return "ok";
}, undef);
Gtk3::main_iteration while Gtk3::events_pending and not $done;
}
Output:
<a href="" onclick="alert('Please sign in to add favorites'); return false" class="favorite highlight" title="Add to favorites">
<span>2</span> ++</a>

Related

No elements found for form number 2 in phantomjs

when I am using "--disk-cache=true" in phantomjs_arg then it's getting error In this line:
my $form = $self->{obj_mech}->form_number( 2 );
No elements found for form number 2 at modules/TestLogin.pm line 1129.
at /usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 796.
WWW::Mechanize::PhantomJS::signal_condition(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
"No elements found for form number 2") called at
/usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 1732
WWW::Mechanize::PhantomJS::xpath(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
"(//form)[2]", "user_info", "form number 2", "single", 1) called at
/usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 2102
WWW::Mechanize::PhantomJS::form_number(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
2) called at modules/TestLogin.pm line 1129
TestLogin::TestLogin_login(TestLogin=HASH(0x4f5c8a8)) called at collectBets.pl line 20 Debugged program terminated. Use q to quit
or R to restart, use o inhibit_exit to avoid stopping after program
termination, h q, h R or h o to get additional info.
without disk-cashe it's working fine.
This is my sample code for better understanding.
#!/usr/bin/perl
use strict;
use warnings;
use Helper;
use WWW::Mechanize::PhantomJS;
use DataBase;
use MyConfig;
use JSON;
use DateTime;
use HTML::Entities;
sub new($$) {
my ($class,$params) = #_;
my $self = $params || {};
bless $self, $class;
$self->{obj_mech} = WWW::Mechanize::PhantomJS -> new( phantomjs_arg => ['--ssl-protocol=any','--disk-cache=true','--max-disk-cache-size=1024'], ignore_ssl_errors => 1);
$self->{obj_helper} = new Helper();
#$self->{obj_db} = new DataBase();
$self->{logged_in} = 0;
#$self->setTorProxy();
#$self->init_market_master();
return $self;
}
Login();
print "\nlogin done...\n";
exit;
sub Login {
my ($self) = #_;
my $html = $self->{obj_mech}->get( "https://www.gmail.com/" );
sleep(25);
$html = $self->{obj_mech}->content;
$self->{obj_mech}->viewport_size({ width => 1366, height => 768 });
my $form = $self->{obj_mech}->form_number( 2 );
my $user_name = '*****';
my $password = '******';
$self->{obj_mech}->set_fields('InputEmail' =>$user_name);
$self->{obj_mech}->set_fields('InputPassword' =>$password);
$self->{obj_mech}->click({ xpath => '//button[#class="PrimaryButton"]' });
sleep(20);
my $test_html=$self->{obj_mech}->content;
$self->{obj_helper}->writeFileNew( "TestLoginPage.html" , $test_html );
my $png = $self->{obj_mech}->content_as_png();
$self->{obj_helper}->writeFileNew( "LoginPage.png" , $png );
return 1;
}
Well, before looking at the disk-cache arguments, I found that there are no such elements.
# There is only 1 form. If you want to keep this line,
# you need to change the form number to 1
my $form = $self->{obj_mech}->form_number( 2 );
# I didn't find input field named 'InputEmail'
# The actual field name is 'Email'
$self->{obj_mech}->set_fields('InputEmail' =>$user_name);
# You have to click 'Next' button firstly then the password
# input box is shown. And the field name should be 'Passwd'
$self->{obj_mech}->set_fields('InputPassword' =>$password);
# The xpath of 'Sign in' button is //input[#value="Sign in"]
$self->{obj_mech}->click({ xpath => '//button[#class="PrimaryButton"]' });
A simple working script either with disk cache or without disk cache:
#! /usr/bin/perl
use strict;
use warnings;
use WWW::Mechanize::PhantomJS;
use open ':std', ':encoding(UTF-8)';
#my $p = WWW::Mechanize::PhantomJS->new(phantomjs_arg=>['--ssl-protocol=any','--disk-cache=false','--max-disk-cache-size=1024'],ignore_ssl_errors=>1);
my $p = WWW::Mechanize::PhantomJS->new(phantomjs_arg=>['--ssl-protocol=any','--disk-cache=true','--max-disk-cache-size=1024'],ignore_ssl_errors=>1);
my $html = $p->get("https://www.gmail.com/");
sleep(5);
write_html('first-page.html', $p->content);
$p->viewport_size({width=>1366,height=>768});
my $form = $p->form_number(1);
my $user_name = '*****';
my $password = '*****';
$p->set_fields('Email'=>$user_name);
sleep(5);
$p->click({xpath=>'//input[#value="Next"]'});
sleep(5);
write_html('after-click-next.html', $p->content);
$p->set_fields('Passwd'=>$password);
sleep(5);
$p->click({xpath=>'//input[#value="Sign in"]'});
sleep(5);
write_html('after-login.html', $p->content);
sub write_html {
my ($file, $content) = #_;
open my $fh, '>', $file or die;
print $fh $content;
close $fh;
}

Accessing Mojolicious helpers from a module using Mojo::Base?

I have an existing application (my website) that I'm doing some code tidying in, and the tidy up is following the same sort of idea as the Mojo::Pg example here, with separate model and controller files to keep things defined. My site accesses both Flickr and Last.fm's APIs, and I have a helper defined in Site::Helpers:
$app->helper(
get_base_rest_url => sub {
my ( $self, $config ) = #_;
sswitch ( $config ) {
case 'photos': {
my $base_url = 'https://api.flickr.com/services/rest/';
my $user_id = '7281432#N05';
my $api_key = $self->app->config->{ 'api_token' }{ 'flickr' };
my $url =
"$base_url"
. "?user_id=$user_id"
. "&api_key=$api_key"
. "&per_page=" . $self->session->{ per_page }
. '&format=json'
. '&nojsoncallback=1';
return $url;
}
case 'music': {
my $base_url = 'https://ws.audioscrobbler.com/2.0/';
my $username = 'virtualwolf';
my $api_key = $self->app->config->{ 'api_token' }{ 'last_fm' };
my $per_page = $self->session->{ 'per_page' };
my $url = "$base_url?user=$username&limit=$per_page&api_key=$api_key&format=json";
return $url;
}
}
}
);
The problem I'm running into is that I don't know how to access that helper from the Site::Model::Photos module. The error is
Can't locate object method "get_base_rest_url" via package "Site::Model::Photos"
which is fair enough, but I can't work out how to actually get at that get_base_rest_url helper (or alternatively, how to access the api_token config).
The problem is that your module have not got app attribute/method which get access to your app.
So, when you create instance of Site::Model::Photos you need to pass app to it in param and make it weaken something like that:
package Site::Model::Photos
use Scalar::Util 'weaken';
sub new {
my $class = shift;
my $app = shift;
my $hash = {app => $app, ...};
weaken $hash->{app};
return bless $hash, $class;
}
sub your_method {
my $self = shift;
$self->{app}->get_base_rest_url(...);
}
1;
Or you may to use this module https://metacpan.org/release/Mojolicious-Plugin-Model which do it for you:
package Site::Model::Photos
use Mojo::Base 'MojoX::Model';
... code of your module ...
sub your_method {
my $self = shift;
$self->app->get_base_rest_url(...);
}
1;
And in your App.pm need to add this:
$app->plugin('Model', {namespaces => ['Site::Model']});
And use it that in controller:
$c->model('photos');
$c->app->model('photos');

Getting the body of an http POST request, using mod-perl 2

I am writing a quick script to munge a submitted file, and return that content to the user.
My test code looks like this:
#!/path/to/bin/perl
use strict;
use warnings;
use utf8;
use Apache2::RequestRec;
use Apache2::RequestIO;
my ( $xmlin, $accepts ) = (q{}, q{});
my $format = 'json';
# read the posted content
while (
Apache2::RequestIO::read($xmlin, 1024)
) {};
{
no warnings;
$accepts = $Apache2::RequestRec::headers_in{'Accepts'};
}
if ($accepts) {
for ($accepts) {
/application\/xml/i && do {
$format = 'xml';
last;
};
/text\/plain/i && do {
$format = 'text';
last;
};
} ## end for ($accepts)
} ## end if ($accepts)
print "format: $format; xml: $xmlin\n";
This code fails to compile with Undefined subroutine &Apache2::RequestIO::read
If I comment out the while loop, the code runs fine.
Unfortunately the Apache2::RequestIO code is pulled in via Apache2::XSLoader::load __PACKAGE__; so I can't check the actual code.... but I don't understand why this doesn't work
(and yes, I've also tried $r->read(...), to no avail)
I think I have a good idea of why your code is not working.
The module Apache2::RequestIO added new functionality to Apache2::RequestRec.
In other words to add new methods/functions to the Apache2::RequestRec namespace.
I would first change Apache2::RequestIO::read to Apache2::RequestRec::read.
If that does not work move use a handler.
I have code that works which does a similar the thing
In your httpd.conf
PerlSwitches -I/path/to/module_dir
PerlLoadModule ModuleName
PerlResponseHandler ModuleName
ModuleName.pm
package ModuleName;
use strict;
use warnings;
use Apache2::RequestIO();
use Apache2::RequestRec();
use Apache2::Const -compile => qw(OK);
sub handler {
my ($r) = #_;
{
use bytes;
my $content = '';
my $offset = 0;
my $cnt = 0;
do {
$cnt = $r->read($content,8192,$offset);
$offset += $cnt;
} while($cnt == 8192);
}
return Apache2::Const::HTTP_OK;
}
I also use Apache2::RequestIO to read the body:
sub body {
my $self = shift;
return $self->{ body } if defined $self->{ body };
$self->apr->read( $self->{ body }, $self->headers_in->get( 'Content-Length' ) );
$self->{ body };
}
In this case you should subclass original Apache2::Request. Especially pay attention to our #ISA = qw(Apache2::Request);
I do not know why, but standard body method return me:
$self->body # {}
$self->body_status # Missing parser
when Content-Type is application/json. So I work around that in such way. Then parse body myself:
sub content {
my $self = shift;
return $self->{ content } if defined $self->{ content };
my $content_type = $self->headers_in->get('Content-Type');
$content_type =~ s/^(.*?);.*$/$1/;
return unless exists $self->{ $content_type };
return $self->{ content } = $self->{ $content_type }( $self->body, $self );
}
where:
use JSON;
sub new {
my ($proto, $r) = #_;
my $self = $proto->SUPER::new($r);
$self->{ 'application/json' } = sub {
decode_json shift;
};
return $self;
}

Accessing __DATA__ from super class

I have a super class called Response :
package Response;
use strict;
use warnings;
use HTML::Template;
sub response {
my ( $class, $request ) = #_;
return $request->new_response( $class->status, $class->headers, $class->body );
}
sub body {
my $class = shift;
my $template = HTML::Template->new( 'filehandle' => eval("$class::DATA") );
return $template->output() . $class;
}
sub status {
return 200;
}
sub headers {
return [ 'Content-Type' => 'text/html' ];
}
1;
__DATA__
Default content
and a subclass called URIError :
package URIError;
use strict;
use warnings;
use Response;
our #ISA = qw(Response);
1;
__DATA__
Invalid URI
When URIError->response is called, line
my $template = HTML::Template->new( 'filehandle' => eval("$class::DATA") );
in Response class does not takes DATA section content from URIError class.
What's the syntax to achieve this ?
Your code will work if you change the body method like this. There is no need for eval: all you have to do is disable strict 'refs' and dereference the string "${class}::DATA"
sub body {
my $class = shift;
my $data_fh = do {
no strict 'refs';
*{"${class}::DATA"};
};
my $template = HTML::Template->new( filehandle => $data_fh );
$template->output . $class;
}

How to clean-up HTTP::Async if still in use

I am using Perl library HTTP::Async as follows:
use strict;
use warnings;
use HTTP::Async;
use Time::HiRes;
...
my $async = HTTP::Async->new( ... );
my $request = HTTP::Request->new( GET => $url );
my $start = [Time::HiRes::gettimeofday()];
my $id = $async->add($request);
my $response = undef;
while (!$response) {
$response = $async->wait_for_next_response(1);
last if Time::HiRes::tv_interval($start) > TIME_OUT;
}
...
When while loop timeout and script ends, I experience the the following error message:
HTTP::Async object destroyed but still in use at script.pl line 0
HTTP::Async INTERNAL ERROR: 'id_opts' not empty at script.pl line 0
What are my options? How can I "clean-up" HTTP::Async object if still in use, but not needed anymore?
I would suggest that you remove incomplete requests, but the module does not provide any interface to do so.
Option 1: Add removal functionality.
Add the following to your script:
BEGIN {
require HTTP::Async;
package HTTP::Async;
if (!defined(&remove)) {
*remove = sub {
my ($self, $id) = #_;
my $hashref = $self->{in_progress}{$id}
or return undef;
my $s = $hashref->{handle};
$self->_io_select->remove($s);
delete $self->{fileno_to_id}{ $s->fileno };
delete $self->{in_progress}{$id};
delete $self->{id_opts}{$id};
return $hashref->{request};
};
}
if (!defined(&remove_all)) {
*remove_all = sub {
my ($self) = #_;
return map $self->remove($_), keys %{ $self->{in_progress} };
};
}
}
You should contact the author and see if he can add this feature. $id is the value returned by add.
Option 2: Silence all warnings from the destructor.
If you're ok with not servicing all the requests, there's no harm in silencing the warnings. You can do so as follows:
use Sub::ScopeFinalizer qw( scope_finalizer );
my $async = ...;
my $anchor = scope_finalizer {
local $SIG{__WARN__} = sub { };
$async = undef;
};
...
Note that this will silence all warnings that occur during the object's destruction, so I don't like this as much.
It's not too hard to subclass HTTP::Async for a more general solution. I use this to be able to abort all pending requests:
package HTTP::Async::WithFlush;
use strict;
use warnings;
use base 'HTTP::Async';
use Time::HiRes qw(time);
sub _flush_to_send {
my $self = shift;
for my $request (#{ $self->{to_send} }) {
delete $self->{id_opts}->{$request->[1]};
}
$self->{to_send} = [];
}
sub _flush_in_progress {
my $self = shift;
# cause all transfers to time out
for my $id (keys %{ $self->{in_progress} }) {
$self->{in_progress}->{$id}->{finish_by} = time - 1;
}
$self->_process_in_progress;
}
sub _flush_to_return {
my $self = shift;
while($self->_next_response(-1)) { }
}
sub flush_pending_requests {
my $self = shift;
$self->_flush_to_send;
$self->_flush_in_progress;
$self->_flush_to_return;
return;
}
1;
This is (maybe) easier on using the module internals than the code by #ikegami.