I am struggling to use HTML::TokeParser
to parse an HTML document that contains Japanese characters.
Here is my code:
use utf8;
use Encode qw(decode encode is_utf8);
use Encode::Guess;
use Data::Dumper;
use LWP::UserAgent;
use HTTP::Cookies;
use Cwd;
use HTML::TokeParser;
my $local_dir = getcwd;
my $browser = LWP::UserAgent->new();
my $cookie_jar = HTTP::Cookies->new(
file => $local_dir . "/cookies.lwp",
autosave => 1,
);
$browser->cookie_jar( $cookie_jar );
push #{ $browser->requests_redirectable }, 'POST';
$browser->requests_redirectable;
my $response = $browser->get("http://www.yahoo.co.jp/");
my $html = $response->content;
print $html;
utf8::decode($html);
my $p = HTML::TokeParser->new( \$html );
# dispatch table with subs to handle the different types of tokens
my %dispatch = (
S => sub { $_[0]->[4] }, # Start tag
E => sub { $_[0]->[2] }, # End tag
T => sub { $_[0]->[1] }, # Text
C => sub { $_[0]->[1] }, # Comment
D => sub { $_[0]->[1] }, # Declaration
PI => sub { $_[0]->[2] }, # Process Instruction
);
while ( my $token = $p->get_tag('a') ) {
print $p->get_trimmed_text if $p->get_trimmed_text eq '社会的責任';
print "\n";
}
This doesn't display anything on my terminal, but if I just do a print $p->get_trimmed_text then the output is OK.
Here are a few hexdump lines corresponding to print $p->get_trimmed_text:
0000000 490a 746e 7265 656e 2074 7845 6c70 726f
0000010 7265 81e3 e4ae 92ba 8fe6 e89b a8a1 a4e7
0000020 e3ba ab81 81e3 e3a4 8481 81e3 0aa6 9fe7
0000030 e5b3 9db7 81e9 e3bc 8982 9be5 e5bd 8586
0000040 a4e5 e396 ae81 83e3 e397 ad83 82e3 e3b4
0000050 ab83 83e3 e395 a182 83e3 e3bc 8c81 86e7
0000060 e68a ac9c 94e6 e6af b48f 320a e334 ab82
0000070 89e6 e380 ae81 b4e7 e885 8991 90e5 e68d
0000080 8089 82e3 e692 a597 b8e5 e3b0 8a82 82e3
0000090 e3b3 bc83 82e3 e4b9 95bb abe7 e38b a681
00000a0 81e3 e7a7 b9b4 bbe4 0a8b 83e3 e39e af82
00000b0 83e3 e389 8a83 83e3 e3ab 8983 82e3 e384
00000c0 8783 83e3 e38b bc83 82e3 e3ba ae81 81e3
00000d0 e58a 97be 81e3 e3aa af82 83e3 e3bc 9d83
00000e0 83e3 e9b3 8d85 bfe4 0aa1 a8e8 e88e 96ab
00000f0 bce4 e39a 8c80 83e3 e392 a983 83e3 e3aa
0000100 bc83 b0e6 e58f 9d8b 88e5 e3a9 8d80 3235
0000110 e525 9986 9ce7 4e9f 5745 e50a a7a4 98e9
It seems like the comparison does not work.
I can use only HTML::TokeParser because that's the only module installed on the server and I can't install anything else.
You expect your two calls to $p->get_trimmed_text to return the same string, but it returns a different token each time it's called. Replace
print $p->get_trimmed_text if $p->get_trimmed_text eq '社会的責任';
with
my $text = $p->get_trimmed_text;
print $text if $text eq '社会的責任';
You shouldn't assume the HTML is encoded using UTF-8. Replace
my $html = $response->content;
utf8::decode($html);
with
my $html = $response->decoded_content;
Also need to encode your outputs. One way is by adding the following:
use encode ':std', ':encoding(UTF-8)';
Please see ikegami's answer. Mine is just an alternate approach which does not address the actual issue with your code.
Unicode::Collate to the rescue!
Note that I added below in your code.
use Unicode::Collate;
use open qw/:std :utf8/;
my $Collator = Unicode::Collate->new();
sub compare_strs
{
my ( $str1, $str2 ) = #_;
# Treat vars as strings by quoting.
# Possibly incorrect/irrelevant approach.
return $Collator->cmp("$str1", "$str2");
}
Note: compare_strs subroutine will return 1 (when $str1 is greater than $str2) or 0 (when $str1 is equal to $str2) or -1 (when $str1 is less than $str2).
Below is the complete working code:
use strict;
use warnings;
use utf8;
use Unicode::Collate;
use open qw/:std :utf8/;
use Encode qw(decode encode is_utf8);
use Encode::Guess;
use Data::Dumper;
use LWP::UserAgent;
use HTTP::Cookies;
use Cwd;
use HTML::TokeParser;
my $local_dir = getcwd;
my $browser = LWP::UserAgent->new();
my $cookie_jar = HTTP::Cookies->new(
file => $local_dir . "/cookies.lwp",
autosave => 1,
);
$browser->cookie_jar( $cookie_jar );
push #{ $browser->requests_redirectable }, 'POST';
$browser->requests_redirectable;
my $Collator = Unicode::Collate->new();
sub compare_strs
{
my ( $str1, $str2 ) = #_;
# Treat vars as strings by quoting.
# Possibly incorrect/irrelevant approach.
return $Collator->cmp("$str1", "$str2");
}
my $response = $browser->get("http://www.yahoo.co.jp/");
my $html = $response->content;
#print $html;
utf8::decode($html);
my $p = HTML::TokeParser->new( \$html );
# dispatch table with subs to handle the different types of tokens
my %dispatch = (
S => sub { $_[0]->[4] }, # Start tag
E => sub { $_[0]->[2] }, # End tag
T => sub { $_[0]->[1] }, # Text
C => sub { $_[0]->[1] }, # Comment
D => sub { $_[0]->[1] }, # Declaration
PI => sub { $_[0]->[2] }, # Process Instruction
);
my $string = '社会的責任';
while ( my $token = $p->get_tag('a') ) {
my $text = $p->get_trimmed_text;
unless (compare_strs($text, $string)){
print $text;
print "\n";
}
}
Output:
chankeypathak#perl:~/Desktop$ perl test.pl
社会的責任
Related
I would like to log SOAP::Lite transport request/response contents using a custom identifier (e.g. a transaction-id or txn_id in my example below):
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use SOAP::Lite +trace => [ transport => \&log_transport, ];
sub log_transport {
my ($in) = #_;
if (ref($in) eq "HTTP::Request") {
# INSERT INTO logs ( txn_id, request ) VALUES ( $tnx_id, $in->content )
say STDERR Dumper(ref($in), $in->content);
}
elsif (ref($in) eq "HTTP::Response") {
# UPDATE logs SET response = '$in->content' WHERE txn_id = $tnx_id
say STDERR Dumper(ref($in), $in->content);
}
}
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME}=0;
my $soap = SOAP::Lite->proxy('https://www.dataaccess.com/webservicesserver/NumberConversion.wso?op=NumberToWords');
$soap->serializer()->register_ns('http://www.dataaccess.com/webservicesserver/' ,"ns");
sub getWords
{
my ($number, $txn_id) = #_ ;
my $method = SOAP::Data->name("ns:NumberToWords");
my #params = ( SOAP::Data->name("ubiNum" => $number) );
my $response = $soap->call($method => #params);
if (!$response->fault) {
say STDOUT "NumberToWords = " . $response->result;
}
else {
say STDERR "error: " . (defined $response->faultstring? $response->faultstring : $soap->transport->status);
}
}
getWords(444, '123abc');
In my example above, how can I pass the transaction-id 123abc to my logger?
P.S. I do not wish to use:
$soap->outputxml(1)->call($method => #params)
It does not seem like the SOAP::Trace transport callback supports extra argument passing. As a workaround you could use a lexical variable declared in the outer scope like this:
use strict;
use warnings;
use Data::Dumper;
my $TXN_ID;
use SOAP::Lite +trace => [ transport => \&log_transport, ];
sub log_transport {
my ($in) = #_;
say STDERR "Logging transaction id: $TXN_ID:";
if (ref($in) eq "HTTP::Request") {
# INSERT INTO logs ( txn_id, request ) VALUES ( $tnx_id, $in->content )
say STDERR Dumper(ref($in), $in->content);
}
elsif (ref($in) eq "HTTP::Response") {
# UPDATE logs SET response = '$in->content' WHERE txn_id = $tnx_id
say STDERR Dumper(ref($in), $in->content);
}
}
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME}=0;
my $soap = SOAP::Lite->proxy('https://www.dataaccess.com/webservicesserver/NumberConversion.wso?op=NumberToWords');
$soap->serializer()->register_ns('http://www.dataaccess.com/webservicesserver/' ,"ns");
sub getWords
{
my ($number, $txn_id) = #_ ;
$TXN_ID = $txn_id;
my $method = SOAP::Data->name("ns:NumberToWords");
my #params = ( SOAP::Data->name("ubiNum" => $number) );
my $response = $soap->call($method => #params);
if (!$response->fault) {
say STDOUT "NumberToWords = " . $response->result;
}
else {
say STDERR "error: " . (defined $response->faultstring? $response->faultstring : $soap->transport->status);
}
}
getWords(444, '123abc');
when I am using "--disk-cache=true" in phantomjs_arg then it's getting error In this line:
my $form = $self->{obj_mech}->form_number( 2 );
No elements found for form number 2 at modules/TestLogin.pm line 1129.
at /usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 796.
WWW::Mechanize::PhantomJS::signal_condition(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
"No elements found for form number 2") called at
/usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 1732
WWW::Mechanize::PhantomJS::xpath(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
"(//form)[2]", "user_info", "form number 2", "single", 1) called at
/usr/local/share/perl/5.22.1/WWW/Mechanize/PhantomJS.pm line 2102
WWW::Mechanize::PhantomJS::form_number(WWW::Mechanize::PhantomJS=HASH(0x4cfa120),
2) called at modules/TestLogin.pm line 1129
TestLogin::TestLogin_login(TestLogin=HASH(0x4f5c8a8)) called at collectBets.pl line 20 Debugged program terminated. Use q to quit
or R to restart, use o inhibit_exit to avoid stopping after program
termination, h q, h R or h o to get additional info.
without disk-cashe it's working fine.
This is my sample code for better understanding.
#!/usr/bin/perl
use strict;
use warnings;
use Helper;
use WWW::Mechanize::PhantomJS;
use DataBase;
use MyConfig;
use JSON;
use DateTime;
use HTML::Entities;
sub new($$) {
my ($class,$params) = #_;
my $self = $params || {};
bless $self, $class;
$self->{obj_mech} = WWW::Mechanize::PhantomJS -> new( phantomjs_arg => ['--ssl-protocol=any','--disk-cache=true','--max-disk-cache-size=1024'], ignore_ssl_errors => 1);
$self->{obj_helper} = new Helper();
#$self->{obj_db} = new DataBase();
$self->{logged_in} = 0;
#$self->setTorProxy();
#$self->init_market_master();
return $self;
}
Login();
print "\nlogin done...\n";
exit;
sub Login {
my ($self) = #_;
my $html = $self->{obj_mech}->get( "https://www.gmail.com/" );
sleep(25);
$html = $self->{obj_mech}->content;
$self->{obj_mech}->viewport_size({ width => 1366, height => 768 });
my $form = $self->{obj_mech}->form_number( 2 );
my $user_name = '*****';
my $password = '******';
$self->{obj_mech}->set_fields('InputEmail' =>$user_name);
$self->{obj_mech}->set_fields('InputPassword' =>$password);
$self->{obj_mech}->click({ xpath => '//button[#class="PrimaryButton"]' });
sleep(20);
my $test_html=$self->{obj_mech}->content;
$self->{obj_helper}->writeFileNew( "TestLoginPage.html" , $test_html );
my $png = $self->{obj_mech}->content_as_png();
$self->{obj_helper}->writeFileNew( "LoginPage.png" , $png );
return 1;
}
Well, before looking at the disk-cache arguments, I found that there are no such elements.
# There is only 1 form. If you want to keep this line,
# you need to change the form number to 1
my $form = $self->{obj_mech}->form_number( 2 );
# I didn't find input field named 'InputEmail'
# The actual field name is 'Email'
$self->{obj_mech}->set_fields('InputEmail' =>$user_name);
# You have to click 'Next' button firstly then the password
# input box is shown. And the field name should be 'Passwd'
$self->{obj_mech}->set_fields('InputPassword' =>$password);
# The xpath of 'Sign in' button is //input[#value="Sign in"]
$self->{obj_mech}->click({ xpath => '//button[#class="PrimaryButton"]' });
A simple working script either with disk cache or without disk cache:
#! /usr/bin/perl
use strict;
use warnings;
use WWW::Mechanize::PhantomJS;
use open ':std', ':encoding(UTF-8)';
#my $p = WWW::Mechanize::PhantomJS->new(phantomjs_arg=>['--ssl-protocol=any','--disk-cache=false','--max-disk-cache-size=1024'],ignore_ssl_errors=>1);
my $p = WWW::Mechanize::PhantomJS->new(phantomjs_arg=>['--ssl-protocol=any','--disk-cache=true','--max-disk-cache-size=1024'],ignore_ssl_errors=>1);
my $html = $p->get("https://www.gmail.com/");
sleep(5);
write_html('first-page.html', $p->content);
$p->viewport_size({width=>1366,height=>768});
my $form = $p->form_number(1);
my $user_name = '*****';
my $password = '*****';
$p->set_fields('Email'=>$user_name);
sleep(5);
$p->click({xpath=>'//input[#value="Next"]'});
sleep(5);
write_html('after-click-next.html', $p->content);
$p->set_fields('Passwd'=>$password);
sleep(5);
$p->click({xpath=>'//input[#value="Sign in"]'});
sleep(5);
write_html('after-login.html', $p->content);
sub write_html {
my ($file, $content) = #_;
open my $fh, '>', $file or die;
print $fh $content;
close $fh;
}
I have a requirement to fetch many http urls and I use AnyEvent::HTTP to do this
For every URL I need to measure the time taken how can I do this ?
My code (stripped down) is here
#!/usr/bin/perl
use strict;
use AnyEvent::HTTP;
use AnyEvent::Socket;
use Data::Dumper;
my $internal_ip=v192.168.2.103; #Use this ip to bind instead of default ip. Harcoding necessary :-( using v$ip
sub prep_cb {
my ($socket)=#_;
my $bind = AnyEvent::Socket::pack_sockaddr undef, $internal_ip;
# I need to start the time here
bind $socket, $bind
or die "bind: $!";
}
my $url="http://192.168.2.105/echo.php";
my $anyevent = AnyEvent->condvar;
$anyevent->begin;
http_request(
"GET" => $url,
on_prepare =>\&prep_cb,
sub {
my ($data, $hdr) = #_;
$anyevent->end;
# I need to measure the time taken
print Dumper([$data,$hdr]);
}
);
$anyevent->recv;
What if you replace your http_request() with the following:
my $timer;
http_request(
"GET" => $url,
on_prepare => sub {$timer = time; prep_cb},
sub {
my ($data, $hdr) = #_;
$anyevent->end;
print "Took " . (time - $timer) . " seconds.\n";
print Dumper([$data,$hdr]);
}
);
Simpler way is to have a variable and update it on on_prepare and log it after $anyevent->end as mentioned by TheAmigo
A general way to profile/time any function:
Assuming your function is fetchHttpUrl($url),
you could call it like this
profile(\&fetchHttpUrl, $url);
sub profile {
my($function, #arguments) = #_;
my $startTime = currentTimeInMilliseconds();
$function->(#arguments);
my $durationInMs = currentTimeInMilliseconds() - $startTime;
print"{".getMethodNameFromPointer($function)."(".join(",", #arguments).")"."} : $durationInMs ms";
}
I am trying to pass parameter into packages with import but it override my export. I saw some suggested $main:debugLevel in Debugger.pm but it doesn't work. How to fix this?
main.pl
our $debugLevel = 5;
our $dDebug=TRUE;
our $dPkg=__PACKAGE__;
our $dMsg="";
use MyPkg::Debugger qw( :all );
# ^-- how do I pass in the variables declared above?
...
dPrintLog(4, 'testsub', 'msg', $mydata);
# ^-- this generate error, if i put in the "import" sub in the Debugger.pm
Debugger.pm
use strict;
package MyPkg::Debugger;
our $VERSION = 1.00;
our #ISA = qw(Exporter);
our #EXPORT_OK = qw(dPrintLog );
our %EXPORT_TAGS = (
all => \#EXPORT_OK
);
use DateTime::Format::Strptime;
use POSIX qw(strftime);
use Data::Dumper;
our $debugLevel = 5;
our $dDebug=TRUE;
our $dPkg=__PACKAGE__;
our $dMsg="";
sub import {
my ($debugLevel , $dDebug, $dMsg, $data) = #_;
}
sub dPrintLog {
my ($level, $sub, $msg, $data) = #_;
if ($level == 5) {
print "L:" . $level . ";" . "Pkg:" . $sub . ";". "Msg: " . $msg . "\n";
print ' '.Dumper($data) unless (!defined $data);
}elsif ($level == 3){
}elsif ($level == 1){
}else{
}
}
Read Exporter's documentation carefully. It contains all the information you need.
App.pl
#!/usr/bin/perl
use warnings;
use strict;
use MyDbg (':all', 5); # Try removing the 5.
dPrintLog(undef, 'message');
MyDbg.pm
package MyDbg;
use warnings;
use strict;
our #ISA = qw(Exporter);
our #EXPORT_OK = qw(dPrintLog );
our %EXPORT_TAGS = ( all => \#EXPORT_OK );
use Exporter ();
my $debugLevel = 3;
sub import {
my ($class, $tag, $level) = #_;
$debugLevel = $level if $level;
$class->Exporter::export_to_level(1, $class, $tag);
}
sub dPrintLog {
my ($level, $msg) = #_;
$level ||= $debugLevel;
print { 5 => "L:$level $msg",
3 => "$msg",
1 => substr $msg, 0 ,1,
}->{$level}, "\n";
}
__PACKAGE__
Interestingly, it doesn't work if you remove the #ISA line. You have to declare the export subroutine to make it work (based on my experiments, no documentation found):
sub export { Exporter::export(#_) }
or
*export = *Exporter::export{CODE};
or, even
sub export { goto &Exporter::export }
#!/usr/bin/perl -w
use WWW::LinkedIn;
use CGI; # load CGI routines
use CGI::Session;
$q = CGI->new; # create new CGI object
print $q->header, # create the HTTP header
$q->start_html('hello world'), # start the HTML
$q->h1('hello world'), # level 1 header
$q->end_html; # end the HTML
my $consumer_key = 'xxxxxxx';
my $consumer_secret = 'xxxxxxxxx';
my $li = WWW::LinkedIn->new(
consumer_key => $consumer_key,
consumer_secret => $consumer_secret,
);
if ( length( $ENV{'QUERY_STRING'} ) > 0 ) {
$buffer = $ENV{'QUERY_STRING'};
#pairs = split( /&/, $buffer );
foreach $pair (#pairs) {
( $name, $value ) = split( /=/, $pair );
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$in{$name} = $value;
}
$sid = $q->cookie('CGISESSID') || $q->param('CGISESSID') || undef;
$session = new CGI::Session( undef, $sid, { Directory => '/tmp' } );
my $access_token = $li->get_access_token(
verifier => $in{'oauth_verifier'},
request_token => $session->param("request_token"),
request_token_secret => $session->param("request_token_secret"),
);
undef($session);
my $profile_xml = $li->request(
request_url =>
'http://api.linkedin.com/v1/people/~:(id,first-name,last-name,positions,industry,distance)',
access_token => $access_token->{token},
access_token_secret => $access_token->{secret},
);
print $profile_xml;
}
The output is printing in single line. I want to print that is separate line.
OUTPUT
aAVGFD34 jj DD 456456 2003 6 true ara systems Technology and Services Technology and Services 0
How can i get the each column value from the profile_xml variable?
id avsdff
first name jj
lastname dd
Simply use Data::Dumper and XML::Simple.
use Data::Dumper;
use XML::Simple; #you may want to install a specific package from your distribution
{...}
my $hash_ref = SimpeXML::XMLin($profile_xml);
print Dumper($hash_ref);
I do not know if you would like more beautifully output.
try just to make simple print out from your hash reference
foreach $key (keys %{$profile_xml}) {
print "$key $profile_xml->{$key}\n";
}
Here i am going the show the way to parse the data and print in the individual lines.
my $parser = XML::Parser->new( Style => 'Tree' );
my $tree = $parser->parse( $profile_xml );
#print Dumper( $tree ); you can use this see the data displayed in the tree formatted
my $UID = $tree->[1]->[4]->[2],"\n";
print "User ID:$UID";
print"</br>";
my $FirstName = $tree->[1]->[8]->[2],"\n";
print "First Name:$FirstName";
print"</br>";
For sample i have showed for UID and FirstName. And this is working fine.