perl xml::sax parsing - perl

Hi I have an xml as below:
<employees>
<employee>
<firstname>John</firstname>
<lastname>Doe</lastname>
<age>gg</age>
<department>Operations</department>
<amount Ccy="EUR">100</amount>
<joinDate> 12/12/2011 </joinDate>
</employee>
<employee>
<firstname>John1111</firstname>
<lastname>Doe1111</lastname>
<age>gg</age>
<department>Operations</department>
<amount Ccy="EUR">200</amount>
<joinDate> 12/13/2011 </joinDate>
</employee>
</employees>
I would like to parse this using xml::sax. So I override start_element and end_element and characters methods. I have to handle two scenarios which i am not sure how to handle. 1) if joinDate is null then take the joinDate as sysdate. 2) sum up the amount and get a value as totalAmount. How to do this using XML::SAX module in perl.
Throw me some light which method i need to override to do this and small snippet of code would help.
Thanks,
Srikanth

It's pretty straightforward:
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use XML::SAX::ParserFactory;
use XML::SAX::PurePerl;
use Date::Calc 'Today';
my $today = sprintf("%02d/%02d/%4d", (Today())[1,2,0]);
# Alternatively if you can't use Date::Calc:
# my #localtime = localtime;
# my $today = sprintf("%02d/%02d/%4d", $localtime[4]+1, $localtime[2], $localtime[5]+1900);
my (#joindates, $total_amount, $buffer);
my $factory = new XML::SAX::ParserFactory;
my $handler = new XML::SAX::PurePerl;
my $parser = $factory->parser(
Handler => $handler,
Methods => {
characters => sub {
$buffer = shift->{Data};
$buffer =~ s/^\s+|\s+$//g;
},
end_element => sub {
return unless shift->{LocalName} =~ /^(amount|joinDate)$/;
push #joindates, $buffer || $today if $1 eq 'joinDate';
$total_amount += $buffer if $1 eq 'amount';
}
}
);
$parser->parse_uri("sample.xml");
print "Total amount: $total_amount\n";
printf("Join dates:\n%s\n", join("\n", #joindates));
Output:
Total amount: 300
Join dates:
12/12/2011
12/13/2012

Related

Issue comparing Japanese characters

I am struggling to use HTML::TokeParser
to parse an HTML document that contains Japanese characters.
Here is my code:
use utf8;
use Encode qw(decode encode is_utf8);
use Encode::Guess;
use Data::Dumper;
use LWP::UserAgent;
use HTTP::Cookies;
use Cwd;
use HTML::TokeParser;
my $local_dir = getcwd;
my $browser = LWP::UserAgent->new();
my $cookie_jar = HTTP::Cookies->new(
file => $local_dir . "/cookies.lwp",
autosave => 1,
);
$browser->cookie_jar( $cookie_jar );
push #{ $browser->requests_redirectable }, 'POST';
$browser->requests_redirectable;
my $response = $browser->get("http://www.yahoo.co.jp/");
my $html = $response->content;
print $html;
utf8::decode($html);
my $p = HTML::TokeParser->new( \$html );
# dispatch table with subs to handle the different types of tokens
my %dispatch = (
S => sub { $_[0]->[4] }, # Start tag
E => sub { $_[0]->[2] }, # End tag
T => sub { $_[0]->[1] }, # Text
C => sub { $_[0]->[1] }, # Comment
D => sub { $_[0]->[1] }, # Declaration
PI => sub { $_[0]->[2] }, # Process Instruction
);
while ( my $token = $p->get_tag('a') ) {
print $p->get_trimmed_text if $p->get_trimmed_text eq '社会的責任';
print "\n";
}
This doesn't display anything on my terminal, but if I just do a print $p->get_trimmed_text then the output is OK.
Here are a few hexdump lines corresponding to print $p->get_trimmed_text:
0000000 490a 746e 7265 656e 2074 7845 6c70 726f
0000010 7265 81e3 e4ae 92ba 8fe6 e89b a8a1 a4e7
0000020 e3ba ab81 81e3 e3a4 8481 81e3 0aa6 9fe7
0000030 e5b3 9db7 81e9 e3bc 8982 9be5 e5bd 8586
0000040 a4e5 e396 ae81 83e3 e397 ad83 82e3 e3b4
0000050 ab83 83e3 e395 a182 83e3 e3bc 8c81 86e7
0000060 e68a ac9c 94e6 e6af b48f 320a e334 ab82
0000070 89e6 e380 ae81 b4e7 e885 8991 90e5 e68d
0000080 8089 82e3 e692 a597 b8e5 e3b0 8a82 82e3
0000090 e3b3 bc83 82e3 e4b9 95bb abe7 e38b a681
00000a0 81e3 e7a7 b9b4 bbe4 0a8b 83e3 e39e af82
00000b0 83e3 e389 8a83 83e3 e3ab 8983 82e3 e384
00000c0 8783 83e3 e38b bc83 82e3 e3ba ae81 81e3
00000d0 e58a 97be 81e3 e3aa af82 83e3 e3bc 9d83
00000e0 83e3 e9b3 8d85 bfe4 0aa1 a8e8 e88e 96ab
00000f0 bce4 e39a 8c80 83e3 e392 a983 83e3 e3aa
0000100 bc83 b0e6 e58f 9d8b 88e5 e3a9 8d80 3235
0000110 e525 9986 9ce7 4e9f 5745 e50a a7a4 98e9
It seems like the comparison does not work.
I can use only HTML::TokeParser because that's the only module installed on the server and I can't install anything else.
You expect your two calls to $p->get_trimmed_text to return the same string, but it returns a different token each time it's called. Replace
print $p->get_trimmed_text if $p->get_trimmed_text eq '社会的責任';
with
my $text = $p->get_trimmed_text;
print $text if $text eq '社会的責任';
You shouldn't assume the HTML is encoded using UTF-8. Replace
my $html = $response->content;
utf8::decode($html);
with
my $html = $response->decoded_content;
Also need to encode your outputs. One way is by adding the following:
use encode ':std', ':encoding(UTF-8)';
Please see ikegami's answer. Mine is just an alternate approach which does not address the actual issue with your code.
Unicode::Collate to the rescue!
Note that I added below in your code.
use Unicode::Collate;
use open qw/:std :utf8/;
my $Collator = Unicode::Collate->new();
sub compare_strs
{
my ( $str1, $str2 ) = #_;
# Treat vars as strings by quoting.
# Possibly incorrect/irrelevant approach.
return $Collator->cmp("$str1", "$str2");
}
Note: compare_strs subroutine will return 1 (when $str1 is greater than $str2) or 0 (when $str1 is equal to $str2) or -1 (when $str1 is less than $str2).
Below is the complete working code:
use strict;
use warnings;
use utf8;
use Unicode::Collate;
use open qw/:std :utf8/;
use Encode qw(decode encode is_utf8);
use Encode::Guess;
use Data::Dumper;
use LWP::UserAgent;
use HTTP::Cookies;
use Cwd;
use HTML::TokeParser;
my $local_dir = getcwd;
my $browser = LWP::UserAgent->new();
my $cookie_jar = HTTP::Cookies->new(
file => $local_dir . "/cookies.lwp",
autosave => 1,
);
$browser->cookie_jar( $cookie_jar );
push #{ $browser->requests_redirectable }, 'POST';
$browser->requests_redirectable;
my $Collator = Unicode::Collate->new();
sub compare_strs
{
my ( $str1, $str2 ) = #_;
# Treat vars as strings by quoting.
# Possibly incorrect/irrelevant approach.
return $Collator->cmp("$str1", "$str2");
}
my $response = $browser->get("http://www.yahoo.co.jp/");
my $html = $response->content;
#print $html;
utf8::decode($html);
my $p = HTML::TokeParser->new( \$html );
# dispatch table with subs to handle the different types of tokens
my %dispatch = (
S => sub { $_[0]->[4] }, # Start tag
E => sub { $_[0]->[2] }, # End tag
T => sub { $_[0]->[1] }, # Text
C => sub { $_[0]->[1] }, # Comment
D => sub { $_[0]->[1] }, # Declaration
PI => sub { $_[0]->[2] }, # Process Instruction
);
my $string = '社会的責任';
while ( my $token = $p->get_tag('a') ) {
my $text = $p->get_trimmed_text;
unless (compare_strs($text, $string)){
print $text;
print "\n";
}
}
Output:
chankeypathak#perl:~/Desktop$ perl test.pl
社会的責任

How do I extract an attribute/property in Perl using XML::Twig module?

If I have the below sample XML, how do I extract the _Id from the field using XML::Twig?
<note>
<to _Id="100">Share</to>
<from>Jane</from>
<heading>Reminder</heading>
<body>A simple text</body>
</note>
I've tried combinations of the below with no luck.
sub getId {
my ($twig, $mod) = #_;
##my $to_id = $mod->field('to')->{'_Id'}; ## does not work
##my $to_id = $mod->{'atts'}->{_Id}; ## does not work
##my $to_id = $mod->id; ## does not work
$twig->purge;
}
This is one way to get 100. It uses the first_child method:
use warnings;
use strict;
use XML::Twig;
my $xml = <<XML;
<note>
<to _Id="100">Share</to>
<from>Jane</from>
<heading>Reminder</heading>
<body>A simple text</body>
</note>
XML
my $twig = XML::Twig->new(twig_handlers => { note => \&getId });
$twig->parse($xml);
sub getId {
my ($twig, $mod) = #_;
my $to_id = $mod->first_child('to')->att('_Id');
print "$to_id \n";
}

perl parsing using sax

I would like to write a xml parsing script in Perl that prints all the firstname values from the following xml file using XML::SAX module.
<employees>
<employee>
<firstname>John</firstname>
<lastname>Doe</lastname>
<age>gg</age>
<department>Operations</department>
<amount Ccy="EUR">100</amount>
</employee>
<employee>
<firstname>Larry</firstname>
<lastname>Page</lastname>
<age>45</age>
<department>Accounts</department>
<amount Ccy="EUR">200</amount>
</employee>
<employee>
<firstname>Harry</firstname>
<lastname>Potter</lastname>
<age>50</age>
<department>Human Resources</department>
<amount Ccy="EUR">300</amount>
</employee>
</employees>
Can anyone help me with sample script?
I am a new to Perl.
Here's an example using XML::SAX. I've used XML::SAX::PurePerl.
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use XML::SAX::ParserFactory;
use XML::SAX::PurePerl;
my $characters;
my #firstnames;
my $factory = new XML::SAX::ParserFactory;
#Let's see which handlers we have available
#print Dumper $factory;
my $handler = new XML::SAX::PurePerl;
my $parser = $factory->parser(
Handler => $handler,
Methods => {
characters => sub {
$characters = shift->{Data};
},
end_element => sub {
push #firstnames, $characters if shift->{LocalName} eq 'firstname';
}
}
);
$parser->parse_uri("sample.xml");
print Dumper \#firstnames;
Output:
$VAR1 = [
'John',
'Larry',
'Harry'
];
I use $characters to hold character data, and push its contents onto #firstnames whenever I see a closing firstname tag.
Do you have any reason to stick with XML::Sax; If not then probably you can look for some other XML parsers in Perl (XML::Twig, XML::LibXML, XML::LibXMLReader, XML::Simple) and many more.
Here is a sample code to retrieve the firstname using XML::Twig.
use XML::Twig;
my $twig = XML::Twig->new ();
$twig->parsefile ('sample.xml');
my #firstname = map { $_->text } $twig->findnodes ('//firstname');

How to read value of a node in Perl libXML::Reader

My XML looks like this-
<info>
<name>NameGoesHere</name>
<age>99</age>
</info>
Using XML::LibXML::Reader, I would like to print the age.
I read the documentation from here, and I am looking for an example. I am able to use $reader->nextElement('info') and then print innerXML but not able to figure out how do I print only age and not the whole innerxml.
Try localName:
#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXML::Reader;
my $reader = XML::LibXML::Reader->new(IO => \*DATA)
or die "Cannot read from \\*DATA\n";
while ( $reader->read ) {
print $reader->readInnerXml if $reader->localName eq 'age';
}
$reader->finish;
__DATA__
<info>
<name>NameGoesHere</name>
<age>99</age>
</info>
Mostly from Regex: keep everything in <profession></profession> tags
use strict;
use warnings;
use feature qw( say );
use XML::LibXML::Reader qw(
XML_READER_TYPE_ELEMENT
XML_READER_TYPE_END_ELEMENT
XML_READER_TYPE_TEXT
);
my $reader = XML::LibXML::Reader->new(IO => \*DATA);
my $in_age = 0;
my $age;
while ($reader->read()) {
if ($reader->nodeType() == XML_READER_TYPE_ELEMENT && $reader->name() eq 'age') {
$age = '' if !$in_age;
++$in_age;
}
elsif ($reader->nodeType() == XML_READER_TYPE_END_ELEMENT && $reader->name() eq 'age') {
--$in_age;
say $age if !$in_age;
}
elsif ($in_age && $reader->nodeType() == XML_READER_TYPE_TEXT) {
$age .= $reader->value();
}
}
__DATA__
<info>
<name>NameGoesHere</name>
<age>99</age>
</info>

What module can I use to parse RSS feeds in a Perl CGI script?

I am trying to find a RSS parser that can be used with a Perl CGI script. I found simplepie and that's really easy parser to use in PHP scripting. Unfortunately that doesn't work with a Perl CGI script. Please let me know if there is anything that's easy to use like simplepie.
I came across this one RssDisplay but I am not sure about the usage and also how good it is.
From CPAN: XML::RSS::Parser.
XML::RSS::Parser is a lightweight liberal parser of RSS feeds. This parser is "liberal" in that it does not demand compliance of a specific RSS version and will attempt to gracefully handle tags it does not expect or understand. The parser's only requirements is that the file is well-formed XML and remotely resembles RSS.
#!/usr/bin/perl
use strict; use warnings;
use XML::RSS::Parser;
use FileHandle;
my $parser = XML::RSS::Parser->new;
unless ( -e 'uploads.rdf' ) {
require LWP::Simple;
LWP::Simple::getstore(
'http://search.cpan.org/uploads.rdf',
'uploads.rdf',
);
}
my $fh = FileHandle->new('uploads.rdf');
my $feed = $parser->parse_file($fh);
print $feed->query('/channel/title')->text_content, "\n";
my $count = $feed->item_count;
print "# of Items: $count\n";
foreach my $i ( $feed->query('//item') ) {
print $i->query('title')->text_content, "\n";
}
Available Perl Modules
XML::RSS::Tools
XML::RSS::Parser:
#!/usr/bin/perl -w
use strict;
use XML::RSS::Parser;
use FileHandle;
my $p = XML::RSS::Parser->new;
my $fh = FileHandle->new('/path/to/some/rss/file');
my $feed = $p->parse_file($fh);
# output some values
my $feed_title = $feed->query('/channel/title');
print $feed_title->text_content;
my $count = $feed->item_count;
print " ($count)\n";
foreach my $i ( $feed->query('//item') ) {
my $node = $i->query('title');
print ' '.$node->text_content;
print "\n";
}
XML::RSS::Parser::Lite (Pure Perl):
use XML::RSS::Parser::Lite;
use LWP::Simple;
my $xml = get("http://url.to.rss");
my $rp = new XML::RSS::Parser::Lite;
$rp->parse($xml);
print join(' ', $rp->get('title'), $rp->get('url'), $rp->get('description')), "\n";
for (my $i = 0; $i < $rp->count(); $i++) {
my $it = $rp->get($i);
print join(' ', $it->get('title'), $it->get('url'), $it->get('description')), "\n";
}
dirtyRSS:
use dirtyRSS;
$tree = parse($in);
die("$tree\n") unless (ref $tree);
disptree($tree, 0);