Perl word Stemming English text - perl

I am trying to stem an English text, I read a lot of forums but I couldn't see a clear example. I am using porter stemmer as in using Text::ENglish.
This is how far I got:
use Lingua::StopWords qw(getStopWords);
my $stopwords = getStopWords('en');
use Text::English;
#stopwords = grep { $stopwords->{$_} } (keys %$stopwords);
chdir("c:/Test Facility/input");
#files = <*>;
foreach $file (#files)
{
open (input, $file);
while (<input>)
{
open (output,">>c:/Test Facility/normalized/".$file);
chomp;
for my $w (#stopwords)
{
s/\b\Q$w\E\b//ig;
}
$_ =~s/<[^>]*>//g;
$_ =~ s/[[:punct:]]//g;
##What should I write here to apply porter stemming using Text::English##
print output "$_\n";
}
}
close (input);
close (output);

Run the following code like this:
perl stemmer.pl /usr/lib/jvm/java-6-sun-1.6.0.26/jre/LICENSE
It produces output similar to:
operat system distributor licens java version sun microsystems inc sun willing to license java platform standard edition developer kit jdk
Note that strings with length 1 and numeric values are removed, besides stopwords.
#!/usr/bin/env perl
use common::sense;
use Encode;
use Lingua::Stem::Snowball;
use Lingua::StopWords qw(getStopWords);
use Scalar::Util qw(looks_like_number);
my $stemmer = Lingua::Stem::Snowball->new(
encoding => 'UTF-8',
lang => 'en',
);
my %stopwords = map {
lc
} keys %{getStopWords(en => 'UTF-8')};
local $, = ' ';
say map {
sub {
my #w =
map {
encode_utf8 $_
} grep {
length >= 2
and not looks_like_number($_)
and not exists $stopwords{lc($_)}
} split
/[\W_]+/x,
shift;
$stemmer->stem_in_place(\#w);
map {
lc decode_utf8 $_
} #w
}->($_);
} <>;

Related

Read ini files without section names

I want to make a configuration file which hold some objects, like this (where of course none of the paramaters can be considered as a primary key)
param1=abc
param2=ghj
param1=bcd
param2=hjk
; always the sames parameters
This file could be read, lets say with Config::IniFiles, because it has a direct transcription into ini file, like this
[0]
param1=abc
param2=ghj
[1]
param1=bcd
param2=hjk
with, for example, something like
perl -pe 'if (m/^\s*$/ || !$section ) print "[", ($section++ || 0) , "]"'
And finish with
open my $fh, '<', "/path/to/config_file.ini" or die $!;
$cfg = Config::IniFiles->new( -file => $fh );
(...parse here the sections starting with 0.)
But, I here ask me some question about the thing becoming quite complex....
(A) Is There a way to transform the $fh, so that it is not required to execute the perl one-liner BEFORE reading the file sequentially? So, to transform the file during perl is actually reading it.
or
(B) Is there a module to read my wonderfull flat database? Or something approching? I let myslef said, that Gnu coreutils does this kind of flat file reading, but I cannot remember how.
You can create a simple subclass of Config::INI::Reader:
package MyReader;
use strict;
use warnings;
use base 'Config::INI::Reader';
sub new {
my $class = shift;
my $self = $class->SUPER::new( #_ );
$self->{section} = 0;
return $self;
}
sub starting_section { 0 };
sub can_ignore { 0 };
sub parse_section_header {
my ( $self, $line ) = #_;
return $line =~ /^\s*$/ ? ++$self->{section} : undef ;
}
1;
With your input this gives:
% perl -MMyReader -MData::Dumper -e 'print Dumper( MyReader->read_file("cfg") )'
$VAR1 = {
'1' => {
'param2' => 'hjk',
'param1' => 'bcd'
},
'0' => {
'param2' => 'ghj',
'param1' => 'abc'
}
};
You can use a variable reference instead of a file name to create a filehandle that reads from it:
use strict;
use warnings;
use autodie;
my $config = "/path/to/config_file.ini";
my $content = do {
local $/;
open my $fh, "<", $config;
"\n". <$fh>;
};
# one liner replacement
my $section = 0;
$content =~ s/^\s*$/ "\n[". $section++ ."]" /mge;
open my $fh, '<', \$content;
my $cfg = Config::IniFiles->new( -file => $fh );
# ...
You can store the modified data in a real file or a string variable, but I suggest that you use paragraph mode by setting the input record separator $/ to the empty string. Like this
use strict;
use warnings;
{
local $/ = ''; # Read file in "paragraphs"
my $section = 0;
while (<DATA>) {
printf "[%d]\n", $section++;
print;
}
}
__DATA__
param1=abc
param2=ghj
param1=bcd
param2=hjk
output
[0]
param1=abc
param2=ghj
[1]
param1=bcd
param2=hjk
Update
If you read the file into a string, adding section identifiers as above, then you can read the result directly into a Config::IniFiles object using a string reference, for instance
my $config = Config::IniFiles->new(-file => \$modified_contents)
This example shows the tie interface, which results in a Perl hash that contains the configuration information. I have used Data::Dump only to show the structure of the resultant hash.
use strict;
use warnings;
use Config::IniFiles;
my $config;
{
open my $fh, '<', 'config_file.ini' or die "Couldn't open config file: $!";
my $section = 0;
local $/ = '';
while (<$fh>) {
$config .= sprintf "[%d]\n", $section++;
$config .= $_;
}
};
tie my %config, 'Config::IniFiles', -file => \$config;
use Data::Dump;
dd \%config;
output
{
# tied Config::IniFiles
"0" => {
# tied Config::IniFiles::_section
param1 => "abc",
param2 => "ghj",
},
"1" => {
# tied Config::IniFiles::_section
param1 => "bcd",
param2 => "hjk",
},
}
You may want to perform operations on a flux of objects (as Powershell) instead of a flux of text, so
use strict;
use warnings;
use English;
sub operation {
# do something with objects
...
}
{
local $INPUT_RECORD_SEPARATOR = '';
# object are separated with empty lines
while (<STDIN>) {
# key value
my %object = ( m/^ ([^=]+) = ([[:print:]]*) $ /xmsg );
# key cannot have = included, which is the delimiter
# value are printable characters (one line only)
operation ( \%object )
}
A like also other answers.

How to search a string in web page and print that full line in which search string is present?

I'm new to programming, learning perl as well.
Here's my question: How do I search a string in web page and print that full line in which search string is present?
Is it possible to find/hit directly that string and then print that full line in which search string is present? Do we need to use xpaths compulsory for this?
If it is just a very basic string you are looking for you can use LWP::Simple and a small regular expression like this:
use LWP::Simple;
my $doc = get('http://stackoverflow.com/q/11771655/479133') || die "GET failed";
foreach my $line (split("\n", $doc)) {
print $line and last if $line =~ m/Here's my query/;
}
There are countless modules available on CPAN to do such things. Have a look at Task::Kensho::WebCrawling if you need something "bigger".
LWP::UserAgent and HTML::Parser can be used:
#!/usr/bin/env perl
use strict;
use warnings;
use HTML::Parser;
use LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my $response = $ua->get('http://search.cpan.org/');
if ( !$response->is_success ) {
print "No matches\n";
exit 1;
}
my $parser = HTML::Parser->new( 'text_h' => [ \&text_handler, 'dtext' ] );
$parser->parse( $response->decoded_content );
sub text_handler {
chomp( my $text = shift );
if ( $text =~ /language/i ) {
print "Matched: $text\n";
}
}

diff behaviour between perl 5.8.8 and 5.10.1

I have this script:
#!/usr/bin/perl
$LOGFILE = "Soccer.txt";
open(LOGFILE) or die("Could not open log file.");
foreach $line (<LOGFILE>) {
chomp $line;
($hour, $match, $idh, $back1, $lay1, $idd, $backx, $layx, $ida, $back2, $lay2) = split(/\s*,\s*/,$line);
$match =~ s/^(\d{2}):(\d{2}) //g; #/ # fix highlighting
($hteam,$ateam) = split(/\s*ยง\s*/,$match,2);
$hteam = get_name($hteam);
$ateam = get_name($ateam);
$match = "$hteam - $ateam";
$foo=qq($hour "$match" $idh $back1 $lay1 $idd $backx $layx $ida $back2 $lay2 \n) ;
print $foo;
}
sub get_name {
# Return the full name for the team, if it exists, otherwise return the original
my %alias = (
"Aalen" => "Vfr Aalen",
"Accrington" => "Accrington Stanley",
"Accrington St" => "Accrington Stanley",
"Adelaide Utd" => "Adelaide United Fc"
);
return $alias{$_[0]} // $_[0];
}
This scrpit works perfect in my system:
perl, v5.10.1 (*) built for i686-linux-gnu-thread-multi
Now I would like run it in a different system:
perl, v5.8.8 built for x86_64-linux-thread-multi
When I try to run it, I get this error:
Search pattern not terminated at ./scriptbd.robot.pl line 458.
Why am I getting an error?
return $alias{$_[0]} // $_[0];
The // operator was added in 5.10. To make it work on earlier Perls, rewrite it:
return (defined $alias{$_[0]}) ? $alias{$_[0]} : $_[0];

What module can I use to parse RSS feeds in a Perl CGI script?

I am trying to find a RSS parser that can be used with a Perl CGI script. I found simplepie and that's really easy parser to use in PHP scripting. Unfortunately that doesn't work with a Perl CGI script. Please let me know if there is anything that's easy to use like simplepie.
I came across this one RssDisplay but I am not sure about the usage and also how good it is.
From CPAN: XML::RSS::Parser.
XML::RSS::Parser is a lightweight liberal parser of RSS feeds. This parser is "liberal" in that it does not demand compliance of a specific RSS version and will attempt to gracefully handle tags it does not expect or understand. The parser's only requirements is that the file is well-formed XML and remotely resembles RSS.
#!/usr/bin/perl
use strict; use warnings;
use XML::RSS::Parser;
use FileHandle;
my $parser = XML::RSS::Parser->new;
unless ( -e 'uploads.rdf' ) {
require LWP::Simple;
LWP::Simple::getstore(
'http://search.cpan.org/uploads.rdf',
'uploads.rdf',
);
}
my $fh = FileHandle->new('uploads.rdf');
my $feed = $parser->parse_file($fh);
print $feed->query('/channel/title')->text_content, "\n";
my $count = $feed->item_count;
print "# of Items: $count\n";
foreach my $i ( $feed->query('//item') ) {
print $i->query('title')->text_content, "\n";
}
Available Perl Modules
XML::RSS::Tools
XML::RSS::Parser:
#!/usr/bin/perl -w
use strict;
use XML::RSS::Parser;
use FileHandle;
my $p = XML::RSS::Parser->new;
my $fh = FileHandle->new('/path/to/some/rss/file');
my $feed = $p->parse_file($fh);
# output some values
my $feed_title = $feed->query('/channel/title');
print $feed_title->text_content;
my $count = $feed->item_count;
print " ($count)\n";
foreach my $i ( $feed->query('//item') ) {
my $node = $i->query('title');
print ' '.$node->text_content;
print "\n";
}
XML::RSS::Parser::Lite (Pure Perl):
use XML::RSS::Parser::Lite;
use LWP::Simple;
my $xml = get("http://url.to.rss");
my $rp = new XML::RSS::Parser::Lite;
$rp->parse($xml);
print join(' ', $rp->get('title'), $rp->get('url'), $rp->get('description')), "\n";
for (my $i = 0; $i < $rp->count(); $i++) {
my $it = $rp->get($i);
print join(' ', $it->get('title'), $it->get('url'), $it->get('description')), "\n";
}
dirtyRSS:
use dirtyRSS;
$tree = parse($in);
die("$tree\n") unless (ref $tree);
disptree($tree, 0);

Where can I find an array of the (un)assigned Unicode code points for a particular block?

At the moment, I'm writing these arrays by hand.
For example, the Miscellaneous Mathematical Symbols-A block has an entry in hash like this:
my %symbols = (
...
miscellaneous_mathematical_symbols_a => [(0x27C0..0x27CA), 0x27CC,
(0x27D0..0x27EF)],
...
)
The simpler, 'continuous' array
miscellaneous_mathematical_symbols_a => [0x27C0..0x27EF]
doesn't work because Unicode blocks have holes in them. For example, there's nothing at 0x27CB. Take a look at the code chart [PDF].
Writing these arrays by hand is tedious, error-prone and a bit fun. And I get the feeling that someone has already tackled this in Perl!
Perhaps you want Unicode::UCD? Use its charblock routine to get the range of any named block. If you want to get those names, you can use charblocks.
This module is really just an interface to the Unicode databases that come with Perl already, so if you have to do something fancier, you can look at the lib/5.x.y/unicore/UnicodeData.txt or the various other files in that same directory to get what you need.
Here's what I came up with to create your %symbols. I go through all the blocks (although in this sample I skip that ones without "Math" in their name. I get the starting and ending code points and check which ones are assigned. From that, I create a custom property that I can use to check if a character is in the range and assigned.
use strict;
use warnings;
digest_blocks();
my $property = 'My::InMiscellaneousMathematicalSymbolsA';
foreach ( 0x27BA..0x27F3 )
{
my $in = chr =~ m/\p{$property}/;
printf "%X is %sin $property\n",
$_, $in ? '' : ' not ';
}
sub digest_blocks {
use Unicode::UCD qw(charblocks);
my $blocks = charblocks();
foreach my $block ( keys %$blocks )
{
next unless $block =~ /Math/; # just to make the output small
my( $start, $stop ) = #{ $blocks->{$block}[0] };
$blocks->{$block} = {
assigned => [ grep { chr =~ /\A\p{Assigned}\z/ } $start .. $stop ],
unassigned => [ grep { chr !~ /\A\p{Assigned}\z/ } $start .. $stop ],
start => $start,
stop => $stop,
name => $block,
};
define_my_property( $blocks->{$block} );
}
}
sub define_my_property {
my $block = shift;
(my $subname = $block->{name}) =~ s/\W//g;
$block->{my_property} = "My::In$subname"; # needs In or Is
no strict 'refs';
my $string = join "\n", # can do ranges here too
map { sprintf "%X", $_ }
#{ $block->{assigned} };
*{"My::In$subname"} = sub { $string };
}
If I were going to do this a lot, I'd use the same thing to create a Perl source file that has the custom properties already defined so I can just use them right away in any of my work. None of the data should change until you update your Unicode data.
sub define_my_property {
my $block = shift;
(my $subname = $block->{name}) =~ s/\W//g;
$block->{my_property} = "My::In$subname"; # needs In or Is
no strict 'refs';
my $string = num2range( #{ $block->{assigned} } );
print <<"HERE";
sub My::In$subname {
return <<'CODEPOINTS';
$string
CODEPOINTS
}
HERE
}
# http://www.perlmonks.org/?node_id=87538
sub num2range {
local $_ = join ',' => sort { $a <=> $b } #_;
s/(?<!\d)(\d+)(?:,((??{$++1})))+(?!\d)/$1\t$+/g;
s/(\d+)/ sprintf "%X", $1/eg;
s/,/\n/g;
return $_;
}
That gives me output suitable for a Perl library:
sub My::InMiscellaneousMathematicalSymbolsA {
return <<'CODEPOINTS';
27C0 27CA
27CC
27D0 27EF
CODEPOINTS
}
sub My::InSupplementalMathematicalOperators {
return <<'CODEPOINTS';
2A00 2AFF
CODEPOINTS
}
sub My::InMathematicalAlphanumericSymbols {
return <<'CODEPOINTS';
1D400 1D454
1D456 1D49C
1D49E 1D49F
1D4A2
1D4A5 1D4A6
1D4A9 1D4AC
1D4AE 1D4B9
1D4BB
1D4BD 1D4C3
1D4C5 1D505
1D507 1D50A
1D50D 1D514
1D516 1D51C
1D51E 1D539
1D53B 1D53E
1D540 1D544
1D546
1D54A 1D550
1D552 1D6A5
1D6A8 1D7CB
1D7CE 1D7FF
CODEPOINTS
}
sub My::InMiscellaneousMathematicalSymbolsB {
return <<'CODEPOINTS';
2980 29FF
CODEPOINTS
}
sub My::InMathematicalOperators {
return <<'CODEPOINTS';
2200 22FF
CODEPOINTS
}
Maybe this?
my #list =
grep {chr ($_) =~ /^\p{Assigned}$/}
0x27C0..0x27EF;
#list = map { $_ = sprintf ("%X", $_ )} #list;
print "#list\n";
Gives me
27C0 27C1 27C2 27C3 27C4 27C5 27C6 27C7 27C8 27C9 27CA 27D0 27D1 27D2 27D3
27D4 27D5 27D6 27D7 27D8 27D9 27DA 27DB 27DC 27DD 27DE 27DF 27E0 27E1 27E2
27E3 27E4 27E5 27E6 27E7 27E8 27E9 27EA 27EB
I don't know why you wouldn't say miscellaneous_mathematical_symbols_a => [0x27C0..0x27EF], because that's how the Unicode standard is defined according to the PDF.
What do you mean when you say it doesn't "work"? If it's giving you some sort of error when you check the existence of the character in the block, then why not just weed them out of the block when your checker comes across an error?