This simple Perl script is translating stories from a database into XML, but this one section is giving me problems. The function makeUrl is called for each story, but needs to ensure that duplicate URLs aren't created.
my #headlines = ();
my $hlCount = 1;
.
.
.
sub makeUrl {
my $headline;
open( URLSOUT, '>>/var/mtkoan/harris/urls' );
$url = $_[0];
print URLSOUT "Before: $url\n";
$url =~ s/\x{90}//g;
$url =~ s/\s+$//g;
$url =~ s/^\s+//g;
$url =~ s/\s/_/g;
$url =~ s/\W//g;
push #headlines, $url;
foreach $headline (#headlines) {
if( $headline eq $url ) {
$url .= "_$hlCount";
$hlCount++;
}
}
print URLSOUT "After: $url\n\n";
print URLSOUT "Headline Array Dump:\n";
print URLSOUT "#headlines\n";
close URLSOUT;
}
When the array is printed, only the last value is shown. Push isn't appending to the end of the array, I can't figure it out!
You can check for uniqueness (and remove duplicates from a list) in two main ways:
With a hash:
my %urls;
# construct your URL in the function...
$urls{$url}++;
# get all the (unique) URLs:
my #urls = keys %urls;
With a library call that returns the unique values in a list (see List::MoreUtils):
use List::MoreUtils 'uniq`;
#urls = uniq #urls;
Related
I want to pair two array and add char '/' between them. Let say, two arrays are like below
#array1 = (FileA .. FileZ);
#array2 = (FileA.txt .. FileZ.txt);
The output that I want is like below
../../../experiment/fileA/fileA.txt
.
.
../../../experiment/fileZ/fileZ.txt
here is my code
my #input_name = input();
my $dirname = "../../../experiment/";
# CREATE FOLDER PATH
my #fileDir;
foreach my $input_name (#input_name){
chomp $input_name;
$_ = $dirname . $input_name;
push #fileDir, $_;
}
# CREATE FILE NAME
my #filename;
my $extension = '.txt';
foreach my $input_name (#input_name){
chomp $input_name;
$_ = $input_name . $extension;
push #filename, $_;
}
The code that I'd try is like below. But it seem doesn't work
#CREATE FULL PATH
foreach my $test_path (#test_path){
foreach my $testname (#testname){
my $test = map "$test_path[$_]/$testname[$_]", 0..$#test_path;
push #file, $test;
}
}
print #file;
I assume input() returns something like ('fileA', 'fileB').
The problem with your code is the nested loop here:
foreach my $test_path (#test_path){
foreach my $testname (#testname){
This combines every $test_path with every possible $testname. You don't want that. Also, it doesn't make much sense to assign the result of map to a scalar: All you'll get is the number of elements in the list created by map.
(Also, you have random chomp calls sprinkled throughout your code. None of those should be there.)
You only need a single array and a single loop:
use strict;
use warnings;
sub input {
return ('fileA', 'fileB');
}
my #input = input();
my $dirname = '../../../experiment';
my #files = map "$dirname/$_/$_.txt", #input;
for my $file (#files) {
print "got $file\n";
}
Here the loop is hidden in the map ..., #input call. If you want to write it as a for loop, it would look like this:
my #files;
for my $input (#input) {
push #files, "$dirname/$input/$input.txt";
}
The problem is your algorithm. You're iterating all filenames and all dirnames at the same time.
I mean, your code says "For every directory, create every file".
Try something along the lines of this and you'll be fine:
# WRITE TESTFILE
foreach my $filename (#filename){
chomp $filename;
if ( -e "$filename/$filename" and -d "$filename/$filename" ){
print "File already exists\n";
}
else {
open ( TXT_FILE, ">$filename/$filename" );
print TXT_FILE "Hello World";
close TXT_FILE;
}
}
my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=journal+of+medical+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]&usehistory=y";
print "\n before url \n";
print $url;
#post the esearch URL
my $output = get($url);
print $output;
I have not used perl ever before.
If I hit this URL in browser, I do get the XML.
However, From what I see in output from script, $output is empty and
print $output;
returns
Use of uninitialized value in print at ./extractEmails.pl line 48.
Please suggest what's wrong and how to fix it
Edit:
As suggested, complete code:
#!/usr/bin/perl -w
# A perlscript written by Joseph Hughes, University of Glasgow
# use this perl script to parse the email addressed from the affiliations in PubMed
use strict;
use LWP::Simple;
my ($query,#queries);
#Query the Journal of Virology from 2014 until the present (use 3000)
$query = 'journal+of+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Journal of General Virology
$query = 'journal+of+general+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Virology
$query = 'virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Archives of Virology
$query = 'archives+of+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Virus Research
$query = 'virus+research[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Antiviral Research
$query = 'antiviral+research[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Viruses
$query = 'viruses[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
push(#queries,$query);
#Journal of Medical Virology
$query = 'journal+of+medical+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
# global variables
push(#queries,$query);
my %emails;
my $emailcnt=0;
my $count=1;
#assemble the esearch URL
foreach my $query (#queries){
my $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
#my $url = $base . "esearch.fcgi?db=pubmed&term=$query&usehistory=y";
my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=journal+of+medical+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]&usehistory=y";
print "\n before url \n";
print $url;
#post the esearch URL
my $output = get($url);
print "\n before output \n";
print get($url);
print $output;
#parse WebEnv, QueryKey and Count (# records retrieved)
my $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
my $key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
my $count = $1 if ($output =~ /<Count>(\d+)<\/Count>/);
#retrieve data in batches of 500
my $retmax = 500;
for (my $retstart = 0; $retstart < $count; $retstart += $retmax) {
my $efetch_url = $base ."efetch.fcgi?db=pubmed&WebEnv=$web";
$efetch_url .= "&query_key=$key&retmode=xml";
my $efetch_out = get($efetch_url);
my #matches = $efetch_out =~ m(<Affiliation>(.*)</Affiliation>)g;
#print "$_\n" for #matches;
for my $match (#matches){
if ($match=~/\s([a-zA-Z0-9\.\_\-]+\#[a-zA-Z0-9\.\_\-]+)$/){
my $email=$1;
$email=~s/\.$//;
$emails{$email}++;
}
}
}
my $cnt= keys %emails;
print "$query\n$cnt\n";
}
print "Total number of emails: ";
my $cnt= keys %emails;
print "$cnt\n";
my #email = keys %emails;
my #VAR;
push #VAR, [ splice #email, 0, 100 ] while #email;
my $batch=100;
foreach my $VAR (#VAR){
open(OUT, ">Set_$batch\.txt") || die "Can't open file!\n";
print OUT join(",",#$VAR);
close OUT;
$batch=$batch+100;
}
I recommend against using LWP::Simple for any reason because it is impossible to configure it or handle errors usefully. Using LWP::UserAgent which it wraps is nearly as simple anyway (though the error handling is a bit complicated). The below examples would replace the use LWP::Simple; and my $output = get($url); lines.
use strict;
use warnings;
use LWP::UserAgent;
my $ua = LWP::UserAgent->new(timeout => 30);
my $response = $ua->get($url);
unless ($response->is_success) {
# the Client-Warning, Client-Aborted, and X-Died headers each may be set on client/transport errors
die $response->status_line;
}
my $output = $response->decoded_content;
The core HTTP::Tiny is also simple.
use strict;
use warnings;
use HTTP::Tiny;
my $ua = HTTP::Tiny->new;
my $response = $ua->get($url);
unless ($response->{success}) {
die $response->{status} == 599 ? $response->{content} : "$response->{status} $response->{reason}";
}
my $output = $response->{content};
If you really want an LWP::Simple approach that will at least report transport errors, try ojo from Mojolicious:
perl -Mojo -E'say g(shift)->text' http://example.com
In a script rather than a oneliner, you can use Mojo::UserAgent directly, and also handle HTTP errors like above:
use strict;
use warnings;
use Mojo::UserAgent;
my $ua = Mojo::UserAgent->new;
my $response = $ua->get($url)->result;
unless ($response->is_success) {
die $response->code . ' ' . $response->message;
}
my $output = $response->text;
Here is the code, I know it is not perfect perl. If you have insight on how I an do better let me know. My main question is how would I print out the arrays without using Data::Dumper?
#!/usr/bin/perl
use Data::Dumper qw(Dumper);
use strict;
use warnings;
open(MYFILE, "<", "move_headers.txt") or die "ERROR: $!";
#First split the list of files and the headers apart
my #files;
my #headers;
my #file_list = <MYFILE>;
foreach my $source_parts (#file_list) {
chomp($source_parts);
my #parts = split(/:/, $source_parts);
unshift(#files, $parts[0]);
unshift(#headers, $parts[1]);
}
# Next get a list of unique headers
my #unique_files;
foreach my $item (#files) {
my $found = 0;
foreach my $i (#unique_files) {
if ($i eq $item) {
$found = 1;
last;
}
}
if (!$found) {
unshift #unique_files, $item;
}
}
#unique_files = sort(#unique_files);
# Now collect the headers is a list per file
my %hash_table;
for (my $i = 0; $i < #files; $i++) {
unshift #{ $hash_table{"$files[$i]"} }, "$headers[$i]";
}
# Process the list with regex
while ((my $key, my $value) = each %hash_table) {
if (ref($value) eq "ARRAY") {
print "$value", "\n";
}
}
The Perl documentation has a tutorial on "Printing of a HASH OF ARRAYS" (without using Data::Dumper)
perldoc perldsc
You're doing a couple things the hard way. First, a hash will already uniqify its keys, so you don't need the loop that does that. It appears that you're building a hash of files, with the values meant to be the headers found in those files. The input data is "filename:header", one per line. (You could use a hash of hashes, since the headers may need uniquifying, but let's let that go for now.)
use strict;
use warnings;
open my $files_and_headers, "<", "move_headers.txt" or die "Can't open move_headers: $!\n";
my %headers_for_file;
while (defined(my $line = <$files_and_headers> )) {
chomp $line;
my($file, $header) = split /:/, $line, 2;
push #{ $headers_for_file{$file} }, $header;
}
# Print the arrays for each file:
foreach my $file (keys %headers_for_file) {
print "$file: #{ $headers_for_file{$file}}\n";
}
We're letting Perl do a chunk of the work here:
If we add keys to a hash, they're always unique.
If we interpolate an array into a print statement, Perl adds spaces between them.
If we push onto an empty hash element, Perl automatically puts an empty anonymous array in the element and then pushes onto that.
An alternative to using Data::Dumper is to use Data::Printer:
use Data::Printer;
p $value;
You can also use this to customise the format of the output. E.g. you can have it all in a single line without the indexes (see the documentation for more options):
use Data::Printer {
index => 0,
multiline => 0,
};
p $value;
Also, as a suggestion for getting unique files, put the elements into a a hash:
my %unique;
#unique{ #files } = #files;
my #unique_files = sort keys %unique;
Actually, you could even skip that step and put everything into %hash_table in one pass:
my %hash_table;
foreach my $source_parts (#file_list) {
chomp($source_parts);
my #parts = split(/:/, $source_parts);
unshift #{ $hash_table{$parts[0]} }, $parts[1];
}
I am new to perl and can't seem to find why this snippet is giving me a 500 error.
#!/usr/bin/perl
use strict;
use warnings;
use CGI::Carp qw( fatalsToBrowser );
my ($distance, $weight, $total_gas, $mph, $buffer, $pair, #pairs, $value, $form, $name);
our %FORM = ();
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
#pairs = split(/&/, $buffer);
foreach $pair (#pairs) {
($name, $value) = split(/=/, $pair);
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$FORM{$name} = $value;
}
Everything I tried on the %FORM = (); gives me variable declaration errors.
Are you certain that #pairs contains the values you expect (ie that they are name value pairs split with "="? More than likely $name isn't defined and you can't add an undefined key pair to a hash. Why are you using STDIN to read in values from the query string? Try:
my $q = CGI->new;
my #keys = $q->param;
my %FORM;
foreach my $name (#keys)
{
my $value = $q->param($name);
$FORM{$name} = $value;
}
or
my $q = CGI->new;
my %FORM = $q->Vars;
http://perldoc.perl.org/CGI.html
I think you're missing HTTP header. Try adding to put following line before any print:
print "Content-type: text/html\n\n";
Make sure you have enough permissions for script to run. It'll depend on OS you're using.
Also you'd consider using CGI module as mentioned in scrappedcola's answer. This code will work for both POST and GET:
use strict; use warnings;
use CGI;
my $form = CGI->Vars;
print "Content-type: text/html\n\n";
print "name=".$form->{name};
input text file contain the following:
....
ponies B-pro
were I-pro
used I-pro
A O
report O
of O
indirect B-cd
were O
. O
...
output XML file
<sen>
<base id="pro">
<w id="1">ponies</w>
<w id="2">were</w>
<w id="3">were</w>
</base>A report of
<base id="cd">indirect</base> were
</sen>
i want to make an XML file by reading the text file, B- means the begining of my tag and I- means an include words inside the tag while "O" means outside the base tag which means it only exist in the tag.
i try the following codes:
#!/usr/local/bin/perl -w
open(my $f, "input.txt") or die "Can't";
open(my $o, ">output.xml") or die "Can't";
my $c;
sub read_line {
my $fh = shift;
if ($fh and my $line = <$fh>) {
chomp($line);
my #words = split(/\t/, $line);
my $word = $words[0];
my $group = $words[1];
if($word eq "."){
return;
}
else{
if($group ne 'O'){
my #b = split(/\-/, $group);
if($b[0] eq 'B'){
my $e = "<e id=\"";
$e .= " . $b[1] . "\">";
$e .= $word . "</e>";
return $e;
}
if($b[0] eq 'I'){
my $w = "<w id=\"";
$w .= $c . "\">";
$w .= $word . "</w>";
$c++;
return $w;
}
}
else{
$c = 2;
return $word;
}
}
}
return;
}
sub get_text(){
my $txt = "";
my $r = read_line($f);
while($r){
if($r =~ m/[[:punct:]]/){
chop($txt);
$txt .= " " . $r . " ";
}
else{
$txt .= $r . " ";
}
$r = read_line($f);
}
chop($txt);
return "<sen>" . $txt . ".</sen>";
}
instead im getting as output:
<sen>
<base id="pro"> ponies </base>
<w id="2">were</w>
<w id="3">were</w>
A report of
<base id="cd">indirect</base> were
</sen>
i really need help.
Thanks
Writing XML "by hand" will only get you in trouble. Use a module from CPAN.
In your case, I would first put the data in a proper Perl data structure (maybe a hash containing some arrays, or something similar) and then using a module (i.e. XML::Simple for starters) to output to a file.
As Javs said, you want to use a module rather than do this by hand. For your purposes, since you have mixed content, I recommend XML::LibXML. Here is an example I made to test that you can indeed to mixed content like you've got:
use XML::LibXML;
my $doc = XML::LibXML::Document->new();
my $root = $doc->createElement('html');
$doc->setDocumentElement($root);
my $body = $doc->createElement('body');
$root->appendChild($body);
my $link = $doc->createElement('a');
$link->setAttribute('href', 'http://google.com');
$link->appendText('Google');
$body->appendChild($link);
$body->appendText('Inline Text');
print $doc->toString;