For this script, I am pulling a csv file that includes what needs to be found and what the replacement is. Those values, $pattern1 and $replacement1 are then inserted into a find & replace function. Ideally this will take the csv key file & do an inplace replacement of the raw data file.
use English;
use strict;
use warnings;
sub inplace_sanitize {
my ( $datafile, $pattern1, $replacement1 ) = #_;
local #ARGV = ( $datafile ),
my $INPLACE_EDIT = '.back';
while ( <> ) {
s/\Q$pattern1/$replacement1/g;
#print;
}
}
sub main
{
# Select Key for Find & Replace
my $filename = 'stmre_fr_key.csv';
open(INPUT, $filename) or die "Cannot open $filename";
# Read the header line.
my $line = <INPUT>;
# Read the lines one by one.
while($line = <INPUT>)
{
chomp($line);
#Split & Assign
my ($replacement1, $pattern1) = split(',', $line);
# Select Data File
my $datafile = 'rawdata.csv';
#Find & Replace Data File
&inplace_sanitize( $datafile, $pattern1, $replacement1 );
}
}
close(INPUT);
main();
So this is not working, as it doesn't perform the replacement. Without the inplace_sanitizecall it prints out the $replacement1 & $pattern1 correctly. The inplace_sanitize works by itself if you define $replacement1 = 'replace'; and $pattern1 = 'find';. But together there it doesn't work. Any ideas?
Samples:
$replacement1 = '7306e005';
$pattern1 = 'leighs_satcon011016001_00753b94';
stmre_fr_key.csv:
find,replace
leighs_satcon011016001_00753b94,7306e005
leighs_satcon011016001_00753b95,7306e006
.
.
.
You're use of my $INPLACE_EDIT is your problem. You want to effect the global variable:
local $INPLACE_EDIT = '.back';
The same way you're treating #ARGV
Related
I have two files. The first has a sequence of words on each line
bus do car
car tree
The second file is an XML file
<title>i have a car. i take bus..</title>
I want to search the text file for each word in the XML file. If it is found, I want to insert all lines from the text file where it appears, with any spaces replaced by x.
The result file would be
<title>i have a car busxdoxcar carxtree. i take bus busxdoxcar..</title>
I try this
use strict;
use warnings;
use autodie;
my $QueryFile = "query.txt";
my $SequenceFile = "Seq_2_terms_150.txt";
my %hashlist;
open NewQueryFile ,">./NewQuery.txt"
or die "Cannot create NewQuery.txt";
open(my $fh,$SequenceFile)
or die "$SequenceFile : $!";
while ( <$fh> ) {
chop;
s/^\s+|\s+$//g;
my $h = \%hashlist;
foreach ( split('\s+', $_) ) {
$h->{$_} //= {};
$h = $h->{$_};
}
$h->{'#'} = 1;
}
close $fh;
open(my $fd, $QueryFile)
or die "$QueryFile : $!";
for my $xml (<$fd>) {
foreach my $line (split(/\n/, $xml)) {
my #words = split(/\s/, $line);
if $words = #hashlist[$_] {
print NewQueryFile join ('x',$words) ;
}
}
}
close NewQueryFile ;
close($fd);
I have put together a quick script to indicate how one might go about this.
I have not bothered with xml, because that may well have left me in a bad mood.
My advice would be: do use variables, whatever you save from not doing so is lost as your code gets confusing and then buggy.
#!/usr/bin/env perl
use strict;
use warnings;
# Notes:
# - more than one space or tab in a row are mangled: They become one space only
# - the query file is not checked for containing actual words to match on,
# it is assumed to be suitable
# - I have made no attempt to parse xml. You should use a parser for that.
# Search Stack Overflow or Google or CPAN or all of those for examples.
# - The replace_xml_text function can be used on the text supplied by the
# parser to get the desired output
# - a feeble attempt is made to deal with punctuation in replace_xml_text
# - This code is not really tested
my %query_words;
my $query_fn = 'query.txt';
open (my $fh, "<",$query_fn) or die "could not open file '$query_fn'";
# build list of words from query file
while ( <$fh> ){
chomp;
# Words mentioned in line.
my #words = split(/\s+/,$_);
# Words joined by 'x'. Seems a strange choice *shrug*.
# This is used to replace words with later.
my $line = join("x",#words);
# Storing in arrayref, this seems easier to me
# than concatening repeatedly and getting the spaces right.
for my $word ( #words ){
push #{$query_words{$word}}, $line;
}
}
# Expects the text to replace.
# Returns the text to replace it with.
sub replace_xml_text {
my $original_text = shift;
my #words;
for my $word ( split(/\s+/,$original_text) ){
my $punctuation = '';
# Remove punctuation before matching,
# but do preserve it.
if ( $word =~s /(\s*[,.]\s*)$// ){
$punctuation = $1;
}
if ( my $additions = $query_words{$word} ){
$word = join(" ",$word,#$additions);
}
# Put punctuation back.
$word .= $punctuation;
# Done replacing in this word, next
push #words,$word;
}
return join(" ",#words);
}
I've asked this question before how to do this with AWK but it doesn't handle it all that well.
The data has semicolons in quoted fields, which AWK doesn't take into account. So I was trying it in perl with the text::csv module so I don't have to think about that. The problem is I don't know how to output it to files based on a column value.
Short example from previous question, the data:
10002394;"""22.98""";48;New York;http://testdata.com/bla/29012827.jpg;5.95;93962094820
10025155;27.99;65;Chicago;http://testdata.com/bla/29011075.jpg;5.95;14201021349
10003062;19.99;26;San Francisco;http://testdata.com/bla/29002816.jpg;5.95;17012725049
10003122;13.0;53;"""Miami""";http://testdata.com/bla/29019899.jpg;5.95;24404000059
10029650;27.99;48;New York;http://testdata.com/bla/29003007.jpg;5.95;3692164452
10007645;20.99;65;Chicago;"""http://testdata.com/bla/28798580.jpg""";5.95;10201848233
10025825;12.99;65;Chicago;"""http://testdata.com/bla/29017837.jpg""";5.95;93962025367
The desired result:
File --> 26.csv
10003062;19.99;26;San Francisco;http://testdata.com/bla/29002816.jpg;5.95;17012725049
File --> 48.csv
10002394;22.98;48;New York;http://testdata.com/bla/29012827.jpg;5.95;93962094820
10029650;27.99;48;New York;http://testdata.com/bla/29003007.jpg;5.95;3692164452
File --> 53.csv
10003122;13.0;53;Miami;http://testdata.com/bla/29019899.jpg;5.95;24404000059
File --> 65.csv
10025155;27.99;65;Chicago;http://testdata.com/bla/29011075.jpg;5.95;14201021349
10007645;20.99;65;Chicago;http://testdata.com/bla/28798580.jpg;5.95;10201848233
10025825;12.99;65;Chicago;http://testdata.com/bla/29017837.jpg;5.95;93962025367
This is what I have so far. EDIT: Modified code:
#!/usr/bin/perl
use strict;
use warnings;
use Text::CSV_XS;
#use Data::Dumper;
use Time::Piece;
my $inputfile = shift || die "Give input and output names!\n";
open my $infile, '<', $inputfile or die "Sourcefile in use / not found :$!\n";
#binmode($infile, ":encoding(utf8)");
my $csv = Text::CSV_XS->new({binary => 1,sep_char => ";",quote_space => 0,eol => $/});
my %fh;
my %count;
my $country;
my $date = localtime->strftime('%y%m%d');
open(my $fh_report, '>', "report$date.csv");
$csv->getline($infile);
while ( my $elements = $csv->getline($infile)){
EDITED IN:
__________
next unless ($elements->[29] =~ m/testdata/);
for (#$elements){
next if ($elements =~ /apple|orange|strawberry/);
}
__________
for (#$elements){
s/\"+/\"/g;
}
my $filename = $elements->[2];
$shop = $elements->[3] .";". $elements->[2];
$count{$country}++;
$fh{$filename} ||= do {
open(my $fh, '>:encoding(UTF-8)', $filename . ".csv") or die "Could not open file '$filename'";
$fh;
};
$csv->print($fh{$filename}, $elements);
}
#print $fh_report Dumper(\%count);
foreach my $name (reverse sort { $count{$a} <=> $count{$b} or $a cmp $b } keys %count) {
print $fh_report "$name;$count{$name}\n";
}
close $fh_report;
Errors:
Can't call method "print" on an undefined value at sort_csv_delimiter.pl line 28, <$infile> line 2
I've been messing around with this but I'm totally at a loss. Can someone help me?
My guess is that you want hash of cached file handles,
my %fh;
while ( my $elements = $csv->getline( $infile ) ) {
my $filename = $elements->[2];
$fh{$filename} ||= do {
open my $fh, ">", "$filename.csv" or die $!;
$fh;
};
# $csv->combine(#$elements);
$csv->print($fh{$filename}, $elements);
}
I don't see an instance of your stated problem -- occurrences of the semicolon separator character ; within quoted fields -- but you are correct that Text::CSV will handle it correctly.
This short program reads your example data from the DATA file handle and prints the result to STDOUT. I presume you know how to read from or write to different files if you wish.
use strict;
use warnings;
use Text::CSV;
my $csv = Text::CSV->new({ sep_char => ';', eol => $/ });
my #data;
while ( my $row = $csv->getline(\*DATA) ) {
push #data, $row;
}
my $file;
for my $row ( sort { $a->[2] <=> $b->[2] or $a->[0] <=> $b->[0] } #data ) {
unless (defined $file and $file == $row->[2]) {
$file = $row->[2];
printf "\nFile --> %d.csv\n", $file;
}
$csv->print(\*STDOUT, $row);
}
__DATA__
10002394;22.98;48;http://testdata.com/bla/29012827.jpg;5.95;93962094820
10025155;27.99;65;http://testdata.com/bla/29011075.jpg;5.95;14201021349
10003062;19.99;26;http://testdata.com/bla/29002816.jpg;5.95;17012725049
10003122;13.0;53;http://testdata.com/bla/29019899.jpg;5.95;24404000059
10029650;27.99;48;http://testdata.com/bla/29003007.jpg;5.95;3692164452
10007645;20.99;65;http://testdata.com/bla/28798580.jpg;5.95;10201848233
10025825;12.99;65;http://testdata.com/bla/29017837.jpg;5.95;93962025367
output
File --> 26.csv
10003062;19.99;26;http://testdata.com/bla/29002816.jpg;5.95;17012725049
File --> 48.csv
10002394;22.98;48;http://testdata.com/bla/29012827.jpg;5.95;93962094820
10029650;27.99;48;http://testdata.com/bla/29003007.jpg;5.95;3692164452
File --> 53.csv
10003122;13.0;53;http://testdata.com/bla/29019899.jpg;5.95;24404000059
File --> 65.csv
10007645;20.99;65;http://testdata.com/bla/28798580.jpg;5.95;"10201848233 "
10025155;27.99;65;http://testdata.com/bla/29011075.jpg;5.95;14201021349
10025825;12.99;65;http://testdata.com/bla/29017837.jpg;5.95;93962025367
Update
I have just realised that your "desired result" isn't the output that you expect to see, but rather the way separate records are written to different files. This program solves that.
It looks from your question as though you want the data sorted in order of the first field as well, and so I have read all of the file into memory and printed a sorted version to the relevant files. I have also used autodie to avoid having to code status checks for all the IO operations.
use strict;
use warnings;
use autodie;
use Text::CSV;
my $csv = Text::CSV->new({ sep_char => ';', eol => $/ });
my #data;
while ( my $row = $csv->getline(\*DATA) ) {
push #data, $row;
}
my ($file, $fh);
for my $row ( sort { $a->[2] <=> $b->[2] or $a->[0] <=> $b->[0] } #data ) {
unless (defined $file and $file == $row->[2]) {
$file = $row->[2];
open $fh, '>', "$file.csv";
}
$csv->print($fh, $row);
}
close $fh;
__DATA__
10002394;22.98;48;http://testdata.com/bla/29012827.jpg;5.95;93962094820
10025155;27.99;65;http://testdata.com/bla/29011075.jpg;5.95;14201021349
10003062;19.99;26;http://testdata.com/bla/29002816.jpg;5.95;17012725049
10003122;13.0;53;http://testdata.com/bla/29019899.jpg;5.95;24404000059
10029650;27.99;48;http://testdata.com/bla/29003007.jpg;5.95;3692164452
10007645;20.99;65;http://testdata.com/bla/28798580.jpg;5.95;10201848233
10025825;12.99;65;http://testdata.com/bla/29017837.jpg;5.95;93962025367
FWIW I have done this using Awk (gawk):
awk --assign col=2 'BEGIN { if(!(col ~/^[1-9]/)) exit 2; outname = "part-%s.txt"; } !/^#/ { out = sprintf(outname, $col); print > out; }' bigfile.txt
other_process data | awk --assign col=2 'BEGIN { if(!(col ~/^[1-9]/)) exit 2; outname = "part-%s.txt"; } !/^#/ { out = sprintf(outname, $col); print > out; }'
Let me explain the awk script:
BEGIN { # execution block before reading any file (once)
if(!(col ~/^[1-9]/)) exit 2; # assert the `col` variable is a positive number
outname = "part-%s.txt"; # formatting string of the output file names
}
!/^#/ { # only process lines not starting with '#' (header/comments in various data files)
out = sprintf(outname, $col); # format the output file name, given the value in column `col`
print > out; # put the line to that file
}
If you like you can add a variable to specify a custom filename or use the current filename (or STDIN) as prefix:
NR == 1 { # at the first file (not BEGIN, as we might need FILENAME)
if(!(col ~/^[1-9]/)) exit 2; # assert the `col` variable is a positive number
if(!outname) outname = (FILENAME == "-" ? "STDIN" : FILENAME); # if `outname` variable was not provided (with `-v/--assign`), use current filename or STDIN
if(!(outname ~ /%s/)) outname = outname ".%s"; # if `outname` is not a formatting string - containing %s - append it
}
!/^#/ { # only process lines not starting with '#' (header/comments in various data files)
out = sprintf(outname, $col); # format the output file name, given the value in column `col`
print > out; # put the line to that file
}
Note: if you provide multiple input files, only the first file's name will be used as output prefix. To support multiple input files and multiple prefixes, you can use FNR == 1 instead and add another variable to distinguish between user-provided outname and the auto-generated one.
I’m a beginner in perl and I’m trying to compare two files with perl. One contains a list of id’s the other one has strings which contain id’s and more text. I want to copy the lines with matching id’s to a third file, but instead of the correct strings I only get a number. What have I done wrong?
use strict;
use warnings;
open ( IDS , "<id.txt");
my #ids = <IDS>;
chomp(#ids);
close IDS;
my $id = #ids;
open ( META , "<meta.txt");
my #metas = <META>;
chomp(#metas);
my $meta = #metas;
open ( OUT1, ">>", "outtest.txt");
foreach $id (#metas){
print OUT1 "$meta"."\n";
}
close OUT1;
close META;
Try With hash variables to get the output:
use strict;
use warnings;
open ( META , "<meta.txt");
my %idsValues = (); #Create one new HASH Variable
while(<META>)
{
my $line = $_;
if($line=~m{<id>(\d+)</id>\s*<string>([^<>]*)</string>})
{
$idsValues{$1} = $2; #Store the values and text into the HASH Variable
}
}
close(META); #Close the opened file
my #Values;
open ( IDS , "<id.txt");
while(<IDS>)
{
my $line = $_;
if($line=~m/<id>(\d+)<\/id>/i)
{
#Check if the value presents in the file and push them into ARRAY Variable.
push(#Values, "IDS: $1\tVALUES: $idsValues{$1}") if(defined $idsValues{$1} );
}
}
close(IDS); #Close the opened file
open ( OUT1, ">>", "outtest.txt");
print OUT1 join "\n", #Values; #Join with newline and Print the output line in the output file.
close OUT1; #Close the opened file
I have been learning perl for the past two weeks. I have been writing some perl scripts for my school project. I need to parse a text file for multiple strings. I searched perl forums and got some information.The below function parses a text file for one string and returns a result. However I need the script to search the file for multiple strings.
use strict;
use warnings;
sub find_string {
my ($file, $string) = #_;
open my $fh, '<', $file;
while (<$fh>) {
return 1 if /\Q$string/;
}
die "Unable to find string: $string";
}
find_string('filename', 'string');
Now for instance if the file contains multiple strings with regular expressions as listed below
"testing"
http://www.yahoo.com =1
http://www.google.com=2
I want the function to search for multiple strings like
find_string('filename', 'string1','string2','string3');
Please can somebody explain me how i need to do that.It would be really helpful
Going through this very quickly here:
You right now pass the name of a file, and one string. What if you pass multiple strings:
if ( find_string ( $file, #strings ) ) {
print "Found a string!\n";
}
else {
print "No string found\n";
}
..
sub find_string {
my $file = shift;
my #strings = #_;
#
# Let's make the strings into a regular expression
#
my $reg_exp = join "|" ,#strings; # Regex is $string1|$string2|$string3...
open my $fh, "<", $file or die qq(Can't open file...);
while ( my $line = <$fh> ) {
chomp $line;
if ( $line =~ $reg_exp ) {
return 1; # Found the string
}
}
return 0; # String not found
}
I am about to go into a meeting, so I haven't really even tested this, but the idea is there. A few things:
You want to handle characters in your strings that could be regular expression characters. You can use either the quotemeta command, or use \Q and \E before and after each string.
Think about using use autodie to handle files that can't be open. Then, you don't have to check your open statement (like I did above).
There are limitations. This would be awful if you were searching for 1,000 different strings, but should be okay with a few.
Note how I use a scalar file handle ($fh). Instead of opening your file via the subroutine, I would pass in a scalar file handle. This would allow you to take care of an invalid file issue in your main program. That's the big advantage of scalar file handles: They can be easily passed to subroutines and stored in class objects.
Tested Program
#! /usr/bin/env perl
#
use strict;
use warnings;
use autodie;
use feature qw(say);
use constant {
INPUT_FILE => 'test.txt',
};
open my $fh, "<", INPUT_FILE;
my #strings = qw(foo fo+*o bar fubar);
if ( find_string ( $fh, #strings ) ) {
print "Found a string!\n";
}
else {
print "No string found\n";
}
sub find_string {
my $fh = shift; # The file handle
my #strings = #_; # A list of strings to look for
#
# We need to go through each string to make sure there's
# no special re characters
for my $string ( #strings ) {
$string = quotemeta $string;
}
#
# Let's join the stings into one big regular expression
#
my $reg_exp = join '|', #strings; # Regex is $string1|$string2|$string3...
$reg_exp = qr($reg_exp); # This is now a regular expression
while ( my $line = <$fh> ) {
chomp $line;
if ( $line =~ $reg_exp ) {
return 1; # Found the string
}
}
return 0; # String not found
}
autodie handles issues when I can't open a file. No need to check for it.
Notice I have three parameters in my open. This is the preferred way.
My file handle is $fh which allows me to pass it to my find_string subroutine. Open the file in the main program, and I can handle read errors there.
I loop through my #strings and use the quotemeta command to automatically escape special regular expression characters.
Note that when I change $string in my loop, it actually modifies the #strings array.
I use qr to create a regular expression.
My regular expression is /foo|fo\+\*o|bar|fubar/.
There are a few bugs For example, the string fooburberry will match with foo. Do you want that, or do you want your strings to be whole words?
I'm happy to see use strict and use warnings in your script. Here is one basic way to do it.
use strict;
use warnings;
sub find_string {
my ($file, $string1, $string2, $string3) = #_;
my $found1 = 0;
my $found2 = 0;
my $found3 = 0;
open my $fh, '<', $file;
while (<$fh>) {
if ( /$string1/ ) {
$found1 = 1;
}
if ( /$string2/ ) {
$found2 = 1;
}
if ( /$string3/ ) {
$found3 = 1;
}
}
if ( $found1 == 1 and $found2 == 1 and $found3 == 1 ) {
return 1;
} else {
return 0;
}
}
my $result = find_string('filename', 'string1'. 'string2', 'string3');
if ( $result == 1 ) {
print "Found all three strings\n";
} else {
print "Didn't find all three\n";
}
I think you can store the file content in an array first, then grep the input in the array.
use strict;
use warnings;
sub find_multi_string {
my ($file, #strings) = #_;
my $fh;
open ($fh, "<$file");
#store the whole file in an array
my #array = <$fh>;
for my $string (#strings) {
if (grep /$string/, #array) {
next;
} else {
die "Cannot find $string in $file";
}
}
return 1;
}
I have a list of words and I want to group them into different groups depending on whether they are verbs/adjectives/nouns/etc. So, basically I am looking for a Perl module which tells whether a word is verb/noun etc.
I googled but couldn't find what I was looking for. Thanks.
Lingua::EN::Tagger, Lingua::EN::Semtags::Engine, Lingua::EN::NamedEntity
See the Lingua::EN:: namespace in CPAN. Specifically, Link Grammar and perhaps Lingua::EN::Tagger can help you. Also WordNet provides that kind of information and you can query it using this perl module.
follow code perl help you to find all this thing in your text file in your folder only give the path of directory and it will process all file at once and save result in report.txt file strong text
#!/usr/local/bin/perl
# for loop execution
# Perl Program to calculate Factorial
sub fact
{
# Retriving the first argument
# passed with function calling
my $x = $_[0];
my #names = #{$_[1]};
my $length = $_[2];
# checking if that value is 0 or 1
if ($x < $length)
{
#print #names[$x],"\n";
use Lingua::EN::Fathom;
my $text = Lingua::EN::Fathom->new();
# Analyse contents of a text file
$dirlocation="./2015/";
$path =$dirlocation.$names[$x];
$text->analyse_file($path); # Analyse contents of a text file
$accumulate = 1;
# Analyse contents of a text string
$text->analyse_block($text_string,$accumulate);
# TO Do, remove repetition
$num_chars = $text->num_chars;
$num_words = $text->num_words;
$percent_complex_words = $text->percent_complex_words;
$num_sentences = $text->num_sentences;
$num_text_lines = $text->num_text_lines;
$num_blank_lines = $text->num_blank_lines;
$num_paragraphs = $text->num_paragraphs;
$syllables_per_word = $text->syllables_per_word;
$words_per_sentence = $text->words_per_sentence;
# comment needed
%words = $text->unique_words;
foreach $word ( sort keys %words )
{
# print("$words{$word} :$word\n");
}
$fog = $text->fog;
$flesch = $text->flesch;
$kincaid = $text->kincaid;
use strict;
use warnings;
use 5.010;
my $filename = 'report.txt';
open(my $fh, '>>', $filename) or die "Could not open file '$filename' $!";
say $fh $text->report;
close $fh;
say 'done';
print($text->report);
$x = $x+1;
fact($x,\#names,$length);
}
# Recursively calling function with the next value
# which is one less than current one
else
{
done();
}
}
# Driver Code
$a = 0;
#names = ("John Paul", "Lisa", "Kumar","touqeer");
opendir DIR1, "./2015" or die "cannot open dir: $!";
my #default_files= grep { ! /^\.\.?$/ } readdir DIR1;
$length = scalar #default_files;
print $length;
# Function call and printing result after return
fact($a,\#default_files,$length);
sub done
{
print "Done!";
}