I am writing a Perl program to convert my local language ASCII characters to Unicode characters (Tamil).
This is my program
#!/bin/perl
use strict;
use warnings;
use open ':std';
use open ':encoding(UTF-8)';
use Encode qw( encode decode );
use Data::Dump qw(dump);
use Getopt::Long qw(GetOptions);
Getopt::Long::Configure qw(gnu_getopt);
my $font;
my %map;
GetOptions(
'font|f=s' => \$font,
'help|h' => \&usage,
) or die "Try $0 -h for help";
print "Do you want to map $font? (y/n)";
chomp( my $answer = lc <STDIN> );
$font = lc( $font );
$font =~ s/ /_/;
$font =~ s/(.*?)\.ttf/$1/;
if ( $answer eq "y" ) {
map_font();
}
else {
restore_map();
}
foreach ( #ARGV ) {
my $modfile = "$_";
$modfile =~ s/.*\/(.*)/uni$1/;
process_file( $_, $modfile );
}
sub process_file {
my #options = #_;
open my $source, '<', "$options[0]";
my $result = $options[1];
my $test = "./text";
my $missingchar = join( "|", map( quotemeta, sort { length $b <=> length $a } keys %map ) );
while ( <$source> ) {
$/ = undef;
s/h;/u;/g; #Might need change based on the tamil font
s/N(.)/$1N/g; #Might need change based on the tamil font
s/n(.)/$1n/g; #Might need change based on the font
s/($missingchar)/$map{$1}/g;
print "$_";
open my $final, '>:utf8', "$result";
print $final "$_";
close $final;
}
}
sub map_font {
my #oddhexes = qw/0B95 0B99 0B9A 0B9E 0B9F 0BA3 0BA4 0BA8 0BAA 0BAE 0BAF 0BB0 0BB2 0BB5 0BB3 0BB4 0BB1 0BA9/;
my #missingletters = qw/0BC1 0BC2/;
my #rest = qw/0B85 0B86 0B87 0B88 0B89 0B8A 0B8E 0B8F 0B90 0B92 0B93 0B83 0BBE 0BBF 0BC0 0BC6 0BC7 0BC8 0BCD 0B9C 0BB7 0BB8 0BB9 0BCB 0BCA 0BCC/;
foreach ( #oddhexes ) {
my $oddhex = $_;
$_ = encode( 'utf8', chr( hex( $_ ) ) );
print "Press the key for $_ :";
chomp( my $bole = <STDIN> );
if ( $bole eq "" ) {
next;
}
$map{$bole} = $_;
foreach ( #missingletters ) {
my $oddchar = encode( 'utf8', chr( hex( $oddhex ) ) . chr( hex( $_ ) ) );
print "Press the key for $oddchar :";
chomp( my $missingchar = <STDIN> );
if ( $missingchar eq "" ) {
next
}
$map{$missingchar} = $oddchar;
}
}
foreach ( #rest ) {
$_ = encode( 'utf8', chr( hex( $_ ) ) );
print "Press the key for $_ :";
chomp( my $misc = <STDIN> );
if ( $misc eq "" ) {
next
}
$map{$misc} = $_;
}
open my $OUTPUT, '>', $font || die "can't open file";
print $OUTPUT dump( \%map );
close $OUTPUT;
}
sub restore_map {
open my $in, '<', "$font" || die "can't open file: $!";
{
local $/;
%map = %{ eval <$in> };
}
close $in;
}
sub usage {
print "\nUsage: $0 [options] {file1.txt file2.txt..} \neg: $0 -f TamilBible.ttf chapter.txt\n\nOptions:\n -f --font - used to pass font name\n -h --help - Prints help\n\nManual mapping of font is essential for using this program\n";
exit;
}
In subroutine process_file, output of print "$_"; displays proper Tamil Unicode characters in the terminal.
However the output to the file handle $final is very different.
The %map is here.
Why are the outputs different?
How can I correct this behaviour?
I have seen this question but this is not the same. In my case the terminal displays the result correctly while the filehandle output is different.
Your open statement
open my $final, '>:utf8', "$result";
sets your file handle to expect characters, and to encode into UTF-8 sequences then on the way out. But you are sending it pre-encoded byte sequences from the %map hash, which causes those bytes to be treated as character and encoded again by Perl IO
In contrast, your terminal is set to expect UTF-8-encoded data, but STDOUT isn't set to do any encoding at all (use open ':std' has no effect on its own, see below) so it passes your UTF-8-encoded bytes through unchanged which happens to be what the terminal expects
By the way, you have set a default open mode of :encoding(UTF-8) for input and output streams with
use open ':encoding(UTF-8)'
but have overridden it in your call to open. The :utf8 mode does a very basic translation from wide characters to byte sequences, but :encoding(UTF-8) is far more useful because it checks that each character being printed is a valid Unicode value. There is a good chance that it would have caught a mistake like this, and it would have been better to allow the default and write just
open my $final, '>', $result;
To keep things clean and tidy, your program should work in characters, and the file handles should be set to encode those characters to UTF-8 when those characters are printed
You can set UTF-8 as the default encoding for all newly-opened file handles as well as STDIN and STDOUT by adding
use open qw/ :std :encoding(utf-8) /;
to the top of your program (:encoding(utf-8) is preferable to :utf8) and remove all calls to encode. You had it almost right, but the :std and :encoding(utf-8) need to be in the same use statement
You should also add
use utf8;
at the very top so that you can use UTF-8 characters in the program itself
You also have a few incidental errors. For instance
In the statement
open my $in, '<', "$font" || die "can't open file: $!";
it is almost always wrong to quote a single scalar variable like $font unless it happens to be an object and you want to invoke the stringification method
You need or instead of ||, otherwise you're just testing the truth of $font
If I asked you what a variable called $in might contain I think you'd be hesitant; $in_fh is better and is a common idiom
It's always nice to put the name of the file into the die string as well as the reason from $!
Taking all of those into account makes your statement look like this
open my $in_fh, '<', $font or die qq{Unable to open "$font" for input: $!};
You should be consistent between upper and lower case scalar variables, and lower case is the correct choice. So
open my $OUTPUT, '>', $font || die "can't open file";
should be something like
open my $out_fh, '>', $font or die qq{Unable to open "$font" for output: $!};
The line
$/ = undef;
should be local $/ as you have used elsewhere, otherwise you are permanently modifying the input record separator for the rest of your program and modules. It also appears after the first read from the file handle, so your program will read and process one line, and then the whole of the rest of the file in the next iteration of the while loop
Related
I'm trying to build a primary key into a new file from an original File which has the following structure (tbl_20180615.txt):
573103150033,0664,54,MSS02VEN*',INT,zxzc,,,,,
573103150033,0665,54,MSS02VEN,INT,zxzc,,,,,
573103150080,0659,29,MSS05ARA',INT,zxzc,,,,,
573103150080,0660,29,MSS05ARA ,INT,zxzc,,,,,
573103154377,1240,72,MSSTRI01,INT,zxzc,,,,,
573103154377,1240,72,MSSTRI01,INT,zxzc,,,,,
I launch my perl Verify.pl then I send the arguments, the first one is the number of columns to build the primary key in the new file, after I have to send the name of file (original file).
(Verify.pl)
#!/usr/bin/perl
use strict;
use warnings;
my $n1 = $ARGV[0];
my $name = $ARGV[1];
$n1 =~ s/"//g;
my $n2 = $n1 + 1;
my %seen;
my ( $file3 ) = qw(log.txt);
open my $fh3, '>', $file3 or die "Can't open $file3: $!";
print "Loading file ...\n";
open( my $file, "<", "$name" ) || die "Can't read file somefile.txt: $!";
while ( <$file> ) {
chomp;
my #rec = split( /,/, $_, $n2 ); #$n2 sirve para armar la primary key, hacer le split en los campos deseados
for ( my $i = 0; $i < $n1; $i++ ) {
print $fh3 "#rec[$i],";
}
print $fh3 "\n";
}
close( $file );
print "Done!\n";
#########to check duplicates
my ($file4) = qw(log.txt);
print "Checking duplicates records...\n\n";
open (my $file4, "<", "log.txt") || die "Can't read file log.txt: $!";
while ( <$file4> ) {
print if $seen{$_}++;
}
close($file4);
if I send the following instruction
perl Verify.pl 2 tbl_20180615.txt
this code build a new file called "log.txt" with the following structure, splitting the original file () into two columns given by the first argument:
(log.txt)
573103150033,0664,
573103150033,0665,
573103150080,0659,
573103150080,0660,
573103154377,1240,
573103154377,1240,
That works ok, but if I want to read the new file log.txt to check duplicates, it doesn't work, but If I comment the lines to generate the file log.txt (listed above) before the line in the code (###############to check duplicates################) launch the next part of the code it works ok, giving me two duplicates lines and looks like this:
(Result in command line)
573103154377,1240
573103154377,1240
How can I solve this issue?
I think this does what you're asking for. It builds a unique list of derived keys before printing any of them, using a hash to check whether a key has already been generated
Note that I have assigned values to #ARGV to emulate input values. You must remove that statement before running the program with input from the command line
#!/usr/bin/perl
use strict;
use warnings;
use autodie; # Handle bad IO statuses automatically
local #ARGV = qw/ 2 tbl_20180615.txt /; # For testing only
tr/"//d for #ARGV; # "
my ($key_fields, $input_file) = #ARGV;
my $output_file = 'log.txt';
my (#keys, %seen);
print "Loading input ... ";
open my $in_fh, '<', $input_file;
while ( <$in_fh> ) {
chomp;
my #rec = split /,/;
my $key = join ',', #rec[0..$key_fields-1];
push #keys, $key unless $seen{$key}++;
}
print "Done\n";
open my $out_fh, '>', $output_file;
print $out_fh "$_\n" for #keys;
close $out_fh;
output log.txt
573103150033,0664
573103150033,0665
573103150080,0659
573103150080,0660
573103154377,1240
Below code works fine but I want $ip to be printed after closing the file.
use strict;
use warnings;
use POSIX;
my $file = "/tmp/example";
open(FILE, "<$file") or die $!;
while ( <FILE> ) {
my $lines = $_;
if ( $lines =~ m/address/ ) {
my ($string, $ip) = (split ' ', $lines);
print "IP address is: $ip\n";
}
}
close(FILE);
sample data in /tmp/example file
$cat /tmp/example
country us
ip_address 192.168.1.1
server dell
This solution looks for the first line that contains ip_address followed by some space and a sequence of digits and dots
Wrapping the search in a block makes perl delete the lexical variable $fh. Because it is a file handle, that handle will also be automatically closed
Note that I've used autodie to avoid the need to explicitly check the status of the open call
This algorithm will find the first occurrence of ip_address and stop reading the file immediately
use strict;
use warnings 'all';
use autodie;
my $file = '/tmp/example';
my $ip;
{
open my $fh, '<', $file;
while ( <$fh> ) {
if ( /ip_address\h+([\d.]+)/ ) {
$ip = $1;
last;
}
}
}
print $ip // 'undef', "\n";
output
192.168.1.1
Store all ips in an array and you'll then have it for later processing.
The shown code can also be simplified a lot. This assumes a four-number ip and data like that shown in the sample
use warnings;
use strict;
use feature 'say';
my $file = '/tmp/example';
open my $fh, '<', $file or die "Can't open $file: $!";
my #ips;
while (<$fh>) {
if (my ($ip) = /ip_address\s*(\d+\.\d+\.\d+\.\d+)/) {
push #ips, $ip;
}
}
close $fh;
say for #ips;
Or, once you open the file, process all lines with a map
my #ips = map { /ip_address\s*(\d+\.\d+\.\d+\.\d+)/ } <$fh>;
The filehandle is here read in a list context, imposed by map, so all lines from the file are returned. The block in map applies to each in turn, and map returns a flattened list with results.
Some notes
Use three-argument open, it is better
Don't assign $_ to a variable. To work with a lexical use while (my $line = <$fh>)
You can use split but here regex is more direct and it allows you to assign its match so that it is scoped. If there is no match the if fails and nothing goes onto the array
use warnings;
use strict;
my $file = "test";
my ( $string,$ip);
open my $FH, "<",$file) or die $!;
while (my $lines = <FH>) {
if ($lines =~ m/address/){
($string, $ip) = (split ' ', $lines);
}
}
print "IP address is: $ip\n";
This will give you the output you needed. But fails in the case of multiple IP match lines in the input file overwrites the last $ip variable.
I am trying to send a variable that is defined in an if statement $abc to a new file. The code seems correct but, I know that it is not working because the file is not being created.
Data File Sample:
bos,control,x1,x2,29AUG2016,y1,y2,76.4
bos,control,x2,x3,30AUG2016,y2,y3,78.9
bos,control,x3,x4,01SEP2016,y3,y4,72.5
bos,control,x4,x5,02SEP2016,y4,y5,80.5
Perl Code:
#!/usr/bin/perl
use strict;
use warnings 'all';
use POSIX qw(strftime); #Pull in date
my $currdate = strftime( "%Y%m%d", localtime ); #Date in YYYYMMDD format
my $modded = strftime( "%d%b%Y", localtime ); #Date in DDMONYYYY format
my $newdate = uc $modded; #converts lowercase to uppercase
my $filename = '/home/.../.../text_file'; #Define full file path before opening
open(FILE, '<', $filename) or die "Uh, where's the file again?\n"; #Open file else give up and relay snarky error
while(<FILE>) #Open While Loop
{
chomp;
my #fields = split(',' , $_); #Identify columns
my $site = $fields[0];
my $var1 = $fields[1];
my $var2 = $fields[4];
my $var3 = $fields[7];
my $abc = print "$var1,$var2,$var3\n" if ($var1 =~ "control" && $var2 =~ "$newdate");
open my $abc, '>', '/home/.../.../newfile.txt';
close $abc;
}
close FILE;
In your code you have a few odd things that are likely mistakes.
my $abc = print "$var1,$var2,$var3\n" if ($var1 =~ "c01" && $var2 =~ "$newdate");
print will return success, which it does as 1. So you will print out the string to STDOUT, and then assign 1 to a new lexical variable $abc. $abc is now 1.
All of that only happens if that condition is met. Don't do conditional assignments. The behavior for this is undefined. So if the condition is false, your $abc might be undef. Or something else. Who knows?
open my $abc, '>', '/home/.../.../newfile.txt';
close $abc;
You are opening a new filehandle called $abc. The my will redeclare it. That's a warning that you would get if you had use warnings in your code. It also overwrites your old $abc with a new file handle object.
You don't write anything to the file
... are weird foldernames, but that's probably just obfuscation for your example
I think what you actually want to do is this:
use strict;
use warnings 'all';
# ...
open my $fh, '<', $filename or die $!;
while ( my $line = <$fh> ) {
chomp $line;
my #fields = split( ',', $line );
my $site = $fields[0];
my $var1 = $fields[1];
my $var2 = $fields[4];
my $var3 = $fields[7];
open my $fh_out, '>', '/home/.../.../newfile.txt';
print $fh_out "$var1,$var2,$var3\n" if ( $var1 =~ "c01" && $var2 =~ "$newdate" );
close $fh_out;
}
close $fh;
You don't need the $abc variable in between at all. You can just print to your new file handle $fh_out that's open for writing.
Note that you will overwrite the newfile.txt file every time you have a match in a line inside $filename.
Your current code:
Prints the string
Assigns the result of printing it to a variable
Immediately overwrites that variable with a file handle (assuming open succeeded)
Closes that file handle without using it
Your logic should look more like this:
if ( $var1 =~ "c01" && $var2 =~ "$newdate" ) {
my $abc = "$var1,$var2,$var3\n"
open (my $file, '>', '/home/.../.../newfile.txt') || die("Could not open file: " . $!);
print $file $abc;
close $file;
}
You have a number of problems with your code. In addition to what others have mentioned
You create a new output file every time you find a matching input line. That will leave the file containing only the last printed string
Your test checks whether the text in the second column contains c01, but all of the lines in your sample input have control in the second column, so nothing will be printed
I'm guessing that you want to test for string equality, in which case you need eq instead of =~ which does a regular expression pattern match
I think it should look something more like this
use strict;
use warnings 'all';
use POSIX 'strftime';
my $currdate = uc strftime '%d%b%Y', localtime;
my ($input, $output) = qw/ data.txt newfile.txt /;
open my $fh, '<', $input or die qq{Unable to open "$input" for input: $!};
open my $out_fh, '>', $output or die qq{Unable to open "$output" for output: $!};
while ( <$fh> ) {
chomp;
my #fields = split /,/;
my ($site, $var1, $var2, $var3) = #fields[0,1,4,7];
next unless $var1 eq 'c01' and $var2 eq $currdate;
print $out_fh "$var1,$var2,$var3\n";
}
close $out_fh or die $!;
As the title says, I have a program or better two functions to read and write a file either in an array or to one. But now to the mean reason why I write this: when running my test several times my test program that tests my functions produces more and more white space. Is there somebody that could explain my fail and correct me?
my code
Helper.pm:
#!/usr/bin/env perl
package KconfCtl::Helper;
sub file_to_array($) {
my $file = shift();
my ( $filestream, $string );
my #rray;
open( $filestream, $file ) or die("cant open $file: $!");
#rray = <$filestream>;
close($filestream);
return #rray;
}
sub array_to_file($$;$) {
my #rray = #{ shift() };
my $file = shift();
my $mode = shift();
$mode='>' if not $mode;
my $filestream;
if ( not defined $file ) {
$filestream = STDOUT;
}
else {
open( $filestream, $mode, $file ) or die("cant open $file: $!");
}
my $l = #rray; print $l,"\n";
foreach my $line (#rray) {
print $filestream "$line\n";
}
close($filestream);
}
1;
test_helper.pl:
use KconfCtl::Helper;
use strict;
my #t;
#t= KconfCtl::Helper::file_to_array("kconf.test");
#print #t;
my $t_index=#t;
#t[$t_index]="n";
KconfCtl::Helper::array_to_file(\#t, "kconf.test", ">");
the result after the first:
n
and the 2nd run:
n
n
When you read from a file, the data includes the newline characters at the end of each line. You're not stripping those off, but you are adding an additional newline when you output your data again. That means your file is gaining additional blank lines each time you read and write it
Also, you must always use strict and use warnings 'all' at the top of every Perl script; you should avoid using subroutine prototypes; and you should declare all of your variables as late as possible
Here's a more idiomatic version of your module code which removes the newlines on input using chomp. Note that you don't need the #! line on the module file as it won't be run from the command line, but you my want it on the program file. It's also more normal to export symbols from a module using the Exporter module so that you don't have to qualify the subroutine names by prefixing them with the full package name
use strict;
use warnings 'all';
package KconfCtl::Helper;
sub file_to_array {
my ($file) = #_;
open my $fh, '<', $file or die qq{Can't open "$file" for input: $!}; #'
chomp(my #array = <$fh>);
return #array;
}
sub array_to_file {
my ($array, $file, $mode) = #_;
$mode //= '>';
my $fh;
if ( $file ) {
open $fh, $mode, $file or die qq{Can't open "$file" for output: $!}; #'
}
else {
$fh = \*STDOUT;
}
print $fh $_, "\n" for #$array;
}
1;
and your test program would be like this
#!/usr/bin/env perl
use strict;
use warnings 'all';
use KconfCtl::Helper;
use constant FILE => 'kconf.test';
my #t = KconfCtl::Helper::file_to_array(FILE);
push #t, 'n';
KconfCtl::Helper::array_to_file(\#t, FILE);
When you read in from your file, you need to chomp() the lines, or else the \n at the end of the line is included.
Try this and you'll see what's happening:
use Data::Dumper; ## add this line
sub file_to_array($) {
my $file = shift();
my ( $filestream, $string );
my #rray;
open( $filestream, '<', $file ) or die("cant open $file: $!");
#rray = <$filestream>;
close($filestream);
print Dumper( \#rray ); ### add this line
return #rray;
}
you can add
foreach(#rray){
chomp();
}
into your module to stop this happening.
I am trying to bring a file loop through it and remove any strings that have less than four characters in it and then print the list. I come from a javascript world and perl is brand new to me.
use strict;
use warnings;
sub lessThan4 {
open( FILE, "<names.txt" );
my #LINES = <FILE>;
close( FILE );
open( FILE, ">names.txt" );
foreach my $LINE ( #LINES ) {
print FILE $LINE unless ( $LINE.length() < 4 );
}
close( FILE );
}
use strict;
use warnings;
# automatically throw exception if open() fails
use autodie;
sub lessThan4 {
my #LINES = do {
# modern perl uses lexical, and three arg open
open(my $FILE, "<", "names.txt");
<$FILE>;
};
# remove newlines
chomp(#LINES);
open(my $FILE, ">", "names.txt");
foreach my $LINE ( #LINES ) {
print $FILE "$LINE\n" unless length($LINE) < 4;
# possible alternative to 'unless'
# print $FILE "$LINE\n" if length($LINE) >= 4;
}
close($FILE);
}
You're basically there. I hope you'll find some comments on your code useful.
# Well done for including these. So many new Perl users don't
use strict;
use warnings;
# Perl programs traditionally use all lower-case subroutine names
sub lessThan4 {
# 1/ You should use lexical variables for filehandles
# 2/ You should use the three-argument version of open()
# 3/ You should always check the return value from open()
open( FILE, "<names.txt" );
# Upper-case variable names in Perl are assumed to be global variables.
# This is a lexical variable, so name it using lower case.
my #LINES = <FILE>;
close( FILE );
# Same problems with open() here.
open( FILE, ">names.txt" );
foreach my $LINE ( #LINES ) {
# This is your biggest problem. Perl doesn't yet embrace the idea of
# calling methods to get properties of a variable. You need to call
# length() as a function.
print FILE $LINE unless ( $LINE.length() < 4 );
}
close( FILE );
}
Rewriting to take all that into account, we get the following:
use strict;
use warnings;
sub less_than_4 {
open( my $in_file_h, '<', 'names.txt' ) or die "Can't open file: $!";
my #lines = <$in_file_h>;
close( $in_file_h );
open( my $out_file_h, '>', 'names.txt' ) or die "Can't open file: $!";
foreach my $line ( #lines ) {
# Note: $line will include the newline character, so you might need
# to increase 4 to 5 here
print $out_file_h $line unless length $line < 4;
}
close( $out_file_h );
}
I am trying to bring a file loop through it and remove any strings that have less than four characters in it and then print the list.
I suppose you need to remove strings from the file which are less than 4 chars in length.
#!/usr/bin/perl
use strict;
use warnings;
open ($FH, "<", "names.txt");
my #final_list;
while (my $line = <$FH>) {
map {
length($_) > 4 and push (#final_list, $_) ;
} split (/\s/, $line);
}
print "\nWords with more than 4 chars: #final_list\n";
#Please try this one:
use strict;
use warnings;
my #new;
while(<DATA>)
{
#Push all the values less than 4 characters
push(#new, $_) unless(length($_) > '4');
}
print #new;
__DATA__
Williams
John
Joe
Lee
Albert
Francis
Sun