Find and Replace multiple texts from file a in file b - perl

I have a csv file which has on column A (chinese phrases) and on column B (their english equivalents).
On another .xml file I have large amounts of chinese texts. I want to make the script to search from csv file and if found replace the terms on xml file.
The closest that I got to my aim is this code.
use Cwd;
use File::Basename;
use File::Copy;
use strict;
use warnings;
my $DIR = $0;
my $filename = basename($0);
$DIR = '/Users/moody/Desktop/chinese/';
#my $tablecounter = 0;
my #GLOSSARY;
my $glossaryFile = 'Chinese.csv';
my $glossaryCount = 1;
my $RAWFILE = "Chinese.xml";
my #MODDEDFILES = ('Chinese-modded.html');
BuildGlossary();
my $FILETEXT = "";
my #MODDED_FILETEXT = ('','','','','','','','','');
ReadRaw($RAWFILE);
my $k = 1;
while ($k < $glossaryCount) {
my $search = $GLOSSARY[$k][0];
my $replace = $GLOSSARY[$k][1];
$FILETEXT =~ s/$search/$replace/g;
$k += 1;
}
$FILETEXT =~ s/<p><\/p>/<\/br>/g;
PrintText($MODDEDFILES[0]);
SplitText();
$k = 1;
while ($k < 3) {
PrintTextModded($MODDEDFILES[$k], $k);
$k += 1;
}
exit;
sub PrintText {
my $filename = $_[0];
open(my $fh, '>:encoding(UTF-8)', $filename)
or die "Could not open file '$filename' $!";
print $fh "<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><head><style>body {background-color: #bfbfbf; margin-right: 20%; margin-left: 20%;} p {margin: 0px; text-indent: 1.5em; word-wrap: break-word;} h3 {margin: 0px; text-align: center;} hr {border-color:black;}</style></head><body>\n\n";
print $fh $FILETEXT;
print $fh "</body></html>\n";
close $fh;
}
sub SplitText {
my #temp = split(/(<p>------------------------------------<\/p>)/, $FILETEXT);
shift #temp;
my $tempSize = #temp;
for (my $i=1; $i <= $tempSize; $i+=4) {
$temp[$i] =~ s/<p>/<h3>/g;
$temp[$i] =~ s/<\/p>/<\/h3>/g;
}
for (my $i=0; $i <= $tempSize; $i+=2) {
$temp[$i] = "<hr noshade>\n";
}
my #volumes = ([0,111],[112,224]);
for (my $k=0; $k < 2; $k++) {
my $temptext = "";
my $i = int($volumes[$k][0]);
my $u = $volumes[$k][1];
for (my $j=$i; $j <= $u; $j++) {
$temptext = $temptext.$temp[$j];
#$temp[$j] = "";
}
$MODDED_FILETEXT[$k] = $temptext;
}
}
sub PrintTextModded {
my $filename = $_[0];
my $num = $_[1];
open(my $fh, '>:encoding(UTF-8)', $filename)
or die "Could not open file '$filename' $!";
print $fh "<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><head><style>body {background-color: #bfbfbf; margin-right: 20%; margin-left: 20%;} p {margin: 0px; text-indent: 1.5em; word-wrap: break-word;} h3 {margin: 0px; text-align: center;} hr {border-color:black;}</style></head><body>\n\n";
print $fh $MODDED_FILETEXT[$num-1];
print $fh "</body></html>\n";
close $fh;
}
sub ReadRaw {
my $filename = $_[0];
open(my $fh, '<:encoding(UTF-8)', $filename)
or die "Could not open file '$filename' $!";
$FILETEXT = "";
while (my $row = <$fh>) {
chomp $row;
$FILETEXT = $FILETEXT."<p>".$row."</p>\n";
}
close $fh;
}
sub BuildGlossary {
open(my $fh, '<:encoding(UTF-8)', $glossaryFile)
or die "Could not open file '$glossaryFile' $!";
my $num = 1;
while (my $row = <$fh>) {
chomp $row;
my #temp = split "," , $row;
$GLOSSARY[$num][0] = $temp[0];
$GLOSSARY[$num][1] = $temp[1];
$num += 1;
$glossaryCount += 1;
}
close $fh;
}
However, the problem is that it still doesn't work. The problems that I face at the moment are:
Use of uninitialized value in concatenation (.) or string at find.pl line 74.
Use of uninitialized value $filename in open at find.pl line 85.
Use of uninitialized value $filename in concatenation (.) or string at find.pl line 85.
Could not open file '' No such file or directory at find.pl line 85.
Is there anyone who could help me out?

The issue was with $DIR = '/Users/moody/Desktop/chinese/';
It had to be changed to $DIR =~ s/$FILENAME//g;

Related

Compare two files and write matching data from first file using perl

First file
FirstName:LastName:Location:Country:ID
FirstName1:LastName1:Location1:Country1:ID1
FirstName2:LastName2:Location2:Country2:ID2
FirstName3:LastName3:Location3:Country3:ID3
FirstName4:LastName4:Location4:Country4:ID4
Second file
FirstName:LastName:Location:Country:Old_ID
FirstName2:LastName2:Location2:Country2:Old_ID2
FirstName4:LastName4:Location4:Country4:Old_ID4
Have to compare first and second file and print matching rows with data from first file which is have new ID's.
Below script fetches me Old_ID's from second file and not the new ones from first file
use warnings;
use strict;
my $details = 'file2.txt';
my $old_details = 'file1.txt';
my %names;
open my $data, '<', $details or die $!;
while (<$data>)
{
my ($name, #ids) = split;
push #{ $names{$_} }, $name for #ids;
}
open my $old_data, '<', $old_details or die $!;
while (<$old_data>)
{
chomp;
print #{ $names{$_} // [$_] }, "\n";
}
Output:
FirstName:LastName:Location:Country:Old_ID
FirstName2:LastName2:Location2:Country2:Old_ID2
FirstName4:LastName4:Location4:Country4:Old_ID4
Expected output:
FirstName:LastName:Location:Country:ID
FirstName2:LastName2:Location2:Country2:ID2
FirstName4:LastName4:Location4:Country4:ID4
Just try this way:
use strict; # Use strict Pragma
use warnings;
my ($file1, $filecnt1, $file2, $filecnt2) = ""; #Declaring variables
$file1 = "a1.txt"; $file2 = "b1.txt"; #Sample files
readFileinString($file1, \$filecnt1); # Reading first file
readFileinString($file2, \$filecnt2); # Reading second file
$filecnt2=~s/\:Old\_ID/\:ID/g; # Replacing that difference content
my #firstfle = split "\n", $filecnt1; # Move content to array variable to compare
my #secndfle = split "\n", $filecnt2;
my %firstfle = map { $_ => 1 } #firstfle; #Mapping the array into hash variable
my #scdcmp = grep { $firstfle{$_} } #secndfle;
print join "\n", #scdcmp;
#---------------> File reading
sub readFileinString
#--------------->
{
my $File = shift;
my $string = shift;
open(FILE1, "<$File") or die "\nFailed Reading File: [$File]\n\tReason: $!";
read(FILE1, $$string, -s $File, 0);
close(FILE1);
}
#---------------> File Writing
sub writeFileinString
#--------------->
{
my $File = shift;
my $string = shift;
my #cDir = split(/\\/, $File);
my $tmp = "";
for(my $i = 0; $i < $#cDir; $i++)
{
$tmp = $tmp . "$cDir[$i]\\";
mkdir "$tmp";
}
if(-f $File){
unlink($File);
}
open(FILE, ">$File") or die "\n\nFailed File Open for Writing: [$File]\n\nReason: $!\n";
print FILE $$string;
close(FILE);
}

Perl compilation error

sorry if it seems obvious but Im pretty new at Perl and programming and I've been working over a week and can't get it done.
My idea is simple. I've got a .csv where I've got the names in the first column, a number from -1 to 1 in the second and a position on the third. Then another file where I have got the names (line starts with >) and the info with 80 characters per line.
What I want to do is keep the name lines of the first file and grab the 'position' given from -20 to +60. But I cannot get it to work and I've got to the point where don't know where to follow.
use strict; #read file line by line
use warnings;
my $outputfile = "Output1.txt";
my $filename = "InputP.txt";
my $inputfasta = "Inputfasta.txt";
open my $fh, '<', $filename or die "Couldn't open '$filename'";
open my $fh2, '>', $outputfile or die "Couldn't create '$outputfile'";
open my $fh3, '<', $inputfasta or die "Couldn't open '$inputfasta'";
my $Psequence = 0;
my $seqname = 0;
while (my $line = <$fh>) {
chomp $line;
my $length = index ($line, ",");
$seqname = substr ($line, 0, $length);
my $length2 = index ($line, ",", $length);
my $score = substr ($line, $length +1, $length2);
my $length3 = index ($line, ",", $length2);
my $position = substr ($line, $length2 +1, $length3);
#print $fh2 "$seqname"."\t"."$score"."\t"."$position"."\n"; }
my $Rlength2 = index ($score, ",");
my $Rscore = substr ($score, 0, $Rlength2);
#print "$Rscore"."\n";}
while (my $linea = <$fh3>){ #same order.
chomp $linea;
if ($linea=~/^>(.+)/) {
print $fh3 "\n"."$linea"."\n"; }
else { $linea =~ /^\s*(.*)\s*$/;
chomp $linea;
print $fh3 "$linea". "\n"; }
}
if ($Rscore >= 0.5){
$Psequence = substr ($linea, -20, 81);
print "$seqname"."\n"."$Psequence";}
}
Please, learn to indent the code correctly. Then the error will be more obvious:
while (my $linea = <$fh3>){ #same order.
chomp $linea;
if ($linea =~ /^>(.+)/) {
print $fh3 "\n$linea\n";
} else {
# Commented out as it does nothing.
# $linea =~ /^\s*(.*)\s*$/;
# chomp $linea;
print $fh3 "$linea\n";
}
}
if ($Rscore >= 0.5){
$Psequence = substr $linea, -20, 81;
print "$seqname\n$Psequence";
}
$linea exists only in the while loop, but you try to use it in the following paragraph, too. The variable disappears when the loop ends.
Create a hash from the CSV where the key is the name and the value is the position.
use Text::CSV_XS qw( );
my %pos_by_name;
{
open(my $fh, '<', $input_qfn)
or die("Can't open $input_qfn: $!\n");
my $csv = Text::CSV_XS->new({ auto_diag => 1, binary => 1 });
while (my $row = $csv_in->getline($fh)) {
$pos_by_name{ $row->[0] } = $row->[2];
}
}
Then, it's just a question of extracting the names from the other file, and using the hash to find the associated position.
open(my $fh, '<', $fasta_qfn)
or die("Can't open $fasta_qfn: $!\n");
while (<$fh>) {
chomp;
my ($name) = /^>(.*)/
or next;
my $pos = $pos_by_name{$name};
if (!defined($pos)) {
die("Can't find position for $name\n");
}
... Do something with $name and $pos ...
}

Is there a better way of writing this code to avoid redundancy?

I have a segment of code in a program, which accepts GNU style input from a pipe (which is a list of file names). If STDIN does not contain data, I need to accept input from a predetermined text file containing file names.
I find myself needing to write redundant code. Is it possible to simplify this bit of code to avoid redundancy?
sub downloadlinkgen {
my $fh;
print "Printing links\n";
if ($getfilelist==1) {
open $fh, '<', "fuzzyfile" or die $!;
while (<$fh>) {
chomp ($_);
(my $fname,my $path, my $suffix) = fileparse($_);
my ($name, $ext) = $fname =~ /(.*)\.(.*)/;
my $newfile=$path.$name.".$ext";
$newfile =~ s/\s/%20/g;
$newfile =~ s/\/root/http:\/\/myip/;
print $newfile."\n";
}
} else {
while (<>) {
chomp ($_);
(my $fname,my $path, my $suffix) = fileparse($_);
my ($name, $ext) = $fname =~ /(.*)\.(.*)/;
my $newfile=$path.$name.".$ext";
$newfile =~ s/\s/%20/g;
$newfile =~ s/\/root/http:\/\/myip/;
print $newfile."\n";
}
}
}
Yes, just make the default ARGV filehandle open the file:
sub downloadlinkgen {
#ARGV = 'fuzzyfile' if $getfilelist == 1;
print "Printing links\n";
while (<>) {
chomp ($_);
(my $fname,my $path, my $suffix) = fileparse($_);
my ($name, $ext) = $fname =~ /(.*)\.(.*)/;
my $newfile=$path.$name.".$ext";
$newfile =~ s/\s/%20/g;
$newfile =~ s/\/root/http:\/\/myip/;
print $newfile."\n";
}
}
sub downloadlinkgen {
# default file handle
my $fh = \*ARGV;
print "Printing links\n";
if ($getfilelist==1) {
open $fh, '<', "fuzzyfile" or die $!;
}
while (<$fh>) {
chomp ($_);
(my $fname,my $path, my $suffix) = fileparse($_);
my ($name, $ext) = $fname =~ /(.*)\.(.*)/;
my $newfile=$path.$name.".$ext";
$newfile =~ s/\s/%20/g;
$newfile =~ s/\/root/http:\/\/myip/;
print $newfile."\n";
}
}
From perldoc -f readline
Reads from the filehandle whose typeglob is contained in EXPR (or from *ARGV if EXPR is not provided)
so \*ARGV is reference to file handle used when reading from <>, and you can use $fh in both cases.
Even if you don't know about ARGV, you could do something simple like this:
sub downloadlinkgen {
my $fh;
print "Printing links\n";
if ($getfilelist==1) {
open $fh, '<', "fuzzyfile" or die $!;
while (<$fh>) {
process_line($_);
}
} else {
while (<>) {
process_line($_);
}
}
}
sub process_line {
my $line = shift;
chomp ($line);
(my $fname,my $path, my $suffix) = fileparse($line);
my ($name, $ext) = $fname =~ /(.*)\.(.*)/;
my $newfile=$path.$name.".$ext";
$newfile =~ s/\s/%20/g;
$newfile =~ s/\/root/http:\/\/myip/;
print $newfile."\n";
}

How can I print lines from a file to separate files

I have a file which has lines like this:
1 107275 447049 scaffold1443 465 341154 -
There are several lines which starts with one, after that a blank line separates and start lines with 2 and so on.
I want to separate these lines to different files based on their number.
I wrote this script but it prints in every file only the first line.
#!/usr/bin/perl
#script for choosing chromosome
use strict;
my $filename= $ARGV[0];
open(FILE, $filename);
while (my $line = <FILE>) {
my #data = split('\t', $line);
my $length = #data;
#print $length;
my $num = $data[0];
if ($length == 6) {
open(my $fh, '>', $num);
print $fh $line;
}
$num = $num + 1;
}
please, i need your help!
use >> to open file for appending to end of it as > always truncates desired file to zero bytes,
use strict;
my $filename = $ARGV[0];
open(my $FILE, "<", $filename) or die $!;
while (my $line = <$FILE>) {
my #data = split('\t', $line);
my $length = #data;
#print $length;
my $num = $data[0];
if ($length == 6) {
open(my $fh, '>>', $num);
print $fh $line;
}
$num = $num + 1;
}
If I understand your question correctly, then paragraph mode might be useful. This breaks a record on two or more new-lines, instead of just one:
#ARGV or die "Supply a filename\n";
my $filename= $ARGV[0];
local $/ = ""; # Set paragraph mode
open(my $file, $filename) or die "Unable to open '$filename' for read: $!";
while (my $lines = <$file>) {
my $num = (split("\t", $lines))[0];
open(my $fh, '>', $num) or die "Unable to open '$num' for write: $!";
print $fh $lines;
close $fh;
}
close $file;

Extracting multiple lines of record/data using a subroutine or functions

Can you show me how to create a subroutine or function using this code?
Basically I want to make my code into a subroutine so I'll be able to re-use it without making my script too long.
Here is my script:
#!/usr/local/bin/perl
use strict;
use warnings;
use Data::Dumper;
use Carp qw(croak);
my #fields;
my ($tmp_var, $rec_type, $country, $header, $Combline, $records, $line);
my $filename = 'data5.txt';
open (my $input_fh, '<', $filename ) or croak "Can't open $filename: $!";
open my $OUTPUTA, ">", 'drp1.txt' or die $!;
open my $OUTPUTB, ">", 'drp2.txt' or die $!;
while (<$input_fh>) {
$line = _trim($_);
#fields = split (/\|/, $line);
$rec_type = $fields[0];
$country = $fields[1];
my $string = substr $fields[1], 0, 1;
$header = $line if(/^INVHDR/);
if ($rec_type eq 'INVDET') {
if ($string eq 'I') {
$records = $header . $line;
print $OUTPUTA $records, scalar <$input_fh>;
}
else {
$records = $header . $line;
print $OUTPUTB $records, scalar <$input_fh>;
}
}
}
close $OUTPUTA or die $!;
close $OUTPUTB or die $!;
sub _trim {
my $word = shift;
if ( $word ) {
$word =~ s/\s*\|/\|/g; #remove trailing spaces
$word =~ s/"//g; #remove double quotes
}
return $word;
}
This is the part of the script that I wanted to put in a subroutine or function:
$line = _trim($_);
#fields = split (/\|/, $line);
$rec_type = $fields[0];
$country = $fields[1];
my $string = substr $fields[1], 0, 1;
$header = $line if (/^INVHDR/);
if ($rec_type eq 'INVDET') {
if ($string eq 'I') {
$records = $header . $line;
print $OUTPUTA $records, scalar <$input_fh>;
}
else {
$records = $header . $line;
print $OUTPUTB $records, scalar <$input_fh>;
}
}
I would suggest breaking it out a little differently and expand on your _trim function, turning it into a parse function:
use strict;
use warnings;
open( my $input_fh, '<', 'data5.txt' ) or die "Can't open $filename: $!";
open( my $OUTPUTA, '>', 'drp1.txt' ) or die $!;
open( my $OUTPUTB, '>', 'drp2.txt' ) or die $!;
my $header = '';
while (<$input_fh>) {
if ($_ =~ /^INVHDR/) {
$header = $_;
}
if ($_ =~ /^INVDET/) {
my #data = parse($_);
my $line = $header . join('|', #data);
# scalar <$input_fh> is almost certainly not doing what you expect,
# though I'm not sure what you're try to accomplish with it
if ( $data[1] =~ /^I/ ) {
print $OUTPUTA $line;
} else {
print $OUTPUTB $line;
}
}
}
sub parse {
my $input = shift || return;
my $input =~ s/"//g; # remove double quotes
# Here I've combined the removal of trailing spaces with the split.
my #fields = split( m{\s*\|}, $input );
return #fields;
}