Split list of delimited lines to hash - perl

The following produces what i want.
#!/usr/bin/env perl
use 5.020;
use warnings;
use Data::Dumper;
sub command {
<DATA>
#in the reality instead of the DATA I have
#qx(some weird shell command what produces output like in the DATA);
}
my #lines = grep { !/^\s*$/ } command();
chomp #lines;
my $data;
#how to write the following nicer - more compact, elegant, etc.. ;)
for my $line (#lines) {
my #arr = split /:/, $line;
$data->{$arr[0]}->{text} = $arr[1];
$data->{$arr[0]}->{par} = $arr[2];
$data->{$arr[0]}->{val} = $arr[3];
}
say Dumper $data;
__DATA__
line1:some text1:par1:val1
line2:some text2:par2:val2
line3:some text3:par3:val3
Wondering how to write the loop in more perlish form. ;)

You can assign to a hash slice:
for my $line (#lines) {
my ($id, #arr) = split /:/, $line;
#{ $data->{$id} }{qw{ text par val }} = #arr;
}
Also, use the following instead of qx, so you don't need to store all the lines in an array:
open my $PIPE, '-|', 'command' or die $!;
while (<$PIPE>) {
# ...
}

Related

I need the output in following way in perl

# File-
# a,b,c,d,e,f
# 1,2,3,4,3,2
# 9,8,7,6,5,0
# 2,3,4,6,7,8
# i need output like this:-
# a=1,d=4,c=3,a=9,d=6,c=7,a=2,d=6,c=4
# but my program gives this:-
# a=1,d=4,c=3a=9,d=6,c=7a=2,d=6,c=4 (there is no , after c and a)
my script is :-
open ($fh, 'parse.txt');
my #arr;
my $dummy=<$fh>;
while (<$fh>) {
chomp;
$a = substr $_, 0,1;
$b = substr $_, 6,1;
$c = substr $_, 4,1;
print "a=$a,d=$b,c=$c";
}
close (IN);
my $prefix = '';
while (<$fh>) {
chomp;
my #fields = split /,/;
print $prefix."a=$fields[0],d=$fields[3],c=$fields[2]";
$prefix = ',';
}
print("\n");
or
my #recs;
while (<$fh>) {
chomp;
my #fields = split /,/;
push #recs, "a=$fields[0],d=$fields[3],c=$fields[2]";
}
print(join(',', #recs), "\n");
Instead of printing out the values you could append them to a string and include a comma after the "c" value. Then at the end of the loop, erase the final comma from the string and print it out. There are some scalability problems if your input file is too large. But if it's a reasonable size there shouldn't be any substantial issue.
my $output;
my $dummy=<$fh>;
while (<$fh>) {
chomp;
$a = substr $_, 0,1;
$b = substr $_, 6,1;
$c = substr $_, 4,1;
$output .= "a=$a,d=$b,c=$c,";
}
chop $output;
print $output;
If you have fields with separators split the line and collect needed elements
use warnings;
use strict;
use feature 'say';
my $file = 'parse.txt';
open my $fh, '<', $file or die "Can't open $file: $!";
my $dummy = <$fh>;
my #res;
while (<$fh>)
{
my ($a, $d, $c) = (split /,/)[0,3,2];
push #res, "a=$a,d=$d,c=$c";
}
say join ',', #res;
or pick the order in the assignment
my ($a, $c, $d) = (split /,/)[0,2,3];

Print a variable which is inside two loops

I couldn't figure it out how to escape this.
I would like to print the variable $rfam_column, which is inside two loops. But I cannot just write the print command right after the place where $rfam_column appears, because I would like to print other things which will be outside the loop and combine them to the printed content.
I would appreciate any advice as to what I'm doing wrong here.
use warnings;
use strict;
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
Global symbol "$rfam_column" requires explicit package name at script_vbeta.pl line 90.
Execution of script_vbeta.pl aborted due to compilation errors.
EDITED to include all the code and information of the input--output as suggested:
Input file is a table with n lines vs n columns like this (I extracted a few columns otherwise it would be much long to represent in a line):
RF00001 1302 5S ribosomal RNA
RF00006 1307 Vault RNA
RF00007 1308 U12 minor spliceosomal RNA
RF00008 1309 Hammerhead ribozyme (type III)
Output should be like this:
|RF00001|RF00006|RF00007
And the code (usage: script.pl -i input_file):
use warnings;
use strict;
use Getopt::Long;
Getopt::Long::Configure("pass_through");
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
I think you want
if ($in) {
...
my #rfams;
for my $row (#list) {
my #fields = split(/\t/, $row);
my $rfam = $fields[0];
push #rfams, $rfam;
}
my $rfams = join('|', #rfams);
print("$rfams\n");
}
I would like to print other things which will be outside the loop and combine them to the $rfam_column content
You can include anything that is in an outer scope in print. You can just put your print statement inside the inner loop
By the way, I don't know what you mean by
# insert "|" between RFs
foreach $_ ($rfam_column) {
s/^/|/;
}
That is the same as
$rfam_column =~ s/^/|/;
which just adds a pipe | character to the beginning of the string
What is an RF?

Perl : Need to append two columns if the ID's are repeating

If id gets repeated I am appending app1, app2 and printing it once.
Input:
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
Output:
id|Name|app1|app2
1|abc|234,265|231,321|
2|xyz|123|215|
3|asd|213|235|
Output I'm getting:
id|Name|app1|app2
1|abc|234,231|
2|xyz|123,215|
1|abc|265,321|
3|asd|213,235|
My Code:
#! usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
my $counter = 0;
my %RepeatNumber;
my $pos=0;
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
open(FH, '<', join('', $basedir, $file)) || die $!;
my $line = readline(FH);
unless ($counter) {
chomp $line;
print OUTFILE $line;
print OUTFILE "\n";
}
while ($line = readline(FH)) {
chomp $line;
my #obj = split('\|',$line);
if($RepeatNumber{$obj[0]}++) {
my $str1= join("|",$obj[0]);
my $str2=join(",",$obj[2],$obj[3]);
print OUTFILE join("|",$str1,$str2);
print OUTFILE "\n";
}
}
This should do the trick:
use strict;
use warnings;
my $file_in = "doctor.txt";
open (FF, "<$file_in");
my $temp = <FF>; # remove first line
my %out;
while (<FF>)
{
my ($id, $Name, $app1, $app2) = split /\|/, $_;
$out{$id}[0] = $Name;
push #{$out{$id}[1]}, $app1;
push #{$out{$id}[2]}, $app2;
}
foreach my $key (keys %out)
{
print $key, "|", $out{$key}[0], "|", join (",", #{$out{$key}[1]}), "|", join (",", #{$out{$key}[2]}), "\n";
}
EDIT
To see what the %out contains (in case it's not clear), you can use
use Data::Dumper;
and print it via
print Dumper(%out);
I'd tackle it like this:
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use 5.14.0;
my %stuff;
#extract the header row.
#use the regex to remove the linefeed, because
#we can't chomp it inline like this.
#works since perl 5.14
#otherwise we could just chomp (#header) later.
my ( $id, #header ) = split( /\|/, <DATA> =~ s/\n//r );
while (<DATA>) {
#turn this row into a hash of key-values.
my %row;
( $id, #row{#header} ) = split(/\|/);
#print for diag
print Dumper \%row;
#iterate each key, and insert into $row.
foreach my $key ( keys %row ) {
push( #{ $stuff{$id}{$key} }, $row{$key} );
}
}
#print for diag
print Dumper \%stuff;
print join ("|", "id", #header ),"\n";
#iterate ids in the hash
foreach my $id ( sort keys %stuff ) {
#join this record by '|'.
print join('|',
$id,
#turn inner arrays into comma separated via map.
map {
my %seen;
#use grep to remove dupes - e.g. "abc,abc" -> "abc"
join( ",", grep !$seen{$_}++, #$_ )
} #{ $stuff{$id} }{#header}
),
"\n";
}
__DATA__
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
This is perhaps a bit overkill for your application, but it should handle arbitrary column headings and arbitary numbers of duplicates. I'll coalesce them though - so the two abc entries don't end up abc,abc.
Output is:
id|Name|app1|app2
1|abc|234,265|231,321
2|xyz|123|215
3|asd|213|235
Another way of doing it which doesn't use a hash (in case you want to be more memory efficient), my contribution lies under the opens:
#!/usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
select(OUTFILE);
open(FH, '<', join('', $basedir, $file)) || die $!;
print(scalar(<FH>));
my #lastobj = (undef);
foreach my $obj (sort {$a->[0] <=> $b->[0]}
map {chomp;[split('|')]} <FH>) {
if(defined($lastobj[0]) &&
$obj[0] eq $lastobj[0])
{#lastobj = (#obj[0..1],
$lastobj[2].','.$obj[2],
$lastobj[3].','.$obj[3])}
else
{
if($lastobj[0] ne '')
{print(join('|',#lastobj),"|\n")}
#lastobj = #obj[0..3];
}
}
print(join('|',#lastobj),"|\n");
Note that split, without it's third argument ignores empty elements, which is why you have to add the last bar. If you don't do a chomp, you won't need to supply the bar or the trailing hard return, but you would have to record $obj[4].

Split a line on every 16th comma

I am using perl to extract "Yes," or "No," from a large CSV, and output to a file using this code
open my $fin, "leads.csv";
my $str;
for (<$fin>) {
if (/^\s*\d+\.\s*(\w+)/) {
$str .= $1 . ",";
}
}
open (MYFILE, '>>data.txt');
print MYFILE $str;
close (MYFILE);
This is working correctly, and outputting data like this http://pastebin.com/r7Lwwz8p, however I need to break
to a new line after the 16th element so it looks like this on output: http://pastebin.com/xC8Lyk5R
Any tips/tricks greatly appreciated!
The following splits a line by commas, and then regroups them by 16 elements:
use strict;
use warnings;
while (my $line = <DATA>) {
chomp $line;
my #fields = split ',', $line;
while (my #data = splice #fields, 0, 16) {
print join(',', #data), "\n";
}
}
__DATA__
LineA,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineB,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineC,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineD,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineE,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineF,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineG,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,LineH,2,3,4,5,6,7,8,9,10,11,12
Outputs:
LineA,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineB,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineC,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineD,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineE,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineF,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineG,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
LineH,2,3,4,5,6,7,8,9,10,11,12
Use a variable to count the number of yes/no matches that you find, and then use the mod (%) operator to insert a newline into the string.
#!/usr/bin/perl
use strict;
use warnings;
open my $fin, "leads.csv";
my $str;
my $count = 0;
for (<$fin>) {
if (/^\s*\d+\.\s*(\w+)/) {
$str .= $1 . ",";
$count++;
}
$str .= "\n" unless ($count % 16);
}
open (MYFILE, '>>data.txt');
print MYFILE $str;
close (MYFILE);

How many different ways are there to concatenate two files line by line using Perl?

Suppose file1 looks like this:
bye bye
hello
thank you
And file2 looks like this:
chao
hola
gracias
The desired output is this:
bye bye chao
hello hola
thank you gracias
I myself have already come up with five different approaches to solve this problem. But I think there must be more ways, probably more concise and more elegant ways, and I hope I can learn more cool stuff :)
The following is what I have tried so far, based on what I've learnt from the many solutions of my previous problems. Also, I'm trying to sort of digest or internalize the knowledge I've acquired from the Llama book.
Code 1:
#!perl
use autodie;
use warnings;
use strict;
open my $file1,'<','c:/file1.txt';
open my $file2,'<','c:/file2.txt';
while(defined(my $line1 = <$file1>)
and defined(my $line2 = <$file2>)){
die "Files are different sizes!\n" unless eof(file1) == eof(file2);
$line1 .= $line2;
$line1 =~ s/\n/ /;
print "$line1 \n";
}
Code 2:
#!perl
use autodie;
use warnings;
use strict;
open my $file1,'<','c:/file1.txt';
my #file1 = <$file1>;
open my $file2,'<','c:/file2.txt';
my #file2 =<$file2>;
for (my $n=0; $n<=$#file1; $n++) {
$file1[$n] .=$file2[$n];
$file1[$n]=~s/\n/ /;
print $file1[$n];
}
Code 3:
#!perl
use autodie;
use warnings;
use strict;
open my $file1,'<','c:/file1.txt';
open my $file2,'<','c:/file2.txt';
my %hash;
while(defined(my $line1 = <$file1>)
and defined(my $line2 = <$file2>)) {
chomp $line1;
chomp $line2;
my ($key, $val) = ($line1,$line2);
$hash{$key} = $val;
}
print map { "$_ $hash{$_}\n" } sort keys %hash;
Code 4:
#!perl
use autodie;
use warnings;
use strict;
open my $file1,'<','c:/file1.txt';
open my $file2,'<','c:/file2.txt';
while(defined(my $line1 = <$file1>)
and defined(my $line2 = <$file2>)) {
$line1 =~ s/(.+)/$1 $line2/;
print $line1;
}
Code 5:
#!perl
use autodie;
use warnings;
use strict;
open my $file1,'<','c:/file1.txt';
my #file1 =<$file1>;
open my $file2,'<','c:/file2.txt';
my #file2 =<$file2>;
while ((#file1) && (#file2)){
my $m = shift (#file1);
chomp($m);
my $n = shift (#file2);
chomp($n);
$m .=" ".$n;
print "$m \n";
}
I have tried something like this:
foreach $file1 (#file2) && foreach $file2 (#file2) {...}
But Perl gave me a syntactic error warning. I was frustrated. But can we run two foreach loops simultaneously?
Thanks, as always, for any comments, suggestions and of course the generous code sharing :)
This works for any number of files:
use strict;
use warnings;
use autodie;
my #handles = map { open my $h, '<', $_; $h } #ARGV;
while (#handles){
#handles = grep { ! eof $_ } #handles;
my #lines = map { my $v = <$_>; chomp $v; $v } #handles;
print join(' ', #lines), "\n";
}
close $_ for #handles;
The most elegant way doesn't involve perl at all:
paste -d' ' file1 file2
If I were a golfing man, I could rewrite #FM's answer as:
($,,$\)=(' ',"\n");#_=#ARGV;open $_,$_ for #_;print
map{chomp($a=<$_>);$a} #_=grep{!eof $_} #_ while #_
which you might be able to turn into a one-liner but that is just evil. ;-)
Well, here it is, under 100 characters:
C:\Temp> perl -le "$,=' ';#_=#ARGV;open $_,$_ for #_;print map{chomp($a =<$_>);$a} #_=grep{!eof $_ }#_ while #_" file1 file2
If it is OK to slurp (and why the heck not — we are looking for different ways), I think I have discovered the path the insanity:
#_=#ARGV;chomp($x[$.-1]{$ARGV}=$_) && eof
and $.=0 while<>;print "#$_{#_}\n" for #x
C:\Temp> perl -e "#_=#ARGV;chomp($x[$.-1]{$ARGV}=$_) && eof and $.=0 while<>;print qq{#$_{#_}\n} for #x" file1 file2
Output:
bye bye chao
hello hola
thank you gracias
An easier alternative to your Code 5 which allows for an arbitrary number of lines and does not care if files have different numbers of lines (hat tip #FM):
#!/usr/bin/perl
use strict; use warnings;
use File::Slurp;
use List::AllUtils qw( each_arrayref );
my #lines = map [ read_file $_ ], #ARGV;
my $it = each_arrayref #lines;
while ( my #lines = grep { defined and chomp and length } $it->() ) {
print join(' ', #lines), "\n";
}
And, without using any external modules:
#!perl
use autodie; use warnings; use strict;
my ($file1, $file2) = #ARGV;
open my $file1_h,'<', $file1;
my #file1 = grep { chomp; length } <$file1_h>;
open my $file2_h,'<', $file2;
my #file2 = grep { chomp; length } <$file2_h>;
my $n_lines = #file1 > #file2 ? #file1 : #file2;
for my $i (0 .. $n_lines - 1) {
my ($line1, $line2) = map {
defined $_ ? $_ : ''
} $file1[$i], $file2[$i];
print $line1, ' ', $line2, "\n";
}
If you want to concatenate only the lines that appear in both files:
#!perl
use autodie; use warnings; use strict;
my ($file1, $file2) = #ARGV;
open my $file1_h,'<', $file1;
my #file1 = grep { chomp; length } <$file1_h>;
open my $file2_h,'<', $file2;
my #file2 = grep { chomp; length } <$file2_h>;
my $n_lines = #file1 < #file2 ? #file1 : #file2;
for my $i (0 .. $n_lines - 1) {
print $file1[$i], ' ', $file2[$i], "\n";
}
An easy one with minimal error checking:
#!/usr/bin/perl -w
use strict;
open FILE1, '<file1.txt';
open FILE2, '<file2.txt';
while (defined(my $one = <FILE1>) or defined(my $twotemp = <FILE2>)){
my $two = $twotemp ? $twotemp : <FILE2>;
chomp $one if ($one);
chomp $two if ($two);
print ''.($one ? "$one " : '').($two ? $two : '')."\n";
}
And no, you can't run two loops simultaneous within the same thread, you'd have to fork, but that would not be guaranteed to run synchronously.