Perl : Need to append two columns if the ID's are repeating - perl

If id gets repeated I am appending app1, app2 and printing it once.
Input:
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
Output:
id|Name|app1|app2
1|abc|234,265|231,321|
2|xyz|123|215|
3|asd|213|235|
Output I'm getting:
id|Name|app1|app2
1|abc|234,231|
2|xyz|123,215|
1|abc|265,321|
3|asd|213,235|
My Code:
#! usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
my $counter = 0;
my %RepeatNumber;
my $pos=0;
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
open(FH, '<', join('', $basedir, $file)) || die $!;
my $line = readline(FH);
unless ($counter) {
chomp $line;
print OUTFILE $line;
print OUTFILE "\n";
}
while ($line = readline(FH)) {
chomp $line;
my #obj = split('\|',$line);
if($RepeatNumber{$obj[0]}++) {
my $str1= join("|",$obj[0]);
my $str2=join(",",$obj[2],$obj[3]);
print OUTFILE join("|",$str1,$str2);
print OUTFILE "\n";
}
}

This should do the trick:
use strict;
use warnings;
my $file_in = "doctor.txt";
open (FF, "<$file_in");
my $temp = <FF>; # remove first line
my %out;
while (<FF>)
{
my ($id, $Name, $app1, $app2) = split /\|/, $_;
$out{$id}[0] = $Name;
push #{$out{$id}[1]}, $app1;
push #{$out{$id}[2]}, $app2;
}
foreach my $key (keys %out)
{
print $key, "|", $out{$key}[0], "|", join (",", #{$out{$key}[1]}), "|", join (",", #{$out{$key}[2]}), "\n";
}
EDIT
To see what the %out contains (in case it's not clear), you can use
use Data::Dumper;
and print it via
print Dumper(%out);

I'd tackle it like this:
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use 5.14.0;
my %stuff;
#extract the header row.
#use the regex to remove the linefeed, because
#we can't chomp it inline like this.
#works since perl 5.14
#otherwise we could just chomp (#header) later.
my ( $id, #header ) = split( /\|/, <DATA> =~ s/\n//r );
while (<DATA>) {
#turn this row into a hash of key-values.
my %row;
( $id, #row{#header} ) = split(/\|/);
#print for diag
print Dumper \%row;
#iterate each key, and insert into $row.
foreach my $key ( keys %row ) {
push( #{ $stuff{$id}{$key} }, $row{$key} );
}
}
#print for diag
print Dumper \%stuff;
print join ("|", "id", #header ),"\n";
#iterate ids in the hash
foreach my $id ( sort keys %stuff ) {
#join this record by '|'.
print join('|',
$id,
#turn inner arrays into comma separated via map.
map {
my %seen;
#use grep to remove dupes - e.g. "abc,abc" -> "abc"
join( ",", grep !$seen{$_}++, #$_ )
} #{ $stuff{$id} }{#header}
),
"\n";
}
__DATA__
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
This is perhaps a bit overkill for your application, but it should handle arbitrary column headings and arbitary numbers of duplicates. I'll coalesce them though - so the two abc entries don't end up abc,abc.
Output is:
id|Name|app1|app2
1|abc|234,265|231,321
2|xyz|123|215
3|asd|213|235

Another way of doing it which doesn't use a hash (in case you want to be more memory efficient), my contribution lies under the opens:
#!/usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
select(OUTFILE);
open(FH, '<', join('', $basedir, $file)) || die $!;
print(scalar(<FH>));
my #lastobj = (undef);
foreach my $obj (sort {$a->[0] <=> $b->[0]}
map {chomp;[split('|')]} <FH>) {
if(defined($lastobj[0]) &&
$obj[0] eq $lastobj[0])
{#lastobj = (#obj[0..1],
$lastobj[2].','.$obj[2],
$lastobj[3].','.$obj[3])}
else
{
if($lastobj[0] ne '')
{print(join('|',#lastobj),"|\n")}
#lastobj = #obj[0..3];
}
}
print(join('|',#lastobj),"|\n");
Note that split, without it's third argument ignores empty elements, which is why you have to add the last bar. If you don't do a chomp, you won't need to supply the bar or the trailing hard return, but you would have to record $obj[4].

Related

Reading text file into hash and accessing values perl

I am trying to read text file content into hash but having some problem reading as well as accessing it.
resctrl_top
/path/to/a/
vdm05top
/path/to/b/
/path/to/c/
/path/to/d/
/path/to/e/
/path/to/f/
The file format will be as above. My desired output is a hash with the non spacing line as key, and the path lines as values. I would like to know also how to access each values for different keys.
resctrl_top => /path/to/a/
vdm05top => /path/to/b/,/path/to/c/,...
Below are the effort I tried:
use strict;
use warnings;
my %hash;
open FILE, "filename.txt" or die $!;
my $key;
while (my $line = <FILE>) {
chomp($line);
if ($line !~ /^\s/) {
($key) = $line =~ /^\S+/g;
$hash{$key} = [];
} else {
$line =~ s/^\s+//;
push #{ $hash{$key} }, $line;
}
}
close FILE;
foreach (keys %hash){
print "$key => $hash{$key}\n";
}
Try this way:
use strict;
use warnings;
use Data::Dumper;
my %hash;
my $key;
while (my $line = <DATA>) {
chomp($line);
if ($line !~ /^\s/) {
$key = $line;
} else {
$line =~ s/\s//g;
push (#{$hash{$key}} , $line);
}
}
my %final;
foreach my $k (keys %hash){
my $val = join(",", #{$hash{$k}});
$final{$k} = $val; #New hash will have key and respective values
}
print Dumper(\%final);
__DATA__
resctrl_top
/path/to/a/
vdm05top
/path/to/b/
/path/to/c/
/path/to/d/
/path/to/e/
/path/to/f/
Result:
$VAR1 = {
'vdm05top' => '/path/to/b/,/path/to/c/,/path/to/d/,/path/to/e/,/path/to/f/',
'resctrl_top' => '/path/to/a/'
};
Hope this solves your problem.
Here's a pretty simple solution.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
use Data::Dumper; # Just for output
my ($key, %hash); # Declare globals
while (<DATA>) { # Quick hack - read from DATA
chomp;
if (/^\s/) { # If the line starts with a space
s/^\s+//;
push #{$hash{$key}}, $_;
} else { # The line is a key
$key = $_;
}
}
say Dumper \%hash;
__DATA__
resctrl_top
/path/to/a/
vdm05top
/path/to/b/
/path/to/c/
/path/to/d/
/path/to/e/
/path/to/f/

how to display the hash value from my sample data

I'm learning perl at the moment, i wanted to ask help to answer this exercise.
My objective is to display the hash value of PartID 1,2,3
the sample output is displaying lot, wafer, program, version, testnames, testnumbers, hilimit, lolimit and partid values only.
sample data
lot=lot123
wafer=1
program=prgtest
version=1
Testnames,T1,T2,T3
Testnumbers,1,2,3
Hilimit,5,6,7
Lolimit,1,2,3
PartID,,,,
1,3,0,5
2,4,3,2
3,5,6,3
This is my code:
#!/usr/bin/perl
use strict;
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file ) or die("Error in command line arguments\n");
my $lotid = "";
open(DATA, $file) or die "Couldn't open file $file";
while(my $line = <DATA>) {
#print "$line";
if ( $line =~ /^lot=/ ) {
#print "$line \n";
my ($dump, $lotid) = split /=/, $line;
print "$lotid\n";
}
elsif ($line =~ /^program=/ ) {
my ($dump, $progid) = split /=/, $line;
print "$progid \n";
}
elsif ($line =~ /^wafer=/ ) {
my ($dump, $waferid) = split /=/, $line;
print "$waferid \n";
}
elsif ($line =~ /^version=/ ) {
my ($dump, $verid) = split /=/, $line;
print "$verid \n";
}
elsif ($line =~ /^testnames/i) {
my ($dump, #arr) = split /\,/, $line;
foreach my $e (#arr) {
print $e, "\n";
}
}
elsif ($line =~ /^testnumbers/i) {
my ($dump, #arr1) = split /\,/, $line;
foreach my $e1 (#arr1) {
print $e1, "\n";
}
}
elsif ($line =~ /^hilimit/i) {
my ($dump, #arr2) = split /\,/, $line;
foreach my $e2 (#arr2) {
print $e2, "\n";
}
}
elsif ($line =~ /^lolimit/i) {
my ($dump, #arr3) = split /\,/, $line;
foreach my $e3 (#arr3) {
print $e3, "\n";
}
}
}
Kindly help add to my code to display Partid 1,2,3 hash.
So I've rewritten your code a little to use a few more modern Perl idioms (along with some comments to explain what I've done). The bit I've added is near the bottom.
#!/usr/bin/perl
use strict;
# Added 'warnings' which you should always use
use warnings;
# Use say() instead of print()
use feature 'say';
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file)
or die ("Error in command line arguments\n");
# Use a lexical variable for a filehandle.
# Use the (safer) 3-argument version of open().
# Add $! to the error message.
open(my $fh, '<', $file) or die "Couldn't open file $file: $!";
# Read each record into $_ - which makes the following code simpler
while (<$fh>) {
# Match on $_
if ( /^lot=/ ) {
# Use "undef" instead of a $dump variable.
# split() works on $_ by default.
my (undef, $lotid) = split /=/;
# Use say() instead of print() - less punctuation :-)
say $lotid;
}
elsif ( /^program=/ ) {
my (undef, $progid) = split /=/;
say $progid;
}
elsif ( /^wafer=/ ) {
my (undef, $waferid) = split /=/;
say $waferid;
}
elsif ( /^version=/ ) {
my (undef, $verid) = split /=/;
say $verid;
}
elsif ( /^testnames/i) {
my (undef, #arr) = split /\,/;
# Changed all of these similar pieces of code
# to use the same variable names. As they are
# defined in different code blocks, they are
# completely separate variables.
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^testnumbers/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^hilimit/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^lolimit/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
# And here's the new bit.
# If we're on the "partid" line, then read the next
# three lines, split each one and print the first
# element from the list returned by split().
elsif ( /^partid/i) {
say +(split /,/, <$fh>)[0] for 1 .. 3;
}
}
Update: By the way, there are no hashes anywhere in this code :-)
Update 2: I've just realised that you only have three different ways to process the data. So you can simplify your code drastically by using slightly more complex regexes.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file)
or die ("Error in command line arguments\n");
open(my $fh, '<', $file) or die "Couldn't open file $file: $!";
while (<$fh>) {
# Single value - just print it.
if ( /^(?:lot|program|wafer|version)=/ ) {
my (undef, $value) = split /=/;
say $value;
}
# List of values - split and print.
elsif ( /^(?:testnames|testnumbers|hilimit|lolimit)/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
# Extract values from following lines.
elsif ( /^partid/i) {
say +(split /,/, <$fh>)[0] for 1 .. 3;
}
}

I need the output in following way in perl

# File-
# a,b,c,d,e,f
# 1,2,3,4,3,2
# 9,8,7,6,5,0
# 2,3,4,6,7,8
# i need output like this:-
# a=1,d=4,c=3,a=9,d=6,c=7,a=2,d=6,c=4
# but my program gives this:-
# a=1,d=4,c=3a=9,d=6,c=7a=2,d=6,c=4 (there is no , after c and a)
my script is :-
open ($fh, 'parse.txt');
my #arr;
my $dummy=<$fh>;
while (<$fh>) {
chomp;
$a = substr $_, 0,1;
$b = substr $_, 6,1;
$c = substr $_, 4,1;
print "a=$a,d=$b,c=$c";
}
close (IN);
my $prefix = '';
while (<$fh>) {
chomp;
my #fields = split /,/;
print $prefix."a=$fields[0],d=$fields[3],c=$fields[2]";
$prefix = ',';
}
print("\n");
or
my #recs;
while (<$fh>) {
chomp;
my #fields = split /,/;
push #recs, "a=$fields[0],d=$fields[3],c=$fields[2]";
}
print(join(',', #recs), "\n");
Instead of printing out the values you could append them to a string and include a comma after the "c" value. Then at the end of the loop, erase the final comma from the string and print it out. There are some scalability problems if your input file is too large. But if it's a reasonable size there shouldn't be any substantial issue.
my $output;
my $dummy=<$fh>;
while (<$fh>) {
chomp;
$a = substr $_, 0,1;
$b = substr $_, 6,1;
$c = substr $_, 4,1;
$output .= "a=$a,d=$b,c=$c,";
}
chop $output;
print $output;
If you have fields with separators split the line and collect needed elements
use warnings;
use strict;
use feature 'say';
my $file = 'parse.txt';
open my $fh, '<', $file or die "Can't open $file: $!";
my $dummy = <$fh>;
my #res;
while (<$fh>)
{
my ($a, $d, $c) = (split /,/)[0,3,2];
push #res, "a=$a,d=$d,c=$c";
}
say join ',', #res;
or pick the order in the assignment
my ($a, $c, $d) = (split /,/)[0,2,3];

Print a variable which is inside two loops

I couldn't figure it out how to escape this.
I would like to print the variable $rfam_column, which is inside two loops. But I cannot just write the print command right after the place where $rfam_column appears, because I would like to print other things which will be outside the loop and combine them to the printed content.
I would appreciate any advice as to what I'm doing wrong here.
use warnings;
use strict;
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
Global symbol "$rfam_column" requires explicit package name at script_vbeta.pl line 90.
Execution of script_vbeta.pl aborted due to compilation errors.
EDITED to include all the code and information of the input--output as suggested:
Input file is a table with n lines vs n columns like this (I extracted a few columns otherwise it would be much long to represent in a line):
RF00001 1302 5S ribosomal RNA
RF00006 1307 Vault RNA
RF00007 1308 U12 minor spliceosomal RNA
RF00008 1309 Hammerhead ribozyme (type III)
Output should be like this:
|RF00001|RF00006|RF00007
And the code (usage: script.pl -i input_file):
use warnings;
use strict;
use Getopt::Long;
Getopt::Long::Configure("pass_through");
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
I think you want
if ($in) {
...
my #rfams;
for my $row (#list) {
my #fields = split(/\t/, $row);
my $rfam = $fields[0];
push #rfams, $rfam;
}
my $rfams = join('|', #rfams);
print("$rfams\n");
}
I would like to print other things which will be outside the loop and combine them to the $rfam_column content
You can include anything that is in an outer scope in print. You can just put your print statement inside the inner loop
By the way, I don't know what you mean by
# insert "|" between RFs
foreach $_ ($rfam_column) {
s/^/|/;
}
That is the same as
$rfam_column =~ s/^/|/;
which just adds a pipe | character to the beginning of the string
What is an RF?

Perl - need to store the column values into hash

I want to create a hash with column header as hash key and column values as hash values in Perl.
For example, if my csv file has the following data:
A,B,C,D,E
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15 ...
I want to create a hash as follows:
A=> 1,6,11
B=>2,7,12
c=>3,8,13 ...
So that just by using the header name I can use that column values.
Is there a way in PERL to do this? Please help me.
I was able to store required column values as array using the following script
use strict;
use warnings;
open( IN, "sample.csv" ) or die("Unable to open file");
my $wanted_column = "A";
my #cells;
my #colvalues;
my $header = <IN>;
my #column_names = split( ",", $header );
my $extract_col = 0;
for my $header_line (#column_names) {
last if $header_line =~ m/$wanted_column/;
$extract_col++;
}
while ( my $row = <IN> ) {
last unless $row =~ /\S/;
chomp $row;
#cells = split( ",", $row );
push( #colvalues, $cells[$extract_col] );
}
my $sizeofarray = scalar #colvalues;
print "Size of the coulmn= $sizeofarray";
But I want to do this to all my column.I guess Hash of arrays will be the best solution but I dont know how to implement it.
Text::CSV is a useful helper module for this sort of thing.
use strict;
use warnings;
use Text::CSV;
use Data::Dumper;
my %combined;
open( my $input, "<", "sample.csv" ) or die("Unable to open file");
my $csv = Text::CSV->new( { binary => 1 } );
my #headers = #{ $csv->getline($input) };
while ( my $row = $csv->getline($input) ) {
for my $header (#headers) {
push( #{ $combined{$header} }, shift(#$row) );
}
}
print Dumper \%combined;
Since you requested without a module - you can use split but you need to bear in mind the limitations. CSV format allows for things like commas nested in quotes. split won't handle that case very well.
use strict;
use warnings;
use Data::Dumper;
my %combined;
open( my $input, "<", "sample.csv" ) or die("Unable to open file");
my $line = <$input>;
chomp ( $line );
my #headers = split( ',', $line );
while (<$input>) {
chomp;
my #row = split(',');
for my $header (#headers) {
push( #{ $combined{$header} }, shift(#row) );
}
}
print Dumper \%combined;
Note: Both of these will effectively ignore any extra columns that don't have headers. (And will get confused by duplicate column names).
Another solution by using for loop :
use strict;
use warnings;
my %data;
my #columns;
open (my $fh, "<", "file.csv") or die "Can't open the file : ";
while (<$fh>)
{
chomp;
my #list=split(',', $_);
for (my $i=0; $i<=$#list; $i++)
{
if ($.==1) # collect the columns, if its first line.
{
$columns[$i]=$list[$i];
}
else #collect the data, if its not the first line.
{
push #{$data{$columns[$i]}}, $list[$i];
}
}
}
foreach (#columns)
{
local $"="\,";
print "$_=>#{$data{$_}}\n";
}
Output will be like this :
A=>1,6,11
B=>2,7,12
C=>3,8,13
D=>4,9,14
E=>5,10,15