Perl parse CSV file "fill" and "null" fields - perl

Okay - I'm going to post my entire script since I get chastised when I don't do it - even though, last time I did that I got chastised for posting the whole script. I simply need to know if the one line I originally asked about would work. ENTIRE SCRIPT (which was working just fine until the other dept gave me their data entirely differently than what we were originally told it would be) TO FOLLOW AT THE END
I'm parsing through and scrubbing a CSV file to make it ready to be loaded in a MySQL table. It is loaded through the table via someone else's "batch Java program" and if any field is empty the batch file stops with an error.
I've been told to just put in a blank space whenever there's an empty field in any record. Would something as simple as this work?
if ( ! length $fields[2] ) {
$_ = ' ' for $fields[2];
}
And would there be a way to check either various multiple fields at once? Or what might be better would be to check ALL the fields (this is after the record has been split) as the last thing I do just before writing the record back out to the CSV file.
Here's the entire script. Please don't tell me how what I'm doing within the already working script is not how you would do it. -
#!/usr/bin/perl/
use strict;
use warnings;
use Data::Dumper;
use Time::Piece;
my $filename = 'mistints_1505_comma.csv';
#my $filename = 'test.csv';
# Open input file
open my $FH, $filename
or die "Could not read from $filename <$!>, program halting.";
# Open error handling file
open ( my $ERR_FH, '>', "errorFiles1505.csv" ) or die $!;
# Read the header line of the input file and print to screen.
chomp(my $line = <$FH>);
my #fields = split(/,/, $line);
print Dumper(#fields), $/;
my #data;
# Read the lines one by one.
while($line = <$FH>) {
chomp($line);
# Scrub data of characters that cause scripting problems down the line.
$line =~ s/[\'\\]/ /g;
# split the fields of each record
my #fields = split(/,/, $line);
# Check if the storeNbr field is empty. If so, write record to error file.
if (!length $fields[28]) {
chomp (#fields);
my $str = join ',', #fields;
print $ERR_FH "$str\n";
}
else
{
# Concatenate the first three fields and add to the beginning of each record
unshift #fields, join '_', #fields[28..30];
# Format the DATE fields for MySQL
$_ = join '-', (split /\//)[2,0,1] for #fields[10,14,24,26];
# Scrub colons from the data
$line =~ s/:/ /g;
# If Spectro_Model is "UNKNOWN", change
if($fields[22] eq "UNKNOWN"){
$_ = 'UNKNOW' for $fields[22];
}
# If tran_date is blank, insert 0000-00-00
if(!length $fields[10]){
$_ = '0000-00-00' for $fields[10];
}
# If init_tran_date is blank, insert 0000-00-00
if(!length $fields[14]){
$_ = '0000-00-00' for $fields[14];
}
# If update_tran_date is blank, insert 0000-00-00
if(!length $fields[24]){
$_ = '0000-00-00' for $fields[24];
}
# If cancel_date is blank, insert 0000-00-00
if(!length $fields[26]){
$_ = '0000-00-00' for $fields[26];
}
# Format the PROD_NBR field by deleting any leading zeros before decimals.
$fields[12] =~ s/^\s*0\././;
# put the records back
push #data, \#fields;
}
}
close $FH;
close $ERR_FH;
print "Unsorted:\n", Dumper(#data); #, $/;
#Sort the clean files on Primary Key, initTranDate, updateTranDate, and updateTranTime
#data = sort {
$a->[0] cmp $b->[0] ||
$a->[14] cmp $b->[14] ||
$a->[26] cmp $b->[26] ||
$a->[27] cmp $b-> [27]
} #data;
#open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/parsedMistints.csv';
open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/cleaned1505.csv';
print $OFH join(',', #$_), $/ for #data;
close $OFH;
exit;

As far as I can tell you have split a record on commas ,, and you want to alter all fields that are empty strings to contain a single space
I would write this
use strict;
use warnings 'all';
my $record = 'a,b,c,,e,,g,,i,,k,,m,n,o,p,q,r,s,t';
my #fields = map { $_ eq "" ? ' ' : $_ } split /,/, $record;
use Data::Dump;
dd \#fields;
output
[ "a", "b", "c", " ", "e", " ", "g", " ", "i", " ", "k", " ", "m" .. "t" ]
Alternatively, if you have some fields that need to be set to something different if they are empty, you can set up an array of defaults
That would look like this. All of the #defaults array is set to spaces except for fields 10, 11 and 12, which are 0000-00-00. These are picked up after the record is split
use strict;
use warnings 'all';
my #defaults = (' ') x 20;
$defaults[$_] = '0000-00-00' for 9, 10, 11;
my $record = 'a,b,c,,e,,g,,i,,k,,m,n,o,p,q,r,s,t';
my #fields = split /,/, $record;
for my $i ( 0 .. $#fields ) {
$fields[$i] = $defaults[$i] if $fields[$i] eq '';
}
use Data::Dump;
dd \#fields;
output
[ "a", "b", "c", " ", "e", " ", "g", " ", "i", "0000-00-00", "k", "0000-00-00", "m" .. "t" ]
Having seen your full program, I recommend something like this. If you had shown a sample of your input data then I could have used a hash to refer to column names instead of numbers, making it much more readable
#!/usr/bin/perl/
use strict;
use warnings 'all';
use Data::Dumper;
use Time::Piece;
my $filename = 'mistints_1505_comma.csv';
#my $filename = 'test.csv';
open my $FH, $filename
or die "Could not read from $filename <$!>, program halting.";
open( my $ERR_FH, '>', "errorFiles1505.csv" ) or die $!;
chomp( my $line = <$FH> );
my #fields = split /,/, $line; #/
print Dumper( \#fields ), "\n";
my #data;
# Read the lines one by one.
while ( <$FH> ) {
chomp;
# Scrub data of characters that cause scripting problems down the line.
tr/'\\/ /; #'
my #fields = split /,/; #/
# Check if the storeNbr field is empty. If so, write record to error file.
if ( $fields[28] eq "" ) {
my $str = join ',', #fields;
print $ERR_FH "$str\n";
next;
}
# Concatenate the first three fields and add to the beginning of each record
unshift #fields, join '_', #fields[ 28 .. 30 ];
# Format the DATE fields for MySQL
$_ = join '-', ( split /\// )[ 2, 0, 1 ] for #fields[ 10, 14, 24, 26 ];
# Scrub colons from the data
tr/://d; #/
my $i = 0;
for ( #fields ) {
# If "Spectro_Model" is "UNKNOWN" then change to "UNKNOW"
if ( $i == 22 ) {
$_ = 'UNKNOW' if $_ eq 'UNKNOWN';
}
# If a date field is blank then insert 0000-00-00
elsif ( grep { $i == $_ } 10, 14, 24, 26 ) {
$_ = '0000-00-00' if $_ eq "";
}
# Format the PROD_NBR field by deleting any leading zeros before decimals.
elsif ( $i == 12 ) {
s/^\s*0\././;
}
# Change all remaining empty fields to a single space
else {
$_ = ' ' if $_ eq "";
}
++$i;
}
push #data, \#fields;
}
close $FH;
close $ERR_FH;
print "Unsorted:\n", Dumper(#data); #, $/;
#Sort the clean files on Primary Key, initTranDate, updateTranDate, and updateTranTime
#data = sort {
$a->[0] cmp $b->[0] or
$a->[14] cmp $b->[14] or
$a->[26] cmp $b->[26] or
$a->[27] cmp $b->[27]
} #data;
#open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/parsedMistints.csv';
open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/cleaned1505.csv' or die $!;
print $OFH join(',', #$_), $/ for #data;
close $OFH;

Well if you did it before splitting into $fields, you ought to be able to do something like
# assuming a CSV line is in $_
#pad null at start of line
s/^,/ ,/;
#pad nulls in the middle
s/,,/, ,/g;
#pad null at the end
s/,$/, /;

Don't try to roll out your own CSV parsing code. Use Text::CSV or Text::CSV::Slurp.
With Text::CSV you could do something like
$line = $csv->string(); # get the combined string
$status = $csv->parse($line); # parse a CSV string into fields
#columns = map {defined $_ ? $_ : " "} $csv->fields(); # get the parsed fields
Are you really sure you want to replace nulls with spaces? I'd say if the field is undefined it should be NULL in db.

Related

How to check whether one file's value contains in another text file? (perl script)

I would like to check one of the file's values contains on another file. if one of the value contains it will show there is existing bin for that specific, if no, it will show there is no existing bin limit. the problem is I am not sure how to check all values at once.
first DID1 text file value contain :
L84A:D:O:M:
L84C:B:E:D:
second DID text file value contain :
L84A:B:E:Q:X:F:i:M:Y:
L84C:B:E:Q:X:F:i:M:Y:
L83A:B:E:Q:X:F:i:M:Y:
if first 4words value are match, need to check all value for that line.
for example L84A in first text file & second text file value has M . it should print out there is an existing M bin
below is my code :
use strict;
use warnings;
my $filename = 'DID.txt';
my $filename1 = 'DID1.txt';
my $count = 0;
open( FILE2, "<$filename1" )
or die("Could not open log file. $!\n");
while (<FILE2>) {
my ($number) = $_;
chomp($number);
my #values1 = split( ':', $number );
open( FILE, "<$filename" )
or die("Could not open log file. $!\n");
while (<FILE>) {
my ($line) = $_;
chomp($line);
my #values = split( ':', $line );
foreach my $val (#values) {
if ( $val =~ /$values1[0]/ ) {
$count++;
if ( $values[$count] =~ /$values1[$count]/ ) {
print
"Yes ,There is an existing bin & DID\n #values1\n";
}
else {
print "No, There is an existing bin & DID\n";
}
}
}
}
}
I cannot check all value. please help to give any advice on it since this is my first time learning for perl language. Thanks a lot :)
Based on my understanding I write this code:
use strict;
use warnings;
#use ReadWrite;
use Array::Utils qw(:all);
use vars qw($my1file $myfile1cnt $my2file $myfile2cnt #output);
$my1file = "did1.txt"; $my2file = "did2.txt";
We are going to read both first and second files (DID1 and DID2).
readFileinString($my1file, \$myfile1cnt); readFileinString($my2file, \$myfile2cnt);
In first file, as per the OP's request the first four characters should be matched with second file and then if they matched we need to check rest of the characters in the first file with the second one.
while($myfile1cnt=~m/^((\w){4})\:([^\n]+)$/mig)
{
print "<LineStart>";
my $lineChk = $1; my $full_Line = $3; #print ": $full_Line\n";
my #First_values = split /\:/, $full_Line; #print join "\n", #First_values;
If the first four digit matched then,
if($myfile2cnt=~m/^$lineChk\:([^\n]+)$/m)
{
Storing the rest of the content in the same and to be split with colon and getting the characters to be matched with first file contents.
my $FullLine = $1; my #second_values = split /:/, $FullLine;
Then search each letter first and second content which matched line...
foreach my $sngletter(#First_values)
{
If the letters are matched with first and second file its going to be printed.
if( grep {$_ eq "$sngletter"} #second_values)
{
print "Matched: $sngletter\t";
}
}
}
else { print "Not Matched..."; }
This is just information that the line end.
print "<LineEnd>\n"
}
#------------------>Reading a file
sub readFileinString
#------------------>
{
my $File = shift;
my $string = shift;
use File::Basename;
my $filenames = basename($File);
open(FILE1, "<$File") or die "\nFailed Reading File: [$File]\n\tReason: $!";
read(FILE1, $$string, -s $File, 0);
close(FILE1);
}
Read search pattern and data into hash (first field is a key), then go through data and select only field included into pattern for this key.
use strict;
use warnings;
use feature 'say';
my $input1 = 'DID1.txt'; # look for key,pattern(array)
my $input2 = 'DID.txt'; # data - key,elements(array)
my $pattern;
my $data;
my %result;
$pattern = file2hash($input1); # read pattern into hash
$data = file2hash($input2); # read data into hash
while( my($k,$v) = each %{$data} ) { # walk through data
next unless defined $pattern->{$k}; # skip those which is not in pattern hash
my $find = join '|', #{ $pattern->{$k} }; # form search pattern for grep
my #found = grep {/$find/} #{ $v }; # extract only those of interest
$result{$k} = \#found; # store in result hash
}
while( my($k,$v) = each %result ) { # walk through result hash
say "$k has " . join ':', #{ $v }; # output final result
}
sub file2hash {
my $filename = shift;
my %hash;
my $fh;
open $fh, '<', $filename
or die "Couldn't open $filename";
while(<$fh>) {
chomp;
next if /^\s*$/; # skip empty lines
my($key,#data) = split ':';
$hash{$key} = \#data;
}
close $fh;
return \%hash;
}
Output
L84C has B:E
L84A has M

Parsing string in multiline data with positive lookbehind

I am trying to parse data like:
header1
-------
var1 0
var2 5
var3 9
var6 1
header2
-------
var1 -3
var3 5
var5 0
Now I want to get e.g. var3 for header2. Whats the best way to do this?
So far I was parsing my files line-by-line via
open(FILE,"< $file");
while (my $line = <FILE>){
# do stuff
}
but I guess it's not possible to handle multiline parsing properly.
Now I am thinking to parse the file at once but wasn't successful so far...
my #Input;
open(FILE,"< $file");
while (<FILE>){ #Input = <FILE>; }
if (#Input =~ /header2/){
#...
}
The easier way to handle this is "paragraph mode".
local $/ = "";
while (<>) {
my ($header, $body) =~ /^([^\n]*)\n-+\n(.*)/s
or die("Bad data");
my #data = map [ split ], split /\n/, $body;
# ... Do something with $header and #data ...
}
The same can be achieved without messing with $/ as follows:
my #buf;
while (1) {
my $line = <>;
$line =~ s/\s+\z// if !defined($line);
if (!length($line)) {
if (#buf) {
my $header = shift(#buf);
shift(#buf);
my #data = map [ split ], splice(#buf);
# ... Do something with $header and #data ...
}
last if !defined($line);
next;
}
push #buf, $line;
}
(In fact, the second snippet includes a couple of small improvements over the first.)
Quick comments on your attempt:
The while loop is useless because #Input = <FILE> places the remaining lines of the file in #Input.
#Input =~ /header2/ matches header2 against the stringification of the array, which is the stringification of the number of elements in #Input. If you want to check of an element of #Input contains header2, will you will need to loop over the elements of #Inputs and check them individually.
while (<FILE>){ #Input = <FILE>; }
This doesn't make much sense. "While you can read a record from FILE, read all of the data on FILE into #Input". I think what you actually want is just:
my #Input = <FILE>;
if (#Input =~ /header2/){
This is quite strange too. The binding operator (=~) expects scalar operands, so it evaluates both operands in scalar context. That means #Input will be evaluated as the number of elements in #Input. That's an integer and will never match "header2".
A couple of approaches. Firstly a regex approach.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
my $file = 'file';
open my $fh, '<', $file or die $!;
my $data = join '', <$fh>;
if ($data =~ /header2.+var3 (.+?)\n/s) {
say $1;
} else {
say 'Not found';
}
The key to this is the /s on the m// operator. Without it, the two dots in the regex won't match newlines.
The other approach is more of a line by line parser.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
my $file = 'file';
open my $fh, '<', $file or die $!;
my $section = '';
while (<$fh>) {
chomp;
# if the line all word characters,
# then we've got a section header.
if ($_ !~ /\W/) {
$section = $_;
next;
}
my ($key, $val) = split;
if ($section eq 'header2' and $key eq 'var3') {
say $val;
last;
}
}
We read the file a line at a time and make a note of the section headers. For data lines, we split on whitespace and check to see if we're in the right section and have the right key.
In both cases, I've switched to using a more standard approach (lexical filehandles, 3-arg open(), or die $!) for opening the file.

Print a variable which is inside two loops

I couldn't figure it out how to escape this.
I would like to print the variable $rfam_column, which is inside two loops. But I cannot just write the print command right after the place where $rfam_column appears, because I would like to print other things which will be outside the loop and combine them to the printed content.
I would appreciate any advice as to what I'm doing wrong here.
use warnings;
use strict;
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
Global symbol "$rfam_column" requires explicit package name at script_vbeta.pl line 90.
Execution of script_vbeta.pl aborted due to compilation errors.
EDITED to include all the code and information of the input--output as suggested:
Input file is a table with n lines vs n columns like this (I extracted a few columns otherwise it would be much long to represent in a line):
RF00001 1302 5S ribosomal RNA
RF00006 1307 Vault RNA
RF00007 1308 U12 minor spliceosomal RNA
RF00008 1309 Hammerhead ribozyme (type III)
Output should be like this:
|RF00001|RF00006|RF00007
And the code (usage: script.pl -i input_file):
use warnings;
use strict;
use Getopt::Long;
Getopt::Long::Configure("pass_through");
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
I think you want
if ($in) {
...
my #rfams;
for my $row (#list) {
my #fields = split(/\t/, $row);
my $rfam = $fields[0];
push #rfams, $rfam;
}
my $rfams = join('|', #rfams);
print("$rfams\n");
}
I would like to print other things which will be outside the loop and combine them to the $rfam_column content
You can include anything that is in an outer scope in print. You can just put your print statement inside the inner loop
By the way, I don't know what you mean by
# insert "|" between RFs
foreach $_ ($rfam_column) {
s/^/|/;
}
That is the same as
$rfam_column =~ s/^/|/;
which just adds a pipe | character to the beginning of the string
What is an RF?

Perl - parse file - write out to two different files

I have written a Perl script to parse through a file, scrub it, and put it in a new file. Was using test data that I was originally given to work with, but now I've gotten all the actual data and it turns out there are a good deal of records I will NOT want in the newly scrubbed file (mainly because too many of the fields in those records are empty).
So I now need to check if a particular field in a record is empty and if so, write it out to an "error" file and not write it out to the scrubbed data file. Below is my script (and before people bring it up, I do not have the Text::CSV module nor will I ever have it available)
NOTE - until I tried putting the IF/ELSE statement in there, the code was working with the data I had prior to being given the actual data with these problem records.
#!/usr/bin/perl/
use strict;
use warnings;
use Data::Dumper;
use Time::Piece;
my $filename = 'uncleanData.csv';
open my $FH, $filename
or die "Could not read from $filename <$!>, program halting.";
# Read the header line.
chomp(my $line = <$FH>);
my #fields = split(/,/, $line);
print Dumper(#fields), $/;
my #data;
# Read the lines one by one.
while($line = <$FH>) {
chomp($line);
Here is the new IF statement I put in with the code below the ELSE having not changed from my prior working script -
# Check if the storeNbr field is empty. If so, write record to error file.
if (!length $fields[28]) {
open ( my $ERR_FH, '>', "errorFiles.csv" ) or die $!;
print $ERR_FH join(',', #$_), $/ for #data;
close $ERR_FH;
}
else
{
# Scrub data of characters that cause scripting problems down the line.
$line =~ s/[\'\\]/ /g;
# split the fields, concatenate fields 28-30, and add the
# concatenated field to the beginning of each line in the file
my #fields = split(/,/, $line);
unshift #fields, join '_', #fields[28..30];
# Format the DATE fields for MySQL
$_ = join '-', (split /\//)[2,0,1] for #fields[10,14,24,26];
# Scrub colons from the data
$line =~ s/:/ /g;
# If Spectro_Model is "UNKNOWN", change
if($fields[22] eq "UNKNOWN"){
$_ = 'UNKNOW' for $fields[22];
}
# If tran_date is blank, insert 0000-00-00
if(!length $fields[10]){
$_ = '0000-00-00' for $fields[10];
}
# If init_tran_date is blank, insert 0000-00-00
if(!length $fields[14]){
$_ = '0000-00-00' for $fields[14];
}
# If update_tran_date is blank, insert 0000-00-00
if(!length $fields[24]){
$_ = '0000-00-00' for $fields[24];
}
# If cancel_date is blank, insert 0000-00-00
if(!length $fields[26]){
$_ = '0000-00-00' for $fields[26];
}
# Format the PROD_NBR field by deleting any leading zeros before decimals.
$fields[12] =~ s/^\s*0\././;
# put the records back
push #data, \#fields;
}
}
close $FH;
print "Unsorted:\n", Dumper(#data); #, $/;
#Sort the clean files on Primary Key, initTranDate, updateTranDate, and updateTranTime
#data = sort {
$a->[0] cmp $b->[0] ||
$a->[14] cmp $b->[14] ||
$a->[26] cmp $b->[26] ||
$a->[27] cmp $b-> [27]
} #data;
#open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/parsedMistints.csv';
open my $OFH, '>', '/swpkg/shared/batch_processing/mistints/cleaned1502.csv';
print $OFH join(',', #$_), $/ for #data;
close $OFH;
exit;
I'm guessing my problem is where I am putting the closing brace } for the ELSE part of the statement. Here are some sample records from the file with the last file being one of the "problem" records -
650096571,1,1,used as store paint,14,IFC 8012NP,Standalone-9,3596,56,1/31/2015,80813,A97W01251,,1/16/2015,0.25,0.25,,SW,CUSTOM MATCH,TRUE,O,xts,,,,,,,1568,61006,1,FALSE
650368376,1,3,Tinted Wrong Color,16,IFC 8012NP,01DX8015206,,6,1/31/2015,160720,A87W01151,MATCH,1/31/2015,1,1,ENG,CUST,CUSTOM MATCH,TRUE,O,Ci52,,,,,,,1584,137252,1,FALSE
650175433,3,1,not tinted - e.w.,16,COROB MODULA HF,Standalone-7,,2,1/31/2015,95555,B20W02651,,1/29/2015,3,3,,COMP,CUSTOM MATCH,TRUE,P,xts,,,,,,,1627,68092,5,FALSE
650187016,2,1,checked out under cash ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
When I run this script, it's still processing the "error records" and throwing up all kinds of "unitialized value" warnings.
Text::CSV is useful if you need to handle quotes or embedded linefeeds. Text::ParseWords can do as a substitute instead if you need that capability.
But as long as you don't have quoting to worry about, split works just fine.
You can do something like:
#!/usr/bin/env perl
use strict;
use warnings;
open ( my $normal_fh, '>', "output.txt" ) or die $!;
open ( my $err_fh, '>', "errors.txt" ) or die $!;
while ( <> ) {
if ( ( split /,/ ) [27] =~ /\w/ ) {
select $normal_fh;
}
else {
select $err_fh;
}
print;
}

Perl : Need to append two columns if the ID's are repeating

If id gets repeated I am appending app1, app2 and printing it once.
Input:
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
Output:
id|Name|app1|app2
1|abc|234,265|231,321|
2|xyz|123|215|
3|asd|213|235|
Output I'm getting:
id|Name|app1|app2
1|abc|234,231|
2|xyz|123,215|
1|abc|265,321|
3|asd|213,235|
My Code:
#! usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
my $counter = 0;
my %RepeatNumber;
my $pos=0;
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
open(FH, '<', join('', $basedir, $file)) || die $!;
my $line = readline(FH);
unless ($counter) {
chomp $line;
print OUTFILE $line;
print OUTFILE "\n";
}
while ($line = readline(FH)) {
chomp $line;
my #obj = split('\|',$line);
if($RepeatNumber{$obj[0]}++) {
my $str1= join("|",$obj[0]);
my $str2=join(",",$obj[2],$obj[3]);
print OUTFILE join("|",$str1,$str2);
print OUTFILE "\n";
}
}
This should do the trick:
use strict;
use warnings;
my $file_in = "doctor.txt";
open (FF, "<$file_in");
my $temp = <FF>; # remove first line
my %out;
while (<FF>)
{
my ($id, $Name, $app1, $app2) = split /\|/, $_;
$out{$id}[0] = $Name;
push #{$out{$id}[1]}, $app1;
push #{$out{$id}[2]}, $app2;
}
foreach my $key (keys %out)
{
print $key, "|", $out{$key}[0], "|", join (",", #{$out{$key}[1]}), "|", join (",", #{$out{$key}[2]}), "\n";
}
EDIT
To see what the %out contains (in case it's not clear), you can use
use Data::Dumper;
and print it via
print Dumper(%out);
I'd tackle it like this:
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use 5.14.0;
my %stuff;
#extract the header row.
#use the regex to remove the linefeed, because
#we can't chomp it inline like this.
#works since perl 5.14
#otherwise we could just chomp (#header) later.
my ( $id, #header ) = split( /\|/, <DATA> =~ s/\n//r );
while (<DATA>) {
#turn this row into a hash of key-values.
my %row;
( $id, #row{#header} ) = split(/\|/);
#print for diag
print Dumper \%row;
#iterate each key, and insert into $row.
foreach my $key ( keys %row ) {
push( #{ $stuff{$id}{$key} }, $row{$key} );
}
}
#print for diag
print Dumper \%stuff;
print join ("|", "id", #header ),"\n";
#iterate ids in the hash
foreach my $id ( sort keys %stuff ) {
#join this record by '|'.
print join('|',
$id,
#turn inner arrays into comma separated via map.
map {
my %seen;
#use grep to remove dupes - e.g. "abc,abc" -> "abc"
join( ",", grep !$seen{$_}++, #$_ )
} #{ $stuff{$id} }{#header}
),
"\n";
}
__DATA__
id|Name|app1|app2
1|abc|234|231|
2|xyz|123|215|
1|abc|265|321|
3|asd|213|235|
This is perhaps a bit overkill for your application, but it should handle arbitrary column headings and arbitary numbers of duplicates. I'll coalesce them though - so the two abc entries don't end up abc,abc.
Output is:
id|Name|app1|app2
1|abc|234,265|231,321
2|xyz|123|215
3|asd|213|235
Another way of doing it which doesn't use a hash (in case you want to be more memory efficient), my contribution lies under the opens:
#!/usr/bin/perl
use strict;
use warnings;
my $basedir = 'E:\Perl\Input\\';
my $file ='doctor.txt';
open(OUTFILE, '>', 'E:\Perl\Output\DoctorOpFile.csv') || die $!;
select(OUTFILE);
open(FH, '<', join('', $basedir, $file)) || die $!;
print(scalar(<FH>));
my #lastobj = (undef);
foreach my $obj (sort {$a->[0] <=> $b->[0]}
map {chomp;[split('|')]} <FH>) {
if(defined($lastobj[0]) &&
$obj[0] eq $lastobj[0])
{#lastobj = (#obj[0..1],
$lastobj[2].','.$obj[2],
$lastobj[3].','.$obj[3])}
else
{
if($lastobj[0] ne '')
{print(join('|',#lastobj),"|\n")}
#lastobj = #obj[0..3];
}
}
print(join('|',#lastobj),"|\n");
Note that split, without it's third argument ignores empty elements, which is why you have to add the last bar. If you don't do a chomp, you won't need to supply the bar or the trailing hard return, but you would have to record $obj[4].