match variable name in reading each line of a file to create view ddl - perl

I have an input file,
TableName1.Column1
TableName1.Column2
TableName2.Column1
TableName2.Column2
TableName3.Column3 etc
I would like it read each of the line and distinguish what columns belong for TableName1 so I can build a view ddl like this: CREATE VIEW TABLENAME1 AS SELECT Column1, Column2 From TableName1; and Next will be View TableName2 etc.
my $file = "summary.csv";
open (my $FH, '<', $file) or die "Can't open '$file' for read: $!";
my #lines;
while (my $line = <$FH>) {
push (#lines, $line);
}
close $FH or die "Cannot close $file: $!";
my $ln=#lines;
for (my $x=0; $x<$ln; $x++){
print("---Start->\n") if($x == 0);
print "---------------->\n";
my $first = (split /\./, $lines[$x] )[0];
my $second = $first;
print "Second is: $second \n";
if ((split /\./, $lines[$x] )[0] eq $first )
{
print "Same Table: $lines[$x]";
}
else
{
print "Next Table: $lines[$x]";
}
print("---End-->\n") if($x == $ln -1);
}

I'd do it something like this.
Parse the data into a data structure. I'm using an array of anonymous arrays. In the anonymous arrays, the first element is the table name and any other elements are columns.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
my #tables;
my $curr_table = '';
# Note: I used a DATA filehandle to test this. You'll need to
# insert your file-opening code here.
while (<DATA>) {
chomp;
my ($table, $column) = split /\./;
if ($table ne $curr_table) {
push #tables, [ $table ];
$curr_table = $table;
}
push #{ $tables[-1] }, $column;
}
And then walk the data structure to do whatever you want with the data (here, I'm just displaying it).
for my $t (#tables) {
my ($table, #columns) = #{ $t };
say "Table: table";
say " * $_" for #columns;
}

Related

How to filter columns from CSV file based on names of columns

I am using the CSV data like below. I don't want to use user and timestamp from csv file. I may add few columns or delete columns.
I didnt find the any suitable method in Text CSV.
Please let me know if any method or module is available
UniqueId, Name, description, user,timestamp
1,jana,testing,janardar,12-10-2018:00:
sub _filter_common_columns_from_csv{
my $csvfile = shift;
my $CSV = Text::CSV_XS->new(
{
binary => 1,
auto_diag => 3,
allow_quotes => 0,
eol => $/
});
my $_columns ||= do {
open(my $fh, '<', $csvfile) or die $!;
my #cols = #{ $CSV->getline($fh) };
close $fh or die $!;
for (#cols) { s/^\s+//; s/\s+$//; }
\#cols;
};
my #columns = #{ $_columns };
my %deleted;
my #regexes = qw(user timestamp);
foreach my $regex (#regexes) {
foreach my $i (0 .. ($#columns - 1)) {
my $col = $columns[$i];
$deleted{$i} = $col if $col =~ /$regex/;
}
}
my #wanted_columns = grep { !$deleted{$_} } 0 .. $#columns - 1;
my $input_temp = "$ENV{HOME}/output/temp_test.csv";
open my $tem, ">",$input_temp or die "$input_temp: $!";
open(my $fh, '<', $csvfile) or die $!;
while (my $row = $CSV->getline($fh)) {
my #fields = #$row;
$CSV->print($tem, [ #fields[#wanted_columns] ]) or $CSV->error_diag;
}
close $fh or die $!;
close $tem or die $!;
return $input_temp;
}
See getline_hr
use warnings;
use strict;
use feature 'say';
use List::MoreUtils qw(any);
use Text::CSV;
my $file = shift #ARGV || die "Usage: $0 filename\n";
my #exclude_cols = qw(user timestamp);
my $csv = Text::CSV->new ( { binary => 1 } )
or die "Cannot use CSV: ".Text::CSV->error_diag ();
open my $fh, '<', $file or die "Can't open $file: $!";
my #cols = #{ $csv->getline($fh) };
my #wanted_cols = grep {
my $name = $_;
not any { $name eq $_ } #exclude_cols;
} #cols;
my $row = {};
$csv->bind_columns (\#{$row}{#cols});
while ($csv->getline($fh)) {
my #wanted_fields = #$row{ #wanted_cols };
say "#wanted_fields";
}
The syntax #$row{#wanted_cols} is for a hash slice, which returns a list of values for the keys in #wanted_cols from the hashref $row.
Actual example using Text::AutoCSV to remove given named columns from arbitrary CSV files like in your posted code (More complicated than the examples in the documentation for only writing specific columns):
#!/usr/bin/perl
use warnings;
use strict;
use Text::AutoCSV qw/remove_accents/;
sub remove_columns {
my ($infile, $outfile, $drop) = #_;
my $csv = Text::AutoCSV->new(in_file => $infile, out_file => $outfile);
# Normalize column names the same way that Text::AutoCSV does
my %drops = map { my $h = remove_accents $_;
$h =~ s/[^[:alnum:]_]//gi;
$h = uc $h;
$h => 1 } #$drop;
my #cols = grep { not exists $drops{$_} } $csv->get_fields_names;
# Hack to avoid reading the file twice.
$csv->{out_fields} = \#cols;
$csv->write();
}
remove_columns "in.csv", "out.csv", [ "user", "timestamp" ];
If you want to modify your CSV in other ways, too, and if SQL would be convenient for those modifications, then consider using DBD::CSV.
You can then open a database handle on your CSV file, select the desired columns with a SELECT query, and write the results with Text::CSV or Text::CSV_XS.
For more details, see the DBD::CSV documentation or e.g. this simple wrapper script for querying CSV files.

Print a variable which is inside two loops

I couldn't figure it out how to escape this.
I would like to print the variable $rfam_column, which is inside two loops. But I cannot just write the print command right after the place where $rfam_column appears, because I would like to print other things which will be outside the loop and combine them to the printed content.
I would appreciate any advice as to what I'm doing wrong here.
use warnings;
use strict;
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
Global symbol "$rfam_column" requires explicit package name at script_vbeta.pl line 90.
Execution of script_vbeta.pl aborted due to compilation errors.
EDITED to include all the code and information of the input--output as suggested:
Input file is a table with n lines vs n columns like this (I extracted a few columns otherwise it would be much long to represent in a line):
RF00001 1302 5S ribosomal RNA
RF00006 1307 Vault RNA
RF00007 1308 U12 minor spliceosomal RNA
RF00008 1309 Hammerhead ribozyme (type III)
Output should be like this:
|RF00001|RF00006|RF00007
And the code (usage: script.pl -i input_file):
use warnings;
use strict;
use Getopt::Long;
Getopt::Long::Configure("pass_through");
my $in;
GetOptions('input' => \$in) or die;
if ( $in ) {
my $input = $ARGV[0] or die;
open (my $fh, '<', $input) or die "Can't open $input $!\n";
chomp (my #db_file = <$fh>);
close $fh;
my #list = grep /RNA/, #db_file;
my $column;
my #column = ();
foreach $column ( #list ) {
my #all_columns = split (/\t/, $column);
my $rfam_column = $all_columns[0];
# insert "|" between RFs
foreach $_ ( $rfam_column ) {
s/^/|/;
}
}
}
print "$rfam_column";
I think you want
if ($in) {
...
my #rfams;
for my $row (#list) {
my #fields = split(/\t/, $row);
my $rfam = $fields[0];
push #rfams, $rfam;
}
my $rfams = join('|', #rfams);
print("$rfams\n");
}
I would like to print other things which will be outside the loop and combine them to the $rfam_column content
You can include anything that is in an outer scope in print. You can just put your print statement inside the inner loop
By the way, I don't know what you mean by
# insert "|" between RFs
foreach $_ ($rfam_column) {
s/^/|/;
}
That is the same as
$rfam_column =~ s/^/|/;
which just adds a pipe | character to the beginning of the string
What is an RF?

Perl - need to store the column values into hash

I want to create a hash with column header as hash key and column values as hash values in Perl.
For example, if my csv file has the following data:
A,B,C,D,E
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15 ...
I want to create a hash as follows:
A=> 1,6,11
B=>2,7,12
c=>3,8,13 ...
So that just by using the header name I can use that column values.
Is there a way in PERL to do this? Please help me.
I was able to store required column values as array using the following script
use strict;
use warnings;
open( IN, "sample.csv" ) or die("Unable to open file");
my $wanted_column = "A";
my #cells;
my #colvalues;
my $header = <IN>;
my #column_names = split( ",", $header );
my $extract_col = 0;
for my $header_line (#column_names) {
last if $header_line =~ m/$wanted_column/;
$extract_col++;
}
while ( my $row = <IN> ) {
last unless $row =~ /\S/;
chomp $row;
#cells = split( ",", $row );
push( #colvalues, $cells[$extract_col] );
}
my $sizeofarray = scalar #colvalues;
print "Size of the coulmn= $sizeofarray";
But I want to do this to all my column.I guess Hash of arrays will be the best solution but I dont know how to implement it.
Text::CSV is a useful helper module for this sort of thing.
use strict;
use warnings;
use Text::CSV;
use Data::Dumper;
my %combined;
open( my $input, "<", "sample.csv" ) or die("Unable to open file");
my $csv = Text::CSV->new( { binary => 1 } );
my #headers = #{ $csv->getline($input) };
while ( my $row = $csv->getline($input) ) {
for my $header (#headers) {
push( #{ $combined{$header} }, shift(#$row) );
}
}
print Dumper \%combined;
Since you requested without a module - you can use split but you need to bear in mind the limitations. CSV format allows for things like commas nested in quotes. split won't handle that case very well.
use strict;
use warnings;
use Data::Dumper;
my %combined;
open( my $input, "<", "sample.csv" ) or die("Unable to open file");
my $line = <$input>;
chomp ( $line );
my #headers = split( ',', $line );
while (<$input>) {
chomp;
my #row = split(',');
for my $header (#headers) {
push( #{ $combined{$header} }, shift(#row) );
}
}
print Dumper \%combined;
Note: Both of these will effectively ignore any extra columns that don't have headers. (And will get confused by duplicate column names).
Another solution by using for loop :
use strict;
use warnings;
my %data;
my #columns;
open (my $fh, "<", "file.csv") or die "Can't open the file : ";
while (<$fh>)
{
chomp;
my #list=split(',', $_);
for (my $i=0; $i<=$#list; $i++)
{
if ($.==1) # collect the columns, if its first line.
{
$columns[$i]=$list[$i];
}
else #collect the data, if its not the first line.
{
push #{$data{$columns[$i]}}, $list[$i];
}
}
}
foreach (#columns)
{
local $"="\,";
print "$_=>#{$data{$_}}\n";
}
Output will be like this :
A=>1,6,11
B=>2,7,12
C=>3,8,13
D=>4,9,14
E=>5,10,15

Perl - have a comma separated output , want to write that in a CSV

Here is the code:
my #col= sort keys %colnames;
print "mRNA,".join(",",#col)."\n";
foreach my $row(keys %rownames){
print "$row";
foreach my $col(#col){
my $num=$mat{$col}->{$row};
$num=~s/(\.\d\d)\d+/$1/;
print ",$num";
}
print "\n";
}
Output:
mRNA,Benzopyrene12h_replica1,Benzopyrene12h_replica2
E2F1,5.01,4.72
REV1,2.76,2.67
POLK,1.21,1.87
POLH,1.49,1.56
POLI,1.94,2.45
Please help me write this output to .csv file.
Something like this might work... Combining with Miller's answer. I didn't test it, just giving you an idea. And it's defiantly could be written more cleanly and less redundant.
use strict;
use warnings;
use autodie;
my $csvFile = Text::CSV->new ( { binary => 1, eol => "\n" } )
or die "Cannot use CSV: ".Text::CSV->error_diag ();
my #col= sort keys %colnames;
my #csv;
$csv[0][0] = "mRNA,";
my #joinCol = join(",",#col);
my $i =1;
foreach (#joinCol) {
$csv[0][$i] = $_;
$i++;
}
my $k = 1;
foreach my $row(keys %rownames){
my $j = 0;
print "$row";
$csv[$k][$j] = $row;
foreach my $col(#col){
my $num=$mat{$col}->{$row};
$num=~s/(\.\d\d)\d+/$1/;
print ",$num";
$csv[$k][$j] = $num;
$j++;
}
print "\n";
$k++;
}
open $fh, '>', "new.csv" or die "Couldn't open csv file: $! \n";
for (#csv) {
$csvFile->print($fh, $_);
}
close $fh;
To write to a CSV file, use Text::CSV
use strict;
use warnings;
use autodie;
# Your Data Initialization
my %colnames; # = Something
my %rownames; # = Something else
my %mat; # = a hash of hash
# Prepare CSV
my $csv = Text::CSV->new ( { binary => 1, eol => "\n" } )
or die "Cannot use CSV: ".Text::CSV->error_diag ();
open $fh, '>', "new.csv";
my #col = sort keys %colnames;
# Output Header
$csv->print($fh, ['mRNA', #col]);
# Output Rows
for my $row (keys %rownames){
my #data = ($row);
for my $col (#col){
my $num = $mat{$col}{$row};
$num =~ s/(\.\d\d)\d+/$1/;
push #data, $num;
}
$csv->print($fh, \#data);
}
close $fh;

Reading and comparing lines in Perl

I am having trouble with getting my perl script to work. The issue might be related to the reading of the Extract file line by line within the while loop, any help would be appreciated. There are two files
Bad file that contains a list of bad IDs (100s of IDs)
2
3
Extract that contains a delimited data with the ID in field 1 (millions of rows)
1|data|data|data
2|data|data|data
2|data|data|data
2|data|data|data
3|data|data|data
4|data|data|data
5|data|data|data
I am trying to remove all the rows from the large extract file where the IDs match. There can be multiple rows where the ID matches. The extract is sorted.
#use strict;
#use warnning;
$SourceFile = $ARGV[0];
$ToRemove = $ARGV[1];
$FieldNum = $ARGV[2];
$NewFile = $ARGV[3];
$LargeRecords = $ARGV[4];
open(INFILE, $SourceFile) or die "Can't open source file: $SourceFile \n";
open(REMOVE, $ToRemove) or die "Can't open toRemove file: $ToRemove \n";
open(OutGood, "> $NewFile") or die "Can't open good output file \n";
open(OutLarge, "> $LargeRecords") or die "Can't open Large Records output file \n";
#Read in the list of bad IDs into array
#array = <REMOVE>;
#Loop through each bad record
foreach (#array)
{
$badID = $_;
#read the extract line by line
while(<INFILE>)
{
#take the line and split it into
#fields = split /\|/, $_;
my $extractID = $fields[$FieldNum];
#print "Here's what we got: $badID and $extractID\n";
while($extractID == $badID)
{
#Write out bad large records
print OutLarge join '|', #fields;
#Get the next line in the extract file
#fields = split /\|/, <INFILE>;
my $extractID = $fields[$FieldNum];
$found = 1; #true
#print " We got a match!!";
#remove item after it has been found
my $input_remove = $badID;
#array = grep {!/$input_remove/} #array;
}
print OutGood join '|', #fields;
}
}
Try this:
$ perl -F'|' -nae 'BEGIN {while(<>){chomp; $bad{$_}++;last if eof;}} print unless $bad{$F[0]};' bad good
First, you are lucky: The number of bad IDs is small. That means, you can read the list of bad IDs once, stick them in a hash table without running into any difficulty with memory usage. Once you have them in a hash, you just read the big data file line by line, skipping output for bad IDs.
#!/usr/bin/env perl
use strict;
use warnings;
# hardwired for convenience
my $bad_id_file = 'bad.txt';
my $data_file = 'data.txt';
my $bad_ids = read_bad_ids($bad_id_file);
remove_data_with_bad_ids($data_file, $bad_ids);
sub remove_data_with_bad_ids {
my $file = shift;
my $bad = shift;
open my $in, '<', $file
or die "Cannot open '$file': $!";
while (my $line = <$in>) {
if (my ($id) = extract_id(\$line)) {
exists $bad->{ $id } or print $line;
}
}
close $in
or die "Cannot close '$file': $!";
return;
}
sub read_bad_ids {
my $file = shift;
open my $in, '<', $file
or die "Cannot open '$file': $!";
my %bad;
while (my $line = <$in>) {
if (my ($id) = extract_id(\$line)) {
$bad{ $id } = undef;
}
}
close $in
or die "Cannot close '$file': $!";
return \%bad;
}
sub extract_id {
my $string_ref = shift;
if (my ($id) = ($$string_ref =~ m{\A ([0-9]+) }x)) {
return $id;
}
return;
}
I'd use a hash as follows:
use warnings;
use strict;
my #bad = qw(2 3);
my %bad;
$bad{$_} = 1 foreach #bad;
my #file = qw (1|data|data|data 2|data|data|data 2|data|data|data 2|data|data|data 3|data|data|data 4|data|data|data 5|data|data|data);
my %hash;
foreach (#file){
my #split = split(/\|/);
$hash{$split[0]} = $_;
}
foreach (sort keys %hash){
print "$hash{$_}\n" unless exists $bad{$_};
}
Which gives:
   
1|data|data|data
4|data|data|data
5|data|data|data