perl: persist set of strings with commit support - perl

I have a set of strings that is modified inside a loop of 25k iterations. It's empty at the beginning, but 0-200 strings are randomly added or removed from it in each cycle. At the end, the set contains about 80k strings.
I want to make it resumable. The set should be saved to disk after each cycle and be loaded on resume.
What library can I use? The amount of raw data is ~16M, but the changes are usually small. I don't want it to rewrite the whole store on each iteration.
Since the strings are paths, I'm thinking of storing them in a log file like this:
+a
+b
commit
-b
+d
commit
In the beginning the file is loaded into a hash and then compacted. If there's no commit line in the end, the last block is not taken into account.

The Storable package brings persistence to your Perl data structures (SCALAR, ARRAY, HASH or REF objects), i.e. anything that can be conveniently stored to disk and retrieved at a later time.

I've decided to put away the heavy artillery and write something simple:
package LoL::IMadeADb;
sub new {
my $self;
( my $class, $self->{dbname} ) = #_;
# open for read, then write. create if not exist
#msg "open $self->{dbname}";
open(my $fd, "+>>", $self->{dbname}) or die "cannot open < $self->{dbname}: $!";
seek($fd, 0, 0);
$self->{fd} = $fd;
#msg "opened";
$self->{paths} = {};
my $href = $self->{paths};
$self->{nlines} = 0;
my $lastcommit = 0;
my ( $c, $rest );
while(defined($c = getc($fd)) && substr(($rest = <$fd>), -1) eq "\n") {
$self->{nlines}++;
chomp($rest);
if ($c eq "c") {
$lastcommit = tell($fd);
#msg "lastcommit: " . $lastcommit;
} elsif ($c eq "+") {
$href->{$rest} = undef;
} elsif ($c eq "-") {
delete $href->{$rest};
}
#msg "line: '" . $c . $rest . "'";
}
if ($lastcommit < tell($fd)) {
print STDERR "rolling back incomplete file: " . $self->{dbname} . "\n";
seek($fd, $lastcommit, 0);
while(defined($c = getc($fd)) && substr(($rest = <$fd>), -1) eq "\n") {
$self->{nlines}--;
chomp($rest);
if ($c eq "+") {
delete $href->{$rest};
} else {
$href->{$rest} = undef;
}
}
truncate($fd, $lastcommit) or die "cannot truncate $self->{dbname}: $!";
print STDERR "rolling back incomplete file; done\n";
}
#msg "entries = " . (keys( %{ $href })+0) . ", nlines = " . $self->{nlines} . "\n";
bless $self, $class
}
sub add {
my ( $self , $path ) = #_;
if (!exists $self->{paths}{$path}) {
$self->{paths}{$path} = undef;
print { $self->{fd} } "+" . $path . "\n";
$self->{nlines}++;
$self->{changed} = 1;
}
undef
}
sub remove {
my ( $self , $path ) = #_;
if (exists $self->{paths}{$path}) {
delete $self->{paths}{$path};
print { $self->{fd} } "-" . $path . "\n";
$self->{nlines}++;
$self->{changed} = 1;
}
undef
}
sub save {
my ( $self ) = #_;
return undef unless $self->{changed};
my $fd = $self->{fd};
my #keys = keys %{$self->{paths}};
if ( $self->{nlines} - #keys > 5000 ) {
#msg "compacting";
close($fd);
my $bkpdir = dirname($self->{dbname});
($fd, my $bkpname) = tempfile(DIR => $bkpdir , SUFFIX => ".tmp" ) or die "cannot create backup file in: $bkpdir: $!";
$self->{nlines} = 1;
for (#keys) {
print { $fd } "+" . $_ . "\n" or die "cannot write backup file: $!";
$self->{nlines}++;
}
print { $fd } "c\n";
close($fd);
move($bkpname, $self->{dbname})
or die "cannot rename " . $bkpname . " => " . $self->{dbname} . ": $!";
open($self->{fd}, ">>", $self->{dbname}) or die "cannot open < $self->{dbname}: $!";
} else {
print { $fd } "c\n";
$self->{nlines}++;
# flush:
my $previous_default = select($fd);
$| ++;
$| --;
select($previous_default);
}
$self->{changed} = 0;
#print "entries = " . (#keys+0) . ", nlines = " . $self->{nlines} . "\n";
undef
}
1;

Related

Unable to print after while loop in perl

BEGIN {
use FindBin;
$scriptsDir = $FindBin::RealBin;
}
sub print_log {
($log, $msg) = ($_[0], $_[1]);
print $log $msg;
}
$opt_rh_gsr = "path_to_file";
open(FO, "$opt_rh_gsr") || die "-F-: Can not open file \n";
while(<FO>) {
if(/vdd_nets/) {
$vdd_net = 1;
$vdd_string = "VDD_NETS \{ \n";
}
if(/gnd_nets/) {
$gnd_net = 1;
}
if(($gnd_net == 1)) {
chomp();
$new_line = $_;
#split_new_line = split(":", $new_line);
}
if(($gnd_net == 1) && /\}/) {
$gnd_net = 0;
$gnd_string .= "\} \n";
exit;
}
if($vdd_net) {
if(/^\s*\S+\s+\S+\s+{/) {
$paren++;
}
if (0 != $paren && /^\s*(\w+)\s*$/) {
$vdd_nets{$1} = $parenvolt;
next;
}
if(/^\s*}\s*$/ || /^\s+$/) {
if (0 == $paren) {
$vdd_net = 0; next;
}
else {
$paren--; next;
}
}
chomp();
if(/\s*\}\s*$/ && ($vdd_net == 1)){
s/\'//g;
$vdd_net = 0;
#_ = split(":");
$vdd_string .= "$_[0] $_[1] \n";
$vdd_string .= "\} \n";
next;
}
if($gnd_net) {
if(/^\s*\}\s+$/ || /^\s+$/) {
$gnd_net = 0;
next;
}
#chomp();
if(/\s*\}\s*$/ && ($gnd_net == 1)){
s/\'//g;
$gnd_net = 0;
}
#_ = split();
$GNDNET = $_[0];
if ($_[0] =~ /^\w+$/) {
$groundnets{$_[0]} = 1;
}
}
}
}
print " done reading \n";
close(FO);
print "closed file \n";
The above is not printing the last 2 print statement (before and after the close of file handle). I tried print STDOUT, that didn't work. I also tried to flush, that didn't work either.
The script is exiting after executing, so it is not stuck in a infinite loop anywhere. I tries using perl5.6 and 5.8, but both of them have the same problem.
To exit a loop, you should use the keyword last instead of exit (which exits the whole program). This if:
if(($gnd_net == 1) && /\}/) {
$gnd_net = 0;
$gnd_string .= "\} \n";
print "exiting loop $gnd_string \n";
exit;
}
Should thus be:
if(($gnd_net == 1) && /\}/) {
$gnd_net = 0;
$gnd_string .= "\} \n";
print "exiting loop $gnd_string \n";
last;
}
(unless you actually wanted to exit the program, in which case the print should rather have been print "exiting program...")
A few tips:
Always add use strict and use warnings at the beginning of your scripts. It will catch many mistakes and save you a lot of time.
Use 3-operand open to open files (ie, open FILEHANDLE,MODE,EXPR instead of open FILEHANDLE,EXPR), and lexical filehandles (ie, $FO instead of FO). Your open should thus have been: open my $FO, '<', $opt_rh_gsr instead of open(FO, "$opt_rh_gsr").
Adding || die "-F-: Can not open file \n" after open is a good idea, but 1) you should do or die instead of || die (in this specific case it doesn't matter, but with or rather than ||, you can omit the parenthesis around open's arguments), and 2) you should add the name of the file you were trying to open (in that case, you'd print die "-F-: Can not open file '$opt_rh_gsr'). 3) add $! to the die to have the error message (die "-F-: Can not open file '$opt_rh_gsr': $!). And 4), as suggested by TLP, don't add a newline at the end of a die string.
sub print_log { ($log, $msg) = ($_[0], $_[1]); ... could have been sub print_log { ($log, $msg) = #_;; it's a bit more idiomatic and concise.
Indent properly your code. It's possible that indentation was lost in the copy-paste, but, if it's not the case, then you should indent better your code. This will save you a lot of time when writing/reading your code, and will save other people even more time when they'll read your code. Most IDEs have indentation features that can help you indent the code.

Perl Error - Global symbol requires explicit package name

I am getting some errors while trying to run a Perl file which is I need to run. (I usually use Python but I do not know any other programming languages including Perl.) I will show errors firstly, then I will attach the whole code in the end.
The error:
$ perl C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl
Global symbol "$fba" requires explicit package name (did you forget to declare "my $fba"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 171.
Global symbol "$fba" requires explicit package name (did you forget to declare "my $fba"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 172.
Global symbol "$fba" requires explicit package name (did you forget to declare "my $fba"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 173.
Global symbol "$fba" requires explicit package name (did you forget to declare "my $fba"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 174.
Global symbol "$map" requires explicit package name (did you forget to declare "my $map"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 202.
Global symbol "$pmap" requires explicit package name (did you forget to declare "my $pmap"?) at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 233.
Execution of C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl aborted due to compilation errors.
When I put "my" at before global symbols which are mentioned in the error, this time I am getting another error below:
$ perl C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl
"my" variable $fba masks earlier declaration in same statement at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 172.
"my" variable $fba masks earlier declaration in same statement at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 173.
"my" variable $fba masks earlier declaration in same statement at C:/databases/ModelSEEDDatabase/Scripts/Archived_Perl_Scripts/DumpSOLRTables_Master.pl line 174.
So, I do not know what should I do now actually. This Perl script will dump/extract two necessary files which are "ComplexRoles.tsv" and "TemplateReactions.tsv".
I am using Python 3.7 and Cygwin64. ModelSEEDDatabase Github link:
https://github.com/ModelSEED/ModelSEEDDatabase
If you can help me, I would be really glad. Thank you in advance.
The whole script is below: (I indicated the problematic lines which are 171, 172, 173, 174, 202, 233.)
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long::Descriptive;
my ($opt, $usage) = describe_options("%c %o <directory>",
[ "compounds=s", "path to master compounds file", { default => "../Biochemistry/compounds.master.tsv" } ],
[ "compartments=s", "path to master compartments file", { default => "../Biochemistry/compartments.master.tsv" } ],
[ "reactions=s", "path to master reactions file", { default => "../Biochemistry/reactions.master.tsv" } ],
[ "aliasdir=s", "path to directory with alias files", { default => "../Aliases/" } ],
[ "pathwaydir=s", "path to directory with pathway files", { default => "../Pathways/" } ],
[ "structuredir=s", "path to directory with structure files", { default => "../Structures/" } ],
[ "master=s", "path to output master biochemistry json file", { default => "../Biochemistry/biochemistry.master.json" } ],
[ "help|h", "print usage message and exit" ]
);
print($usage->text), exit if $opt->help;
my $directory = $ARGV[0];
exit if !$directory || !-d $directory;
$directory.="/" if $directory !~ /\/$/;
my #temp=();
my $header = 1;
#######################################################
#Initialization
#######################################################
#Collect Aliases
opendir(my $AliasDir, $opt->aliasdir);
my #Files = grep { $_ =~ /\.aliases$/ } readdir($AliasDir);
closedir($AliasDir);
my $rxn_alias_hash = {};
my $p_rxn_alias_hash = {};
my $cpd_alias_hash = {};
my $p_cpd_alias_hash = {};
my %alias_cpd_hash = ();
my %alias_rxn_hash = ();
foreach my $file (sort #Files){
$file =~ /^(\w+)\.aliases/;
my $aliasSet = $1;
$aliasSet = join(" ", split(/_/,$aliasSet)) if $aliasSet eq "Enzyme_Class";
open(FH, "< ".$opt->aliasdir.$file);
$header = 1;
while(<FH>){
chomp;
if($header){$header--;next}
#temp=split(/\t/,$_,-1);
if($temp[1] =~ /^cpd/ || $temp[2] =~ /^cpd/){
foreach my $cpd (split(/\|/,$temp[1])){
$cpd_alias_hash->{$aliasSet}->{$temp[0]}->{$cpd}=1;
$alias_cpd_hash{$cpd}{$aliasSet}{$temp[0]}=1;
}
foreach my $cpd (split(/\|/,$temp[2])){
$p_cpd_alias_hash->{$aliasSet}->{$temp[0]}->{$cpd}=1;
#Need to revise the decision to forgo aliases found in more recent database
if(!exists($alias_cpd_hash{$cpd}) && !exists($alias_cpd_hash{$cpd}{$aliasSet})){
$alias_cpd_hash{$cpd}{$aliasSet}{$temp[0]}=1;
}
}
}
if($temp[1] =~ /^rxn/ || $temp[2] =~ /^rxn/){
foreach my $rxn (split(/\|/,$temp[1])){
$rxn_alias_hash->{$aliasSet}->{$temp[0]}->{$rxn}=1;
$alias_rxn_hash{$rxn}{$aliasSet}{$temp[0]}=1;
}
foreach my $rxn (split(/\|/,$temp[2])){
$p_rxn_alias_hash->{$aliasSet}->{$temp[0]}->{$rxn}=1;
#Need to revise the decision to forgo aliases found in more recent database
if(!exists($alias_rxn_hash{$rxn}) && !exists($alias_rxn_hash{$rxn}{$aliasSet})){
$alias_rxn_hash{$rxn}{$aliasSet}{$temp[0]}=1;
}
}
}
}
close(FH);
}
my $rxn_pathways = {};
open(my $fh, "< ".$opt->pathwaydir."HopeScenarios.txt");
while (my $line = <$fh>) {
chomp($line);
my $array = [split(/\t/,$line)];
my $patharray = [split(/:/,$array->[0])];
pop(#{$patharray});
pop(#{$patharray});
if (defined($rxn_alias_hash->{KEGG}->{$array->[1]})) {
foreach my $rxn (keys(%{$rxn_alias_hash->{KEGG}->{$array->[1]}})) {
$rxn_pathways->{$rxn}->{KEGG}->{$patharray->[0]} = 1;
$rxn_pathways->{$rxn}->{Scenario}->{join("/",#{$patharray})} = 1;
}
} elsif (defined($p_rxn_alias_hash->{KEGG}->{$array->[1]})) {
foreach my $rxn (keys(%{$p_rxn_alias_hash->{KEGG}->{$array->[1]}})) {
$rxn_pathways->{$rxn}->{KEGG}->{$patharray->[0]} = 1;
$rxn_pathways->{$rxn}->{Scenario}->{join("/",#{$patharray})} = 1;
}
}
}
close($fh);
open($fh, "< ".$opt->pathwaydir."plantdefault.pathways.tsv");
my #headers = split(/\t/,<$fh>);
shift(#headers);
chomp($headers[$#headers]);
while(<$fh>){
chomp;
#temp=split(/\t/,$_,-1);
my $id = shift (#temp);
for(my $i=0;$i<scalar(#headers);$i++){
next if $temp[$i] eq "null";
foreach my $path (split(/\|/,$temp[$i])){
$rxn_pathways->{$id}{$headers[$i]}{$temp[$i]}=1;
}
}
}
close($fh);
my $cpd_structure = {};
open($fh, "< ".$opt->structuredir."KEGG_Charged_InChI.txt");
while (my $line = <$fh>) {
chomp($line);
my $array = [split(/\t/,$line)];
if (defined($cpd_alias_hash->{KEGG}->{$array->[0]})) {
foreach my $cpdid (keys(%{$cpd_alias_hash->{KEGG}->{$array->[0]}})){
if (!defined($cpd_structure->{$cpdid})) {
$cpd_structure->{$cpdid} = $array->[1];
}
}
}
if (defined($p_cpd_alias_hash->{KEGG}->{$array->[0]})) {
foreach my $cpdid (keys(%{$p_cpd_alias_hash->{KEGG}->{$array->[0]}})){
if (!defined($cpd_structure->{$cpdid})) {
$cpd_structure->{$cpdid} = $array->[1];
}
}
}
}
close($fh);
open($fh, "< ".$opt->structuredir."MetaCyc_Charged_InChI.txt");
while (my $line = <$fh>) {
chomp($line);
my $array = [split(/\t/,$line)];
if (defined($cpd_alias_hash->{MetaCyc}->{$array->[0]})) {
foreach my $cpdid (keys(%{$cpd_alias_hash->{MetaCyc}->{$array->[0]}})){
if (!defined($cpd_structure->{$cpdid})) {
$cpd_structure->{$cpdid} = $array->[1];
}
}
}
if (defined($p_cpd_alias_hash->{MetaCyc}->{$array->[0]})) {
foreach my $cpdid (keys(%{$p_cpd_alias_hash->{MetaCyc}->{$array->[0]}})){
if (!defined($cpd_structure->{$cpdid})) {
$cpd_structure->{$cpdid} = $array->[1];
}
}
}
}
close($fh);
#Retreiving templates
# Need to get these from source file
my $templates = [
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","GramPosModelTemplate"), ###line171
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","GramNegModelTemplate"), ###line172
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","CoreModelTemplate"), ###line173
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","PlantModelTemplate") ###line174
];
#Printing template table
my $templatlist = ["template.0","template.1","template.2","template.3"];
my $templatedata = [
["template.0","gram_positive_template","genome_scale_model","Bacteria","0","1","chenry"],
["template.1","gram_negative_template","genome_scale_model","Bacteria","0","1","chenry"],
["template.2","core_template","core_model","Bacteria","0","1","chenry"],
["template.3","plant_template","genome_scale_model","Plant","0","1","seaver"],
];
#Printing complex roles
open($fh, ">", $directory."ComplexRoles.tsv");
my $columns = [qw(
complex_id
complex_name
complex_source
complex_type
role_id
role_name
role_type
role_source
role_aliases
role_exemplar
type
triggering
optional
)];
print $fh join("\t",#{$columns})."\n";
my $cpxs = $map->complexes(); # Need to get these from source file ###line202
my $idhash;
my $count=0;
for (my $i=0; $i < #{$cpxs}; $i++) {
my $cpx = $cpxs->[$i];
$idhash->{$cpx->id()} = "cpx.".$i;
my $cpxroles = $cpx->complexroles();
for (my $j=0; $j < #{$cpxroles}; $j++) {
my $cpxrole = $cpxroles->[$j];
my $roleid = $cpxrole->role()->id();
$roleid =~ s/ms//;
my $data = [
"cpx.".$i,
"cpx.".$i,
"ModelSEED",
"SEED_role_complex",
$roleid,
$cpxrole->role()->name(),
"SEED_role",
"SEED",
"searchname:".$cpxrole->role()->searchname(),
"null",
"role_mapping",
$cpxrole->triggering(),
$cpxrole->optionalRole(),
];
print $fh join("\t",#{$data})."\n";
}
$count = $i;
}
$cpxs = $pmap->complexes(); # Need to get these from source file ###line233
for (my $i=0; $i < #{$cpxs}; $i++) {
$count++;
my $cpx = $cpxs->[$i];
$idhash->{$cpx->id()} = "cpx.".$count;
my $cpxroles = $cpx->complexroles();
for (my $j=0; $j < #{$cpxroles}; $j++) {
my $cpxrole = $cpxroles->[$j];
my $roleid = $cpxrole->role()->id();
$roleid =~ s/ms//;
my $data = [
"cpx.".$count,
"cpx.".$count,
"ModelSEED",
"SEED_role_complex",
$roleid,
$cpxrole->role()->name(),
"SEED_role",
"SEED",
"searchname:".$cpxrole->role()->searchname(),
"null",
"role_mapping",
$cpxrole->triggering(),
$cpxrole->optionalRole(),
];
print $fh join("\t",#{$data})."\n";
}
}
close($fh);
#Printing compounds
#As it stands, it's a copy of the master compounds file with the aliases integrated
open(FH, "< ".$opt->compounds);
open($fh, ">", $directory."Compounds.tsv");
$header = 1;
undef(#headers);
my %Compounds=();
while(<FH>){
chomp;
if($header){
#headers = split(/\t/,$_,-1);
print $fh $_."\n";
$header--;
next;
}
#temp=split(/\t/,$_,-1);
#map values to keys
#probably not that necessary, but useful if column order changes
my %cpdHash=();
for(my $i=0;$i<scalar(#headers);$i++){
$cpdHash{$headers[$i]}=$temp[$i];
}
my #aliases = ();
foreach my $aliasSet (keys %{$alias_cpd_hash{$cpdHash{id}}}){
foreach my $alias (keys %{$alias_cpd_hash{$cpdHash{id}}{$aliasSet}}){
push(#aliases, "\"".$aliasSet.":".$alias."\"");
}
}
$cpdHash{aliases}= scalar(#aliases)>0 ? join(";",#aliases) : "null";
print $fh join("\t", map { $cpdHash{$_} } #headers),"\n";
$Compounds{$cpdHash{id}}=\%cpdHash;
}
close($fh);
#Printing reactions
#As it stands, it's a copy of the master reactions file with the pathways, aliases, and ec numbers integrated
open(FH, "< ".$opt->reactions);
open($fh, ">", $directory."Reactions.tsv");
$header = 1;
undef(#headers);
my %Reactions=();
while(<FH>){
chomp;
if($header){
#headers = split(/\t/,$_,-1);
print $fh join("\t", grep { $_ ne 'is_obsolete' && $_ ne 'linked_reaction' } #headers),"\n";
$header--;
next;
}
#temp=split(/\t/,$_,-1);
#map values to keys
#probably not that necessary, but useful if column order changes
my %rxnHash=();
for(my $i=0;$i<scalar(#headers);$i++){
$rxnHash{$headers[$i]}=$temp[$i];
}
my #ecnums = ();
my #aliases = ();
foreach my $aliasSet (keys %{$alias_rxn_hash{$rxnHash{id}}}){
foreach my $alias (keys %{$alias_rxn_hash{$rxnHash{id}}{$aliasSet}}){
#Only include full ec numbers (?)
if ($aliasSet eq "Enzyme Class"){
if($alias =~ m/\d+\.\d+\.\d+\.\d+/){
push(#ecnums, $alias);
}
}else{
push(#aliases, "\"".$aliasSet.":".$alias."\"");
}
}
}
$rxnHash{aliases}= scalar(#aliases)>0 ? join(";",#aliases) : "null";
$rxnHash{ec_numbers}= scalar(#ecnums)>0 ? join(";",#ecnums) : "null";
my #pathways = ();
if (defined($rxn_pathways->{$rxnHash{id}})) {
foreach my $type (keys(%{$rxn_pathways->{$rxnHash{id}}})) {
foreach my $path (keys(%{$rxn_pathways->{$rxnHash{id}}{$type}})) {
push(#pathways, $type.":".$path);
}
}
}
$rxnHash{pathways}= scalar(#pathways)>0 ? join(";",#pathways) : "null";
print $fh join("\t", map { $rxnHash{$_} } grep { $_ ne 'is_obsolete' && $_ ne 'linked_reaction' } #headers),"\n";
$Reactions{$rxnHash{id}}=\%rxnHash;
}
close($fh);
#Printing template biomasses reactions
open($fh, ">", $directory."TemplateBiomasses.tsv");
$columns = [qw(
id
name
type
other
dna
rna
protein
lipid
cellwall
cofactor
energy
template_id
template_name
template_modeltype
template_domain
template_version
template_is_current
template_owner
compartment_ids
compound_ids
compound_data
)];
print $fh join("\t",#{$columns})."\n";
for (my $i=0; $i < #{$templates}; $i++) {
my $biomasses = $templates->[$i]->templateBiomasses();
for (my $j=0; $j < #{$biomasses}; $j++) {
my $compounds = {};
my $comps = {};
my $bio = $biomasses->[$j];
my $biocpds = $bio->templateBiomassComponents();
my #compounddata = ();
for (my $k=0; $k < #{$biocpds}; $k++) {
my $biocpd = $biocpds->[$k];
my $biocpd_id = $biocpd->compound()->id();
my #links = ();
my $linkrefs = $biocpd->linked_compounds();
for (my $m=0; $m < #{$linkrefs}; $m++) {
push(#links, $linkrefs->[$m]->id()."{".$biocpd->link_coefficients()->[$m]."}");
}
$compounds->{$biocpd_id} = 1;
$comps->{$biocpd->compartment()->id()} = 1;
push(#compounddata, $biocpd_id.":\"".$Compounds{$biocpd_id}{name}."\":".$biocpd->coefficient().":".$biocpd->coefficientType().":".$biocpd->class().":".join("|",#links));
}
my $data = [
$templatedata->[$i]->[0].".".$bio->id(),
$bio->name(),
"growth",
$bio->other(),
$bio->dna(),
$bio->rna(),
$bio->protein(),
$bio->lipid(),
$bio->cellwall(),
$bio->cofactor(),
$bio->energy(),
$templatedata->[$i]->[0],
$templatedata->[$i]->[1],
$templatedata->[$i]->[2],
$templatedata->[$i]->[3],
$templatedata->[$i]->[4],
$templatedata->[$i]->[5],
$templatedata->[$i]->[6],
"0:".join(";0:",keys(%{$comps})),
join(";",keys(%{$compounds})),
join(";",#compounddata)
];
print $fh join("\t",#{$data})."\n";
}
}
close($fh);
#Printing template reactions
open($fh, ">", $directory."TemplateReactions.tsv");
$columns = [qw(
id
reaction_id
abbreviation
name
code
stoichiometry
is_transport
equation
definition
model_direction
gapfill_direction
type
base_cost
forward_penalty
reverse_penalty
pathways
aliases
ec_numbers
deltag
deltagerr
template_id
template_name
template_modeltype
template_domain
template_version
template_is_current
template_owner
compartment_ids
complex_ids
compound_ids
)];
print $fh join("\t",#{$columns})."\n";
for (my $i=0; $i < #{$templates}; $i++) {
my $rxns = $templates->[$i]->templateReactions();
for (my $j=0; $j < #{$rxns}; $j++) {
my $rxn = $rxns->[$j];
my $complexes = {};
my $cpxs = $rxn->complexs();
for (my $j=0; $j < #{$cpxs}; $j++) {
$complexes->{$idhash->{$cpxs->[$j]->id()}} = 1;
}
my $compounds = {};
my $comps = {};
# my $rgts = [split(/;/,$Reactions{$rxn->reaction()->id()}{stoichiometry})];
my $rgts = $rxn->reaction()->reagents();
for (my $j=0; $j < #{$rgts}; $j++) {
# my ($coef,$cpd,$cmpt) = split(/:/,$rgts->[$j]);
my ($cpd,$cmpt) = ($rgts->[$j]->compound()->id(),$rgts->[$j]->compartment()->id());
$compounds->{$cpd}=1;
$comps->{$cmpt}=1;
}
my $rxn_id = $rxn->reaction()->id();
my $compid = "c0";
my $data = [
$templatedata->[$i]->[0].".".$rxn_id."_".$compid,
$rxn_id,
$Reactions{$rxn_id}{abbreviation}."_".$compid,
$Reactions{$rxn_id}{name}."_".$compid,
# $Reactions{$rxn_id}{code},
# $Reactions{$rxn_id}{stoichiometry},
$rxn->reaction()->code(),
$rxn->reaction()->stoichiometry(),
$Reactions{$rxn_id}{is_transport},
# $Reactions{$rxn_id}{equation},
# $Reactions{$rxn_id}{definition},
$rxn->reaction()->equation(),
$rxn->reaction()->definition(),
$rxn->direction(),
defined($rxn->GapfillDirection()) ? $rxn->GapfillDirection() : "=",
"null",
defined($rxn->base_cost()) ? $rxn->base_cost() : 0,
defined($rxn->forward_penalty()) ? $rxn->base_cost() : 0,
defined($rxn->reverse_penalty()) ? $rxn->base_cost() : 0,
$Reactions{$rxn_id}{pathways},
$Reactions{$rxn_id}{aliases},
$Reactions{$rxn_id}{ec_numbers},
$Reactions{$rxn_id}{deltag},
$Reactions{$rxn_id}{deltagerr},
$templatedata->[$i]->[0],
$templatedata->[$i]->[1],
$templatedata->[$i]->[2],
$templatedata->[$i]->[3],
$templatedata->[$i]->[4],
$templatedata->[$i]->[5],
$templatedata->[$i]->[6],
"0:".join(";0:",keys(%{$comps})),
join(";",keys(%{$complexes})),
join(";",keys(%{$compounds}))
];
print $fh join("\t",#{$data})."\n";
}
}
close($fh);
my $templates = [
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","GramPosModelTemplate"), ###line171
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","GramNegModelTemplate"), ###line172
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","CoreModelTemplate"), ###line173
$fba->_get_msobject("ModelTemplate","KBaseTemplateModels","PlantModelTemplate") ###line174
];
The code is calling a method on the object in $fba without first declaring it it or even assigning an object to it. (In Python terms, the code does fba._get_msobject(...) without first doing fba = ....)
Not only will you need to declare the variable (my $fba), you will need to assign to it whatever object it's supposed to have.

GetOption - Perl - Referencing

So I have stumbled upon a little issue when trying to build out a simple "Airport Search Script" in Perl.
my $filename = '/home/student/perl-basic/topic-07/iata_airports.csv';
my $number = '1';
my $matching;
my $latitude;
my $longitude;
my $word = 'false';
GetOptions (
"filename=s" => \$filename,
"number=i" => \$number,
"matching=s" => \$matching,
"latitude=f" => \$latitude,
"longitude=f" => \$longitude,
"word=s" => \$word
);
sub parse_airports {
my $file = shift;
my $csv = Text::CSV->new( { binary => 1, eol => $/ } );
open ( my $fh, "<", $file ), or die "Error opening input file: $!";
my $ra_colnames = $csv->getline ( $fh );
$csv->column_names( #$ra_colnames );
my $ra_airports = $csv->getline_hr_all( $fh );
close ( $fh );
return $ra_airports;
}
sub get_name_matching_airports {
}
my $rah_airports = parse_airports( $filename );
my $rah_airports_found = [];
if ($matching) {
say "Up to $number airports matching $matching in $filename:";
$rah_airports_found = get_name_matching_airports(
airports => $rah_airports,
matching_string => $matching,
word => $word,
);
}
elsif ($latitude && $longitude) {
say "Up to $number airports near [$latitude, $longitude] in $filename:"
}
else {
say "Must have at least --matching, or --latitude and --longitude as arguments";
}
print pp($rah_airports_found);
So where I am struggling is in the "sub get_name_matching_airports"
Because you do not have the file let me explain the file structure.
It is a hash (ALL IATA Airports) with hashes (DETAILS of each airport). There are around 15 keys in each airport hash and one of the keys titles is (NAME). I have opened the file and parsed all the info into a hash ref which is returned at the end of the sub "parse_airports".
In the sub "get_name_matching_airports" I need to find additional airports with similar names based on the argument I passed in, into ($matching).
EXAMPLE: I parse (case-insensitive) "London" as an argument from the command line e.g. ./search_airports2 --matching London. In the sub "get_name_matching_airports" I will need to respond with any airport that has london (case-insensitive) in key(name).
Then push these newly found airports which are similar into the array "rah_airports_found" and in the end print this out.
SO I SOLVED MY PROBLEM WITH THE FOLLOWING CODE:
sub get_name_matching_airports {
my %params = (
airports => undef,
matching_string => undef,
word => undef,
#_
);
my #rah_airports_found;
my $ra_airports = $params{airports};
my $counter = 0;
foreach my $i ( #$ra_airports ) {
if ( $params{word} ) {
if ( $i->{name} eq $params{matching_string} ) {
push #rah_airports_found, $i;
$counter++;
}
}
else {
if ( $i->{name} =~ /$params{matching_string}/i ) {
push #rah_airports_found, $i;
$counter++;
}
if ( defined( $number ) && $counter == $number ) {
return \#rah_airports_found;
}
}
}
return \#rah_airports_found;
}
Example:
for my $Airport_rf (keys %{$rah_airports}) {
if ( $Airport_rf->{NAME} =~ m{\Q$matching\E}xi) {
# do your stuff here
}
}
If you donĀ“t know the exact key of the hashref, you have to match the CLI parameter against all values.

Perl script to convert xls to csv

This script coverts xls to csv ok.
The challenge is that it does not convert blank cell in the xls to blanks in csv file.
Any help is appreciated: UPDATED SCRIPT
#!/usr/bin/perl
use strict;
use Spreadsheet::ParseExcel;
use Text::CSV;
my $sourcename = shift #ARGV or die "invocation: $0 <source file>\n";
my $source_excel = new Spreadsheet::ParseExcel;
my $source_book = $source_excel->Parse($sourcename)
or die "Could not open source Excel file $sourcename: $!";
my $storage_book;
foreach my $source_sheet_number (0 .. $source_book->{SheetCount}-1) {
my $source_sheet = $source_book->{Worksheet}[$source_sheet_number];
print "--------- SHEET:", $source_sheet->{Name}, "\n";
next unless defined $source_sheet->{MaxRow};
next unless $source_sheet->{MinRow} <= $source_sheet->{MaxRow};
next unless defined $source_sheet->{MaxCol};
next unless $source_sheet->{MinCol} <= $source_sheet->{MaxCol};
foreach my $row_index ($source_sheet->{MinRow} .. $source_sheet->{MaxRow}) {
foreach my $col_index ($source_sheet->{MinCol} .. $source_sheet->{MaxCol}) {
my $source_cell = $source_sheet->{Cells}[$row_index][$col_index];
if ($source_cell && $source_cell->Value) {
#print "( $row_index , $col_index ) =>", $source_cell->Value, "\t;";
print $source_cell->Value, ";";
}
else
{
print ";"
}
}
}
}
sample excel
EFG KDD ABS JME
FGO POP JET
converted as:
EFG;KDD;ABS;JME;
FGO;POP;JET;
but it should be:
EFG;KDD;ABS;JME;
FGO;;POP;JET;
You have to check if the value of the cell is initialized, not the cell it self.
Change:
if ($source_cell) {
#print "( $row_index , $col_index ) =>", $source_cell->Value, "\t;";
print $source_cell->Value, ";";
}
To:
if ($source_cell && $source_cell->Value) {
#print "( $row_index , $col_index ) =>", $source_cell->Value, "\t;";
print $source_cell->Value, ";";
} else {
print ";";
}
should work.
UPDATE:
foreach my $row_index ($source_sheet->{MinRow} .. $source_sheet->{MaxRow}) {
foreach my $col_index ($source_sheet->{MinCol} .. $source_sheet->{MaxCol}) {
my $source_cell = $source_sheet->{Cells}[$row_index][$col_index];
if ($source_cell && $source_cell->Value) {
print $source_cell->Value.";";
} else {
print ";";
}
}
print "\n";
}
}

Generate Excel output using Win32::OLE in Perl

I am a beginner in Perl and have tried playing around with Perl much to understand its ways and working! I have a basic knowledge of arays, hashes and related topics. I have to develop a script for a topic and i am quite unsure how to go about it. I desperately need help and am very grateful to anyone who can explain the 'how to do' part!
I have a code with 3 parts in it which does the same thing thrice for 3 different lets say components. Basic idea is, it takes all the components marked 'A' from an excel file, iterates through the excel file, adds up its corresponding RAM and ROM values and prints out the output without duplicate entries. The 2nd and 3rd part are the same but for components 'B' and 'C'. So far i am able to print out the output of all 3 parts in a text file. But now i want all three outputs in an excel workbook as 3 separate worksheets!
I am not particularly sure how to go about it. Any ideas are really welcome!!!
PS: Please forgive me if i have not typed the code right in the forum! This is my first post!!
Here is how my code looks so far:
# This Test script was created to try out the possible methods to extract all the Names from the
# excel report without duplicate entries and find their corresponding RAM/ROM size sum
# -excel D:\Abc\Test.xlsx -out D:\Abc\Output
sub usage($)
{
return shift(#_) . <<"END_USAGE";
Usage: $0 -excel Specify the file path.
-out outputdirectory Specify output directiory
END_USAGE
}
use Getopt::Long;
use Win32::OLE;
use List::Util qw(sum);
use Data::Dumper qw(Dumper);
my $output_path = ();
my $excel_path = ();
my $no_rows = ();
my $lastCol = ();
GetOptions("excel=s" => \$excel_path,
"out=s" => \$output_path,
"h|help" => \$help,
);
#help message
die usage("") if ($help);
system(cls);
print "\n*******************************************************************\n";
print "Component Overview \n";
print "*******************************************************************\n";
print "Please wait, Processing may take couple of minutes... \n";
##File handler for the script file.
$log_path = $output_path."\\log.txt";
$output_file_path = $output_path."\\TestExcel.xlsx";
open LogFile,">",$log_path or die "Cannot create the log file:$log_path !!!";
print LogFile "Start time :".localtime()."\n";
# Start Excel and make it visible
my $xlApp = Win32::OLE->GetActiveObject('Excel.Application') || Win32::OLE->new('Excel.Application', 'Quit');
$xlApp->{Visible} = 0;
#Opening the work book
my $workBook = $xlApp->Workbooks->Open($excel_path);
#print "X: " . $workBook . " - " . $excel_path . "\n";
my $excelSheet = $workBook->Worksheets("Report");
$excelSheet->Activate();
print "Reading the file...\n";
&ReadExcel();
print LogFile "Completed time :".localtime()."\n";
print "\nCompleted.Please close this window...\n" ;
print "*******************************************************************\n";
# Sub routine to parse the cosipa file
sub ReadExcel()
{
my $row_index;
#Findings the number of valid rows
$no_rows = $excelSheet->UsedRange->Rows->{'Count'};
$lastCol = $excelSheet->UsedRange->Columns->{'Count'};
$row_index = findRowindex();
my #comp_array = ();
# Name => ResourceType => size
my $resultData = {};
for(my $index=($row_index+1);$index<=$no_rows;$index++)
{
my $X = $excelSheet->Cells($index,6)->Value();
my $Y = $excelSheet->Cells($index,7)->Value();
my $name = $excelSheet->Cells($index,9)->Value();
my $resourceType = $excelSheet->Cells($index,3)->Value();
my $size = $excelSheet->Cells($index,2)->Value();
#Name Overview
my $currNameTypeMap;
if ( ! exists $resultNameData->{ $name } ) # ->: arrow operator is used to dereference reference to arrays or hashes.
{
$resultNameData->{ $name } = {};
}
$currNameTypeMap = $resultNameData->{ $name };
$currNameTypeMap->{ $resourceType } += $size;
# Y Overview
my $currYTypeMap;
if ( ! exists $resultYData->{ $Y } ) # ->: arrow operator is used to dereference reference to arrays or hashes.
{
$resultYData->{ $cluster } = {};
}
$currYTypeMap = $resultYData->{ $Y };
$currYTypeMap->{ $resourceType } += $size;
# X Overview
my $currXTypeMap;
if ( ! exists $resultXData->{ $X } ) # ->: arrow operator is used to dereference reference to arrays or hashes.
{
$resultXData->{ $X } = {};
}
$currXTypeMap = $resultXData->{ $X };
$currXTypeMap->{ $resourceType } += $size;
}
my #uniqNameArr = sort keys %$resultNameData;
my #uniqYArr = sort keys %$resultYData;
my #uniqXArr = sort keys %$resultXData;
for my $currName ( #uniqNameArr )
{
print $currName . "\n". " RAM: " . $resultNameData->{ $currName }-> { "RAM" } . ", ROM: " . $resultNameData->{ $currName }-> { "ROM" } . "\n";
#print Dumper %$resultData;
}
print "----------------------------------------------------------------------- \n";
for my $currY ( #uniqYArr )
{
print $currY. "\n". " RAM: " . $resultYData->{ $currY }-> { "RAM" } . ", ROM: " . $resultYData->{ $currY }-> { "ROM" } . "\n";
}
print "------------------------------------------------------------------------ \n";
for my $currX ( #uniqXArr )
{
print $currX . "\n". " RAM: " . $resultXData->{ $currX }-> { "RAM" } . ", ROM: " . $resultXData->{ $currX }-> { "ROM" } . "\n";
}
}
#Sub routine to find the starting row index
sub findRowindex()
{
my $ret = ();
for(my $index=1;$index<$no_rows;$index++)
{
if(defined($excelSheet->Cells($index,1)))
{
my $cel_value = $excelSheet->Cells($index,1)->Value();
if($cel_value =~ m/^Name$/i)
{
$ret = $index;
last;
}
}
}
return $ret;
}
#Trim function
sub trim {
(my $s = $_[0]) =~ s/^\s+|\s+$//g;
return $s;
}
A workaround: You could use Excel::Writer::XLSX to create Excel files, it is working fine and quite robust. Here is how you could convert a tab separated file to Excel.
Reading excel: Spreadsheet::XLSX
use Text::Iconv;
my $converter = Text::Iconv -> new ("utf-8", "windows-1251");
use Spreadsheet::XLSX;
my $excel = Spreadsheet::XLSX -> new ('test.xlsx', $converter);
foreach my $sheet (#{$excel -> {Worksheet}}) {
printf("Sheet: %s\n", $sheet->{Name});
$sheet -> {MaxRow} ||= $sheet -> {MinRow};
foreach my $row ($sheet -> {MinRow} .. $sheet -> {MaxRow}) {
$sheet -> {MaxCol} ||= $sheet -> {MinCol};
foreach my $col ($sheet -> {MinCol} .. $sheet -> {MaxCol}) {
my $cell = $sheet -> {Cells} [$row] [$col];
if ($cell) {
printf("( %s , %s ) => %s\n", $row, $col, $cell -> {Val});
}
}
}
}
Writing excel: Excel::Writer::XLSX
my $workbook = Excel::Writer::XLSX->new( $xls_filename );
my $worksheet = $workbook->add_worksheet('data');
# Create a format for the headings
my $header_format = $workbook->add_format();
$header_format->set_bold();
$header_format->set_size( 18 );
$header_format->set_color( 'black' );
$header_format->set_align( 'center' );
my $row=0;
while (my $line = <$fh>){
chomp($line);
my #cols = split(/\t/,$line);
for(my $col=0;$col<#cols;$col++){
if ($row == 0 ){
$worksheet->write_string( $row, $col, $cols[$col],$header_format );
} else {
$worksheet->write_string( $row, $col, $cols[$col] );
}
}
$row++;
}
close($fh);
I hope this helps you.
Regards,