I'm working with automating the creation of a Word document in Perl by means of Win32::OLE. My current code looks like this, and it's leaving instances of WINWORD.EXE in memory:
my $range = $select->Range;
my $table = $doc->Tables->Add( $range, scalar #rows, scalar #{ $rows[0] } );
for my $rownum ( 0 .. $#rows ) {
for my $colnum ( 0 .. $#{ $rows[$rownum] } ) {
my #cellpos = ( $rownum + 1, $colnum + 1 );
my $data = $rows[$rownum][$colnum];
$table->Cell(#cellpos)->Range->{'Text'} = $data;
1;
}
}
However, if I were to refactor my code per the Microsoft recommendation for Visual Studio .NET, it would look like this:
my $range = $select->Range;
my $tables = $doc->Tables;
my $table = $tables->Add( $range, scalar #rows, scalar #{ $rows[0] } );
for my $rownum ( 0 .. $#rows ) {
for my $colnum ( 0 .. $#{ $rows[$rownum] } ) {
my #cellpos = ( $rownum + 1, $colnum + 1 );
my $data = $rows[$rownum][$colnum];
my $cell = $table->Cell(#cellpos);
my $cell_range = $cell->Range;
$cell_range->{'Text'} = $data;
}
}
That code does the job, but it's awfully "noisy" to my mind. Is there a cleaner way to do this?
It can be improved marginally. There is no need for the #cellpos and $data variables, and it is tidier to extract a reference to the current element of #rows for use within the inner loop.
my $range = $select->Range;
my $tables = $doc->Tables;
my $table = $tables->Add( $range, scalar #rows, scalar #{ $rows[0] } );
for my $rownum ( 0 .. $#rows ) {
my $cols = $rows[$rownum];
for my $colnum ( 0 .. $#$cols ) {
my $cell = $table->Cell($rownum + 1, $colnum + 1);
my $cell_range = $cell->Range;
$cell_range->{Text} = $cols->[$colnum];
}
}
Related
I'm trying to understand the piece of code below; I just cannot understand what is being done in line 15.
It seems like it is trying to initialise/assign to %heading but I am just not sure how that syntax works.
$strings = [qw(city state country language code )];
my $file = "fname";
my $fn = $strings;
my $c = 0;
open( FILEH, "< ${file}.txt" ) or die( $! );
while ( <FILEH> ) {
my %heading;
chomp;
$c++;
#heading{ ( #$fn, "One" ) } = split( /[|]/ ); # Line 15
if ( defined( $heading{"One"} ) ) {
my $One = $heading{"One"};
}
That's called a "slice". It assigns to several keys at once:
#hash{ $key1, $key2 } = ($value1, $value2);
is a shorter and faster way of doing
$hash{$key1} = $value1;
$hash{$key2} = $value2;
#$fn is the same as #{ $fn }, i.e. array dereference.
I am new to Perl. I have excel file say "sample.xls" which looks like follows.
Sample.xls
There are about data of 1000 rows like this. I want to parse this file and write it in another file say "output.xls" with following output format.
output.xls
I have written a script in perl, however, it doesn't give me the exact output the way I want. Also, looks like the script is not very efficient. Can anyone guide me how I can improve my script as well as have my output as shown in "output.xls" ??
Here's the Script:
#!/usr/bin/perl –w
use strict;
use warnings;
use Spreadsheet::ParseExcel;
use Spreadsheet::WriteExcel;
use Spreadsheet::WriteExcel::Chart;
# Read the input and output filenames.
my $inputfile = "path/sample.xls";
my $outputfile = "path/output.xls";
if ( !$inputfile || !$outputfile ) {
die( "Couldn't find file\n" );
}
my $parser = Spreadsheet::ParseExcel->new();
my $inwb = $parser->parse( $inputfile );
if ( !defined $inwb ) {
die "Parsing error: ", $parser->error(), ".\n";
}
my $outwb = Spreadsheet::WriteExcel->new( $outputfile );
my $inws = $inwb->worksheet( "Sheet1" );
my $outws = $outwb->add_worksheet("Sheet1");
my $out_row = 0;
my ( $row_min, $row_max ) = $inws->row_range();
my ( $col_min, $col_max ) = $inws->col_range();
my $format = $outwb->add_format(
center_across => 1,
bold => 1,
size => 10,
border => 4,
color => 'black',
border_color => 'black',
align => 'vcenter',
);
$outws->write(0,0, "Item Name", $format);
$outws->write(0,1, "Spec", $format);
$outws->write(0,2, "First name", $format);
$outws->write(0,3, "Middle Name", $format);
$outws->write(0,4, "Last Name", $format);
$outws->write(0,5, "Customer Number", $format);
$outws->write(0,6, "Age", $format);
$outws->write(0,7, "Units", $format);
my $col_count = 1;
#$row_min = 1;
for my $inws ( $inwb->worksheets() ) {
my ( $row_min, $row_max ) = $inws->row_range();
my ( $col_min, $col_max ) = $inws->col_range();
for my $in_row ( 2 .. $row_max ) {
for my $col ( 0 .. 0 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
$outws->write($in_row,$col, $cell->value());
$outws->write($in_row,$col+1, $fields[1]);
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 1 .. 1 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
#$outws->write($in_row,$col+1, $cell->value());
$outws->write($in_row,$col+1, $fields[0]);
$outws->write($in_row,$col+2, $fields[1]);
$outws->write($in_row,$col+3, $fields[2]);
$outws->write($in_row,$col+4, $fields[3]);
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 2 .. 2 ) {
my $cell = $inws->get_cell( $in_row, $col);
my #fields = split /_/, $cell->value();
next unless $cell;
$outws->write($in_row,6, $cell->value());
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 3 .. 9 ) {
my $cell = $inws->get_cell( $in_row, $col);
next unless $cell;
}
}
for my $in_row ( 2 .. $row_max ) {
for my $col ( 10 .. 10 ) {
my $cell = $inws->get_cell( $in_row, $col );
next unless $cell;
$outws->write($in_row,7, $cell->value());
}
}
}
To get your output sorted, you need to collect all the information first before you are writing it out. Right now, you are doing a bit of jumping back and forth between rows and columns.
Here are some changes I would make to get it sorted, and make it more efficient (to read).
Create a data structure $data outside of your loop to store all the information.
If there is only one worksheet, you don't need to loop over sheets. Just work with one sheet.
Loop over the lines.
Inside that loop, use the code you have to parse the individual fields to just parse them. No 2..2 loops. Just a bunch of statements.
my #item_fields = split /_/, $inws->get_cell( $in_row, 0 ) || q{};
my #name_fields = split /_/, $inws->get_cell( $in_row, $col ) || q{};
Store them in $data per item.
push #{ $data } = [ $item_fields[0], ... ];
Done with the loop. Open the output file.
Loop over $data with a sort and write to the output file.
foreach my $row (sort { $a->[0] cmp $b->[0] } #{ $data } ) { ... }
Done.
I suggest you read up on sort and also check out perlref and perlreftut to learn more about references (data structures).
I'm trying to count the number of bases using a for loop and the substr function but the counts are off and I'm not sure why! Please help! I have to use these functions in my assignment. Where am I going wrong? Here is my code:
use strict;
use warnings;
my $user_input = "accgtutf5";
#initalizing the lengths
my $a_base_total = 0;
my $c_base_total = 0;
my $g_base_total = 0;
my $t_base_total = 0;
my $other_total = 0;
for ( my $position = 0; $position < length $user_input; $position++ ) {
my $nucleotide = substr( $user_input, $position, 1 );
if ( $nucleotide eq "a" ) {
$a_base_total++;
} elsif ( $nucleotide eq "c" ) {
$c_base_total++;
} elsif ( $nucleotide eq "g" ) {
$g_base_total++;
} elsif ( $nucleotide eq "t" ) {
$t_base_total++;
} else {
$other_total++;
}
$position++;
}
print "a = $a_base_total\n";
print "c = $c_base_total\n";
print "g = $g_base_total\n";
print "t = $t_base_total\n";
print "other = $other_total\n";
The output I'm getting is :
a=1
c=1
g=0
t=2
other=1
When it should be:
a = 1
c = 2
g = 1
t = 2
other = 3
Thanks in advance! :)
You're incrementing twice.
Simply remove this line:
$position++;
Also, instead of iterating on position, I would suggest iterating on character.
Your script can be simplified to just:
use strict;
use warnings;
my $user_input = "accgtutf5";
my %count;
for my $nucleotide (split '', $user_input) {
$nucleotide = 'other' unless $nucleotide =~ /[acgt]/;
$count{$nucleotide}++;
}
printf "%s = %d\n", $_, $count{$_} // 0 for qw(a c g t other);
You are incrementing $position twice: once at the for and once at the end of the loop. Remove the second $position++.
I'm comparing a reference sequence of size 5500 bases and query sequence of size 3600, using dynamic programming (semi global alignment), in fact I don't know much about complexity and performance and the code is blowing up and giving me the error "out of memory". Knowing that it works normally on smaller sequences, my question is: This behavior is normal or I might have another problem in code ?if it's normal any hint to solve this problem ? Thanks in advance.
sub semiGlobal {
my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = #_;
# initialization: first row to 0 ;
my #matrix;
$matrix[0][0]{score} = 0;
$matrix[0][0]{pointer} = "none";
for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
$matrix[0][$j]{score} = 0;
$matrix[0][$j]{pointer} = "none";
}
for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
$matrix[$i][0]{score} = $GAP * $i;
$matrix[$i][0]{pointer} = "up";
}
# fill
my $max_i = 0;
my $max_j = 0;
my $max_score = 0;
print "seq2: ".length($seq2);
print "seq1: ".length($seq1);
for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
my ( $diagonal_score, $left_score, $up_score );
# calculate match score
my $letter1 = substr( $seq1, $j - 1, 1 );
my $letter2 = substr( $seq2, $i - 1, 1 );
if ( $letter1 eq $letter2 ) {
$diagonal_score = $matrix[ $i - 1 ][ $j - 1 ]{score} + $MATCH;
}
else {
$diagonal_score = $matrix[ $i - 1 ][ $j - 1 ]{score} + $MISMATCH;
}
# calculate gap scores
$up_score = $matrix[ $i - 1 ][$j]{score} + $GAP;
$left_score = $matrix[$i][ $j - 1 ]{score} + $GAP;
# choose best score
if ( $diagonal_score >= $up_score ) {
if ( $diagonal_score >= $left_score ) {
$matrix[$i][$j]{score} = $diagonal_score;
$matrix[$i][$j]{pointer} = "diagonal";
}
else {
$matrix[$i][$j]{score} = $left_score;
$matrix[$i][$j]{pointer} = "left";
}
}
else {
if ( $up_score >= $left_score ) {
$matrix[$i][$j]{score} = $up_score;
$matrix[$i][$j]{pointer} = "up";
}
else {
$matrix[$i][$j]{score} = $left_score;
$matrix[$i][$j]{pointer} = "left";
}
}
# set maximum score
if ( $matrix[$i][$j]{score} > $max_score ) {
$max_i = $i;
$max_j = $j;
$max_score = $matrix[$i][$j]{score};
}
}
}
my $align1 = "";
my $align2 = "";
my $j = $max_j;
my $i = $max_i;
while (1) {
if ( $matrix[$i][$j]{pointer} eq "none" ) {
$stseq1 = $j;
last;
}
if ( $matrix[$i][$j]{pointer} eq "diagonal" ) {
$align1 .= substr( $seq1, $j - 1, 1 );
$align2 .= substr( $seq2, $i - 1, 1 );
$i--;
$j--;
}
elsif ( $matrix[$i][$j]{pointer} eq "left" ) {
$align1 .= substr( $seq1, $j - 1, 1 );
$align2 .= "-";
$j--;
}
elsif ( $matrix[$i][$j]{pointer} eq "up" ) {
$align1 .= "-";
$align2 .= substr( $seq2, $i - 1, 1 );
$i--;
}
}
$align1 = reverse $align1;
$align2 = reverse $align2;
return ( $align1, $align2, $stseq1 ,$max_j);
}
One way to possibly solve the problem is to tie the #matrix with a file. However, this will dramatically slow down the program. Consider this:
sub semiGlobal {
use Tie::Array::CSV;
tie my #matrix, 'Tie::Array::CSV', 'temp.txt'; # Don't forget to add your own error handler.
my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = #_;
# initialization: first row to 0 ;
$matrix[0][0] = '0 n';
for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
$matrix[0][$j] = '0 n';
}
for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
my $score = $GAP * $i;
$matrix[$i][0] = join ' ',$score,'u';
}
#print Dumper(\#matrix);
# fill
my $max_i = 0;
my $max_j = 0;
my $max_score = 0;
print "seq2: ".length($seq2)."\n";
print "seq1: ".length($seq1)."\n";
for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
my ( $diagonal_score, $left_score, $up_score );
# calculate match score
my $letter1 = substr( $seq1, $j - 1, 1 );
my $letter2 = substr( $seq2, $i - 1, 1 );
my $score = (split / /, $matrix[ $i - 1 ][ $j - 1 ])[0];
if ( $letter1 eq $letter2 ) {
$diagonal_score = $score + $MATCH;
}
else {
$diagonal_score = $score + $MISMATCH;
}
# calculate gap scores
$up_score = (split / /,$matrix[ $i - 1 ][$j])[0] + $GAP;
$left_score = (split / /,$matrix[$i][ $j - 1 ])[0] + $GAP;
# choose best score
if ( $diagonal_score >= $up_score ) {
if ( $diagonal_score >= $left_score ) {
$matrix[$i][$j] = join ' ',$diagonal_score,'d';
}
else {
$matrix[$i][$j] = join ' ', $left_score, 'l';
}
}
else {
if ( $up_score >= $left_score ) {
$matrix[$i][$j] = join ' ', $up_score, 'u';
}
else {
$matrix[$i][$j] = join ' ', $left_score, 'l';
}
}
# set maximum score
if ( (split / /, $matrix[$i][$j])[0] > $max_score ) {
$max_i = $i;
$max_j = $j;
$max_score = (split / /, $matrix[$i][$j])[0];
}
}
}
my $align1 = "";
my $align2 = "";
my $stseq1;
my $j = $max_j;
my $i = $max_i;
while (1) {
my $pointer = (split / /, $matrix[$i][$j])[1];
if ( $pointer eq "n" ) {
$stseq1 = $j;
last;
}
if ( $pointer eq "d" ) {
$align1 .= substr( $seq1, $j - 1, 1 );
$align2 .= substr( $seq2, $i - 1, 1 );
$i--;
$j--;
}
elsif ( $pointer eq "l" ) {
$align1 .= substr( $seq1, $j - 1, 1 );
$align2 .= "-";
$j--;
}
elsif ( $pointer eq "u" ) {
$align1 .= "-";
$align2 .= substr( $seq2, $i - 1, 1 );
$i--;
}
}
$align1 = reverse $align1;
$align2 = reverse $align2;
untie #matrix; # Don't forget to add your own error handler.
unlink 'temp.txt'; # Don't forget to add your own error handler.
return ( $align1, $align2, $stseq1 ,$max_j);
}
You can still use your original sub for short sequences, and switch to this sub for long ones.
I think that #j_random_hacker and #Ashalynd are on the right track regarding using this algorithm in most Perl implementations. The datatypes you're using are going to use more memory that absolutely needed for the calculations.
So this is "normal" in that you should expect to see this kind of memory usage for how you've written this algorithm in perl. You may have other problems in surrounding code that are using a lot of memory but this algorithm will hit your memory hard with large sequences.
You can address some of the memory issues by changing the datatypes that you're using as #Ashalynd suggests. You could try changing the hash which holds score and pointer into an array and changing the string pointers into integer values. Something like this might get you some benefit while still maintaining readability:
use strict;
use warnings;
# define constants for array positions and pointer values
# so the code is still readable.
# (If you have the "Readonly" CPAN module you may want to use it for constants
# instead although none of the downsides of the "constant" pragma apply in this code.)
use constant {
SCORE => 0,
POINTER => 1,
DIAGONAL => 0,
LEFT => 1,
UP => 2,
NONE => 3,
};
...
sub semiGlobal2 {
my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = #_;
# initialization: first row to 0 ;
my #matrix;
# score and pointer are now stored in an array
# using the defined constants as indices
$matrix[0][0][SCORE] = 0;
# pointer value is now a constant integer
$matrix[0][0][POINTER] = NONE;
for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
$matrix[0][$j][SCORE] = 0;
$matrix[0][$j][POINTER] = NONE;
}
for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
$matrix[$i][0][SCORE] = $GAP * $i;
$matrix[$i][0][POINTER] = UP;
}
... # continue to make the appropriate changes throughout the code
However, when I tested this I didn't get a huge benefit when attempting to align a 3600 char string in a 5500 char string of random data. I programmed my code to abort when it consumed more than 2GB of memory. The original code aborted after 23 seconds while the one using constants and an array instead of a hash aborted after 32 seconds.
If you really want to use this specific algorithm I'd check out the performance of Algorithm::NeedlemanWunsch. It doesn't look like it's very mature but it may have addressed your performance issues. Otherwise look into writing an Inline or Perl XS wrapper around a C implementation
I would like to write a Perl function that gets a GFF3 filename and a range (i.e. 100000 .. 2000000). and returns a reference to an array containing all names/accessions of genes found in this range.
I guess using bioperl will make sense, but I have very little experience with it. I can write a script that parses a GFF3 by my self, but if using bioperl (or another packagae) is not too complicated - I'd rather reuse their code.
use Bio::Tools::GFF;
my $range_start = 100000;
my $range_end = 200000;
my #features_in_range = ( );
my $gffio = Bio::Tools::GFF->new(-file => $gff_file, -gff_version => 3);
while (my $feature = $gffio->next_feature()) {
## What about features that are not contained within the coordinate range but
## do overlap it? Such features won't be caught by this check.
if (
($feature->start() >= $range_start)
&&
($feature->end() <= $range_end)
) {
push #features_in_range, $feature;
}
}
$gffio->close();
DISCLAIMER: Naive implementation. I just banged that out, it's had no testing. I won't even guarantee it compiles.
You do want to use BioPerl for this, using possibly the Bio::Tools::GFF module.
You should really ask on the BioPerl mailing list. It's very friendly and the subscribers are very knowledgeable -- they'll definitely be able to help you. And once you do get an answer (and if you don't get one here first), I suggest answering your own question here with the answer so we can all benefit!
The following function takes a hash of targets and ranges and returns a function that will iterate over all targets that overlap any of the ranges. The targets should be a reference to an array of references:
my $targets =
[
[
$start,
$end,
],
...,
]
The ranges should be a reference to an array of hashes:
my $ranges =
[
{
seqname => $seqname,
source => $source,
feature => $feature,
start => $start,
end => $end,
score => $score,
strand => $strand,
frame => $frame,
attribute => $attribute,
},
...,
]
You can, of course, just pass a single target.
my $brs_iterator
= binary_range_search( targets => $targets, ranges => $ranges );
while ( my $gff_line = $brs_iterator->() ) {
# do stuff
}
sub binary_range_search {
my %options = #_;
my $targets = $options{targets} || croak 'Need a targets parameter';
my $ranges = $options{ranges} || croak 'Need a ranges parameter';
my ( $low, $high ) = ( 0, $#{$ranges} );
my #iterators = ();
TARGET:
for my $range (#$targets) {
RANGE_CHECK:
while ( $low <= $high ) {
my $try = int( ( $low + $high ) / 2 );
$low = $try + 1, next RANGE_CHECK
if $ranges->[$try]{end} < $range->[0];
$high = $try - 1, next RANGE_CHECK
if $ranges->[$try]{start} > $range->[1];
my ( $down, $up ) = ($try) x 2;
my %seen = ();
my $brs_iterator = sub {
if ( $ranges->[ $up + 1 ]{end} >= $range->[0]
and $ranges->[ $up + 1 ]{start} <= $range->[1]
and !exists $seen{ $up + 1 } )
{
$seen{ $up + 1 } = undef;
return $ranges->[ ++$up ];
}
elsif ( $ranges->[ $down - 1 ]{end} >= $range->[0]
and $ranges->[ $down - 1 ]{start} <= $range->[1]
and !exists $seen{ $down - 1 }
and $down > 0 )
{
$seen{ $down - 1 } = undef;
return $ranges->[ --$down ];
}
elsif ( !exists $seen{$try} ) {
$seen{$try} = undef;
return $ranges->[$try];
}
else {
return;
}
};
push #iterators, $brs_iterator;
next TARGET;
}
}
# In scalar context return master iterator that iterates over the list of range iterators.
# In list context returns a list of range iterators.
return wantarray
? #iterators
: sub {
while (#iterators) {
if ( my $range = $iterators[0]->() ) {
return $range;
}
shift #iterators;
}
return;
};
}