Read MS Word table data row wise using win32:ole perl - perl

I am new to win32:ole module in perl. I am trying to print MS word table data row wise on command prompt. But I am able to print only last row of the table. Can you please help me to solve this problem? Thanks in advance.
Below is my code:
#!/usr/bin/perl
use strict;
use warnings;
use File::Spec::Functions qw( catfile );
use Win32::OLE qw(in);
use Win32::OLE::Const 'Microsoft Word';
$Win32::OLE::Warn = 3;
my $word = get_word();
$word->{DisplayAlerts} = wdAlertsNone;
$word->{Visible} = 1;
my $doc = $word->{Documents}->Open('C:\\PerlScripts\\myTest.docx');
my $tables = $word->ActiveDocument->{'Tables'};
for my $table (in $tables)
{
my $tableText = $table->ConvertToText({ Separator => wdSeparateByTabs });
print "Table: ". $tableText->Text(). "\n";
}
$doc->Close(0);
sub get_word
{
my $word;
eval { $word = Win32::OLE->GetActiveObject('Word.Application');};
die "$#\n" if $#;
unless(defined $word)
{
$word = Win32::OLE->new('Word.Application', sub { $_[0]->Quit })
or die "Oops, cannot start Word: ", Win32::OLE->LastError, "\n";
}
return $word;
}

Not a perfect solution by any means but here's an advancement on the problem.
I used a string separator "\n\n" which produces the following output ...
Further hacking required :(
C:\StackOverflow>perl word.pl meTest.docx
Table: Header1
Header2
Header3
Header4
Row1-Cell1
Row1-Cell2
Row1-Cell3
Row1-Cell4
Row2-Cell1
Row2-Cell2
Row2-Cell3
Row2-Cell4
Row2-Cell5
Here's the code. I have commented out some other code in the tables loop that I used to hack on the data returned by $tableRange->{Text} Uncomment to experiment further.
#!/usr/bin/perl
use strict;
use warnings;
use File::Spec::Functions qw( catfile );
use Win32::OLE qw(in);
use Win32::OLE::Const 'Microsoft Word';
$Win32::OLE::Warn = 3;
my $word = get_word();
$word->{DisplayAlerts} = wdAlertsNone;
$word->{Visible} = 1;
my $doc = $word->{Documents}->Open('meTest.docx');
my $tables = $word->ActiveDocument->{'Tables'};
for my $table (in $tables)
{
my $tableRange = $table->ConvertToText({ Separator => "\n\n" });
print "Table: \n" . $tableRange->{Text}. "\n";
# foreach $word (split/\n/, $tableRange->{Text}) {
# print $word . "\n" ;
# # $userinput = <STDIN>;
# }
}
$doc->Close(0);
sub get_word
{
my $word;
eval { $word = Win32::OLE->GetActiveObject('Word.Application');};
die "$#\n" if $#;
unless(defined $word)
{
$word = Win32::OLE->new('Word.Application', sub { $_[0]->Quit })
or die "Oops, cannot start Word: ", Win32::OLE->LastError, "\n";
}
return $word;
}
Sorry I couldn't be of more help.

extract all the doc tables into a single xls file
sub doParseDoc {
my $msg = '' ;
my $ret = 1 ; # assume failure at the beginning ...
$msg = 'START --- doParseDoc' ;
$objLogger->LogDebugMsg( $msg );
$msg = 'using the following DocFile: "' . $DocFile . '"' ;
$objLogger->LogInfoMsg( $msg );
#-----------------------------------------------------------------------
#Using OLE + OLE constants for Variants and OLE enumeration for Enumerations
# Create a new Excel workbook
my $objWorkBook = Spreadsheet::WriteExcel->new("$DocFile" . '.xls');
# Add a worksheet
my $objWorkSheet = $objWorkBook->add_worksheet();
my $var1 = Win32::OLE::Variant->new(VT_BOOL, 'true');
Win32::OLE->Option(Warn => \&Carp::croak);
use constant true => 0;
# at this point you should have the Word application opened in UI with t
# the DocFile
# build the MS Word object during run-time
my $objMSWord = Win32::OLE->GetActiveObject('Word.Application')
or Win32::OLE->new('Word.Application', 'Quit');
# build the doc object during run-time
my $objDoc = $objMSWord->Documents->Open($DocFile)
or die "Could not open ", $DocFile, " Error:", Win32::OLE->LastError();
#Set the screen to Visible, so that you can see what is going on
$objMSWord->{'Visible'} = 1;
# try NOT printing directly to the file
#$objMSWord->ActiveDocument->SaveAs({Filename => 'AlteredTest.docx',
#FileFormat => wdFormatDocument});
my $tables = $objMSWord->ActiveDocument->Tables();
my $tableText = '' ;
my $xlsRow = 1 ;
for my $table (in $tables){
# extract the table text as a single string
#$tableText = $table->ConvertToText({ Separator => 'wdSeparateByTabs' });
# cheated those properties from here:
# https://msdn.microsoft.com/en-us/library/aa537149(v=office.11).aspx#officewordautomatingtablesdata_populateatablewithdata
my $RowsCount = $table->{'Rows'}->{'Count'} ;
my $ColsCount = $table->{'Columns'}->{'Count'} ;
# disgard the tables having different than 5 columns count
next unless ( $ColsCount == 5 ) ;
$msg = "Rows Count: $RowsCount " ;
$msg .= "Cols Count: $ColsCount " ;
$objLogger->LogDebugMsg ( $msg ) ;
#my $tableRange = $table->ConvertToText({ Separator => '##' });
# OBS !!! simple print WILL print to your doc file use Select ?!
#$objLogger->LogDebugMsg ( $tableRange . "\n" );
# skip the header row
foreach my $row ( 0..$RowsCount ) {
foreach my $col (0..$ColsCount) {
# nope ... $table->cell($row,$col)->->{'WrapText'} = 1 ;
# nope $table->cell($row,$col)->{'WordWrap'} = 1 ;
# so so $table->cell($row,$col)->WordWrap() ;
my $txt = '';
# well some 1% of the values are so nasty that we really give up on them ...
eval {
$txt = $table->cell($row,$col)->range->{'Text'};
#replace all the ctrl chars by space
$txt =~ s/\r/ /g ;
$txt =~ s/[^\040-\176]/ /g ;
# perform some cleansing - ColName<primary key>=> ColName
#$txt =~ s#^(.[a-zA-Z_0-9]*)(\<.*)#$1#g ;
# this will most probably brake your cmd ...
# $objLogger->LogDebugMsg ( "row: $row , col: $col with txt: $txt \n" ) ;
} or $txt = 'N/A' ;
# Write a formatted and unformatted string, row and column notation.
$objWorkSheet->write($xlsRow, $col, $txt);
} #eof foreach col
# we just want to dump all the tables into the one sheet
$xlsRow++ ;
} #eof foreach row
sleep 1 ;
} #eof foreach table
# close the opened in the UI document
$objMSWord->ActiveDocument->Close;
# OBS !!! now we are able to print
$objLogger->LogDebugMsg ( $tableText . "\n" );
# exit the whole Word application
$objMSWord->Quit;
return ( $ret , $msg ) ;
}
#eof sub doParseDoc

Use below lines of code
my $doc = $word->Documents->Open('C:\\PerlScripts\\myTest.docx');
my $tables = $word->{'Tables'};
instead of below code
my $doc = $word->{Documents}->Open('C:\\PerlScripts\\myTest.docx');
my $tables = $word->ActiveDocument->{'Tables'};
your problem get solved.

Related

Unable to retrieve multiple column values from file in Perl

I have a file with following contents:
TIME
DATE TIME DAY
191227 055526 FRI
RC DEV SERVER
RC1 SERVER1
RC2 SERVER2
RC3 SERVER3
END
I am fetching argument values from this file, say if I pass DATE as an argument to the script I am getting corresponding value of the DATE. i.e., 191227
When I pass multiple arguments say DATE, DAY I should get values:
DATE=191227
DAY=FRI
But what I am getting here is:
DATE=191227
DAY=NULL
And if I pass RC as an argument I should get:
RC=RC1,RC2,RC3
The script looks below:
#!/usr/bin/perl
use strict;
use Data::Dumper;
print Dumper(\#ARGV);
foreach my $name(#ARGV){
print "NAME:$name\n";
my ($result, $start, $stop, $width) = "";
while(my $head = <STDIN>)
{
if( $head =~ (m/\b$name\b/g))
{
$start = (pos $head) - length($name);
$stop = (pos $head);
my $line = <STDIN>;
pos $head = $stop+1;
$head =~ (m/\b/g);
$width = (pos $head) - $start;
$result = substr($line,$start,$width);
}
}
$result =~ s/^\s*(.*?)\s*$/$1/;
print "$name=";
$result = "NULL" if ( $result eq "" );
print "$result\n";
}
Can someone please help me to get values if I pass multiple arguments also if suppose argument value have data in multiple lines it should be printed comma separated values (ex: for RC, RC=RC1,RC2,RC3).
Here is an example, assuming the input file is named file.txt and the values are starting at the same horizontal position as the keys:
package Main;
use feature qw(say);
use strict;
use warnings;
use Data::Dumper qw(Dumper);
my $self = Main->new(fn => 'file.txt', params => [#ARGV]);
$self->read_file();
$self->print_values();
sub read_file {
my ( $self ) = #_;
my $fn = $self->{fn};
open ( my $fh, '<', $fn ) or die "Could not open file '$fn': $!";
local $/ = ""; #Paragraph mode
my #blocks = <$fh>;
close $fh;
$self->{values} = {};
for my $block (#blocks) {
$self->parse_block( $block );
}
}
sub parse_block {
my ( $self, $block ) = #_;
my #lines = split /\n/, $block;
my $header = shift #lines;
my ($keys, $startpos) = $self->get_block_keys( $header );
for my $line ( #lines ) {
for my $key (#$keys) {
my $startpos = $startpos->{$key};
my $str = substr $line, $startpos;
my ( $value ) = $str =~ /^(\S+)/;
if ( defined $value ) {
push #{$self->{values}{$key}}, $value;
}
}
}
}
sub get_block_keys {
my ( $self, $header ) = #_;
my $values = $self->{values};
my #keys;
my %spos;
while ($header =~ /(\S+)/g) {
my $key = $1;
my $startpos = $-[1];
$spos{$key} = $startpos;
push #keys, $key;
}
for my $key (#keys) {
if ( !(exists $values->{$key}) ) {
$values->{$key} = [];
}
}
return (\#keys, \%spos);
}
sub new {
my ( $class, %args ) = #_;
return bless \%args, $class;
}
sub print_values {
my ( $self ) = #_;
my $values = $self->{values};
for my $key (#{$self->{params}}) {
my $value = "<NO VALUE FOUND>";
if ( exists $values->{$key} ) {
$value = join ",", #{$values->{$key}};
}
say "$key=$value";
}
}
Edit
If you want to read the file from STDIN instead, change the following part of the code:
# [...]
my $self = Main->new(params => [#ARGV]);
$self->read_file();
$self->print_values();
sub read_file {
my ( $self ) = #_;
local $/ = ""; #Paragraph mode
my #blocks = <STDIN>;
$self->{values} = {};
for my $block (#blocks) {
$self->parse_block( $block );
}
}
# [...]

Reading 2000 files and building hashes in perl

I have a code to parse 2000 csv files and build hashes based on them.
code is running good and fast until it reads ~100 files and there after it is running at snail pace
Memory consumed is ~ 1.8 GB uncompressed
Goal is to build global hash %_hist from the csv files.
File sizes range between 20KB to 30 MB
OS is Mac with 12 GB RAM
64 bit perl 5.18
I have create every variable in the functions as "my" expecting it to be released after the function exits.
The only persistent global variable is %_hist
Is there a way to improve performance?
foreach my $file (#files){
iLog ("Checking $file");
$| = 1; #flush io
return error("File $file doesn't exist") if not -e $file;
my #records = readCSVFile($file); #reads csv file to 2d array and returns the array
my #formatted_recs;
foreach $rec ( #records ){
my ($time,$c,$user_dst,$client,$ip_src,$first_seen,$last_seen,$first_seen_time,$last_seen_time,$device_ip,$country,$org,$user_agent) = #$rec;
my #newrec = ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org);
next if $time =~ /time/i; #Ignore first record
push(#formatted_recs, \#newrec);
}
baselineHistRecords(#formatted_recs);
}
sub readCSVFile{
my $file = shift;
my #data;
open my $fh, '<', $file or return error("Could not open $file: $!");
my $line = <$fh>; #Read headerline
my $sep_char = ',';
$sep_char = ';' if $line =~ /;"/;
$sep_char = '|' if $line =~ /\|/;
my $csv = Text::CSV->new({ sep_char => "$sep_char" });
push (#data, split(/$sep_char/, $line) );
while( my $row = $csv->getline( $fh ) ) {
push #data, $row;
}
close $fh;
return #data;
}
sub baselineHistRecords{
my #recs = #_;
undef $_ for ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
undef $_ for (%device_count, %ua_count, %location_count, %org_count );
my ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
my %loc = {}; my %loc2rec = {};
my %device_count = {}; my %ua_count = {}; my %location_count = {}; my %sorg_count = {};
my $hits=0;
my #suspicious_hits = ();
foreach $rec (#recs){
my $devtag=''; my $os = '';
my #row = #{$rec};
($time,$c,$client,$first_seen,$last_seen,$ip_src,$ua,$device_ip,$country,$org) = #row;
veryverbose("\n$time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org");
next if not is_ipv4($ip_src);
###### 1. Enrich IP
my $org = getOrgForIP($ip_src);
my ($country_code,$region,$city) = getGeoForIP($ip_src);
my $isp = getISPForIP($ip_src);
my $loc = join(" > ",($country_code, $region));
my $city = join(" > ",($country_code, $region, $city));
my $cidr = $ip_src; $cidr =~ s/\d+\.\d+$/0\.0\/16/; #Removing last octet
# my $packetmail = getPacketmailRep($ip_src);
# push (#suspicious_hits, "$time $c $client $ip_src $ua / $packetmail") if $packetmail !~ /NOTFOUND/;
##### 2. SANITIZE
$ua = cannonize($ua);
$devtag = $& if $ua =~ /\([^\)]+\)/;
#tokens = split(/;/, $devtag);
$os = $tokens[0];
$os =~ s/\+/ /g;$os =~ s/\(//g;$os =~ s/\)//g;
$os = 'Android' if $os !~ /Android/i and $devtag =~ /Android/i;
$os = "Windows NT" if $os =~ /compatible/i or $os =~ /Windows NT/i;
$_hist{$client}{"isp"}{$isp}{c} += 1;
$_hist{$client}{"os"}{$os}{c} += 1;
$_hist{$client}{"ua"}{$ua}{c} += 1 if not is_empty ($ua);
$_hist{$client}{"ua"}{c} += 1 if not is_empty ($ua); #An exception marked since all logs doesn't have UA values
$_hist{$client}{"loc"}{$loc}{c} += 1;
$_hist{$client}{"org"}{$org}{c} += 1;
$_hist{$client}{"cidr"}{$cidr}{c} += 1;
$_hist{$client}{"city"}{$city}{c} += 1;
$_hist{$client}{"c"} += 1;
$hits = $hits + 1;
print "." if $hits%100==0;
debug( "\n$ip_src : $os $loc $isp $org $ua: ".$_hist{$client}{"os"}{$os}{c} );
}
print "\nHITS: $hits";
return if ($hits==0); #return if empty
printf("\n######(( BASELINE for $client (".$_hist{$client}{c} ." records) ))#######################\n");
foreach my $item (qw/os org isp loc ua cidr/){
debug( sprintf ("\n\n--(( %s: %s ))-------------------------------- ",$client,uc($item)) );
## COMPUTE Usage Percent
my #item_values = sort { $_hist{$client}{$item}{$b}{c} <=> $_hist{$client}{$item}{$a}{c} } keys %{ $_hist{$client}{$item} };
my #cvalues = ();
foreach my $key ( #item_values ){
my $count = $_hist{$client}{$item}{$key}{c};
my $total = $_hist{$client}{c};
$total = $_hist{$client}{"ua"}{c} if $item =~ /^ua|os$/i and $_hist{$client}{"ua"}{c}; #Over for User_agent and OS determination as all logs doesn't have them
my $pc = ceil(( $count / $total ) * 100) ;
debug ("Ignoring empty value") if is_empty($key); # Ignoring Empty values
next if is_empty($key);
$_hist{$client}{$item}{$key}{p} = $pc ;
push (#cvalues, $pc);
#printf("\n%3d \% : %s",$pc,$key) if $pc>0;
}
## COMPUTE Cluster Centers
my #clustercenters = getClusterCenters(3,#cvalues);
my ($low, $medium, $high) = #clustercenters;
$_hist{$client}{$item}{low} = $low;
$_hist{$client}{$item}{medium} = $medium;
$_hist{$client}{$item}{high} = $high;
my %tags = ( $low => "rare",
$medium => "normal",
$high =>"most common",
);
debug ("\n(Cluster Centers) : $low \t$medium \t $high\n");
foreach my $key ( #item_values ){
next if is_empty($key);
my $pc = $_hist{$client}{$item}{$key}{p};
$_hist{$client}{$item}{$key}{tag} = $tags{ closest($pc, #clustercenters) };
debug( sprintf("\n%3d \% : %s : %s",$pc, $_hist{$client}{$item}{$key}{tag} , $key) );
}
}
printf("\n\n###################################\n");
saveHistBaselines();
}
Thanks,
Uma
This is more question for code review.
There's a ton of completely useless copying around in the code. E.g.: why the hell you copy data from my #$rec to #newrec? $rec to #row? Why do you return plain list of lines from readCSVFile instead of reference?
You don't really need to read entire file in memory and then process it - you can process data line by line and throw it away immideately after you done with it.

Perl line/delimiter file formatting syntax

I'm trying to figure out how to use a code that's written in perl, but am not very familiar with perl syntax. I was wondering if someone could tell me what the format of the file #metafilecache is? The code is failing to read the samplerate within the file, but I'm not sure how I have it formatted incorrectly. Here's the excerpt of the code I think is appropriate:
my $tnet = $ARGV[0];
my $tsta = $ARGV[1];
my $stadir = $ARGV[2];
if ( ! -d "$targetdir" ) {
mkdir "$targetdir" || die "Cannot create $targetdir: $?\n";
}
die "Cannot find PDF bin base dir: $pdfbinbase\n" if ( ! -d "$pdfbinbase" );
my %targetdays = ();
my %targetchan = ();
# Collect target files in the $pdfbinbase dir, limited by $changlob
foreach my $nsldir (glob("$pdfbinbase/{CHRYS}/$tnet.$tsta.*")) {
next if ( ! -d "$nsldir" ); # Limit to directories
# Extract location ID from directory name
my ($net,$sta,$loc) = $nsldir =~ /\/(\w+)\.(\w+)\.([\w-]+)$/;
if ( $net ne $tnet ) {
print "Target network ($tnet) != network ($net)\n";
next;
}
if ( $sta ne $tsta ) {
print "Target station ($tsta) != station ($sta)\n";
next;
}
foreach my $chandir (glob("$nsldir/$changlob")) {
next if ( ! -d "$chandir" ); # Limit to directories
# Extract channel code from directory name
my ($chan) = $chandir =~ /.*\/([\w\d]+)$/;
foreach my $yeardir (glob("$chandir/Y*")) {
next if ( ! -d "$yeardir" ); # Limit to directories
# Extract year from directory name
my ($year) = $yeardir =~ /^.*\/Y(\d{4,4})$/;
foreach my $daybin (glob("$yeardir/D*.bin")) {
next if ( ! -f "$daybin" ); # Limit to regular files
my ($day) = $daybin =~ /^.*\/D(\d{3,3})\.bin$/;
$targetdays{"$loc.$chan.$year.$day"} = $daybin;
$targetchan{"$loc.$chan"} = 1;
}
}
}
}
if ( $verbose > 1 ) {
print "Target days from PDF bin files:\n";
my $count = 0;
foreach my $tday (sort keys %targetdays) {
print "Target day: $tday => $targetdays{$tday}\n";
$count++;
}
print "Targets: $count\n";
}
# Remove targets that have already been calculated by checking
# results files against targets.
foreach my $tchan ( keys %targetchan ) {
my ($loc,$chan) = split (/\./, $tchan);
# Generate target file name
my $targetfile = undef;
if ( $loc ne "--" ) { $targetfile = "$targetdir/$prefix-$loc.$chan"; }
else { $targetfile = "$targetdir/$prefix-$chan"; }
print "\nChecking target file for previous results: $targetfile\n"
if ( $verbose );
next if ( ! -f "$targetfile" );
# Open result file and remove any targets that are included
open IN, "$targetfile" || next;
foreach my $line (<IN>) {
next if ( $line =~ /^YEAR\.DAY/ );
my ($year,$day) = $line =~ /^(\d+).(\d+)/;
# Delete this target
delete $targetdays{"$loc.$chan.$year.$day"};
}
close IN;
}
if ( $verbose > 1 ) {
print "Remaining target days:\n";
my $count = 0;
foreach my $tday (sort keys %targetdays) {
print "Target day: $tday => $targetdays{$tday}\n";
$count++;
}
print "Remaining Targets: $count\n";
}
my %targetfiles = ();
# Calculate and store PDF mode for each target day
TARGET: foreach my $tday (sort keys %targetdays) {
my ($loc,$chan,$year,$day) = split (/\./, $tday);
my %power = ();
my %count = ();
my #period = ();
# Determine sampling rate
my $samprate = GetSampRate ($tnet,$tsta,$loc,$chan);
print "Samplerate for $tnet $tsta $loc $chan is: $samprate\n" if (
$verbose );
if ( ! defined $samprate ) {
if ( ($tsta eq "ECSD") || ($tsta eq "SFJ") || ($tsta eq "CASEE") ||
($tsta eq "JSC") ){
next;
}
else {
print "Cannot determine sample rate for channel
$tnet.$tsta.$loc.$chan\n";
next;
}
}
This is the subroutine GetSampRate:
sub GetSampRate { # GetSampRate (net,sta,loc,chan)
my $net = shift;
my $sta = shift;
my $loc = shift;
my $chan = shift;
my $samprate = undef;
# Generate source name: Net_Sta_Loc_Chan
my $srcname = "${net}_${sta}_";
$srcname .= ($loc eq "--") ? "_" : "${loc}_";
$srcname .= "$chan";
if ( $#metafilecache < 0 ) {
my $metafile = "$stadir/metadata.txt";
if ( ! -f "$metafile" ) {
print "GetSampRate(): Cannot find metadata file: $metafile\n";
return undef;
}
# Open metadata file
if ( ! (open MF, "<$metafile") ) {
print "GetSampRate(): Cannot open: $metafile\n";
return undef;
}
# Read all lines in the metafilecache
#metafilecache = <MF>;
close MF;
}
# Read all lines starting with srcname into #lines
my #lines = grep { /^$srcname/ } #metafilecache;
# Find maximum of sample rates for this channel
foreach my $line ( #lines ) {
my #fields = split(/\t/, $line);
my $rate = $fields[7];
$samprate = $rate if (!defined $samprate || $rate > $samprate);
}
return $samprate;
}
The code you have shown is very clunky.
As far this scope is concerned, the file is called $stadir/metadata.txt and I can't help with $stadir as it's either undefined or a global value that is set elsewhere — not a great design idea
After that, #metafilecache = <MF> loads the entire file into the array #metafilecache, leaving a trailing newline character at the end of each element
Then my #lines = grep { /^$srcname/ } #metafilecache duplicates to #lines all lines beginning with the string held in $srcname. This is another global variable that shouldn't be used
The following for loop splits the line on tab ("\t" or "\x09") separators and sets $rate to the eighth value ($fields[7]). $samprate is updated at each iteration if the latest value of $rate is greater than the current stored maximum
I hope that helps

formatting output data to an excel file

This program gets numeric values from the web for each of the values in my #values array
I want these values to be printed out in a table which looks like
il9 il8 il7
2012 v1 b1
2011 v2 b2
2010 v3 b3
.
.
2000 v12 b12
where v1 .. v12 are values for the first variable in #values etc. here is my program please help me structure it. Is there an escape character that could take me back to the first line of the program in perl
thanks
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use URI;
my $browser = LWP::UserAgent->new;
$browser->timeout(10);
$browser->env_proxy;
open(OUT, ">out");
my $i = 2013;
while ($i-- > 2000){print OUT "$i\n"}
my $a = 2013 ;
my $base = 'http://webtools.mf.uni-lj.si/public/summarisenumbers.php';
my #values = ('il9', 'il8', 'il6' );
foreach my $value (#values) {
print OUT "$value \n"
while ($a-- > 2000){
my $b = $a + 1;
my $c = $b + 1;
my $query = '?query=('.$value.')'.$a.'[dp] NOT '.$b.'[dp] NOT '.$c.'[dp]';
my $add = $base.$query;
#my $url = URI->new($add);
#my $response = $browser->get($url);
#if($response->is_success) {print OUT $response->decoded_content;}
#else {die $response->status_line};
print OUT "$query\n";
} $a = 2013; print OUT
}
close(OUT);
Pay more attention to formatting/indentation and variable naming - it will help you a lot.
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
my $base_url = 'http://webtools.mf.uni-lj.si/public/summarisenumbers.php';
my #values = ( 'il9', 'il8', 'il6' );
my $stat_data = {};
my $browser = LWP::UserAgent->new;
$browser->timeout(10);
$browser->env_proxy;
for my $value ( #values ) {
for my $year ( 2010 .. 2013 ) {
my $query = '?query=(' . $value . ')' . $year .'[dp] NOT ' . ($year+1) . '[dp] NOT ' . ($year+2) .'[dp]';
my $url = "$base_url$query";
my $response = $browser->get( $url );
if( $response->is_success ) {
## store the fetched content in a hash structure
$stat_data->{$year}->{$value} = $response->decoded_content;
}
else {
die $response->status_line;
}
}
}
## print the header
print "\t", join( "\t", #values ), "\n";
## print the data by the years in reverse order
for my $year ( reverse sort keys %{$stat_data} ) {
print $year;
for my $value ( #values ) {
print "\t", $stat_data->{$year}->{$value};
}
print "\n";
}

Why does perl "hash of lists" do this?

I have a hash of lists that is not getting populated.
I checked that the block at the end that adds to the hash is in fact being called on input. It should either add a singleton list if the key doesn't exist, or else push to the back of the list (referenced under the right key) if it does.
I understand that the GOTO is ugly, but I've commented it out and it has no effect.
The problem is that when printhits is called, nothing is printed, as if there are no values in the hash. I also tried each (%genomehits), no dice.
THANKS!
#!/usr/bin/perl
use strict;
use warnings;
my $len = 11; # resolution of the peaks
#$ARGV[0] is input file
#$ARGV[1] is call number
# optional -s = spread number from call
# optional -o specify output file name
my $usage = "see arguments";
my $input = shift #ARGV or die $usage;
my $call = shift #ARGV or die $usage;
my $therest = join(" ",#ARGV) . " ";
print "the rest".$therest."\n";
my $spread = 1;
my $output = $input . ".out";
if ($therest =~ /-s\s+(\d+)\s/) {$spread = $1;}
if ($therest =~ /-o\s+(.+)\s/) {$output = $1;}
# initialize master hash
my %genomehits = ();
foreach (split ';', $input) {
my $mygenename = "err_naming";
if ($_ =~ /^(.+)-/) {$mygenename = $1;}
open (INPUT, $_);
my #wiggle = <INPUT>;
&singlegene(\%genomehits, \#wiggle, $mygenename);
close (INPUT);
}
&printhits;
#print %genomehits;
sub printhits {
foreach my $key (%genomehits) {
print "key: $key , values: ";
foreach (#{$genomehits{$key}}) {
print $_ . ";";
}
print "\n";
}
}
sub singlegene {
# let %hash be the mapping hash
# let #mygene be the gene to currently process
# let $mygenename be the name of the gene to currently process
my (%hash) = %{$_[0]};
my (#mygene) = #{$_[1]};
my $mygenename = $_[2];
my $chromosome;
my $leftbound = -2;
my $rightbound = -2;
foreach (#mygene) {
#print "Doing line ". $_ . "\n";
if ($_ =~ "track" or $_ =~ "output" or $_ =~ "#") {next;}
if ($_ =~ "Step") {
if ($_ =~ /chrom=(.+)\s/) {$chromosome = $1;}
if ($_ =~ /span=(\d+)/) {$1 == 1 or die ("don't support span not equal to one, see wig spec")};
$leftbound = -2;
$rightbound = -2;
next;
}
my #line = split /\t/, $_;
my $pos = $line[0];
my $val = $line[-1];
# above threshold for a call
if ($val >= $call) {
# start of range
if ($rightbound != ($pos - 1)) {
$leftbound = $pos;
$rightbound = $pos;
}
# middle of range, increment rightbound
else {
$rightbound = $pos;
}
if (\$_ =~ $mygene[-1]) {goto FORTHELASTONE;}
}
# else reinitialize: not a call
else {
FORTHELASTONE:
# typical case, in an ocean of OFFs
if ($rightbound != ($pos-1)) {
$leftbound = $pos;
}
else {
# register the range
my $range = $rightbound - $leftbound;
for ($spread) {
$leftbound -= $len;
$rightbound += $len;
}
#print $range . "\n";
foreach ($leftbound .. $rightbound) {
my $key = "$chromosome:$_";
if (not defined $hash{$key}) {
$hash{$key} = [$mygenename];
}
else { push #{$hash{$key}}, $mygenename; }
}
}
}
}
}
You are passing a reference to %genomehits to the function singlegene, and then copying it into a new hash when you do my (%hash) = %{$_[0]};. You then add values to %hash which goes away at the end of the function.
To fix it, use the reference directly with arrow notation. E.g.
my $hash = $_[0];
...
$hash->{$key} = yadda yadda;
I think it's this line:
my (%hash) = %{$_[0]};
You're passing in a reference, but this statement is making a copy of your hash. All additions you make in singlegene are then lost when you return.
Leave it as a hash reference and it should work.
PS - Data::Dumper is your friend when large data structures are not behaving as expected. I'd sprinkle a few of these in your code...
use Data::Dumper; print Dumper \%genomehash;