Correct use of Perl "exists" - perl

I have two files. The first two columns in both are chromosome loci and genotypes, for instance chr1:1736464585 and T/G.
I have put the first two columns into a hash. I want to check whether the hash key (the chromosome locus) exists in the second file.
I have written this Perl program and have tried many variations but I'm not sure if I'm using exists correctly: it gives the error exists is not an HASH or ARRAY element or a subroutine.
#!/usr/bin/perl
use strict;
use warnings;
my $output = "annotated.txt";
open( O, ">>$output" );
my $filename = "datatest.txt";
my $filename2 = "MP2.txt";
chomp $filename;
chomp $filename2;
my %hash1 = ();
open( FN1, $filename ) or die "Can't open $filename: $!";
my #lines = <FN1>;
foreach my $line (#lines) {
my #split = split /\t/, $line;
if ( $line =~ /^chr/ ) {
my ( $key, $value ) = ( $split[0], $split[1] );
$hash1{$key} = $value;
}
}
my $DATA;
open( $DATA, $filename2 ) or die $!;
my #lines2 = <$DATA>;
foreach my $line2 (#lines2) {
my #split2 = split /\t/, $line2;
if ( $line2 =~ /^chr/ ) {
if ( exists %hash1{$key} ) {
print "$line2\n";
}
}
}

The syntax of the following line is incorrect:
if (exists %hash1{$key}) { ... }
This should be:
if (exists $hash1{$key}) { ... }

Related

how to display the hash value from my sample data

I'm learning perl at the moment, i wanted to ask help to answer this exercise.
My objective is to display the hash value of PartID 1,2,3
the sample output is displaying lot, wafer, program, version, testnames, testnumbers, hilimit, lolimit and partid values only.
sample data
lot=lot123
wafer=1
program=prgtest
version=1
Testnames,T1,T2,T3
Testnumbers,1,2,3
Hilimit,5,6,7
Lolimit,1,2,3
PartID,,,,
1,3,0,5
2,4,3,2
3,5,6,3
This is my code:
#!/usr/bin/perl
use strict;
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file ) or die("Error in command line arguments\n");
my $lotid = "";
open(DATA, $file) or die "Couldn't open file $file";
while(my $line = <DATA>) {
#print "$line";
if ( $line =~ /^lot=/ ) {
#print "$line \n";
my ($dump, $lotid) = split /=/, $line;
print "$lotid\n";
}
elsif ($line =~ /^program=/ ) {
my ($dump, $progid) = split /=/, $line;
print "$progid \n";
}
elsif ($line =~ /^wafer=/ ) {
my ($dump, $waferid) = split /=/, $line;
print "$waferid \n";
}
elsif ($line =~ /^version=/ ) {
my ($dump, $verid) = split /=/, $line;
print "$verid \n";
}
elsif ($line =~ /^testnames/i) {
my ($dump, #arr) = split /\,/, $line;
foreach my $e (#arr) {
print $e, "\n";
}
}
elsif ($line =~ /^testnumbers/i) {
my ($dump, #arr1) = split /\,/, $line;
foreach my $e1 (#arr1) {
print $e1, "\n";
}
}
elsif ($line =~ /^hilimit/i) {
my ($dump, #arr2) = split /\,/, $line;
foreach my $e2 (#arr2) {
print $e2, "\n";
}
}
elsif ($line =~ /^lolimit/i) {
my ($dump, #arr3) = split /\,/, $line;
foreach my $e3 (#arr3) {
print $e3, "\n";
}
}
}
Kindly help add to my code to display Partid 1,2,3 hash.
So I've rewritten your code a little to use a few more modern Perl idioms (along with some comments to explain what I've done). The bit I've added is near the bottom.
#!/usr/bin/perl
use strict;
# Added 'warnings' which you should always use
use warnings;
# Use say() instead of print()
use feature 'say';
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file)
or die ("Error in command line arguments\n");
# Use a lexical variable for a filehandle.
# Use the (safer) 3-argument version of open().
# Add $! to the error message.
open(my $fh, '<', $file) or die "Couldn't open file $file: $!";
# Read each record into $_ - which makes the following code simpler
while (<$fh>) {
# Match on $_
if ( /^lot=/ ) {
# Use "undef" instead of a $dump variable.
# split() works on $_ by default.
my (undef, $lotid) = split /=/;
# Use say() instead of print() - less punctuation :-)
say $lotid;
}
elsif ( /^program=/ ) {
my (undef, $progid) = split /=/;
say $progid;
}
elsif ( /^wafer=/ ) {
my (undef, $waferid) = split /=/;
say $waferid;
}
elsif ( /^version=/ ) {
my (undef, $verid) = split /=/;
say $verid;
}
elsif ( /^testnames/i) {
my (undef, #arr) = split /\,/;
# Changed all of these similar pieces of code
# to use the same variable names. As they are
# defined in different code blocks, they are
# completely separate variables.
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^testnumbers/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^hilimit/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
elsif ( /^lolimit/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
# And here's the new bit.
# If we're on the "partid" line, then read the next
# three lines, split each one and print the first
# element from the list returned by split().
elsif ( /^partid/i) {
say +(split /,/, <$fh>)[0] for 1 .. 3;
}
}
Update: By the way, there are no hashes anywhere in this code :-)
Update 2: I've just realised that you only have three different ways to process the data. So you can simplify your code drastically by using slightly more complex regexes.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
use Getopt::Long;
my $file = "";
GetOptions ("infile=s" => \$file)
or die ("Error in command line arguments\n");
open(my $fh, '<', $file) or die "Couldn't open file $file: $!";
while (<$fh>) {
# Single value - just print it.
if ( /^(?:lot|program|wafer|version)=/ ) {
my (undef, $value) = split /=/;
say $value;
}
# List of values - split and print.
elsif ( /^(?:testnames|testnumbers|hilimit|lolimit)/i) {
my (undef, #arr) = split /\,/;
foreach my $e (#arr) {
say $e;
}
}
# Extract values from following lines.
elsif ( /^partid/i) {
say +(split /,/, <$fh>)[0] for 1 .. 3;
}
}

Duplicate values in column

I have a original file which has following columns,
02-May-2018,AAPL,Sell,0.25,1000
02-May-2018,C,Sell,0.25,2000
02-May-2018,JPM,Sell,0.25,3000
02-May-2018,WFC,Sell,0.25,5000
02-May-2018,AAPL,Sell,0.25,7000
02-May-2018,GOOG,Sell,0.25,8000
02-May-2018,GOOG,Sell,0.25,9000
02-May-2018,C,Sell,0.25,2000
02-May-2018,AAPL,Sell,0.25,3000
I am trying to print this original line if I see value in the second column more then 2 times.. for example, if I see AAPL more then 2 times desired result should print
02-May-2018,AAPL,Sell,0.25,1000
02-May-2018,AAPL,Sell,0.25,7000
02-May-2018,AAPL,Sell,0.25,3000
So Far, I have written the following which prints results multiple times which is wrong.. can you please help on what I am doing wrong?
open (FILE, "<$TMPFILE") or die "Could not open $TMPFILE";
open (OUT, ">$TMPFILE1") or die "Could not open $TMPFILE1";
%count = ();
#symbol = ();
while ($line = <FILE>)
{
chomp $line;
(#data) = split(/,/,$line);
$count{$data[1]}++;
#keys = sort {$count{$a} cmp $count{$b}} keys %count;
for my $key (#keys)
{
if ( $count{$key} > 2 )
{
print "$line\n";
}
}
}
I'd do it something like this - store lines you've seen in a 'buffer' and print them out again if the condition is hit (before continuing to print as you go):
#!/usr/bin/env perl
use strict;
use warnings;
my %buffer;
my %count_of;
while ( my $line = <> ) {
my ( $date, $ticker, #values ) = split /,/, $line;
#increment the count
$count_of{$ticker}++;
if ( $count_of{$ticker} < 3 ) {
#count limit not hit, so stash the current line in the buffer.
$buffer{$ticker} .= $line;
next;
}
#print the buffer if the count has been hit
if ( $count_of{$ticker} == 3 ) {
print $buffer{$ticker};
}
#only gets to here once the limit is hit, so just print normally.
print $line;
}
With your input data, this outputs:
02-May-2018,AAPL,Sell,0.25,1000
02-May-2018,AAPL,Sell,0.25,7000
02-May-2018,AAPL,Sell,0.25,3000
Simple answer:
push #{ $lines{(split",")[1]} }, $_ while <>;
print #{ $lines{$_} } for grep #{ $lines{$_} } > 2, sort keys %lines;
perl program.pl inputfile > outputfile
You need to read the input file twice, because you don't know the final counts until you get to the end of the file
use strict;
use warnings 'all';
my ($TMPFILE, $TMPFILE1) = qw/ infile outfile /;
my %counts;
{
open my $fh, '<', $TMPFILE or die "Could not open $TMPFILE: $!";
while ( <$fh> ) {
my #fields = split /,/;
++$counts{$fields[1]};
}
}
open my $fh, '<', $TMPFILE or die "Could not open $TMPFILE: $!";
open my $out_fh, '>', $TMPFILE1 or die "Could not open $TMPFILE1: $!";
while ( <$fh> ) {
my #fields = split /,/;
print $out_fh $_ if $counts{$fields[1]} > 2;
}
output
02-May-2018,AAPL,Sell,0.25,1000
02-May-2018,AAPL,Sell,0.25,7000
02-May-2018,AAPL,Sell,0.25,3000
This should work:
use strict;
use warnings;
open (FILE, "<$TMPFILE") or die "Could not open $TMPFILE";
open (OUT, ">$TMPFILE1") or die "Could not open $TMPFILE1";
my %data;
while ( my $line = <FILE> ) {
chomp $line;
my #line = split /,/, $line;
push(#{$data{$line[1]}}, $line);
}
foreach my $key (keys %data) {
if(#{$data{$key}} > 2) {
print "$_\n" foreach #{$data{$key}};
}
}

How do i pattern match and keep writing to new file until another pattern match

My goal is to find and print all the lines in a "big.v" file starting from pattern match "module" until "endmodule" into individual files.
big.v: module test;
<bunch of code>
endmodule
module foo;
<bunch of code>
endmodule
And the individual files would look like:
test.v : module test;
..
endmodule
foo.v: module test1;
..
endmodule
I got most of it working using:
use strict;
use warnings;
#open(my $fh, ">", $f1) || die "Couldn't open '".$f."' for writing because: ".$!;
while (<>) {
my $line = $_;
if ($line =~ /(module)(\s+)(\w+)(.*)/) {
my $modname = $3;
open(my $fh1, ">", $modname.".v") ;
print $fh1 $line."\n";
## how do i keep writing next lines to this file until following pattern
if ($line =~ /(endmodule)(\s+)(.*)/) { close $fh1;}
}
}
Thanks,
There's a useful perl construct called the 'range operator':
http://perldoc.perl.org/perlop.html#Range-Operators
It works like this:
while ( <$file> ) {
if ( m/startpattern/ .. m/endpattern/ ) {
print;
}
}
So given your example - I think this should do the trick:
my $output;
while ( my $line = <STDIN> ) {
if ( $line =~ m/module/ .. m/endmodule/ ) {
my ( $modname ) = ( $line =~ m/module\s+(\w+)/ );
if ( defined $modname) {
open ( $output, ">", "$modname.v" ) or warn $!;
}
print {$output} $line;
}
}
Edit: But given your source data - you don't actually need to use a range operator I don't think. You could just close/reopen new 'output' files as you go. This assumes that you could 'cut up' your file based on 'module' lines, which isn't necessarily a valid assumption.
But sort of more like this:
use strict;
use warnings;
open ( my $input, "<", "big.v" ) or die $!;
my $output;
while ( my $line = <$input> ) {
if ( $line =~ m/^\s*module/ ) {
#start of module line found
#close filehandle if it's open
close($output) if defined $output;
#extract the module name from the line.
my ($modulename) = ( $line =~ m/module\s+(\w+)/ );
#open new output file (overwriting)
open( $output, ">", "$modulename.v" ) or warn $!;
}
#this test might not be necessary.
if ( defined $output ) {
print {$output} $line;
}
}

Add counter to if statement

How can I add a counter to this statement.
# go through each reference file
for my $file (#reference_files)
{
open my $ref, "<", $file or die "Can't open reference file '$file': $!";
while (my $line = <$ref>)
{
chomp $line;
my ($scaffold, undef, $type, $org_snp, $new_snp, undef, undef, undef, $info) = split /\t/, $line;
next if not $scaffold =~ /^KB/;
next if not $type =~ /^GENE/i;
my ($transcript_id, $gene_name, $auto) = split /[;][ ]/, $info;
$gene_name = $1 if $gene_name =~ /["]([^"]*)["]/;
if (my $matching_genes = $genes{$scaffold})
{
say join "\t", $gene_name, $_ for values %$matching_genes;
}
}
say "###";
}
I would like the script to additionally count all $matching_genes. Is there a way to incorporate this? I've been unsuccessful with standard counters (i.e. $i++) as it's pulling all values in the hash.
You can have a global counter variable on the top intialized to 0 before your for loop, say:
my $counter = 0;
# go through each reference file
for my $file (#reference_files)
# ... Rest of your code ...
Then, you can increment $counter inside of the if statement where $matching_genes is assigned:
if (my $matching_genes = $genes{$scaffold})
{
$counter++;
say join "\t", $gene_name, $_ for values %$matching_genes;
}
my $count=0;
# go through each reference file
for my $file (#reference_files)
{
open my $ref, "<", $file or die "Can't open reference file '$file': $!";
while (my $line = <$ref>)
{
chomp $line;
my ($scaffold, undef, $type, $org_snp, $new_snp, undef, undef, undef, $info) = split /\t/, $line;
next if not $scaffold =~ /^KB/;
next if not $type =~ /^GENE/i;
my ($transcript_id, $gene_name, $auto) = split /[;][ ]/, $info;
$gene_name = $1 if $gene_name =~ /["]([^"]*)["]/;
if (my $matching_genes = $genes{$scaffold})
{
say join "\t", $gene_name, $_ for values %$matching_genes;
$count =+ scalar(keys %$matching_genes);
}
}
say "###";
}
print "total: $count\n";

Extracting multiple lines of record/data using a subroutine or functions

Can you show me how to create a subroutine or function using this code?
Basically I want to make my code into a subroutine so I'll be able to re-use it without making my script too long.
Here is my script:
#!/usr/local/bin/perl
use strict;
use warnings;
use Data::Dumper;
use Carp qw(croak);
my #fields;
my ($tmp_var, $rec_type, $country, $header, $Combline, $records, $line);
my $filename = 'data5.txt';
open (my $input_fh, '<', $filename ) or croak "Can't open $filename: $!";
open my $OUTPUTA, ">", 'drp1.txt' or die $!;
open my $OUTPUTB, ">", 'drp2.txt' or die $!;
while (<$input_fh>) {
$line = _trim($_);
#fields = split (/\|/, $line);
$rec_type = $fields[0];
$country = $fields[1];
my $string = substr $fields[1], 0, 1;
$header = $line if(/^INVHDR/);
if ($rec_type eq 'INVDET') {
if ($string eq 'I') {
$records = $header . $line;
print $OUTPUTA $records, scalar <$input_fh>;
}
else {
$records = $header . $line;
print $OUTPUTB $records, scalar <$input_fh>;
}
}
}
close $OUTPUTA or die $!;
close $OUTPUTB or die $!;
sub _trim {
my $word = shift;
if ( $word ) {
$word =~ s/\s*\|/\|/g; #remove trailing spaces
$word =~ s/"//g; #remove double quotes
}
return $word;
}
This is the part of the script that I wanted to put in a subroutine or function:
$line = _trim($_);
#fields = split (/\|/, $line);
$rec_type = $fields[0];
$country = $fields[1];
my $string = substr $fields[1], 0, 1;
$header = $line if (/^INVHDR/);
if ($rec_type eq 'INVDET') {
if ($string eq 'I') {
$records = $header . $line;
print $OUTPUTA $records, scalar <$input_fh>;
}
else {
$records = $header . $line;
print $OUTPUTB $records, scalar <$input_fh>;
}
}
I would suggest breaking it out a little differently and expand on your _trim function, turning it into a parse function:
use strict;
use warnings;
open( my $input_fh, '<', 'data5.txt' ) or die "Can't open $filename: $!";
open( my $OUTPUTA, '>', 'drp1.txt' ) or die $!;
open( my $OUTPUTB, '>', 'drp2.txt' ) or die $!;
my $header = '';
while (<$input_fh>) {
if ($_ =~ /^INVHDR/) {
$header = $_;
}
if ($_ =~ /^INVDET/) {
my #data = parse($_);
my $line = $header . join('|', #data);
# scalar <$input_fh> is almost certainly not doing what you expect,
# though I'm not sure what you're try to accomplish with it
if ( $data[1] =~ /^I/ ) {
print $OUTPUTA $line;
} else {
print $OUTPUTB $line;
}
}
}
sub parse {
my $input = shift || return;
my $input =~ s/"//g; # remove double quotes
# Here I've combined the removal of trailing spaces with the split.
my #fields = split( m{\s*\|}, $input );
return #fields;
}