I'm trying to use a foreach loop to loop through an array and then use a nested while loop to loop through each line of a text file to see if the array element matches a line of text; if so then I push data from that line into a new array to perform calculations.
The outer foreach loop appears to be working correctly (based on printed results with each array element) but the inner while loop is not looping (same data pushed into array each time).
Any advice?
The code is below
#! /usr/bin/perl -T
use CGI qw(:cgi-lib :standard);
print "Content-type: text/html\n\n";
my $input = param('sequence');
my $meanexpfile = "final_expression_complete.txt";
open(FILE, $meanexpfile) or print "unable to open file";
my #meanmatches;
#regex = (split /\s/, $input);
foreach $regex (#regex) {
while (my $line = <FILE>) {
if ( $line =~ m/$regex\s(.+\n)/i ) {
push(#meanmatches, $1);
}
}
my $average = average(#meanmatches);
my $std_dev = std_dev($average, #meanmatches);
my $average_round = sprintf("%0.4f", $average);
my $stdev_round = sprintf("%0.4f", $std_dev);
my $coefficient_of_variation = $stdev_round / $average_round;
my $cv_round = sprintf("%0.4f", $coefficient_of_variation);
print font(
{ color => "blue" }, "<br><B>$regex average: $average_round
 Standard deviation: $stdev_round Coefficient of
variation(Cv): $cv_round</B>"
);
}
sub average {
my (#values) = #_;
my $count = scalar #values;
my $total = 0;
$total += $_ for #values;
return $count ? $total / $count : 0;
}
sub std_dev {
my ($average, #values) = #_;
my $count = scalar #values;
my $std_dev_sum = 0;
$std_dev_sum += ($_ - $average)**2 for #values;
return $count ? sqrt($std_dev_sum / $count) : 0;
}
Yes, my advice would be:
Turn on strict and warnings.
perltidy your code,
use 3 argument open: open ( my $inputfile, "<", 'final_expression.txt' );
die if it doesn't open - the rest of your program is irrelevant.
chomp $line
you are iterating your filehandle, but once you've done this you're at the end of file for the next iteration of the foreach loop so your while loops becomes a null operation. Simplistically, reading the file into an array my #lines = <FILE>; would fix this.
So with that in mind:
#!/usr/bin/perl -T
use strict;
use warnings;
use CGI qw(:cgi-lib :standard);
print "Content-type: text/html\n\n";
my $input = param('sequence');
my $meanexpfile = "final_expression_complete.txt";
open( my $input_file, "<", $meanexpfile ) or die "unable to open file";
my #meanmatches;
my #regex = ( split /\s/, $input );
my #lines = <$input_file>;
chomp (#lines);
close($input_file) or warn $!;
foreach my $regex (#regex) {
foreach my $line (#lines) {
if ( $line =~ m/$regex\s(.+\n)/i ) {
push( #meanmatches, $1 );
}
}
my $average = average(#meanmatches);
my $std_dev = std_dev( $average, #meanmatches );
my $average_round = sprintf( "%0.4f", $average );
my $stdev_round = sprintf( "%0.4f", $std_dev );
my $coefficient_of_variation = $stdev_round / $average_round;
my $cv_round = sprintf( "%0.4f", $coefficient_of_variation );
print font(
{ color => "blue" }, "<br><B>$regex average: $average_round
 Standard deviation: $stdev_round Coefficient of
variation(Cv): $cv_round</B>"
);
}
sub average {
my (#values) = #_;
my $count = scalar #values;
my $total = 0;
$total += $_ for #values;
return $count ? $total / $count : 0;
}
sub std_dev {
my ( $average, #values ) = #_;
my $count = scalar #values;
my $std_dev_sum = 0;
$std_dev_sum += ( $_ - $average )**2 for #values;
return $count ? sqrt( $std_dev_sum / $count ) : 0;
}
The problem here is that starting from the second iteration of foreach you are trying to read from already read file handle. You need to rewind to the beginning to read it again:
foreach $regex (#regex) {
seek FILE, 0, 0;
while ( my $line = <FILE> ) {
However that does not look very performant. Why read file several times at all, when you can read it once before the foreach starts, and then iterate through the list:
my #lines;
while (<FILE>) {
push (#lines, $_);
}
foreach $regex (#regex) {
foreach $line (#lines) {
Having the latter, you might also what to consider using grep instead of the while loop.
Related
I have written below mention code to read a file and and storing data to array #s_arr.
But when I am trying to print that #s_arr array outside the block it shows nothing.
use Data::Dumper;
my #s_arr;
my #err;
my %sort_h_1;
$fname = '/qv/Error.log';
open( IN, "<$fname" );
foreach $line ( <IN> ) {
if ( $line =~ /CODE\+(\w{3})(\d{5})/ ) {
$a = "$1$2";
push #err, $a;
}
}
close IN;
$prev = "";
$count = 0;
my %hash;
foreach ( sort #err ) {
if ( $prev ne $_ ) {
if ( $count ) {
$hash{$prev} = $count;
}
$prev = $_;
$count = 0;
}
$count++;
}
print Dumper \%hash;
printf( "%s:%d\n", $prev, $count ) if $count;
$hash{$prev} = $count;
my $c = 0;
print "Today Error Count\n";
foreach my $name ( sort { $hash{$b} <=> $hash{$a} } keys %hash ) {
#printf "%-8s %s\n", $name, $hash{$name};
#my %sort_h ;
push #s_arr, $name;
push #s_arr, $hash{$name};
#$sort_h{$name} = $hash{$name} ;
#print Dumper \%sort_h ;
#print Dumper \#s_arr ;
$c++;
if ( $c eq 30 ) {
exit;
}
}
print Dumper \#s_arr; # It's showing nothing
You are calling exit inside of your foreach loop. That makes the program stop, and the print Dumper #s_arr is never reached.
To break out of a loop you need to use last.
foreach my $name ( sort ... ) {
# ...
$c++;
last if $c == 30; # break out of the loop when $c reaches 30
}
I used the postfix variant of if here because that makes it way easier to read. Also note that as zdim pointed out above, you should use the numerical equality check == when checking for numbers. eq is for strings.
I have a code to parse 2000 csv files and build hashes based on them.
code is running good and fast until it reads ~100 files and there after it is running at snail pace
Memory consumed is ~ 1.8 GB uncompressed
Goal is to build global hash %_hist from the csv files.
File sizes range between 20KB to 30 MB
OS is Mac with 12 GB RAM
64 bit perl 5.18
I have create every variable in the functions as "my" expecting it to be released after the function exits.
The only persistent global variable is %_hist
Is there a way to improve performance?
foreach my $file (#files){
iLog ("Checking $file");
$| = 1; #flush io
return error("File $file doesn't exist") if not -e $file;
my #records = readCSVFile($file); #reads csv file to 2d array and returns the array
my #formatted_recs;
foreach $rec ( #records ){
my ($time,$c,$user_dst,$client,$ip_src,$first_seen,$last_seen,$first_seen_time,$last_seen_time,$device_ip,$country,$org,$user_agent) = #$rec;
my #newrec = ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org);
next if $time =~ /time/i; #Ignore first record
push(#formatted_recs, \#newrec);
}
baselineHistRecords(#formatted_recs);
}
sub readCSVFile{
my $file = shift;
my #data;
open my $fh, '<', $file or return error("Could not open $file: $!");
my $line = <$fh>; #Read headerline
my $sep_char = ',';
$sep_char = ';' if $line =~ /;"/;
$sep_char = '|' if $line =~ /\|/;
my $csv = Text::CSV->new({ sep_char => "$sep_char" });
push (#data, split(/$sep_char/, $line) );
while( my $row = $csv->getline( $fh ) ) {
push #data, $row;
}
close $fh;
return #data;
}
sub baselineHistRecords{
my #recs = #_;
undef $_ for ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
undef $_ for (%device_count, %ua_count, %location_count, %org_count );
my ($time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org) ;
my %loc = {}; my %loc2rec = {};
my %device_count = {}; my %ua_count = {}; my %location_count = {}; my %sorg_count = {};
my $hits=0;
my #suspicious_hits = ();
foreach $rec (#recs){
my $devtag=''; my $os = '';
my #row = #{$rec};
($time,$c,$client,$first_seen,$last_seen,$ip_src,$ua,$device_ip,$country,$org) = #row;
veryverbose("\n$time,$c,$client,$first_seen,$last_seen,$ip_src,$user_agent,$device_ip,$country,$org");
next if not is_ipv4($ip_src);
###### 1. Enrich IP
my $org = getOrgForIP($ip_src);
my ($country_code,$region,$city) = getGeoForIP($ip_src);
my $isp = getISPForIP($ip_src);
my $loc = join(" > ",($country_code, $region));
my $city = join(" > ",($country_code, $region, $city));
my $cidr = $ip_src; $cidr =~ s/\d+\.\d+$/0\.0\/16/; #Removing last octet
# my $packetmail = getPacketmailRep($ip_src);
# push (#suspicious_hits, "$time $c $client $ip_src $ua / $packetmail") if $packetmail !~ /NOTFOUND/;
##### 2. SANITIZE
$ua = cannonize($ua);
$devtag = $& if $ua =~ /\([^\)]+\)/;
#tokens = split(/;/, $devtag);
$os = $tokens[0];
$os =~ s/\+/ /g;$os =~ s/\(//g;$os =~ s/\)//g;
$os = 'Android' if $os !~ /Android/i and $devtag =~ /Android/i;
$os = "Windows NT" if $os =~ /compatible/i or $os =~ /Windows NT/i;
$_hist{$client}{"isp"}{$isp}{c} += 1;
$_hist{$client}{"os"}{$os}{c} += 1;
$_hist{$client}{"ua"}{$ua}{c} += 1 if not is_empty ($ua);
$_hist{$client}{"ua"}{c} += 1 if not is_empty ($ua); #An exception marked since all logs doesn't have UA values
$_hist{$client}{"loc"}{$loc}{c} += 1;
$_hist{$client}{"org"}{$org}{c} += 1;
$_hist{$client}{"cidr"}{$cidr}{c} += 1;
$_hist{$client}{"city"}{$city}{c} += 1;
$_hist{$client}{"c"} += 1;
$hits = $hits + 1;
print "." if $hits%100==0;
debug( "\n$ip_src : $os $loc $isp $org $ua: ".$_hist{$client}{"os"}{$os}{c} );
}
print "\nHITS: $hits";
return if ($hits==0); #return if empty
printf("\n######(( BASELINE for $client (".$_hist{$client}{c} ." records) ))#######################\n");
foreach my $item (qw/os org isp loc ua cidr/){
debug( sprintf ("\n\n--(( %s: %s ))-------------------------------- ",$client,uc($item)) );
## COMPUTE Usage Percent
my #item_values = sort { $_hist{$client}{$item}{$b}{c} <=> $_hist{$client}{$item}{$a}{c} } keys %{ $_hist{$client}{$item} };
my #cvalues = ();
foreach my $key ( #item_values ){
my $count = $_hist{$client}{$item}{$key}{c};
my $total = $_hist{$client}{c};
$total = $_hist{$client}{"ua"}{c} if $item =~ /^ua|os$/i and $_hist{$client}{"ua"}{c}; #Over for User_agent and OS determination as all logs doesn't have them
my $pc = ceil(( $count / $total ) * 100) ;
debug ("Ignoring empty value") if is_empty($key); # Ignoring Empty values
next if is_empty($key);
$_hist{$client}{$item}{$key}{p} = $pc ;
push (#cvalues, $pc);
#printf("\n%3d \% : %s",$pc,$key) if $pc>0;
}
## COMPUTE Cluster Centers
my #clustercenters = getClusterCenters(3,#cvalues);
my ($low, $medium, $high) = #clustercenters;
$_hist{$client}{$item}{low} = $low;
$_hist{$client}{$item}{medium} = $medium;
$_hist{$client}{$item}{high} = $high;
my %tags = ( $low => "rare",
$medium => "normal",
$high =>"most common",
);
debug ("\n(Cluster Centers) : $low \t$medium \t $high\n");
foreach my $key ( #item_values ){
next if is_empty($key);
my $pc = $_hist{$client}{$item}{$key}{p};
$_hist{$client}{$item}{$key}{tag} = $tags{ closest($pc, #clustercenters) };
debug( sprintf("\n%3d \% : %s : %s",$pc, $_hist{$client}{$item}{$key}{tag} , $key) );
}
}
printf("\n\n###################################\n");
saveHistBaselines();
}
Thanks,
Uma
This is more question for code review.
There's a ton of completely useless copying around in the code. E.g.: why the hell you copy data from my #$rec to #newrec? $rec to #row? Why do you return plain list of lines from readCSVFile instead of reference?
You don't really need to read entire file in memory and then process it - you can process data line by line and throw it away immideately after you done with it.
I can open one file in a directory and run the following code. However, when I try to use the same code on multiple files within a directory, I get an error regarding there not being a file.
I have tried to make sure that I am naming the files correctly, that they are in the right format, that they are located in my current working directory, and that things are referenced correctly.
I know a lot of people have had this error before and have posted similar questions, but any help would be appreciated.
Working code:
#!/usr/bin/perl
use warnings;
use strict;
use diagnostics;
use List::Util qw( min max );
my $RawSequence = loadSequence("LDTest.fasta");
my $windowSize = 38;
my $stepSize = 1;
my %hash;
my $s1;
my $s2;
my $dist;
for ( my $windowStart = 0; $windowStart <= 140; $windowStart += $stepSize ) {
my $s1 = substr( $$RawSequence, $windowStart, $windowSize );
my $s2 = 'CGGAGCTTTACGAGCCGTAGCCCAAACAGTTAATGTAG';
# the 28 nt forward primer after the barcode plus the first 10 nt of the mtDNA dequence
my $dist = levdist( $s1, $s2 );
$hash{$dist} = $s1;
#print "Distance between '$s1' and '$s2' is $dist\n";
sub levdist {
my ( $seq1, $seq2 ) = (#_)[ 0, 1 ];
my $l1 = length($s1);
my $l2 = length($s2);
my #s1 = split '', $seq1;
my #s2 = split '', $seq2;
my $distances;
for ( my $i = 0; $i <= $l1; $i++ ) {
$distances->[$i]->[0] = $i;
}
for ( my $j = 0; $j <= $l2; $j++ ) {
$distances->[0]->[$j] = $j;
}
for ( my $i = 1; $i <= $l1; $i++ ) {
for ( my $j = 1; $j <= $l2; $j++ ) {
my $cost;
if ( $s1[ $i - 1 ] eq $s2[ $j - 1 ] ) {
$cost = 0;
}
else {
$cost = 1;
}
$distances->[$i]->[$j] = minimum(
$distances->[ $i - 1 ]->[ $j - 1 ] + $cost,
$distances->[$i]->[ $j - 1 ] + 1,
$distances->[ $i - 1 ]->[$j] + 1,
);
}
}
my $min_distance = $distances->[$l1]->[$l2];
for ( my $i = 0; $i <= $l1; $i++ ) {
$min_distance = minimum( $min_distance, $distances->[$i]->[$l2] );
}
for ( my $j = 0; $j <= $l2; $j++ ) {
$min_distance = minimum( $min_distance, $distances->[$l1]->[$j] );
}
return $min_distance;
}
}
sub minimum {
my $min = shift #_;
foreach (#_) {
if ( $_ < $min ) {
$min = $_;
}
}
return $min;
}
sub loadSequence {
my ($sequenceFile) = #_;
my $sequence = "";
unless ( open( FASTA, "<", $sequenceFile ) ) {
die $!;
}
while (<FASTA>) {
my $line = $_;
chomp($line);
if ( $line !~ /^>/ ) {
$sequence .= $line; #if the line doesn't start with > it is the sequence
}
}
return \$sequence;
}
my #keys = sort { $a <=> $b } keys %hash;
my $BestMatch = $hash{ keys [0] };
if ( $keys[0] < 8 ) {
$$RawSequence =~ s/\Q$BestMatch\E/CGGAGCTTTACGAGCCGTAGCCCAAACAGTTAATGTAG/g;
print ">|Forward|Distance_of_Best_Match: $keys[0] |Sequence_of_Best_Match: $BestMatch", "\n",
"$$RawSequence", "\n";
}
Here is an abbreviated version of my non-working code. Things that haven't changed I didn't included:
Headers and Globals:
my $dir = ("/Users/roblogan/Documents/FakeFastaFiles");
my #ArrayofFiles = glob "$dir/*.fasta";
foreach my $file ( #ArrayofFiles ) {
open( my $Opened, $file ) or die "can't open file: $!";
while ( my $OpenedFile = <$Opened> ) {
my $RawSequence = loadSequence($OpenedFile);
for ( ... ) {
...;
print
">|Forward|Distance_of_Best_Match: $keys[0] |Sequence_of_Best_Match: $BestMatch",
"\n", "$$RawSequence", "\n";
}
}
}
The exact error is:
Uncaught exception from user code:
No such file or directory at ./levenshtein_for_directory.pl line 93, <$Opened> line 1.
main::loadSequence('{\rtf1\ansi\ansicpg1252\cocoartf1404\cocoasubrtf470\x{a}') called at ./levenshtein_for_directory.pl line 22
line 93:
89 sub loadSequence{
90 my ($sequenceFile) = #_;
91 my $sequence = "";
92 unless (open(FASTA, "<", $sequenceFile)){
93 die $!;
94 }
Line 22:
18 foreach my $file ( #ArrayofFiles ) {
19 open (my $Opened, $file) or die "can't open file: $!";
20 while (my $OpenedFile = <$Opened>) {
21
22 my $RawSequence = loadSequence($OpenedFile);
23
I just learned that "FASTA file" is a settled term. Wasn't aware of that and previously thought they are some files and contain filenames or something. As #zdim already said, you're opening these files twice.
The following code gets a list of FASTA files (only the filenames) and then calls loadSequence with each such a filename. That subroutine then opens the given file, concatenates the none-^> lines to one big line and returns it.
# input: the NAME of a FASTA file
# return: all sequences in that file as one very long string
sub loadSequence
{
my ($fasta_filename) = #_;
my $sequence = "";
open( my $fasta_fh, '<', $fasta_filename ) or die "Cannot open $fasta_filename: $!\n";
while ( my $line = <$fasta_fh> ) {
chomp($line);
if ( $line !~ /^>/ ) {
$sequence .= $line; #if the line doesn't start with > it is the sequence
}
}
close($fasta_fh);
return $sequence;
}
# ...
my $dir = '/Users/roblogan/Documents/FakeFastaFiles';
my #ArrayofFiles = glob "$dir/*.fasta";
foreach my $filename (#ArrayofFiles) {
my $RawSequence = loadSequence($filename);
# ...
}
You seem to be trying to open files twice. The line
my #ArrayofFiles = glob "$dir/*.fasta";
Gives you the list of files. Then
foreach my $file (#ArrayofFiles){
open (my $Opened, $file) or die "can't open file: $!";
while (my $OpenedFile = <$Opened>) {
my $RawSequence = loadSequence($OpenedFile);
# ...
does the following, line by line. It iterates through files, opens each, reads a line from it, and then submits that line to the function loadSequence().
However, in that function you attempt to open a file again
sub loadSequence{
my ($sequenceFile) = #_;
my $sequence = "";
unless (open(FASTA, "<", $sequenceFile)){
# ...
The $sequenceFile variable in the function is passed to the function as $OpenedFile -- which is a line in the file that is already opened and being read from, not the file name. While I am not certain about details of your code, the error you show seems to be consistent with this.
It may be that you are confusing the glob, which gives you the list of files, with the opendir which would indeed need a following readdir to access the files.
Try renaming $OpenedFile to, say, $line (which it is) and see how it looks then.
This is the program as it stands right now, it takes in a .fasta file (a file containing genetic code), creates a hash table with the data and prints it, however, it is quite slow. It splits a string an compares it against all other letters in the file.
use strict;
use warnings;
use Data::Dumper;
my $total = $#ARGV + 1;
my $row;
my $compare;
my %hash;
my $unique = 0;
open( my $f1, '<:encoding(UTF-8)', $ARGV[0] ) or die "Could not open file '$ARGV[0]' $!\n";
my $discard = <$f1>;
while ( $row = <$f1> ) {
chomp $row;
$compare .= $row;
}
my $size = length($compare);
close $f1;
for ( my $i = 0; $i < $size - 6; $i++ ) {
my $vs = ( substr( $compare, $i, 5 ) );
for ( my $j = 0; $j < $size - 6; $j++ ) {
foreach my $value ( substr( $compare, $j, 5 ) ) {
if ( $value eq $vs ) {
if ( exists $hash{$value} ) {
$hash{$value} += 1;
} else {
$hash{$value} = 1;
}
}
}
}
}
foreach my $val ( values %hash ) {
if ( $val == 1 ) {
$unique++;
}
}
my $OUTFILE;
open $OUTFILE, ">output.txt" or die "Error opening output.txt: $!\n";
print {$OUTFILE} "Number of unique keys: " . $unique . "\n";
print {$OUTFILE} Dumper( \%hash );
close $OUTFILE;
Thanks in advance for any help!
It is not clear from the description what is wanted from this script, but if you're looking for matching sets of 5 characters, you don't actually need to do any string matching: you can just run through the whole sequence and keep a tally of how many times each 5-letter sequence occurs.
use strict;
use warnings;
use Data::Dumper;
my $str; # store the sequence here
my %hash;
# slurp in the whole file
open(IN, '<:encoding(UTF-8)', $ARGV[0]) or die "Could not open file '$ARGV[0]' $!\n";
while (<IN>) {
chomp;
$str .= $_;
}
close(IN);
# not sure if you were deliberately omitting the last two letters of sequence
# this looks at all the sequence
my $l_size = length($str) - 4;
for (my $i = 0; $i < $l_size; $i++) {
$hash{ substr($str, $i, 5) }++;
}
# grep in a scalar context will count the values.
my $unique = grep { $_ == 1 } values %hash;
open OUT, ">output.txt" or die "Error opening output.txt: $!\n";
print OUT "Number of unique keys: ". $unique."\n";
print OUT Dumper(\%hash);
close OUT;
It might help to remove searching for information that you already have.
I don't see that $j depends upon $i so you're actually matching values to themselves.
So you're getting bad counts as well. It works for 1, because 1 is the square of 1.
But if for each five-character string you're counting strings that match, you're going
to get the square of the actual number.
You would actually get better results if you did it this way:
# compute it once.
my $lim = length( $compare ) - 6;
for ( my $i = 0; $i < $lim; $i++ ){
my $vs = substr( $compare, $i, 5 );
# count each unique identity *once*
# if it's in the table, we've already counted it.
next if $hash{ $vs };
$hash{ $vs }++; # we've found it, record it.
for ( my $j = $i + 1; $j < $lim; $j++ ) {
my $value = substr( $compare, $j, 5 );
$hash{ $value }++ if $value eq $vs;
}
}
However, it could be an improvement on this to do an index for your second loop
and let the c-level of perl do your matching for you.
my $pos = $i;
while ( $pos > -1 ) {
$pos = index( $compare, $vs, ++$pos );
$hash{ $vs }++ if $pos > -1;
}
Also, if you used index, and wanted to omit the last two characters--as you do, it might make sense to remove those from the characters you have to search:
substr( $compare, -2 ) = ''
But you could do all of this in one pass, as you loop through file. I believe the code
below is almost an equivalent.
my $last_4 = '';
my $last_row = '';
my $discard = <$f1>;
# each row in the file after the first...
while ( $row = <$f1> ) {
chomp $row;
$last_row = $row;
$row = $last_4 . $row;
my $lim = length( $row ) - 5;
for ( my $i = 0; $i < $lim; $i++ ) {
$hash{ substr( $row, $i, 5 ) }++;
}
# four is the maximum we can copy over to the new row and not
# double count a strand of characters at the end.
$last_4 = substr( $row, -4 );
}
# I'm not sure what you're getting by omitting the last two characters of
# the last row, but this would replicate it
foreach my $bad_key ( map { substr( $last_row, $_ ) } ( -5, -6 )) {
--$hash{ $bad_key };
delete $hash{ $bad_key } if $hash{ $bad_key } < 1;
}
# grep in a scalar context will count the values.
$unique = grep { $_ == 1 } values %hash;
You may be interested in this more concise version of your code that uses a global regex match to find all the subsequences of five characters. It also reads the entire input file in one go, and removes the newlines afterwards.
The path to the input file is expected as a parameter on the command line, and the output is sent to STDIN, and can be redirected to a file on the command line, like this
perl subseq5.pl input.txt > output.txt
I've also used Data::Dump instead of Data::Dumper because I believe it to be vastly superior. However it is not a core module, and so you will probably need to install it.
use strict;
use warnings;
use open qw/ :std :encoding(utf-8) /;
use Data::Dump;
my $str = do { local $/; <>; };
$str =~ tr|$/||d;
my %dups;
++$dups{$1} while $str =~ /(?=(.{5}))/g;
my $unique = grep $_ == 1, values %dups;
print "Number of unique keys: $unique\n";
dd \%dups;
I am working on some genome data and I have 2 files ->
File1
A1 1 10
A1 15 20
A2 2 11
A2 13 16
File2
>A1
CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT
>A2
GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA
AACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGC
CAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAAT
In file 1, 2nd and 3rd column represents the indexes in File2. So I want that, if character in column1 of file1 matches with character followed by symbol (>) in file2 , then from next line of that file2 give back the substring according to indexes in col2 and col3 of file1. (sorry, I know its complicated) Here is the desire output ->
Output
>A1#1:10
CTATTATTTA
>A1#15:20
ACCTA
>A2#2:11
TCTGCACAGC
>A2#13:16
GCTT
I know if I have only 1 string I can take out sub-string very easily ->
#ARGV or die "No input file specified";
open $first, '<',$ARGV[0] or die "Unable to open input file: $!";
$string="GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT";
while (<$first>)
{
#cols = split /\s+/;
$co=$cols[1]-1;
$length=$cols[2]-$co;
$fragment = substr $string, $co, $length;
print ">",$cols[0],"#",$cols[1],":",$cols[2],"\n",$fragment,"\n";
}
but here my problem is when should I input my second file and how should I match the character in col1 (of file1) with character in file2 (followed by > symbol) and then how to get substring?
I wasnt sure if they were all one continuous line or separate lines.
I set it up as continuous for now.
Basically, read the 2nd file as master.
Then you can process as many index files as you need.
You can use hash of arrays to help with the indexing.
push #{$index{$key}}, [$start,$stop];
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
print "Usage: $0 [filename(s)]\n";
exit 1;
}
my %Data = read_master($master_file);
foreach my $index_file (#ARGV) {
my %Index = read_index($index_file);
foreach my $key (sort keys %Index) {
foreach my $i (#{$Index{$key}}) {
my ($start,$stop) = #$i;
print ">$key#$start:$stop\n";
my $pos = $start - 1;
my $count = $stop - $start + 1;
print substr($Data{$key},$pos,$count)."\n";
}
}
}
sub read_file {
my $file = shift;
my #lines;
open(FILE, $file) or die "Error: cannot open $file\n$!";
while(<FILE>){
chomp; #remove newline
s/(^\s+|\s+$)//g; # strip lead/trail whitespace
next if /^$/; # skip blanks
push #lines, $_;
}
close FILE;
return #lines;
}
sub read_index {
my $file = shift;
my #lines = read_file($file);
my %index;
foreach (#lines) {
my ($key,$start,$stop) = split /\s+/;
push #{$index{$key}}, [$start,$stop];
}
return %index;
}
sub read_master {
my $file = shift;
my %master;
my $key;
my #lines = read_file($file);
foreach (#lines) {
if ( m{^>(\w+)} ) { $key = $1 }
else { $master{$key} .= $_ }
}
return %master;
}
Load File2 in a Hash, with A1, A2... as keys, and the DNA sequence as value. This way you can get the DNA sequence easily.
This 2nd update turns the master file into a hash of arrays as well.
This treats each row in the 2nd file as individual sequences.
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
print "Usage: $0 [filename(s)]\n";
exit 1;
}
my %Data = read_master($master_file);
foreach my $index_file (#ARGV) {
my %Index = read_index($index_file);
foreach my $key (sort keys %Index) {
foreach my $i (#{$Index{$key}}) {
my ($start,$stop) = #$i;
print ">$key#$start:$stop\n";
my $pos = $start - 1;
my $count = $stop - $start + 1;
foreach my $seq (#{$Data{$key}}) {
print substr($seq,$pos,$count)."\n";
}
}
}
}
sub read_file {
my $file = shift;
my #lines;
open(FILE, $file) or die "Error: cannot open $file\n$!";
while(<FILE>){
chomp; #remove newline
s/(^\s+|\s+$)//g; # strip lead/trail whitespace
next if /^$/; # skip blanks
push #lines, $_;
}
close FILE;
return #lines;
}
sub read_index {
my $file = shift;
my #lines = read_file($file);
my %index;
foreach (#lines) {
my ($key,$start,$stop) = split /\s+/;
push #{$index{$key}}, [$start,$stop];
}
return %index;
}
sub read_master {
my $file = shift;
my %master;
my $key;
my #lines = read_file($file);
foreach (#lines) {
if ( m{^>(\w+)} ) { $key = $1 }
else { push #{ $master{$key} }, $_ }
}
return %master;
}
Output:
>A1#1:10
CTATTATTTA
AAGTGTGTTA
>A1#15:20
ACCTAC
ATTAAT
>A2#2:11
TCTGCACAGC
ACCCCCCCCT
AAACCCCAAA
>A2#13:16
GCTT
CCCC
ACAA