#t = qw(a b c d e + g h + j k m n l + h +);
#q = qw(a b c d e f g h i j k l m l j h h);
#s = qw(a b c d e f g h k j k l m l j h h);
foreach (0..$#q){
if($t[$_] eq ($q[$_] && $s[$_])){
print "$t[$_]";
}
print "$t[$_]-$t[$_]\n";
elsif($t[$_] eq '+' && $q[$_] eq $s[$_]){
print"$t[$_]";
}
else{
print "\n";
}
}
Expected Output:
abcde+gh [1-8]
jk [10-11]
l+h+ [14-17]
Here #t based on matching both #q and #s, and print the intervals also based on #t.
I am not able to get an intervals as mismatching. please give me a good solution
Your code has an syntax error you introduced with your 4th edit. You can't put any code outside an if's block and its elseif. If I understood it right you wanted to know when the arrays #q, #s and #t line up, where #t is allowed to have '+' as a wildcard.
Here is one solution. It uses a $start variable to check if we are inside an interval and stores the beginning. If we are at the end of an interval or the arrays. We print the interval lengths. There are probably nicer ways to format this. The best would be to introduce more complex ad-hoc objects. The code would be much easier if you were't interested in the indices of the beginning and end of the intervals.
For the test: I restructured it a bit. Furthermore if you already know that $q[$_] eq $s[$_] you won't have to check both $t[$_] eq $s[$_] and $t[$_] eq $q[$_]. You don't have to make that check at all if $t[$_] eq "+"
#!/usr/bin/env perl
use strict; # These aren't optional!
use warnings; # Always use them!
use 5.01; # for the // operator and say
my #t = qw(a b c d e + g h + j k m n l + h +);
my #q = qw(a b c d e f g h i j k l m l j h h);
my #s = qw(a b c d e f g h k j k l m l j h h);
my ($start);
sub print_interval{
my $end = shift;
printf((' 'x(8+$start-$end)). # inserting the whitespaces
"[%2d-%-2d]\n", $start, $end);
}
foreach (0..$#q){
my ($te, $qe, $se) = ($t[$_], $q[$_], $s[$_]); # just shorthands
if($qe eq $se && ($te eq "+" || $te eq $qe)){
$start //= $_; # if not set, set it to the current index
print $te;
}elsif (defined $start){
print_interval($_-1);
undef $start;
}
}
if (defined $start){
# if we are still in an interval at the end,
# we'll have to print that too.
print_interval($#q)
}
If you're uncomfortable with the definedness checks, you also can set $start to -1 and check 0 <= $start.
Here is a solution that uses intermediate objects and saves the results in an array, this makes for nicer formatting and the code is structured better:
# … strict, warnings, array declarations
my ($res,#results);
foreach (0..$#q){
my ($te, $qe, $se) = ($t[$_], $q[$_], $s[$_]);
if($qe eq $se && ($te eq "+" || $te eq $qe)){
$res = {start => $_, string => ''} unless defined $res;
$res->{string} .= $te;
}elsif (defined $res){
$res->{end} = $_-1;
push #results, $res;
undef $res;
}
}
if (defined $res){ # still in interval
$res->{end} = $#q;
push #results, $res;
}
printf "%-9s[%2d-%-2d]\n", #{$_}{qw|string start end|} for #results;
#!/usr/bin/perl
use strict;
use warnings;
my #t = qw(a b c d e + g h + j k m n l + h +);
my #q = qw(a b c d e f g h i j k l m l j h h);
my #s = qw(a b c d e f g h k j k l m l j h h);
my #current_interval = (); #will store the interval we are currently working on
my #intervals = (); #keeps track of all those intervals
for(0 .. $#t){
if($q[$_] eq $s[$_] and ($q[$_] eq $t[$_] or $t[$_] eq '+')){
push(#current_interval, $_);
}
else{
if(#current_interval){
push(#intervals, [$current_interval[0], $current_interval[$#current_interval]]);
#current_interval = ();
}
}
}
#when exiting the loop we dont want to lose our current interval!
if(#current_interval){
push(#intervals, [$current_interval[0], $current_interval[$#current_interval]]);}
#print intervals
for (#intervals){
my #c = #{$_};
print $c[0],"\t",$c[1],"\n";
}
I got the intervals for you.
Please note that I added "use strict; use warnings" - before adding this solution to your project.
Greetings Tim
Related
The code is getting messy somewhere in the loop! Please help me to solve it.
Details
Replicate and/or reassign most of the array elements in the multi-dimensional array, using reference elements.
File-1: List of array indices & the elements that needs to be maintained in the original array.
File-2: The original multi-dimensional array that needs to be rewritten with the above info. Except the elements from the above, the rest of all elements have to be reassigned.
File-3: Expected output (reassigned array elements)
Note: Addition to the array indices from file1, rest of all the indices will be replaced with the reference line. Reference line is usually present in the first line of the array. In the modified array, the reference line is not needed.
File-1:
ID1 2 E1,E4
ID2 5 E6,E7,E9
ID3 1 E3
File-2:
ID1.txt
Ref K L M N O P A B C D
E1 S H G U S K R E K K
E2 S L G N O P A B C D
E3 S L G N O P A B C D
E4 U L G G O P A B C D
E5 U L M G O P A J C D
E6 U L M G O P A J C D
E7 U L M G O P A J C D
E8 U L M G O P A J C D
E9 S L M N O P A J C D
E10 S L M N O P A J C D
.
.
.
File-3: Expected output
new_ID1.txt
E1 K L G N O P A B C D
E2 K L M N O P A B C D
E3 K L M N O P A B C D
E4 K L G N O P A B C D
E5 K L M N O P A B C D
E6 K L M N O P A B C D
E7 K L M N O P A B C D
E8 K L M N O P A B C D
E9 K L M N O P A B C D
E10 K L M N O P A B C D
.
.
.
In the expected output, (new_ID1.txt), second index of the array for "E1" and "E4" is maintained from the original array. Everything else is replaced by the reference line in "E2,E3,E5...".
Code
#!/usr/bin/perl
use strict;
use warnings;
my %HoHoA = ();
open(IN,"ids.txt");
my #ids = <IN>; chomp #ids; close IN;
open(IN2,"indices_and_values.txt");
while(my $l = <IN2>)
{
chomp $l;
my #tmp = split "\t", $l;
my $lid = $tmp[0];
my $pos = $tmp[1];
my #gps = #tmp[2..$#tmp];
foreach my $g (#gps)
{
push #{$HoHoA{$lid}{$g}}, $pos;
}
}
close IN2;
foreach my $outer (sort keys %HoHoA)
{
open(IN3,"$outer.txt");
my #rS = <IN3>; chomp #rS; close IN3;
my #orgArr = (); my #refArr = (); my #newArr = ();
foreach my $unk (#rS)
{
#orgArr = split "\t", $unk;
if($unk =~ /^Ref/)
{
#refArr = split "\t", $unk;
next;
}
foreach my $inner (sort keys %{$HoHoA{$outer}})
{
if($inner =~ /^$orgArr[0]/)
{
foreach my $ele (sort {$a <=> $b} #{$HoHoA{$outer}{$inner}})
{
$refArr[$ele] = $orgArr[$ele];
}
}
#else
#{
#}
}
print ">$orgArr[0]\t";
print join("\t",#refArr[1..$#refArr]);
print "\n";
}
#rS = ();
print "\n";
}
The shown code is well-meant but a bit too complicated; you may have lost your way in the maneuvers over the nested data structure. Here's another, simpler, approach.
Parse the information from the "reference" file (File-1) into a hash (E1 => [2, ...], ..). I put indices for data to be kept in an arrayref to allow for multiple indices for a row. Then go line by line, replacing data at these indices for rows that have a key, and print output as you go.
use warnings;
use strict;
use feature 'say';
my ($ref_file, $data_file) = #ARGV;
die "Usage: $0 ref-file data-file\n" if not $ref_file or not $data_file;
open my $fh, '<', $ref_file or die "Can't open $ref_file: $!";
my %rows;
while (<$fh>) {
my (undef, $idx, $row_id) = split;
for (split /,/, $row_id) {
push #{$rows{$_}}, $idx; # elem => [ indices ]
}
}
my $outfile = 'new_' . $data_file;
open $fh, '<', $data_file or die "Can't open $data_file: $!";
open my $fh_out, '>', $outfile or die "Can't open $outfile: $!";
my #ref = split ' ', <$fh>;
shift #ref; # toss the first field
while (<$fh>) {
my ($row_id, #data) = split;
if (exists $rows{$row_id}) { # this row needs attention
my #new_row = #ref;
foreach my $idx (#{$rows{$row_id}}) { # keep data at these indices
$new_row[$idx] = $data[$idx];
}
say $fh_out join "\t", $row_id, #new_row;
}
else { # use whole reference line
say $fh_out join "\t", $row_id, #ref;
}
}
The new file (shown with two spaces instead of the actual tabs, for readability)
E1 K L G N O P A B C D
E2 K L M N O P A B C D
E3 K L M N O P A B C D
E4 K L G N O P A B C D
E5 K L M N O P A B C D
E6 K L M N O P A B C D
E7 K L M N O P A B C D
E8 K L M N O P A B C D
E9 K L M N O P A B C D
E10 K L M N O P A B C D
Note that the given input file happens to have the same entries as the reference line to use in replacement at many indices of interest -- so we can't see those "changes" in the above output. (I tested by changing the input file so to be able to see.)
This is one way to do it, if I understood your problem statement correctly:
#!/usr/bin/perl
use strict;
use warnings;
my %keep_idx;
open FILE, "file-1" or die "Couldn't open file-1";
while(<FILE>) {
my (undef, $idx, $id_str) = split /\s+/;
my #ids = split /,/, $id_str;
foreach my $id (#ids) {
$keep_idx{$id}{$idx} = 1;
}
}
close FILE;
open FILE, "file-2" or die "Couldn't open file-2";
open OUTFILE, ">file-3" or die "Couldn't open file-3";
my (undef, #ref) = split /\s+/, <FILE>;
while(<FILE>) {
my ($id, #src) = split /\s+/;
my $line = "$id";
for (my $i = 0; $i <= $#src; $i++) {
my $e = $keep_idx{$id}{$i} ? $src[$i] : $ref[$i];
$line .= " $e";
}
print OUTFILE "$line\n";
}
close OUTFILE;
close FILE;
I have a tab separated large file like this:
input.txt
a b c
s t e
a b c
f q y
r e x
to delete the repeated lines (rows) in this file, i use:
my %seen;
my #lines;
while (<>) {
my #cols = split /\s+/;
unless ($seen{$cols[0]}++) {
push #lines, $_;
}
}
print #lines;
the output here is:
a b c
s t e
f q y
r e x
Now if I want to delete those lines too that contain repeted values (means: that value once appear anywhere in upper rows/columns, here "e") and keep only the uppermost value containing line, please suggest what will be the most preffered approach keeping in mind that my input file is very large with many columns and rows.
model output that I want for the above input.txt would be:
a b c
s t e
f q y
Thank you
You also need to iterate over the #cols and examine every item instead of just the first one, $cols[0].
You need something like
unless ($seen{$cols[0]}++ || $seen{$cols[1]}++ || $seen{$cols[2]}++ ...) {
push #lines, $_;
}
Of course that would be bad style and impossible if you don't know the number of columns in advance.
I would do it with grep:
my %seen;
my #lines;
while (<DATA>) {
my #cols = split /\s+/;
unless ( grep { $seen{$_}++ } #cols ) {
push #lines, $_;
}
}
print #lines;
__DATA__
a b c
s t e
a b c
f q y
r e x
Output:
a b c
s t e
f q y
grep processes the code between the curlies { $seen{$_}++ } for each element in the list #cols and returns (in scalar context) the number of items that evaluated to true.
It's not the fastest approach because it always iterates over the whole array (even if the first evaluation would be sufficient for your particular test). But give it a try; perhaps it's fast enough for you.
As I wrote in my comments, split /\s+/ is very rarely correct
And the solution you have mishandles lines with duplicate fields
It's also more efficient to replace grep with any from the core List::Util module
I suggest that you store the fields of each line in a hash %cols, like this
use strict;
use warnings 'all';
use List::Util 'any';
my ( #lines, %seen );
while ( <DATA> ) {
my %cols = map { $_ => 1 } split;
push #lines, $_ unless any { $seen{$_}++ } keys %cols;
}
print for #lines;
__DATA__
a b c
p p p
p q r
s t e
a b c
f q y
r e x
output
a b c
p p p
s t e
Even this may not be what you want, as the line f q y is omitted because q has already been "seen" in the omitted line p q r. You will have to clarify the required behaviour in this situation
I've created a hash that lists each unique word from a text file and the number of times each word appears. The resulting output from this hash indicates it has read the entire file correctly.
However, later efforts to parse text from the same file appear to only capture some of the contents.
For illustrative purposes, a second hash designed to capture each word from the infile as a hash value and that word's relative ordering in the document as the hash key captures only a small fraction of all the words in the document.
Any insights as to the cause of this phenomenon?
#!/usr/bin/perl
use strict;
use warnings;
my $path = "U:/Perl";
chdir($path) or die "Cant chdir to $path $!";
# Starting off computing a simple word count for each word in the 10-K.
my %hash;
my $word;
my %words;
while (<>) {
my #words = split;
# Here creating an index of each word in the 10-K and the number of times
# it appears. This hash works correctly.
foreach my $i (0 .. $#words) {
my $word = $words[$i];
++$hash{$word};
# Here trying to create a hash where each word from the infile is a value,
# and the word's order in the doc is the key.
$words{$i} = $word;
}
}
# The code below simply sorts each hash and prints it to an external file.
my #keys = sort { "\L$a" <=> "\L$b" } keys %words;
open(my $fh2, '>', 'report2.txt');
foreach (#keys) {
print $fh2 "$_ \t $words{$_}\n ";
}
close $fh2;
#keys = sort {
"\L$hash{$a}" <=> "\L$hash{$b}" or
"\L$a" cmp "\L$b"
} keys %hash;
open(my $fh3, '>', 'report3.txt');
foreach (#keys) {
print $fh3 "$_ \t $hash{$_}\n ";
}
close $fh3;
I presume you're talking about the %words hash? You are keying that hash by the position of each word on each line, so it will only ever have as many entries as the longest line in the file has words.
When processing this data
a b c d e f
g h i j k
l m n o
p q r
s t
u
your program will build %hash with 21 elements, with the keys a to u and all the values equal to 1 as they are all different.
But the hash %words will have six elements -- the number of words in the longest line -- and the values will be overwritten by the last word at that position in the line. So your hash will look like
{ 0 => 'u', 1 => 't', 2 => 'r', 3 => 'o', 4 => 'k', 5 => 'f' }
as the last word in the first position on the line is u, the last one in the second position is t, etc.
Surely you don't want a hash indexing the words across the whole file? That would be an array!
Update
Thank you for explaining your intention. I don't think putting all the words from the file into a hash one by one is going to help you to count all the unique six-word sequences.
In any case, using numeric keys from 1 to N, where N is the number of words in the file, is misusing a hash, and as I intimated above what you really want is an array that is meant to be indexed by integers.
I think you should keep an array that holds the current six-word sequence. If you add each word to the end of the array and drop them from the beginning then it will always hold the most recent six words from the file.
Something like this, perhaps
use strict;
use warnings;
my #sequence;
my %sequences;
while (<DATA>) {
for (split) {
push #sequence, $_;
if (#sequence >= 6) {
shift #sequence while #sequence > 6;
++$sequences{"#sequence"};
}
}
}
use Data::Dump;
dd \%sequences;
__DATA__
a b c d e f
g h i j k
l m n o
p q r
s t
u
output
{
"a b c d e f" => 1,
"b c d e f g" => 1,
"c d e f g h" => 1,
"d e f g h i" => 1,
"e f g h i j" => 1,
"f g h i j k" => 1,
"g h i j k l" => 1,
"h i j k l m" => 1,
"i j k l m n" => 1,
"j k l m n o" => 1,
"k l m n o p" => 1,
"l m n o p q" => 1,
"m n o p q r" => 1,
"n o p q r s" => 1,
"o p q r s t" => 1,
"p q r s t u" => 1,
}
When looping through two arrays, I am confused about how to move the pointer through one loop but keep it constant in another. So for example:
Array 1: A T C G T C G A G C G
Array 2: A C G T C C T G T C G
So A in the first array matches A in the second array so we move on to next elements. But since the T doesn't match the C in the 2nd index, I want the program to compare that T to the next G in array 2 and so on until it finds the matching T.
my ($array1ref, $array2ref) = #_;
my #array1 = #$array1ref;
my #array2= #$array2ref;
my $count = 0;
foreach my $element (#array1) {
foreach my $element2 (#array2) {
if ($element eq $element2) {
$count++;
}else { ???????????
}
You could use a while loop to search for matches. If you find a match, advance in both arrays. If you don't, advance the second array. At the end you could print the remaining unmatched characters from the first array:
# [1, 2, 3] is a reference to an anonymous array (1, 2, 3)
# qw(1, 2, 3) is shorthand quoted-word for ('1', '2', '3')
my $arr1 = [qw(A T C G T C G A G C G)];
my $arr2 = [qw(A C G T C C T G T C G)];
my $idx1 = 0;
my $idx2 = 0;
# Find matched characters
# #$arr_ref is the size of the array referenced by $arr_ref
while ($idx1 < #$arr1 && $idx2 < #$arr2) {
my $char1 = $arr1->[$idx1];
my $char2 = $arr2->[$idx2];
if ($char1 eq $char2) {
# Matched character, advance arr1 and arr2
printf("%s %s -- arr1[%d] matches arr2[%d]\n", $char1, $char2, $idx1, $idx2);
++$idx1;
++$idx2;
} else {
# Unmatched character, advance arr2
printf(". %s -- skipping arr2[%d]\n", $char2, $idx2);
++$idx2;
}
}
# Remaining unmatched characters
while ($idx1 < #$arr1) {
my $char1 = $arr1->[$idx1];
printf("%s . -- arr1[%d] is beyond the end of arr2\n", $char1, $idx1);
$idx1++;
}
The script prints:
A A -- arr1[0] matches arr2[0]
. C -- skipping arr2[1]
. G -- skipping arr2[2]
T T -- arr1[1] matches arr2[3]
C C -- arr1[2] matches arr2[4]
. C -- skipping arr2[5]
. T -- skipping arr2[6]
G G -- arr1[3] matches arr2[7]
T T -- arr1[4] matches arr2[8]
C C -- arr1[5] matches arr2[9]
G G -- arr1[6] matches arr2[10]
A . -- arr1[7] is beyond the end of arr2
G . -- arr1[8] is beyond the end of arr2
C . -- arr1[9] is beyond the end of arr2
G . -- arr1[10] is beyond the end of arr2
Nested loops makes no sense. You don't want to loop over anything more than once.
You didn't specify what you wanted to happen after you resync, so you'll want to start with the following and tailor it to your needs.
my ($array1, $array2) = #_;
my $idx1 = 0;
my $idx2 = 0;
while ($idx1 < #$array1 && $idx2 < #$array2) {
if ($array1->[$idx1] eq $array2->[$idx2]) {
++$idx1;
++$idx2;
} else {
++$idx2;
}
}
...
As is, the above snippet will leave $idx1 at the last index it couldn't (eventually) resync. If instead you want to stop as soon as you first resync, you want
my ($array1, $array2) = #_;
my $idx1 = 0;
my $idx2 = 0;
my $mismatch = 0;
while ($idx1 < #$array1 && $idx2 < #$array2) {
if ($array1->[$idx1] eq $array2->[$idx2]) {
last if $mismatched;
++$idx1;
++$idx2;
} else {
++$mismatched;
++$idx2;
}
}
...
The foreach loops won't cut it: We'll either want to loop while there are available elements in both arrays, or iterate through all indices, which we can increment as we like:
EL1: while (defined(my $el1 = shift #array1) and #array2) {
EL2: while(defined(my $el2 = shift #array2)) {
++$count and next EL1 if $el1 eq $el2; # break out of inner loop
}
}
or
my $j = 0; # index of #array2
for (my $i = 0; $i <= $#array1; $i++) {
$j++ until $j > $#array or $array1[$i] eq $array2[$j];
last if $j > $#array;
$count++;
}
or any combination.
This is to complex a condition for for loops use while loops instead
my ($array1ref, $array2ref) = #_;
my #array1 = #$array1ref;
my #array2= #$array2ref;
my $count = 0;
my ($index, $index2) = (0,0);
#loop while indexs are in arrays
while($index <= ##array1 && $index2 <= ##array2) {
if($array1[$index] eq $array2[$index2]) {
$index++;
$index2++;
} else {
#increment index until we find a match
$index2++ until $array1[$index] eq $array2[$index2];
}
}
Here is one possibility. It will use indexes to go through both lists.
my #array1 = qw(A T C G T C G A G C G);
my #array2 = qw(A C G T C C T G T C G);
my $count = 0;
my $idx1 = 0;
my $idx2 = 0;
while(($idx1 < scalar #array1) && ($idx2 < scalar #array2)) {
if($array1[$idx1] eq $array2[$idx2]) {
print "Match of $array1[$idx1] array1 \# $idx1 and array2 \# $idx2\n";
$idx1++;
$idx2++;
$count++;
} else {
$idx2++;
}
}
print "Count = $count\n";
It seems like you could do this pretty easily with a 'grep', if you're guaranteed that array2 is always as long as or longer than array1. Something like this:
sub align
{
my ($array1, $array2) = #_;
my $index = 0;
return grep
{
$array1->[$index] eq $array2->[$_] ? ++$index : 0
} 0 .. scalar( #$array2 ) - 1;
}
Basically, the grep is saying "Return me the list of increasing indices into array2 that match contiguous elements from array1."
If you run the above with this test code, you can see it returns the expected alignment array:
my #array1 = qw(A T C G T C G A G C G);
my #array2 = qw(A C G T C C T G T C G);
say join ",", align \#array1, \#array2;
This outputs the expected mapping:
0,3,4,7,8,9,10. That list means that #array1[0 .. 6] correspond to #array2[0,3,4,7,8,9,10].
(Note: You need to use Modern::Perl or similar to use say.)
Now, you haven't really said what you need the output of the operation to be. I've assumed you wanted this mapping array. If you just need a count of the number of elements skipped in #array2 while aligning it with #array1, you can still use the grep above, but instead of the list, just return scalar(#$array2) - $index at the end.
As you may know, your problem is called Sequence Alignment. There are well-developed algorithms to do this efficiently, and one such module Algorithm::NeedlemanWunsch is available on CPAN. Here's how you might apply it to your problem.
#!/usr/bin/perl
use Algorithm::NeedlemanWunsch;
my $arr1 = [qw(A T C G T C G A G C G)];
my $arr2 = [qw(A C G T C C T G T C G)];
my $matcher = Algorithm::NeedlemanWunsch->new(sub {#_==0 ? -1 : $_[0] eq $_[1] ? 1 : -2});
my (#align1, #align2);
my $result = $matcher->align($arr1, $arr2,
{
align => sub {unshift #align1, $arr1->[shift]; unshift #align2, $arr2->[shift]},
shift_a => sub {unshift #align1, $arr1->[shift]; unshift #align2, '.'},
shift_b => sub {unshift #align1, '.'; unshift #align2, $arr1->[shift]},
});
print join("", #align1), "\n";
print join("", #align2), "\n";
That prints out an optimal solution in terms of the costs we specified in the constructor:
ATCGT.C.GAGCG
A.CGTTCGG.TCG
A very different method from the one in your original question, but I think it's worth knowing about.
The array I want to query does not change during execution:
my #const_arr=qw( a b c d e f g);
The input is a string containing the indices I want to access, for example:
my $str ="1,4";
Is there something (besides iterating over the indices in $str) along the lines of #subarray = #const_arr[$str] that will result in #subarray containing [b,e]
?
If the indices are in a string, you can split the string to get them:
#array = qw(a b c d e);
$indices = '1,4';
#subarray = #array[split /,/, $indices];
print "#subarray\n";
An array slice will do this:
#const_arr=qw(a b c d e);
#subarray=(#const_arr)[1,4];
print "#subarray"'
my #const_arr = qw(a b c d e f); # the {...} creates a hash reference,
# not what you wanted
my $str = "1,4";
my #idx = split /,/ => $str;
my #wanted = #const_arr[#idx];
or in one line:
my #wanted = #const_arr[split /,/ => $str];
#const_arr should initiate like this:
my #const_arr = qw(a b c d e f);
then you can access to 1 and 4 element by:
#const_arr[1,4]