With reference to the question Calculating the distance between atomic coordinates, where the input is
ATOM 920 CA GLN A 203 39.292 -13.354 17.416 1.00 55.76 C
ATOM 929 CA HIS A 204 38.546 -15.963 14.792 1.00 29.53 C
ATOM 939 CA ASN A 205 39.443 -17.018 11.206 1.00 54.49 C
ATOM 947 CA GLU A 206 41.454 -13.901 10.155 1.00 26.32 C
ATOM 956 CA VAL A 207 43.664 -14.041 13.279 1.00 40.65 C
.
.
.
ATOM 963 CA GLU A 208 45.403 -17.443 13.188 1.00 40.25 C
there is an answer reported as
use strict;
use warnings;
my #line;
while (<>) {
push #line, $_; # add line to buffer
next if #line < 2; # skip unless buffer is full
print proc(#line), "\n"; # process and print
shift #line; # remove used line
}
sub proc {
my #a = split ' ', shift; # line 1
my #b = split ' ', shift; # line 2
my $x = ($a[6]-$b[6]); # calculate the diffs
my $y = ($a[7]-$b[7]);
my $z = ($a[8]-$b[8]);
my $dist = sprintf "%.1f", # format the number
sqrt($x**2+$y**2+$z**2); # do the calculation
return "$a[3]-$b[3]\t$dist"; # return the string for printing
}
The output of above code is the distance between the first CA to the second one and second to third and so on...
How to modify this code to find the distance between first CA to rest of the CAs (2, 3, ..) and from second CA to rest of the CAs (3, 4, ..) and so on and printing only those which is less then 5 Angstrom?
I found that push #line, $_; statement should be altered to increase the array size but not clear how to do that.
To get the pairs, read the file into an array, #data_array. Then loop over the entries.
Update: Added file opening and load #data_array.
open my $fh, '<', 'atom_file.pdb' or die $!;
my #data_array = <$fh>;
close $fh or die $!;
for my $i (0 .. $#data_array) {
for my $j ($i+1 .. $#data_array) {
process(#data_array[$i,$j]);
}
}
May be try this:
use strict;
use warnings;
my #alllines = ();
while(<DATA>) { push(#alllines, $_); }
#Each Current line
for(my $i=0; $i<=$#alllines+1; $i++)
{
#Each Next line
for(my $j=$i+1; $j<=$#alllines; $j++)
{
if($alllines[$i])
{
#Split the line into tab delimits
my ($line1_tb_1,$line1_tb_2,$line1_tb_3) = split /\t/, $alllines[$i];
print "Main_Line: $line1_tb_1\t$line1_tb_2\t$line1_tb_3";
if($alllines[$j])
{
#Split the line into tab delimits
my ($line_nxt_tb1,$line_nxt_tb2,$line_nxt_tb3) = split /\t/, $alllines[$j];
print "Next_Line: $line_nxt_tb1\t$line_nxt_tb2\t$line_nxt_tb3";
#Do it your coding/regex here
}
}
#system 'pause'; Testing Purpose!!!
}
}
__DATA__
tab1 123 456
tab2 789 012
tab3 345 678
tab4 901 234
tab5 567 890
I hope this will help you.
Related
My question is similar to this question posted earlier.
I am having many files which I need to merge them based on the presence or absence of the first column ID, but while merging I am getting lots of empty values in my output file, I want those empty values to be zero if it is not present in another file. The example below is based on only two files content, but I have many sample files like this format (tabular).
For example:
File1
ID Value
123 1
231 2
323 3
541 7
File2
ID Value
541 6
123 1
312 3
211 4
Expected Output:
ID File1 File2
123 1 1
231 2 0
323 3 0
541 7 6
312 0 3
211 0 4
Obtaining Output:
ID File1 File2
123 1 1
231 2
323 3
541 7 6
312 undef 3
211 undef 4
As you can see above I am getting output but in file2 column, it's not adding zero or leaving empty and in case of file1 column it is having undef value. I have checked undef values and then my final output gives zeros in place of undef values but still I am having those empty spaces. Please find my code below (hardcoded only for two files).
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Data::Dumper;
my $path = "/home/pranjay/Projects/test";
my #files = ("s1.txt","s2.txt");
my %classic_com;
my $cnt;
my $classic_txt;
my $sample_cnt = 0;
my $classic_txtcomb = "test_classic.txt";
open($classic_txt,">$path/$classic_txtcomb") or die "Couldn't open file
$classic_txtcomb for writing,$!";
print $classic_txt "#ID\t"."file1\tfile2\n";
foreach my $file(#files){
$sample_cnt++;
print "$sample_cnt\n";
open($cnt,"<$path/$file")or die "Couldn't open file $file for reading,$!";
while(<$cnt>){
chomp($_);
my #count = ();
next if($_=~/^ID/);
my #record=();
#record=split(/\t/,$_);
my $scnt = $sample_cnt -1;
if((exists($classic_com{$record[0]})) and ($sample_cnt > 0)){
${$classic_com{$record[0]}}[$scnt]=$record[1];
}else{
$count[$scnt] = "$record[1]";
$classic_com{$record[0]}= [#count];
}
}
}
my %final_txt=();
foreach my $key ( keys %classic_com ) {
#print "$key: ";
my #val = #{ $classic_com{$key} };
my #v;
foreach my $i ( #val ) {
if(not defined($i)){
$i = 0;
push(#v, $i);
}else{
push(#v, $i);
next;
}
}
$final_txt{$key} = [#v];
}
#print Dumper %classic_com;
while(my($key,$value)=each(%final_txt)){
my $val=join("\t", #{$value});
print $classic_txt "$key\t"."#{$value}"."\n";
}
Just read the input files into a hash of arrays. The topmost key is the ID, each inner array contains the value for file i on the i-th position. When printing, use the // defined-or operator to replace undefs with zeroes:
#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
my %merged;
my $file_tally = 0;
while (my $file = shift) {
open my $in, '<', $file or die "$file: $!";
<$in>; # skip the header
while (<$in>) {
my ($id, $value) = split;
$merged{$id}[$file_tally] = $value;
}
++$file_tally;
}
for my $value (keys %merged) {
my #values = #{ $merged{$value} };
say join "\t", $value, map $_ // 0, #values[0 .. $file_tally - 1];
}
program.pl
my %val;
/ (\d+) \s+ (\d+) /x and $val{$1}{$ARGV} = $2 while <>;
pr( 'ID', my #f = sort keys %{{map%$_,values%val}} );
pr( $_, map$_//0, #{$val{$_}}{#f} ) for sort keys %val;
sub pr{ print join("\t",#_)."\n" }
Run:
perl program.pl s1.txt s2.txt
ID s1.txt s2.txt
123 1 1
211 0 4
231 2 0
312 0 3
323 3 0
541 7 6
I have written a script in perl that reads each file in current directory and calculates distances between protein and ligand atom. Whenever it encounters condition that the result is <=5 it should break the loop (last) and move this file into another directory (mp) that was previously made. For some reason I have a problem with this script; it calcluates the distances correctly but does not move these files into another folder. It does not give me any error tough. I am trying to figure out what seems to be a problem.
This is the script that I have been using:
#!/usr/local/bin/perl
use strict;
use warnings;
use File::Glob;
mkdir "mp";
my $txt;
for my $txt ( glob '*.txt' )
{
open my $fh, '<', $txt;
my $out_fh;
my (#refer, #points);
my $part = 0;
my $dist;
while (my $line = <$fh>)
{
chomp($line);
$part++ if ($line =~ /^HETATM/);
my #array = (substr($line, 30, 8),substr($line,38,8),substr($line,46,8));
#print "#array\n";
if ($part == 0)
{
push #refer, [ #array ];
}
elsif ($part)
{
push #points, [ #array ];
}
my $atom;
foreach my $ref(#refer)
{
my ($x1, $y1, $z1) = #{$ref};
foreach my $atom(#points)
{
my ($x, $y, $z) = #{$atom};
my $dist = sqrt( ($x-$x1)**2 + ($y-$y1)**2 + ($z-$z1)**2 );
if ($dist <= 5)
{
print "Distance for calculation between $ref and $atom is $dist\n";
last;
system ("mv $fh mp");
}
}
}
}
}
The content of my input file looks like this:
ATOM 1593 HD21 LEU D 46 11.528 -8.800 5.301 1.00 0.00 H
ATOM 1594 HD22 LEU D 46 12.997 -9.452 4.535 1.00 0.00 H
ATOM 1595 HD23 LEU D 46 11.722 -8.718 3.534 1.00 0.00 H
HETATM 1597 N1 308 A 1 0.339 6.314 -9.091 1.00 0.00 N
HETATM 1598 C10 308 A 1 -0.195 5.226 -8.241 1.00 0.00 C
The result that script gives me in terminal looks like this:
Distance for calculation between ARRAY(0x1c61fa8) and ARRAY(0x1c6f950) is 4.98553437456809
Distance for calculation between ARRAY(0x1c62098) and ARRAY(0x1c6ffe0) is 4.98962253081333
But it does not move the files.
I have a PDB file. Now it has two parts separated by TER. Before TER I call it part 1. I want to take x,y,z of ATOM 1 of first part i.e before TER and find distance to all x,y,z co ordinates after TER and then second ATOM of part one to all ATOMS of part second. This has to be repeated for all ATOMS of first part= to all ATOMS of second part. I have to automate it for 20 files. names of my files begin like 1_0.pdb,2_0.pdb....20_0.pdb.
This is a distance calculation. I have tried something in PERL but its very rough. Can someone help a bit.
The File looks like:
----long file (I truncated it)----
ATOM 1279 C ALA 81 -1.925 -11.270 1.404
ATOM 1280 O ALA 81 -0.279 9.355 15.557
ATOM 1281 OXT ALA 81 -2.188 10.341 15.346
TER
ATOM 1282 N THR 82 29.632 5.205 5.525
ATOM 1283 H1 THR 82 30.175 4.389 5.768
ATOM 1284 H2 THR 82 28.816 4.910 5.008
The code is: In the end it finds the maximum distance and its co ordinates
my #points = ();
open(IN, #ARGV[0]) or die "$!";
while (my $line = <IN>) {
chomp($line);
my #array = (split (/\s+/, $line))[5, 6, 7];
print "#array\n";
push #points, [ #array ];
}
close(IN);
$max=0;
for my $i1 ( 0 .. $#points )
{
my ( $x1, $y1, $z1 ) = #{ $points[$i1] };
my $dist = sqrt( ($x1+1.925)**2 + ($y1+11.270)**2 + ($z1-1.404)**2 );
print "distance from (-1.925 -11.270 1.404) to ( $x1, $y1, $z1 ) is $dist\n";
if ( $dist > $max )
{ $max = $dist;
$x=$x1;
$y=$y1;
$z=$z1;
}}
print "maximum value is : $max\n";
print "co ordinates are : $x $y $z\n";
Not sure I clearly understand what you want, but how about:
#!/usr/local/bin/perl
use strict;
use warnings;
my (#refer, #points);
my $part = 0;
while (my $line = <DATA>) {
chomp($line);
if ($line =~ /^TER/) {
$part++;
next;
}
my #array = (split (/\s+/, $line))[5, 6, 7];
if ($part == 0) {
push #refer, [ #array ];
} else {
push #points, [ #array ];
}
}
my %max = (val=>0, x=>0, y=>0, z=>0);
foreach my $ref(#refer) {
my ($x1, $y1, $z1) = #{$ref};
foreach my $atom(#points) {
my ($x, $y, $z) = #{$atom};
my $dist = sqrt( ($x-$x1)**2 + ($y-$y1)**2 + ($z-$z1)**2 );
if ($dist > $max{val}) {
$max{val} = $dist;
$max{x} = $x;
$max{y} = $y;
$max{z} = $z;
}
}
}
print "max is $max{val}; coord: x=$max{x}, y=$max{y}, z=$max{z}\n";
__DATA__
ATOM 1279 C ALA 81 -1.925 -11.270 1.404
ATOM 1280 O ALA 81 -0.279 9.355 15.557
ATOM 1281 OXT ALA 81 -2.188 10.341 15.346
TER
ATOM 1282 N THR 82 29.632 5.205 5.525
ATOM 1283 H1 THR 82 30.175 4.389 5.768
ATOM 1284 H2 THR 82 28.816 4.910 5.008
output:
max is 35.9813670807545; coord: x=30.175, y=4.389, z=5.768
The main issue here is reading the data. First, note that one cannot use split with PDB text files since the fields are defined by position and not by separators. See Coordinate File Description (PDB Format).
To separate the ATOM record of different polymer chains you can start with a simplified version like
my $iblock = 0;
my #atoms = ();
while (my $line = <IN>) {
chomp($line);
# Switch blocks at TER lines
if ($line =~ /^TER/) {
$iblock++;
# Read ATOM lines
} elsif ($line =~ m/^ATOM/) {
my #xyz = (substr($line,7-1,9),substr($line,16-1,9),substr($line,25-1,9));
printf "Block %d: atom at (%s)\n",$iblock,join (",",#xyz);
push #{$atoms[$iblock]},\#xyz;
# Parse additional line types (if needed)
} else {
...
}
}
Followed by a loop over all pairs of coordinates from different blocks, structured as follows:
# 1st block
for my $iblock1 (0..$#atoms) {
# 2nd block
for my $iblock2 ($iblock1+1..$#atoms) {
# Compare all pairs of atoms
...
my $xyz1 (#{$atoms[$iblock1]}) {
for my $xyz2 (#{$atoms[$iblock2]}) {
# Calculate distance and compare with $max_dist
...
}
}
# Print the maximal distance between these two blocks
...
}
}
Of course, the code could be more general if a more elaborate data structure is used or by applying one of the available PDB parsers, such as Bioperl's.
With proper encapsulation, this is pretty simple, and requires minor modifications of your code.
ETA: Added fixed width solution I had on hand. It would probably be best to read all the fields instead of discarding the first 31 chars, and then return them all in a hash reference. That way, you could process all the lines with the same subroutine, and simply switch between parts when the first field turns out to be TER. It should be easy for you to extrapolate this from the given code.
You'll note that the reference values are read in with a loop, because we need to break the loop at the break point. The rest of the values are slurped up with a map statement. Then we simply feed the data to the subroutine we made from your initial code (with some improvements). I used the same names for the lexical variables to make it easier to read the code.
use strict;
use warnings;
my #points;
while (<DATA>) {
last if /^TER$/;
push #points, getpoints($_);
}
my #ref = map getpoints($_), <DATA>;
for my $p (#points) {
getcoords($p, \#ref);
}
sub getpoints {
my $line = shift;
my #data = unpack "A31 A8 A8 A8", $line;
shift #data;
return \#data;
}
sub getcoords {
my ($p, $ref) = #_;
my ($p1,$p2,$p3) = #$p;
my $max=0;
my ($x,$y,$z);
for my $aref ( #$ref ) {
my ( $x1, $y1, $z1 ) = #$aref;
my $dist = sqrt(
($x1-$p1)**2 +
($y1-$p2)**2 +
($z1-$p3)**2
);
print "distance from ($p1 $p2 $p3) to ( $x1, $y1, $z1 ) is $dist\n";
if ( $dist > $max ) {
$max = $dist;
$x=$x1;
$y=$y1;
$z=$z1;
}
}
print "maximum value is : $max\n";
print "co ordinates are : $x $y $z\n";
}
__DATA__
ATOM 1279 C ALA 81 -1.925 -11.270 1.404
ATOM 1280 O ALA 81 -0.279 9.355 15.557
ATOM 1281 OXT ALA 81 -2.188 10.341 15.346
TER
ATOM 1282 N THR 82 29.632 5.205 5.525
ATOM 1283 H1 THR 82 30.175 4.389 5.768
ATOM 1284 H2 THR 82 28.816 4.910 5.008
I have file that looks like:
ATOM 2517 O VAL 160 8.337 12.679 -2.487
ATOM 2518 OXT VAL 160 7.646 12.461 -0.386
TER
ATOM 2519 N VAL 161 -14.431 5.789 -25.371
ATOM 2520 H1 VAL 161 -15.336 5.698 -25.811
ATOM 2521 H2 VAL 161 -13.416 10.529 17.708
ATOM 2522 H3 VAL 161 -14.363 9.436 18.498
ATOM 2523 CA VAL 161 4.400 9.233 16.454
ATOM 2524 HA VAL 161 3.390 9.170 16.047
I have to remove "TER", the line before "TER" and 3 lines after the line just after TER and make file continuous like this:
ATOM 2517 O VAL 160 8.337 12.679 -2.487
ATOM 2519 N VAL 161 -14.431 5.789 -25.371
ATOM 2523 CA VAL 161 4.400 9.233 16.454
ATOM 2524 HA VAL 161 3.390 9.170 16.047
A simple line-by-line script.
Usage: perl script.pl -i.bak fileglob
E.g. perl script.pl -i.bak File*MINvac.pdb
This will alter the original file, and save a backup of each file with the extension .bak. Note that if TER lines appear too close to the end of the file, it will cause warnings. On the other hand, so will the other solutions presented.
If you do not wish to save backups (use caution, since changes are irreversible!), use -i instead.
Code:
#!/usr/bin/perl
use v5.10;
use strict;
use warnings;
my $prev;
while (<>) {
if (/^TER/) {
print scalar <>; # print next line
<> for 1 .. 3; # skip 3 lines
$prev = undef; # remove previous line
} else {
print $prev if defined $prev;
$prev = $_;
}
if (eof) { # New file next iteration?
print $prev;
$prev = undef;
}
}
I realized I was supposed to write it in Perl, but now I've already written it in Python. I'm posting it anyway as it may prove to be useful, don't see any harm in that.
#!/usr/bin/python2.7
import sys
import glob
import os
try:
dir = sys.argv[1]
except IndexError:
print "Usage: "+sys.argv[0]+" dir"
print "Example: "+sys.argv[0]+" /home/user/dir/"
sys.exit(1)
for file in glob.glob(os.path.join(dir, 'File*_*MINvac.pdb')):
fin = open(file, "r")
content = fin.readlines()
fin.close()
for i in range(0, len(content)):
try:
if "TER" in content[i]:
del content[i]
del content[i-1]
del content[i:i+3]
except IndexError:
break
fout = open(file, "w")
fout.writelines(content)
fout.close()
Edit: Added support for multiple files, like the OP wanted.
So, for each set of 6 consecutive lines, you want to discard all but the third line if the second line is a TER?
TIMTOWTDI, but this should work:
my #queue;
while (<>) {
push #queue, $_;
#queue = $queue[2] if #queue == 6 and $queue[1] =~ /^TER$/;
print shift #queue if #queue == 6;
}
print #queue; # assume no TERs in last 4 lines
use strict;
use warnings;
use Tie::File;
my #array;
tie #array, 'Tie::File', 'myFile.txt' or die "Unable to tie file";
my %unwanted = map { $_ => 1 } # Hashify ...
map { $_-1, $_, $_+2 .. $_+4 } # ... the five lines ...
grep { $array[$_] =~ /^TER/ } # ... around 'TER' ...
0 .. $#array ; # ... in the file
# Remove the unwanted lines
#array = map { $array[$_] } grep { ! $unwanted{$_} } 0 .. $#array;
untie #array; # The end
I have a list of numbers in a file in one column such as-
144
542
123
54
234
233
I want to group numbers every nth time
For example : if n=2 then 144,542 is in one group , 123,54 in the second , 234,233 is in the third till the end of the file
the loop I wrote just gives me the first group of numbers and not the entire list:
What changes should I do ?
use strict;
open ( IN ,"$inputfile") || die ("cannot open ! ");
my #list;
my $N=2;
while (#list = <IN>) {
chomp;
for ( $i=1;$i<=$N;$i++){
print "#list[$i]";
}
}
Use natatime from List::MoreUtils
use warnings;
use strict;
use List::MoreUtils qw(natatime);
my $n = 2;
my #list;
while (<DATA>) {
chomp;
push #list, $_;
}
my $it = natatime($n, #list);
while (my #vals = $it->()) {
print "#vals\n";
}
__DATA__
144
542
123
54
234
233
Prints:
144 542
123 54
234 233
You can use the by function from List::Gen to partition a list into equal size segments:
use List::Gen qw(by);
my $pairs = by 2 => # partition by 2
grep {s/^\s+|\s+$//g; length} # remove whitespace and empty lines
<DATA>; # read all lines
print "#$_\n" for #$pairs;
__DATA__
144
542
123
54
234
233
which prints:
144 542
123 54
234 233
I have to applaud your use of strict and would like to encourage you to also add warnings. :)
And a solution that makes the semantics a bit more clear:
use strict;
use warnings;
use File::Slurp 'read_file';
use Array::Split qw( split_by );
my $inputfile = 'file';
my #lines = read_file( "$inputfile" );
$_ =~ s/[\r\n]//g for #lines; # remove newlines
my #grouped_lines = split_by( 2, #lines );
for my $group ( #grouped_lines ) {
print join ',', #{$group};
print "\n";
}
__END__
144
542
123
54
234
233
becomes:
144,542
123,54
234,233