I want to calculate overlap number(#) and percentage (%) from a series of ranges values distributed in four different files initiated with a specific identifier(id) (like NP_111111.4) . The initial list of ids are taken from file1.txt (starting file) and if the id matches with ids of other file, overlaps are calculated. Suppose my files are like this:
file1.txt
NP_111111.4: 1-9 12-20 30-41
YP_222222.2: 3-30 40-80
file2.txt
NP_111111.4: 1-6, 13-22, 31-35, 36-52
NP_414690.4: 360-367, 749-755
YP_222222.2: 19-24, 22-40
file3.txt
NP_418214.2: 1-133, 135-187, 195-272
YP_222222.2: 1-10
file4.txt
NP_418119.2
YP_222222.2 GO:0016878, GO:0051108
NP_111111.4 GO:0005887
From these input file, I want to create a .csv or excel output with separate columns with header as:
id overlap_file1_file2(#) overlap_file1_file2(%) overlap_file1_file3(#) overlap_file1_file3(%) overlap_file1_file2_file3(#) overlap_file1_file2_file3(%) Go_Terms(File4)
I am learning perl and found a perl module "strictures" for this type of range comparison. I am calculating overlapping number and percentage from two ranges as:
#!/usr/bin/perl
use strictures;
use Number::Range;
my $seq1 = Number::Range->new(8..356); #Start and stop for file1.txt
my $seq2 = Number::Range->new(156..267); #Start and stop for file2.txt
my $overlap = 0;
my $sseq1 = $seq1->size;
my $percent = (($seq2->size * 100) / $seq1->size);
foreach my $int ($seq2->range) {
if ( $seq1->inrange($int) ) {
$overlap++;
}
else {
next;
}
}
print "Total size= $sseq1 Number overlapped= $overlap Percentage overlap= $percent \n";
But I could not find a way to match ids of (file1.txt) with other files to extract specific information and to print them in a output csv file.
Please help. Thanks for your consideration.
This is a fragile solution in that it can only check 3 files for overlaps. If more files are involved, the code would need to be restructured. It uses Set::IntSpan to calculate the overlaps (and percent of overlaps.
#!/usr/bin/perl
use strict;
use warnings;
use Set::IntSpan;
use autodie;
my $file1 = 'file1';
my #files = qw/file2 file3/;
my %data;
my %ids;
open my $fh1, '<', $file1;
while (<$fh1>) {
chomp;
my ($id, $list) = split /:\s/;
$ids{$id}++;
$data{$file1}{$id} = Set::IntSpan->new(split ' ', $list);
}
close $fh1;
for my $file (#files) {
open my $fh, '<', $file;
while (<$fh>) {
chomp;
my ($id, $list) = split /:\s/;
next unless exists $ids{$id};
$data{$file}{$id} = Set::IntSpan->new(split /,\s/, $list);
}
close $fh;
}
my %go_terms;
open my $go, '<', 'file4';
while (<$go>) {
chomp;
my ($id, $terms) = split ' ', $_, 2;
$go_terms{$id} = $terms =~ tr/,//dr;
}
close $go;
my %output;
for my $file (#files) {
for my $id (keys %ids) {
my $count = ($data{$file1}{$id} * $data{$file}{$id})->size;
my $percent = sprintf "%.0f", 100 * $count / $data{$file1}{$id}->size;
$output{$id}{$file} = [$count, $percent];
}
}
for my $id (keys %ids) {
my $count = ($data{$file1}{$id} * $data{$files[0]}{$id} * $data{$files[1]}{$id})->size;
my $percent = sprintf "%.0f", 100 * $count / $data{$file1}{$id}->size;
$output{$id}{all_files} = [$count, $percent];
}
# output saved as f2.csv
print join(",", qw/ID f1f2_overlap f1f2_%overlap
f1f3_overlap f1f3_%overlap
f1f2f3_overlap f1f2f3_%overlap Go_terms/), "\n";
for my $id (keys %output) {
print "$id,";
for my $file (#files, 'all_files') {
my $aref = $output{$id}{$file};
print join(",", #$aref), ",";
}
print +($go_terms{$id} // ''), "\n";
}
The Excel sheet looks like this.
Related
I have a PDB file which contains information about a specific protein. One of the information it holds is the positions of the different atoms (xyz coordinates).
The file is the following https://files.rcsb.org/view/6U9D.pdb . With this file I want to calculate the geometric center of the atoms. In theory I know what I need to do, but the script I wrote does not seem to work.
The first part of the program, before the for loop, is another part of the assignment which requires me to read the sequence and convert it from the 3 letter nomenclature to the 1 letter one. The part that interests me is the for loop until the end. I tried to pattern match in order to isolate the XYZ coordinates. Then I used a counter that I had set up in the beginning which is the $k variable. When I check the output on cygwin the only output I get is the sequence 0 0 0 instead of the sum of each dimension divided by $k. Any clue what has gone wrong?
$k=0;
open (IN, '6U9D.pdb.txt');
%amino_acid_conversion = (
ALA=>'A',TYR=>'Y',MET=>'M',LEU=>'L',CYS=>'C',
GLY=>'G',ARG=>'R',ASN=>'N',ASP=>'D',GLN=>'Q',
GLU=>'E',HIS=>'H',TRP=>'W',LYS=>'K',PHE=>'F',
PRO=>'P',SER=>'S',THR=>'T',ILE=>'I',VAL=>'V'
);
while (<IN>) {
if ($_=~m/HEADER\s+(.*)/){
print ">$1\n"; }
if ($_=~m/^SEQRES\s+\d+\s+\w+\s+\d+\s+(.*)/){
$seq.=$1;
$seq=~s/ //g;
}
}
for ($i=0;$i<=length $seq; $i+=3) {
print "$amino_acid_conversion{substr($seq,$i,3)}";
if ($_=~m/^ATOM\s+\d+\s+\w+\s+\w+\s+\w+\s+\d+\s+(\S+)\s+(\S+)\s+(\S+)/) {
$x+=$1; $y+=$2; $z+=$3; $k++;
}
}
print "\n";
#print $k;
$xgk=($x/$k); $ygk=($y/$k); $zgk=($z/$k);
print "$xgk $ygk $zgk \n";
I do not know bioinformatics but it seems like you should do something like this:
use feature qw(say);
use strict;
use warnings;
my $fn = '6U9D.pdb';
open ( my $IN, '<', $fn ) or die "Could not open file '$fn': $!";
my $seq = '';
my $x = 0;
my $y = 0;
my $z = 0;
my $k = 0;
while (<$IN>) {
if ($_ =~ m/HEADER\s+(.*)/) {
say ">$1";
}
if ($_=~m/^SEQRES\s+\d+\s+\w+\s+\d+\s+(.*)/){
$seq .= $1;
}
if ($_ =~ m/^ATOM\s+\d+\s+\w+\s+\w+\s+\w+\s+\d+\s+(\S+)\s+(\S+)\s+(\S+)/) {
$x+=$1; $y+=$2; $z+=$3; $k++;
}
}
close $IN;
$seq =~ s/ //g;
my %amino_acid_conversion = (
ALA=>'A',TYR=>'Y',MET=>'M',LEU=>'L',CYS=>'C',
GLY=>'G',ARG=>'R',ASN=>'N',ASP=>'D',GLN=>'Q',
GLU=>'E',HIS=>'H',TRP=>'W',LYS=>'K',PHE=>'F',
PRO=>'P',SER=>'S',THR=>'T',ILE=>'I',VAL=>'V'
);
my %unknown_keys;
my $conversion = '';
say "Sequence length: ", length $seq;
for (my $i=0; $i < length $seq; $i += 3) {
my $key = substr $seq, $i, 3;
if (exists $amino_acid_conversion{$key}) {
$conversion.= $amino_acid_conversion{$key};
}
else {
$unknown_keys{$key}++;
}
}
say "Conversion: $conversion";
say "Unknown keys: ", join ",", keys %unknown_keys;
say "Number of atoms: ", $k;
my $xgk=($x/$k);
my $ygk=($y/$k);
my $zgk=($z/$k);
say "Geometric center: $xgk $ygk $zgk";
This gives me the following output:
[...]
Number of atoms: 76015
Geometric center: 290.744642162734 69.196842162731 136.395842938893
I want to print a random new word English in dictionary file in terminal Unix by Perl. I want to select and print a random line and 2 follow lines.
But my code doesn't complete this work.
Please help me to improve it.
An example of the output I wish:
#inspire: ....
ghk
lko...
Dictionary file:
#inspiration: mean....
abc def...
ghk lmn
...
#inspire: ....
ghk
lko...
#people: ...
...
The complete dictionary file is here anhviet109K.txt. It's about 14MB
My code:
use strict;
use warnings;
use File::Copy qw(copy move);
my $files = 'anhviet109K.txt';
my $fh;
my $linewanted = 16 + int( rand( 513796 - 16 ) );
# 513796: number of lines of file dic.txt
open( $fh, "<", $files ) or die "cannot open < $fh: $!";
my $del = " {2,}";
my $temp = 0;
my $count = 0;
while ( my $line = <$fh> ) {
if ( ( $line =~ "#" ) && ( $. > $linewanted ) ) {
$count = 4;
}
else {
next;
}
if ( $count > 0 ) {
print $line;
$count--;
}
else {
last;
}
}
close $fh;
Something like this, perhaps?
Your data has helped me to exclude the header entries in your dictionary file
This program finds the location of all of the entries (lines beginning with #) in the file, then chooses one at random and prints it
Tốt học tiếng Anh may mắn
use strict;
use warnings 'all';
use Fcntl ':seek';
use constant FILE => 'anhviet109K.txt';
open my $fh, '<', FILE or die qq{Unable to open "#{[FILE]}" for input: $!};
my #seek; # Locations of all the definitions
my $addr = tell $fh;
while ( <$fh> ) {
push #seek, $addr if /^\#(?!00-)/;
$addr = tell $fh;
}
my $choice = $seek[rand #seek];
seek $fh, $choice, SEEK_SET;
print scalar <$fh>;
while ( <$fh> ) {
last if /^\#/;
print;
}
output
#finesse /fi'nes/
* danh từ
- sự khéo léo, sự phân biệt tế nhị
- mưu mẹo, mánh khoé
* động từ
- dùng mưu đoạt (cái gì); dùng mưu đẩy (ai) làm gì; dùng mưu, dùng kế
=to finesse something away+ dùng mưu đoạt cái gì
A single pass approach:
use strict;
use warnings;
use autodie;
open my $fh, '<:utf8', 'anhviet109K.txt';
my $definition = '';
my $count;
my $select;
while (my $line = <$fh>) {
if ($line =~ /^#(?!00-)/) {
++$count;
$select = rand($count) < 1;
if ($select) {
$definition = $line;
}
}
elsif ($select) {
$definition .= $line;
}
}
# remove blank line that some entries have
$definition =~ s/^\s+\z//m;
binmode STDOUT, ':utf8';
print $definition;
This iterative random selection always selects the first item, has a 1/2 chance of replacing it with the second item, a 1/3 for the third, and so on.
I'm new to Perl. I have two text files and I need to check matching string on both lists.
For example matching strings are:
line - file 1: fe/bla/blablabla/abcdefg
line - file 2: blablabla/abcdefg
There is a match!
In addition, the location (line number) of the matching strings is not the same on both files.
I tried put the lists in arrays and compare the arrays with nested loop, but the running time of the program is huge (the lists contain thousand of lines) and I believe there is another way, less naïve and more productive.
This is the way I put the data in the array:
my $list1 = /path/to/the/file;
open (FILE , '<' , $list1) or die ("Could not open the file");
while ( my $line = <FILE> ) {
chomp ($line);
$list_1[$i] = $line;
$i = $i+1;
}
close FILE;
I did it to the other list as well.
And this is my nested loop.
for ( $k = 0 ; $k < #list_1 ; $k = $k+1 ) {
for ($i = 0 ; $i < #list_2 ; $i = $i+1 ) {
if (index($list_1[$k] , $list_2[$i]) != -1) {
splice (#list_2 , $i , 1);
last;
}
}
}
As long as file2 isn't enormous, the simplest way is to build a regular expression pattern from its contents and check each line in file1 against the pattern.
You don't say what output you want, so I have printed all lines in file1 that have a match in file2.
use strict;
use warnings;
use 5.010;
use autodie;
my ($list1, $list2) = qw( /path/to/list1 /path/to/list2 );
open my $fh, '<', $list2;
my $re = join '|', map { chomp; quotemeta; } <$fh>;
$re = qr/$re/;
open $fh, '<', $list2;
while ( <$fh> ) {
print if /$re/;
}
I have a csv with about 160,000 lines, it looks like this:
chr1,160,161,3,0.333333333333333,+
chr1,161,162,4,0.5,-
chr1,309,310,14,0.0714285714285714,+
chr1,311,312,2,0.5,-
chr1,499,500,39,0.717948717948718,+
chr2,500,501,8,0.375,-
chr2,510,511,18,0.5,+
chr2,511,512,6,0.333333333333333,-
I would like to pair lines where column 1 is the same, column 3 matches column 2 and where column 6 is a '+' while on the other line it is a '-'. If this is true I would like to sum column 4 and column 5.
My desired out put would be
chr1,160,161,7,0.833333333333333,+
chr1,309,310,14,0.0714285714285714,+
chr1,311,312,2,0.5,-
chr1,499,500,39,0.717948717948718,+
chr2,500,501,8,0.375,-
chr2,510,511,24,0.833333333333333,-
the best solution I can think of is to duplicate the file and then match columns between the file and it's duplicate with perl:
#!/usr/bin/perl
use strict;
use warnings;
open my $firstfile, '<', $ARGV[0] or die "$!";
open my $secondfile, '<', $ARGV[1] or die "$!";
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a,
$total_b,$sign_a,$sign_b);
while (<$firstfile>) {
my #col = split /,/;
$chr_a = $col[0];
$start = $col[1];
$end = $col[2];
$sum_a = $col[3];
$total_a = $col[4];
$sign_a = $col[5];
seek($secondfile,0,0);
while (<$secondfile>) {
my #seccol = split /,/;
$chr_b = $seccol[0];
$begin = $seccol[1];
$finish = $seccol[2];
$sum_b = $seccol[3];
$total_b = $seccol[4];
$sign_b = $seccol[5];
print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3],
$col[4]+=$seccol[4], $col[5]),
"\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
}
}
And that works fine, but ideally I'd like to be able to do this within the file itself without having to duplicate it, because I have many files and so I would like to run a script over all of them that is less time-consuming.
Thanks.
In the absence of a response to my comment, this program will do as you ask with the data you provide.
use strict;
use warnings;
my #last;
while (<DATA>) {
s/\s+\z//;
my #line = split /,/;
if (#last
and $last[0] eq $line[0]
and $last[2] eq $line[1]
and $last[5] eq '+' and $line[5] eq '-') {
$last[3] += $line[3];
$last[4] += $line[4];
print join(',', #last), "\n";
#last = ()
}
else {
print join(',', #last), "\n" if #last;
#last = #line;
}
}
print join(',', #last), "\n" if #last;
__DATA__
chr1,160,161,3,0.333333333333333,+
chr1,161,162,4,0.5,-
chr1,309,310,14,0.0714285714285714,+
chr1,311,312,2,0.5,-
chr1,499,500,39,0.717948717948718,+
chr2,500,501,8,0.375,-
chr2,510,511,18,0.5,+
chr2,511,512,6,0.333333333333333,-
output
chr1,160,161,7,0.833333333333333,+
chr1,309,310,14,0.0714285714285714,+
chr1,311,312,2,0.5,-
chr1,499,500,39,0.717948717948718,+
chr2,500,501,8,0.375,-
chr2,510,511,24,0.833333333333333,+
I am new to perl require your help to build a logic.
I have some let say 10 files in a directory, each file has some data like below. Each file contain lines depends upon the number of users setting. For example if 4 users are there then 4 lines will get printed from the server.
1405075666889,4044,SOA_breade,200,OK,Thread Group 1-1,text,true,623,4044
1405075666889,4041,SOA_breade,200,OK,Thread Group 1-1,text,true,623,4041
1405075666889,4043,SOA_breade,200,OK,Thread Group 1-1,text,true,623,4043
1405075666889,4045,SOA_breade,200,OK,Thread Group 1-1,text,true,623,4044
I want to some piece of logic that should create single file in a output directory and that file should contain 10 lines
Min_Value, Max_Value, Avg_Value, User1, User2, User3......User4
and their corresponding values from second line in this case corresponding values are coming from second column.
Min_Value, Max_Value, Avg_Value, User1, User2, User3......User4
4.041,4.045,4.044,4.041,4.043,4.045
.
.
.
.
.
10th file data
Here is my code... It is working however I am not getting how to print user1, user2... in sequence and its corresponding values
my #soaTime;
my #soaminTime;
my #soamaxTime;
my #soaavgTime;
my $soadir = $Json_Time;
foreach my $inputfile (glob("$soadir/*Overview*.txt")) {
open(INFILE, $inputfile) or die("Could not open file.");
foreach my $line (<INFILE>) {
my #values = split(',', $line); # parse the file
my $time_ms = $values[1]/1000;
push (#soaTime, $time_ms);
}
my $min = min #soaTime;
push (#soaminTime, $min);
print $soaminTime[0];
my $max = max #soaTime;
push (#soamaxTime, $max);
sub mean { return #_ ? sum(#_) / #_ : 0 };
#print mean(#soaTime);
push (#soaavgTime, mean());
close(INFILE);
}
my $outputfile = $report_path."abc.txt";
open (OUTFILE, ">$outputfile");
print OUTFILE ("Min_Value,Max_Value,User1,User2,User3,User4"."\n"); # Prining the data
for (my $count = 0; $count <= $#soaTC; $count++) {
print OUTFILE("$soaminTime[0],$soamaxTime[0],$soaTime[0],$soaTime[1],$soaTime[2],$soaTime[3]"."\n" ); #Prining the data
}
close(OUTFILE);
Please help.
is it?
use strict;
use List::Util qw( min max );
my $Json_Time="./test";
my $report_path="./out/";
my #soaTime;
my #soaminTime;
my #soamaxTime;
my #soaavgTime;
my #users;
my $maxusers;
my $soadir = $Json_Time;
foreach my $inputfile (glob("$soadir/*Overview*.txt")) {
open(INFILE, $inputfile) or die("Could not open file.");
my $i=0;
my #m_users;
my #m_soaTime;
foreach my $line (<INFILE>) {
my #values = split(',', $line); # parse the file
my $time_ms = $values[1]/1000;
push (#m_soaTime, $time_ms);
$i++;
push(#m_users, "User".$i);
}
push(#soaTime,\#m_soaTime);
if ($maxusers<$#m_users) {
#users=#m_users;
$maxusers=$#m_users;
}
my $min = min(#m_soaTime);
push (#soaminTime, $min);
my $max =max(#m_soaTime);
push (#soamaxTime, $max);
sub mean { return #_ ? sum(#_) / #_ : 0 };
push (#soaavgTime, mean());
close(INFILE);
}
my $outputfile = $report_path."abc.txt";
open (OUTFILE, ">$outputfile");
print OUTFILE "Min_Value,Max_Value,Avg_Value,".join(',',#users)."\n"; # Prining the data
for (my $count = 0; $count <= $#soaavgTime; $count++) {
print OUTFILE $soaminTime[$count].","
.$soamaxTime[$count].","
.$soaavgTime[$count].","
.join(',',#{$soaTime[$count]})
."\n"; #Prining the data
}
close(OUTFILE);