Compare two directories for differences in regular files

Compare two directories for differences in regular files - perl

Need to compare two directories and check through every file in both directories for files of the same name, if the same name occurs you check to see if the files have the same content, if so print > file <. If the file has the same name but not the same content < file > If there is a file that's not in directory 2 but in directory 1 <<< file1, and likewise >>> file2 for a file in dir 2 but not dir 1.
I have been having trouble, my code doesn't even compare when I test to see if the files are equal in name.
#!/usr/bin/perl -w
use File::Basename;
#files1 = `/usr/bin/find $ARGV[0] -print`;
chop #files1;
#files2 = `/usr/bin/find $ARGV[1] -print`;
chop #files2;
here:
for ($i=1; #files1 >= $i; $i++) {
for ($x=1; #files2 >= $x; $x++) {
$file1 = basename($files1[$i]);
$file2 = basename($files2[$x]);
if ($file1 eq $file2) {
shift #files1;
shift #files2;
$result = `/usr/bin/diff -q $files1[$i] $files2[$x]`;
chop $result;
if ($result eq "Files $files1[$i] and $files2[$x] differ") {
print "< $file1 >\n";
next here;
}
else {
print "> $file1 <\n";
}
}
else {
if ( !-e "$files1[$i]/$file2") { print ">>> $file2\n";}
unless ( -e "$files2[$x]/$file1") { print "<<< $file1\n";}
}
}
}

Try using this :
diff -aqr /dir1 /dir2
or :
#!/bin/bash
for f;
for g; do
[[ "$f" != "$g" ]] &&
cmp &>/dev/null "$f" "$g" || echo "$f is different of $g"
done
done
USE this
./script dir1/* dir2/*

Related

eliminate empty files in a subroutine in perl

I want to a add a code in the next script to eliminate those empty output files.
The script convert a single fastq file or all the fastq files in a folder to fasta format, all the output fasta files keep the same name of the fastq file; the script present an option to exclude all the sequences that present a determinate number of NNN repeats (NNNNNNNNNNNNNNNNNNATAGTGAAGAATGCGACGTACAGGATCATCTA), I added this option because some sequences present only NNNNN in the sequences, example: if the -n option is equal to 15 (-n 15) it will exclude all the sequences that present 15 o more N repeats, to this point the code works well, but it generate an empty files (in those fastq files that all the sequences present 15 or more N repeats are excluded). I want to eliminate all the empty files (without sequences) and add a count of how many files were eliminate because it were empty.
Code:
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
my ($infile, $file_name, $file_format, $N_repeat, $help, $help_descp,
$options, $options_descrp, $nofile, $new_file, $count);
my $fastq_extension = "\\.fastq";
GetOptions (
'in=s' => \$infile,
'N|n=i' =>\$N_repeat,
'h|help' =>\$help,
'op' =>\$options
);
# Help
$help_descp =(qq(
Ussaje:
fastQF -in fastq_folder/ -n 15
or
fastQF -in file.fastq -n 15
));
$options_descrp =(qq(
-in infile.fastq or fastq_folder/ required
-n exclude sequences with more than N repeat optional
-h Help description optional
-op option section optional
));
$nofile =(qq(
ERROR: "No File or Folder Were Chosen !"
Usage:
fastQF -in folder/
Or See -help or -op section
));
# Check Files
if ($help){
print "$help_descp\n";
exit;
}
elsif ($options){
print "$options_descrp\n";
exit;
}
elsif (!$infile){
print "$nofile\n";
exit;
}
#Subroutine to convert from fastq to fasta
sub fastq_fasta {
my $file = shift;
($file_name = $file) =~ s/(.*)$fastq_extension.*/$1/;
# eliminate old files
my $oldfiles= $file_name.".fasta";
if ($oldfiles){
unlink $oldfiles;
}
open LINE, '<', $file or die "can't read or open $file\n";
open OUTFILE, '>>', "$file_name.fasta" or die "can't write $file_name\n";
while (
defined(my $head = <LINE>) &&
defined(my $seq = <LINE>) &&
defined(my $qhead = <LINE>) &&
defined(my $quality = <LINE>)
) {
substr($head, 0, 1, '>');
if (!$N_repeat){
print OUTFILE $head, $seq;
}
elsif ($N_repeat){
my $number_n=$N_repeat-1;
if ($seq=~ m/(n)\1{$number_n}/ig){
next;
}
else{
print OUTFILE $head, $seq;
}
}
}
close OUTFILE;
close LINE;
}
# execute the subrutine to extract the sequences
if (-f $infile) { # -f es para folder !!
fastq_fasta($infile);
}
else {
foreach my $file (glob("$infile/*.fastq")) {
fastq_fasta($file);
}
}
exit;
I have tried to use the next code outside of the subroutine (before exit) but it just work for the last file :
$new_file =$file_name.".fasta";
foreach ($new_file){
if (-z $new_file){
$count++;
if ($count==1){
print "\n\"The choosen File present not sequences\"\n";
print " \"or was excluded due to -n $N_repeat\"\n\n";
}
elsif ($count >=1){
print "\n\"$count Files present not sequences\"\n";
print " \" or were excluded due to -n $N_repeat\"\n\n";
}
unlink $new_file;
}
}
and I just have tried something similar inside of the subroutine but this last code don´t work !!!!
Any Advise !!!!???
Thanks So Much !!!

you should check, if something was written to your new file at the end of our fastq_fasta subroutine. Just put your code after the close OUTFILE statement:
close OUTFILE;
close LINE;
my $outfile = $file_name.".fasta";
if (-z $outfile)
{
unlink $outfile || die "Error while deleting '$outfile': $!";
}
Additionally, it will be better to add the die/warn statement also to the other unlink line. Empty files should be deleted.
Maybe another solution if you are not fixed to perl, but allowed to use sed and a bash loop:
for i in *.fastq
do
out=$(dirname "$i")/$(basename "$i" .fastq).fasta
sed -n '1~4{s/^#/>/;N;p}' "$i" > "$out"
if [ -z $out ]
then
echo "Empty output file $out"
rm "$out"
fi
done
Hope that helps!
Best Frank

The easiest thing to do is probably to add a counter to your subroutine to keep track of the number of sequences in the outfile:
sub fastq_fasta {
my $counter1 = 0;
my $file = shift;
($file_name = $file) =~ s/(.*)$fastq_extension.*/$1/;
# eliminate old files
my $oldfiles= $file_name.".fasta";
if ($oldfiles){
unlink $oldfiles;
}
open LINE, '<', $file or die "can't read or open $file\n";
open OUTFILE, '>>', "$file_name.fasta" or die "can't write $file_name\n";
while (
defined(my $head = <LINE>) &&
defined(my $seq = <LINE>) &&
defined(my $qhead = <LINE>) &&
defined(my $quality = <LINE>)
) {
$counter1 ++;
substr($head, 0, 1, '>');
if (!$N_repeat){
print OUTFILE $head, $seq;
}
elsif ($N_repeat){
my $number_n=$N_repeat-1;
if ($seq=~ m/(n)\1{$number_n}/ig){
$counter1 --;
next;
}
else{
print OUTFILE $head, $seq;
}
}
}
close OUTFILE;
close LINE;
return $counter1;
}
You can then delete files when the returned count is zero:
if (-f $infile) { # -f es para folder !!
fastq_fasta($infile);
}
else {
foreach my $file (glob("$infile/*.fastq")) {
if (fastq_fasta($file) == 0) {
$file =~ s/(.*)$fastq_extension.*/$1.fasta/;
unlink $file;
}
}
}

Automate paired analysis

I have a growing number of files to process using a simple Perl script I wrote. The script takes two files as input and prints an output. I want to use a bash script (or anything really) to automate the following usage:
perl Program.pl GeneLevels_A GeneLevels_B > GeneLevels_A_B
with every paired, non-directional combination of files in a particular directory.
Here is the Perl script:
#!/usr/bin/perl
use strict;
use warnings;
die "Usage: $0 <File_1> <File_2>\n" unless #ARGV == 2;
my $file1 = shift #ARGV;
my $file2 = shift #ARGV;
my %hash1;
my %hash2;
my $counter = 0;
my $amt = 25;
my $start = 244 - $amt;
open (REF, $file1);
while (<REF>) {
my $line = $_;
chomp $line;
if ($counter < $start) {
$counter++;
next;
}
my #cells = split('\t', $line);
my $ID = $cells[2];
my $row = $cells[0];
$hash1{$ID} = $row;
$counter++;
}
close REF;
$counter = 0;
open (FILE, $file2);
while (<FILE>) {
my $line = $_;
chomp $line;
if ($counter < $start) {
$counter++;
next;
}
my #cells = split('\t', $line);
my $ID = $cells[2];
my $row = $cells[0];
$hash2{$ID} = $row;
$counter++;
}
close FILE;
while ( my ($key, $value) = each(%hash1) ) {
if ( exists $hash2{$key} ) {
print "$key\t$value\t$hash2{$key}\n";
}
}
A good solution would allow me to run the Perl script on every file with an appropriate suffix.
An even better solution would assess the suffixes of existing files to determine which pairs of files have already been processed this way and omit those. For example if File_A, File_B, File_C, and File_B_C exist then only File_A_B and File_A_C would be produced. Note that File_A_B and File_B_A are equivalent.

This should work. Better checks for bad arguments would be a good thing to add:
#!/bin/bash
if [ $# != 2 ]; then
echo "usage: pair <suffix1> <suffix2>"
exit
fi
suffix1=$1
suffix2=$2
for file1 in *_${suffix1}; do
fileCheck=$(echo $file1 | sed -e "s#_$suffix2##")
if [ "$fileCheck" = "$file1" ]; then
file2=${file1/_$suffix1/_$suffix2}
if [[ ( ! -f ${file1}_${suffix2} ) && ( ! -f ${file2}_${suffix1} ) ]]; then
echo processing ${file1}_${suffix2}
perl Program.pl $file1 $file2 > ${file1}_${suffix2}
fi
fi
done

How to compare two directories and their files in perl

Fred here again with a little issue I'm having that I hope you guys can help me with.
I'm reviewing for midterms and going over an old file I found on here and I wanted to get it working. I can't find it on here anymore but I still have the source code so I'll make another question on it.
So here was his assignment:
Write a perl script that will compare two directories for differences in regular files. All regular files with the same names should be tested with the unix function /usr/bin/diff -q which will determine whether they are identical. A file in dir1 which does not have a similarly named file in dir2 will have it's name printed after the string <<< while a file in dir2 without a corresponding dir1 entry will be prefixed with the string >>>. If two files have the same name but are different then the file name will be surrounded by > <.
Here is the script:
#!/usr/bin/perl -w
use File::Basename;
#files1 = `/usr/bin/find $ARGV[0] -print`;
chop #files1;
#files2 = `/usr/bin/find $ARGV[1] -print`;
chop #files2;
statement:
for ($i=1; #files1 >= $i; $i++) {
for ($x=1; #files2 >= $x; $x++) {
$file1 = basename($files1[$i]);
$file2 = basename($files2[$x]);
if ($file1 eq $file2) {
shift #files1;
shift #files2;
$result = `/usr/bin/diff -q $files1[$i] $files2[$x]`;
chop $result;
if ($result eq "Files $files1[$i] and $files2[$x] differ") {
print "< $file1 >\n";
next statement;
} else {
print "> $file1 <\n";
}
} else {
if ( !-e "$files1[$i]/$file2") { print ">>> $file2\n";}
unless ( -e "$files2[$x]/$file1") { print "<<< $file1\n";}
}
}
}
This is the output:
> file2 <
>>> file5
<<< file1
The output should be:
> file1 <
> file2 <
<<< file4
>>> file5
I already checked the files to make sure that they all match and such but still having problems. If anyone can help me out I would greatly appreciate it!

First off, always use these:
use strict;
use warnings;
It comes with a short learning curve, but they more than make up for it in the long run.
Some notes:
You should use the File::Find module instead of using a system call.
You start your loops at array index 1. In perl, the first array index is 0. So you skip the first element.
Your loop condition is wrong. #files >= $x means you will iterate to 1 more than max index (normally). You want either $x < #files or $x <= $#files.
You should use chomp, which is a safer version of chop.
Altering the arrays you are iterating over is a sure way to cause yourself some confusion.
Why use if (! -e ...) and then unless (-e ...)? That surely just adds confusion.
And this part:
$file1 = basename($files1[$i]);
...
if ( !-e "$files1[$i]/$file2" )
Assuming #files1 contains file names and not just directories, this will never match anything. For example:
$file2 = basename("dir/bar.html");
$file1 = basename("foo/bar.html");
-e "foo/bar.html/bar.html"; # does not compute
I would recommend using hashes for the lookup, assuming you only want to match against identical file names and missing file names:
use strict;
use warnings;
use File::Find;
use List::MoreUtils qw(uniq);
my (%files1, %files2);
my ($dir1, $dir2) = #ARGV;
find( sub { -f && $files1{$_} = $File::Find::name }, $dir1);
find( sub { -f && $files2{$_} = $File::Find::name }, $dir2);
my #all = uniq(keys %files1, keys %files2);
for my $file (#all) {
my $result;
if ($files1{$file} && $files2{$file}) { # file exists in both dirs
$result = qx(/usr/bin/diff -q $files1{$file} $files2{$file});
# ... etc
} elsif ($files1{$file}) { # file only exists in dir1
} else { # file only exists in dir2
}
}
In the find() subroutine, $_ represents the base name, and $File::Find::name the name including path (which is suitable for use with diff). The -f check will assert that you only include regular files in your hash.

How can I compare file list from a tar archive and directory?

I am still learning Perl. Can anyone please suggest me the Perl code to compare files from .tar.gz and a directory path.
Let's say I have tar.gz backup of following directory path which I have taken few days back.
a/file1
a/file2
a/file3
a/b/file4
a/b/file5
a/c/file5
a/b/d/file and so on..
Now I want to compare files and directories under this path with the tar.gz backup file.
Please suggest Perl code to do that.

See Archive::Tar.

The Archive::Tar and File::Find modules will be helpful. A basic example is shown below. It just prints information about the files in a tar and the files in a directory tree.
It was not clear from your question how you want to compare the files. If you need to compare the actual content, the get_content() method in Archive::Tar::File will likely be needed. If a simpler comparison is adequate (for example, name, size, and mtime), you won't need much more than methods used in the example below.
#!/usr/bin/perl
use strict;
use warnings;
# A utility function to display our results.
sub Print_file_info {
print map("$_\n", #_), "\n";
}
# Print some basic information about files in a tar.
use Archive::Tar qw();
my $tar_file = 'some_tar_file.tar.gz';
my $tar = Archive::Tar->new($tar_file);
for my $ft ( $tar->get_files ){
# The variable $ft is an Archive::Tar::File object.
Print_file_info(
$ft->name,
$ft->is_file ? 'file' : 'other',
$ft->size,
$ft->mtime,
);
}
# Print some basic information about files in a directory tree.
use File::Find;
my $dir_name = 'some_directory';
my #files;
find(sub {push #files, $File::Find::name}, $dir_name);
Print_file_info(
$_,
-f $_ ? 'file' : 'other',
-s,
(stat)[9],
) for #files;

Perl is kind of overkill for this, really. A shell script would do fine. The steps you need to take though:
Extract the tar to a temporary folder somewhere.
diff -uR the two folders and redirect the output somewhere (or perhaps pipe to less as appropriate)
Clean up the temporary folder.
And you're done. Shouldn't be more than 5-6 lines. Something quick and untested:
#!/bin/sh
mkdir $TEMP/$$
tar -xz -f ../backups/backup.tgz $TEMP/$$
diff -uR $TEMP/$$ ./ | less
rm -rf $TEMP/$$

Heres an example that checks to see if every file that is in an archive, also exists in a folder.
# $1 is the file to test
# $2 is the base folder
for file in $( tar --list -f $1 | perl -pe'chomp;$_=qq["'$2'$_" ]' )
do
# work around bash deficiency
if [[ -e "$( perl -eprint$file )" ]]
then
echo " $file"
else
echo "no $file"
fi
done
This is how I tested this:
I removed / renamed config, then ran the following:
bash test Downloads/update-dnsomatic-0.1.2.tar.gz Downloads/
Which gave the output of:
"Downloads/update-dnsomatic-0.1.2/"
no "Downloads/update-dnsomatic-0.1.2/config"
"Downloads/update-dnsomatic-0.1.2/update-dnsomatic"
"Downloads/update-dnsomatic-0.1.2/README"
"Downloads/update-dnsomatic-0.1.2/install.sh"
I am new to bash / shell programming, so there is probably a better way to do this.

This might be a good starting point for a good Perl program. It does what the question asked for though.
It was just hacked together, and ignores most of the best practices for Perl.
perl test.pl full \
Downloads/update-dnsomatic-0.1.2.tar.gz \
Downloads/ \
update-dnsomatic-0.1.2
#! /usr/bin/env perl
use strict;
use 5.010;
use warnings;
use autodie;
use Archive::Tar;
use File::Spec::Functions qw'catfile catdir';
my($action,$file,$directory,$special_dir) = #ARGV;
if( #ARGV == 1 ){
$file = *STDOUT{IO};
}
if( #ARGV == 3 ){
$special_dir = '';
}
sub has_file(_);
sub same_size($$);
sub find_missing(\%$);
given( lc $action ){
# only compare names
when( #{[qw'simple name names']} ){
my #list = Archive::Tar->list_archive($file);
say qq'missing file: "$_"' for grep{ ! has_file } #list;
}
# compare names, sizes, contents
when( #{[qw'full aggressive']} ){
my $next = Archive::Tar->iter($file);
my( %visited );
while( my $file = $next->() ){
next unless $file->is_file;
my $name = $file->name;
$visited{$name} = 1;
unless( has_file($name) ){
say qq'missing file: "$name"' ;
next;
}
unless( same_size( $name, $file->size ) ){
say qq'different size: "$name"';
next;
}
next unless $file->size;
unless( same_checksum( $name, $file->get_content ) ){
say qq'different checksums: "$name"';
next;
}
}
say qq'file not in archive: "$_"' for find_missing %visited, $special_dir;
}
}
sub has_file(_){
my($file) = #_;
if( -e catfile $directory, $file ){
return 1;
}
return;
}
sub same_size($$){
my($file,$size) = #_;
if( -s catfile($directory,$file) == $size ){
return $size || '0 but true';
}
return; # empty list/undefined
}
sub same_checksum{
my($file,$contents) = #_;
require Digest::SHA1;
my($outside,$inside);
my $sha1 = Digest::SHA1->new;
{
open my $io, '<', catfile $directory, $file;
$sha1->addfile($io);
close $io;
$outside = $sha1->digest;
}
$sha1->add($contents);
$inside = $sha1->digest;
return 1 if $inside eq $outside;
return;
}
sub find_missing(\%$){
my($found,$current_dir) = #_;
my(#dirs,#files);
{
my $open_dir = catdir($directory,$current_dir);
opendir my($h), $open_dir;
while( my $elem = readdir $h ){
next if $elem =~ /^[.]{1,2}[\\\/]?$/;
my $path = catfile $current_dir, $elem;
my $open_path = catfile $open_dir, $elem;
given($open_path){
when( -d ){
push #dirs, $path;
}
when( -f ){
push #files, $path, unless $found->{$path};
}
default{
die qq'not a file or a directory: "$path"';
}
}
}
}
for my $path ( #dirs ){
push #files, find_missing %$found, $path;
}
return #files;
}
After renaming config to config.rm, adding an extra char to README, changing a char in install.sh, and adding a file .test. This is what it outputted:
missing file: "update-dnsomatic-0.1.2/config"
different size: "update-dnsomatic-0.1.2/README"
different checksums: "update-dnsomatic-0.1.2/install.sh"
file not in archive: "update-dnsomatic-0.1.2/config.rm"
file not in archive: "update-dnsomatic-0.1.2/.test"

Removing files with duplicate content from single directory [Perl, or algorithm]

I have a folder with large number of files, some of with have exactly the same contents. I want to remove files with duplicate contents, meaning if two or more files with duplicate content found, I'd like to leave one of these files, and delete the others.
Following is what I came up with, but I don't know if it works :) , didn't try it yet.
How would you do it? Perl or general algorithm.
use strict;
use warnings;
my #files = <"./files/*.txt">;
my $current = 0;
while( $current <= $#files ) {
# read contents of $files[$current] into $contents1 scalar
my $compareTo = $current + 1;
while( $compareTo <= $#files ) {
# read contents of $files[compareTo] into $contents2 scalar
if( $contents1 eq $contents2 ) {
splice(#files, $compareTo, 1);
# delete $files[compareTo] here
}
else {
$compareTo++;
}
}
$current++;
}

Here's a general algorithm (edited for efficiency now that I've shaken off the sleepies -- and I also fixed a bug that no one reported)... :)
It's going to take forever (not to mention a lot of memory) if I compare every single file's contents against every other. Instead, why don't we apply the same search to their sizes first, and then compare checksums for those files of identical size.
So then when we md5sum every file (see Digest::MD5) calculate their sizes, we can use a hash table to do our matching for us, storing the matches together in arrayrefs:
use strict;
use warnings;
use Digest::MD5 qw(md5_hex);
my %files_by_size;
foreach my $file (#ARGV)
{
push #{$files_by_size{-s $file}}, $file; # store filename in the bucket for this file size (in bytes)
}
Now we just have to pull out the potential duplicates and check if they are the same (by creating a checksum for each, using Digest::MD5), using the same hashing technique:
while (my ($size, $files) = each %files_by_size)
{
next if #$files == 1;
my %files_by_md5;
foreach my $file (#$files_by_md5)
{
open my $filehandle, '<', $file or die "Can't open $file: $!";
# enable slurp mode
local $/;
my $data = <$filehandle>;
close $filehandle;
my $md5 = md5_hex($data);
push #{$files_by_md5{$md5}}, $file; # store filename in the bucket for this MD5
}
while (my ($md5, $files) = each %files_by_md5)
{
next if #$files == 1;
print "These files are equal: " . join(", ", #$files) . "\n";
}
}
-fini

Perl, with Digest::MD5 module.
use Digest::MD5 ;
%seen = ();
while( <*> ){
-d and next;
$filename="$_";
print "doing .. $filename\n";
$md5 = getmd5($filename) ."\n";
if ( ! defined( $seen{$md5} ) ){
$seen{$md5}="$filename";
}else{
print "Duplicate: $filename and $seen{$md5}\n";
}
}
sub getmd5 {
my $file = "$_";
open(FH,"<",$file) or die "Cannot open file: $!\n";
binmode(FH);
my $md5 = Digest::MD5->new;
$md5->addfile(FH);
close(FH);
return $md5->hexdigest;
}
If Perl is not a must and you are working on *nix, you can use shell tools
find /path -type f -print0 | xargs -0 md5sum | \
awk '($1 in seen){ print "duplicate: "$2" and "seen[$1] } \
( ! ($1 in seen ) ) { seen[$1]=$2 }'

md5sum *.txt | perl -ne '
chomp;
($sum, $file) = split(" ");
push #{$files{$sum}}, $file;
END {
foreach (keys %files) {
shift #{$files{$_}};
unlink #{$files{$_}} if #{$files{$_}};
}
}
'

Perl is kinda overkill for this:
md5sum * | sort | uniq -w 32 -D | cut -b 35- | tr '\n' '\0' | xargs -0 rm
(If you are missing some of these utilities or they don't have these flags/functions,
install GNU findutils and coreutils.)

Variations on a theme:
md5sum *.txt | perl -lne '
my ($sum, $file) = split " ", $_, 2;
unlink $file if $seen{$sum} ++;
'
No need to go and keep a list, just to remove one from the list and delete the rest; simply keep track of what you've seen before, and remove any file matching a sum that's already been seen. The 2-limit split is to do the right thing with filenames containing spaces.
Also, if you don't trust this, just change the word unlink to print and it will output a list of files to be removed. You can even tee that output to a file, and then rm $(cat to-delete.txt) in the end if it looks good.

a bash script is more expressive than perl in this case:
md5sum * |sort -k1|uniq -w32 -d|cut -f2 -d' '|xargs rm

I'd recommend that you do it in Perl, and use File::Find while you're at it.
Who knows what you're doing to generate your list of files, but you might want to combine it with your duplicate checking.
perl -MFile::Find -MDigest::MD5 -e '
my %m;
find(sub{
if(-f&&-r){
open(F,"<",$File::Find::name);
binmode F;
$d=Digest::MD5->new->addfile(F);
if(exists($m{$d->hexdigest}){
$m{$d->hexdigest}[5]++;
push $m{$d->hexdigest}[0], $File::Find::name;
}else{
$m{$d->hexdigest} = [[$File::Find::name],0,0,0,0,1];
}
close F
}},".");
foreach $d (keys %m) {
if ($m{$d}[5] > 1) {
print "Probable duplicates: ".join(" , ",$m{$d}[0])."\n\n";
}
}'

Here is a way of filtering by size first and by md5 checksum second:
#!/usr/bin/perl
use strict; use warnings;
use Digest::MD5 qw( md5_hex );
use File::Slurp;
use File::Spec::Functions qw( catfile rel2abs );
use Getopt::Std;
my %opts;
getopt('de', \%opts);
$opts{d} = '.' unless defined $opts{d};
$opts{d} = rel2abs $opts{d};
warn sprintf "Checking %s\n", $opts{d};
my $files = get_same_size_files( \%opts );
$files = get_same_md5_files( $files );
for my $size ( keys %$files ) {
for my $digest ( keys %{ $files->{$size}} ) {
print "$digest ($size)\n";
print "$_\n" for #{ $files->{$size}->{$digest} };
print "\n";
}
}
sub get_same_md5_files {
my ($files) = #_;
my %out;
for my $size ( keys %$files ) {
my %md5;
for my $file ( #{ $files->{$size}} ) {
my $contents = read_file $file, {binmode => ':raw'};
push #{ $md5{ md5_hex($contents) } }, $file;
}
for my $k ( keys %md5 ) {
delete $md5{$k} unless #{ $md5{$k} } > 1;
}
$out{$size} = \%md5 if keys %md5;
}
return \%out;
}
sub get_same_size_files {
my ($opts) = #_;
my $checker = defined($opts->{e})
? sub { scalar ($_[0] =~ /\.$opts->{e}\z/) }
: sub { 1 };
my %sizes;
my #files = grep { $checker->($_) } read_dir $opts->{d};
for my $file ( #files ) {
my $path = catfile $opts->{d}, $file;
next unless -f $path;
my $size = (stat $path)[7];
push #{ $sizes{$size} }, $path;
}
for my $k (keys %sizes) {
delete $sizes{$k} unless #{ $sizes{$k} } > 1;
}
return \%sizes;
}

You might want to have a look at how I did to find duplicate files and remove them. Though you have to modify it to your needs.
http://priyank.co.in/remove-duplicate-files