Perl recursion through filesystem (without File:Find) follows "top" one path only - perl

The general problem: Iterate through a windows folder structure, without the use of Find::File and rename all folders to be some short value. This is intended to fix the classic "windows file paths are more then 256 characters" problem.
My problem: I've got everything working, except that it will correctly process a single path through the tree, but not any others and I can't see why.
Note: use File:Find is still present, though not used, and the code is likely in-elegant. It renames the current folder, then iterates through subfolders.
Code:
use strict;
use File::Find;
use File::Copy;
use File::Path;
my $target = "E:\\bl10s\\";
opendir( DIR, $target );
my $newFolderName = 0;
my $file;
while ( $file = readdir(DIR) ) {
# A file test to check that it is a directory
# Use -f to test for a file
next if ( $file eq "." );
next if ( $file eq ".." );
next if ( -f "$target\\$file" );
print "$target/$file" . "\n";
while ( -e $target . $newFolderName ) {
$newFolderName++;
}
print $target. $file . " rename to " . $target . $newFolderName . "\n";
rename( $target . $file, $target . $newFolderName );
}
closedir(DIR);
opendir( DIR, $target );
while ( $file = readdir(DIR) ) {
next if ( $file eq "." );
next if ( $file eq ".." );
if ( -f "$target\\$file" )
{
print "Failed name check on itterator main line \n";
}
my $nextDirectoryPathCalled = $target . $file;
print "Re-iterating on: " . $nextDirectoryPathCalled;
my $someint = &stripper($nextDirectoryPathCalled);
}
closedir(DIR);
# find( \&dir_names, $target );
sub stripper {
print "\nNew level\n";
print "$_[0] . \n";
my $target = $_[0] . "\\";
opendir( DIR, $target );
my $newFolderName = 0;
my $file;
while ( $file = readdir(DIR) ) {
# A file test to check that it is a directory
# Use -f to test for a file
next if ( $file eq "." );
next if ( $file eq ".." );
next if ( -f "$target\\$file" );
print "TARGET: $target FILE: $file" . "\n";
while ( -e $target . $newFolderName ) {
$newFolderName++;
}
print $target
. $file
. " rename to "
. $target
. $newFolderName . "\n";
rename( $target . $file, $target . $newFolderName );
}
closedir(DIR);
opendir( DIR, $target );
while ( $file = readdir(DIR) ) {
next if ( $file eq "." );
next if ( $file eq ".." );
next if ( -f "$target\\$file" );
my $nextDirectoryPathCalled = $target . $file;
print "Re-iterating on: " . $nextDirectoryPathCalled;
&stripper($nextDirectoryPathCalled);
}
closedir(DIR);
}
Any ideas? Feel free to be critical...
Thanks

When doing recursion, it is important that each level has its own set of variables, and does not change those of upper levels. Take for example this implementation of the factorial, which is:
fac(n) = 1 · 2 · 3 · … · n
In naïve Perl code, we could write
sub fac {
$n = shift;
return 1 if $n < 2;
return $n * fac($n-1);
}
Output for fac(10) is 1. This is cleary wrong, we expected 1*2*3*4*5*6*7*8*9*10 = 3628800.
The problem is that each recursion level assigned to the same $n variable. We have the following call stack:
fac(10)
$n = 10
fac(9) * $n
$n = 9
fac(8) * $n * $n
$n = 8
...
fac(1) * $n * $n * $n * $n * $n * $n * $n * $n * $n * $n
$n = 1
return 1
So all of that evaluates to 1. Duh.
We can give each level a different $n variable by declaring it with my:
sub fac {
my $n = shift;
return 1 if $n < 2;
return $n * fac($n-1);
}
Now, everything works as expected.
In Perl, bareword filehandles are global variables. When you do
opendir DIR, $somepath;
then you assign a handle to that global bareword. While recursing, you constantly reassign to it. When your function returns from a lower level, the handle does not get the previous value back.
Instead, you should use lexical variables for your handles as well:
opendir my $dir, $somepath;
Then each invocation of your subroutine has its own, non-global dirhandle, and everything should work as expected.
Using File::Find handles all of that complexity behind the scenes, and just makes the correct element directly available to your callback. It is robust, widely used, so you should really see if you shouldn't be using the find function from that module instead.

Related

How to modify "for" loop in existing program

I want to modify this code
#!/usr/bin/perl -w
# Call of CPAN
use warnings;
use strict;
use Cwd;
# Variables
my $i = 0;
my $directory = getcwd;
my $file = "options";
# Opening output file and adding the header on first row
open( FILE, ">>OLTP.txt" ) or die( "Could not create file OLTP.txt" );
print FILE "User script,Serveur Name,Instance Name,Date of script,Serveur Name2,Instance Name2,ADVANCED_COMPRESSION~HEADER,TABLE_COMPRESSION~HEADER,2,count,DATA_DICTIONARY_VIEW,TABLE_OWNER,TABLE_NAME,PARTITION_NAME,COMPRESSION,COMPRESS_FOR,\n";
close FILE;
# Loop while files are found
foreach my $files ( list_files( $directory, 1, $file ) ) {
print "File : $files\n";
singlefile( $files, $file );
}
# Recursion and list integration
sub list_files {
my ( $directory, $recurse, $file ) = #_;
require File::Spec;
# Search in subdirectory or not
if ( ( not defined $recurse ) || ( $recurse != 1 ) ) {
$recurse = 0;
}
# verification directory
if ( not defined $directory ) {
die "No named directory\n";
}
# Opening a directory
opendir my $fh_rep, $directory or die "Can not open directory $directory\n";
# List files and directories, except (. and ..)
my #fic_rep = grep { !/^\.\.?$/ } readdir $fh_rep;
# Closing directory
closedir $fh_rep or die "Unable to close directory $directory\n";
# Fill list with found files
my #files;
# File or folder? if file: add files to the table. if record: start the recursion
foreach my $nom ( #fic_rep ) {
my $mycurrentfile = File::Spec->catdir( $directory, $nom );
if ( -f $mycurrentfile
and $mycurrentfile =~ m/$file/
and $mycurrentfile =~ m/\.csv$/i ) {
push( #files, $mycurrentfile );
}
elsif ( -d $mycurrentfile and $recurse == 1 ) {
push( #files, list_files( $mycurrentfile, $recurse, $file ) ); # recursion
}
}
return #files;
}
## Merge data after filtering
sub singlefile {
my ( $file, $out ) = #_;
# Open file
open( FILE, $file ) or die( "error occured while opening file" );
# Create list from file
my #save = <FILE>;
close( FILE );
# Empty table rows which do not meet criteria
foreach ( #save ) {
$_ = "" unless ( $_ =~ m/"ENABLED","OLTP",/ && $_ =~ m/^GREP/ );
$_ = "" if ( $_ =~ m/SYSMAN/ || m/SYS/ );
chomp $_;
}
# Open output file, add data, close
open( FILE, ">>OLTP.txt" ) or die( "error occured while opening file OLTP.txt" );
foreach ( #save ) {
print FILE $_ . "\n" if ( $_ );
}
close( FILE );
}
It seems to do the following
Create txt with header
Create list of files based on every files that match with criteria (option, csv)
For each file in the list, fill with all rows and then remove what do not match with the criteria unless and if
Push everything into the file with header (oltp.txt)
My Goal :
Create txt with header
Create list of file based on every files that match with criteria (option, csv)
For each files of the list, fill only with the First rows that match with the criteria unless and if
Push everything into the file with header oltp.txt. The final result should be the text with header and then a maximum of one line per file if the criteria match.

Fetch and upload files and prefixed files to s3 using perl

I am trying to fetch files from s3 using Amazon::S3 module in Perl . I am successfully able to download files which are not prefixed but unable to fetch prefixed files like test/abc.txt.
I am using below code.
sub export_bucket {
my ($conn, $bucket, $directory) = #_;
$bucket = $conn->bucket($bucket);
my $response = $bucket->list();
print $response->{bucket}."\n";
for my $key (#{ $response->{keys} }) {
print "\t".$key->{key}."\n";
_export_file($conn,$bucket,$key->{key}, $directory.'/'.$key->{key});
}
}
sub _export_file {
my ($conn,$bucket,$name,$path) = #_;
print "Downloading $name file","\n";
my $test = $bucket->get_key_filename($name,'GET',$path);
print Dumper($test);
my $acl = $bucket->get_acl($name);
print Dumper($acl);
open my $acl_file, '>', $path.'.acl';
print $acl_file $acl;
close $acl_file;
}
Suggest me what changes should i make so that when a prefixed/folder comes i should be able to download the folder as well.
Thanks
You need to modify you code to create the target directory on your local filesystem if it does not already exist. It should look something like this:
use File::Path qw[make_path];
sub export_bucket {
my ( $conn, $bucket, $directory ) = #_;
$bucket = $conn->bucket($bucket);
my $response = $bucket->list();
print $response->{bucket} . "\n";
for my $key ( #{ $response->{keys} } ) {
print "\t" . $key->{key} . "\n";
_export_file( $conn, $bucket, $key->{key}, $directory . '/' . $key->{key} );
}
}
sub _export_file {
my ( $conn, $bucket, $name, $path ) = #_;
print "Downloading $name file", "\n";
my $test = $bucket->get_key_filename( $name, 'GET', $path );
print Dumper($test);
my $acl = $bucket->get_acl($name);
print Dumper($acl);
## get path directory part
my ($dir_part) = $path =~ /(.+)\/[^\/]+$/;
unless ( -d $dir_part ) {
make_path($dir_part);
}
open my $acl_file, '>', $path . '.acl';
print $acl_file $acl;
close $acl_file;
}
Perhaps $directory has a trailing "/" already in the case that is failing (or $key->{key} has a preceding slash)? Try debug-printing $directory and $path in the code to see exactly what your string arguments are.

How to rename directories recursively

I want to rename directories recursively using File::Find::Rule, eg. remove extra spaces in each found but as I understand the module doesn't do finddepth and renames only one. Is there a way to do that. Thanks.
use autodie;
use strict ;
use warnings;
use File::Find::Rule;
my $dir = 'D:/Test';
my #fd = File::Find::Rule->directory
->in( $dir );
for my $fd ( #fd ) {
my $new = $fd;
$new =~ s/\s\s+/ /g;
print "$new\n";
rename $fd, $new;
}
You want to process the deeper results first, so process the list in reverse. You can only rename the leaf part of the path; you'll get to the more shallow parts later.
use Path::Class qw( dir );
for ( reverse #fd ) {
my $dir = dir($_);
my $parent = $dir->parent;
my $old_leaf = my $new_leaf = $dir->dir_list(-1);
$new_leaf =~ s/\s+/ /g;
if ($new_leaf ne $old_leaf) {
my $old_file = $parent->dir($old_leaf);
my $new_file = $parent->dir($new_leaf);
# Prevent accidental deletion of files.
if (-e $new_file) {
warn("$new_file already exists\n");
next;
}
rename($old_file, $new_file);
}
}
Answer to original question:
I don't see how FFR comes into play.
rename 'Test1/Test2/Test3', 'Test1/Test2/Dir3';
rename 'Test1/Test2', 'Test1/Dir2';
rename 'Test1', 'Dir1';
For arbitrary paths,
use Path::Class qw( dir );
my #parts1 = dir('Test1/Test2/Test3')->dir_list();
my #parts2 = dir('Dir1/Dir2/Dir3' )->dir_list();
die if #parts1 != #parts2;
for (reverse 0..$#parts1) {
my $path1 = dir(#parts1[ 0..$_ ]);
my $path2 = dir(#parts2[ 0..$_ ]);
rename($path1, $path2);
}
Or maybe you want to rename all Test1 to Dir1, Test2 to Dir2, and Test3 to Dir3, process the list in reverse order.
my %map = (
'Test1' => 'Dir1',
'Test2' => 'Dir2',
'Test3' => 'Dir3',
);
my $pat = join '|', map quotemeta, keys %map;
for ( reverse #fd ) {
my $o = $_;
my $n = $_;
$n =~ s{/\K($pat)\z}{$map{$1}};
if ($n ne $o) {
if (-e $n) {
warn("$n already exists\n");
next;
}
rename($o, $n);
}
}
I have a module for doing actions recursively in a directory tree. It didn't have the ability to act on the directories themselves though, so it took a little updating. I have uploaded version 0.03 of my File::chdir::WalkDir, but until it shows up, it can be installed from its GitHub repo, and now available using your fav CPAN utility. This script would then remove spaces from directory names inside the base directory 'Test' relative to the working directory:
#!/usr/bin/env perl
use strict;
use warnings;
use File::chdir::WalkDir 0.030;
use File::Copy;
my $job = sub {
my ($name, $in_dir) = #_;
#ONLY act on directories
return 0 unless (-d $name);
my $new_name = $name;
if ($new_name =~ s/\s+/ /g) {
move($name, $new_name);
}
};
walkdir( 'Test', $job, {'act_on_directories' => 1} );

How to create the next file or folder in a series of progressively numbered files?

Sorry for the bad title but this is the best I could do! :D
I have a script which creates a new project every time the specified function is called.
Each project must be stored in its own folder, with the name of the project. But, if you don't specify a name, the script will just name it "new projectX", where X is a progressive number.
With time the user could rename the folders or delete some, so every time the script runs, it checks for the smallest number available (not used by another folder) and creates the relevant folder.
Now I managed to make a program which I think works as wanted, but I would like to hear from you if it's OK or there's something wrong which I'm unable to spot, given my inexperience with the language.
while ( defined( $file = readdir $projects_dir ) )
{
# check for files whose name start with "new project"
if ( $file =~ m/^new project/i )
{
push( #files, $file );
}
}
# remove letters from filenames, only the number is left
foreach $file ( #files )
{
$file =~ s/[a-z]//ig;
}
#files = sort { $a <=> $b } #files;
# find the smallest number available
my $smallest_number = 0;
foreach $file ( #files )
{
if ( $smallest_number != $file )
{
last;
}
$smallest_number += 1;
}
print "Smallest number is $smallest_number";
Here's a basic approach for this sort of problem:
sub next_available_dir {
my $n = 1;
my $d;
$n ++ while -e ($d = "new project$n");
return $d;
}
my $project_dir = next_available_dir();
mkdir $project_dir;
If you're willing to use a naming pattern that plays nicely with Perl's string auto-increment feature, you can simplify the code further, eliminating the need for $n. For example, newproject000.
I think I would use something like:
use strict;
use warnings;
sub new_project_dir
{
my($base) = #_;
opendir(my $dh, $base) || die "Failed to open directory $base for reading";
my $file;
my #numbers;
while ($file = readdir $dh)
{
$numbers[$1] = 1 if ($file =~ m/^new project(\d+)$/)
}
closedir($dh) || die "Failed to close directory $base";
my $i;
my $max = $#numbers;
for ($i = 0; $i < $max; $i++)
{
next if (defined $numbers[$i]);
# Directory did not exist when we scanned the directory
# But maybe it was created since then!
my $dir = "new project$i";
next unless mkdir "$base/$dir";
return $dir;
}
# All numbers from 0..$max were in use...so try adding new numbers...
while ($i < $max + 100)
{
my $dir = "new project$i";
$i++;
next unless mkdir "$base/$dir";
return $dir;
}
# Still failed - give in...
die "Something is amiss - all directories 0..$i in use?";
}
Test code:
my $basedir = "base";
mkdir $basedir unless -d $basedir;
for (my $j = 0; $j < 10; $j++)
{
my $dir = new_project_dir($basedir);
print "Create: $dir\n";
if ($j % 3 == 2)
{
my $k = int($j / 2);
my $o = "new project$k";
rmdir "$basedir/$o";
print "Remove: $o\n";
}
}
Try this:
#!/usr/bin/env perl
use strict;
use warnings;
# get the current list of files
# see `perldoc -f glob` for details.
my #files = glob( 'some/dir/new\\ project*' );
# set to first name, in case there are none others
my $next_file = 'new project1';
# check for others
if( #files ){
# a Schwartian transform
#files = map { $_->[0] } # get original
sort { $a->[1] <=> $b->[1] } # sort by second field which are numbers
map { [ $_, do{ ( my $n = $_ ) =~ s/\D//g; $n } ] } # create an anonymous array with original value and the second field nothing but digits
#files;
# last file name is the biggest
$next_file = $files[-1];
# add one to it
$next_file =~ s/(.*)(\d+)$/$1.($2+1)/e;
}
print "next file: $next_file\n";
Nothing wrong per se, but that's an awful lot of code to achieve a single objective (get the minimum index of directories.
A core module, couple of subs and few Schwartzian transforms will make the code more flexible:
use strict;
use warnings;
use List::Util 'min';
sub num { $_[0] =~ s|\D+||g } # 'new project4' -> '4', 'new1_project4' -> '14' (!)
sub min_index {
my ( $dir, $filter ) = #_;
$filter = qr/./ unless defined $filter; # match all if no filter specified
opendir my $dirHandle, $dir or die $!;
my $lowest_index = min # get the smallest ...
map { num($_) } # ... numerical value ...
grep { -d } # ... from all directories ...
grep { /$filter/ } # ... that match the filter ...
readdir $dirHandle; # ... from the directory contents
$lowest_index++ while grep { $lowest_index == num( $_ ) } readdir $dirhandle;
return $lowest_index;
}
# Ready to use!
my $index = min_index ( 'some/dir' , qr/^new project/ );
my $new_project_name = "new project $index";

Removing files with duplicate content from single directory [Perl, or algorithm]

I have a folder with large number of files, some of with have exactly the same contents. I want to remove files with duplicate contents, meaning if two or more files with duplicate content found, I'd like to leave one of these files, and delete the others.
Following is what I came up with, but I don't know if it works :) , didn't try it yet.
How would you do it? Perl or general algorithm.
use strict;
use warnings;
my #files = <"./files/*.txt">;
my $current = 0;
while( $current <= $#files ) {
# read contents of $files[$current] into $contents1 scalar
my $compareTo = $current + 1;
while( $compareTo <= $#files ) {
# read contents of $files[compareTo] into $contents2 scalar
if( $contents1 eq $contents2 ) {
splice(#files, $compareTo, 1);
# delete $files[compareTo] here
}
else {
$compareTo++;
}
}
$current++;
}
Here's a general algorithm (edited for efficiency now that I've shaken off the sleepies -- and I also fixed a bug that no one reported)... :)
It's going to take forever (not to mention a lot of memory) if I compare every single file's contents against every other. Instead, why don't we apply the same search to their sizes first, and then compare checksums for those files of identical size.
So then when we md5sum every file (see Digest::MD5) calculate their sizes, we can use a hash table to do our matching for us, storing the matches together in arrayrefs:
use strict;
use warnings;
use Digest::MD5 qw(md5_hex);
my %files_by_size;
foreach my $file (#ARGV)
{
push #{$files_by_size{-s $file}}, $file; # store filename in the bucket for this file size (in bytes)
}
Now we just have to pull out the potential duplicates and check if they are the same (by creating a checksum for each, using Digest::MD5), using the same hashing technique:
while (my ($size, $files) = each %files_by_size)
{
next if #$files == 1;
my %files_by_md5;
foreach my $file (#$files_by_md5)
{
open my $filehandle, '<', $file or die "Can't open $file: $!";
# enable slurp mode
local $/;
my $data = <$filehandle>;
close $filehandle;
my $md5 = md5_hex($data);
push #{$files_by_md5{$md5}}, $file; # store filename in the bucket for this MD5
}
while (my ($md5, $files) = each %files_by_md5)
{
next if #$files == 1;
print "These files are equal: " . join(", ", #$files) . "\n";
}
}
-fini
Perl, with Digest::MD5 module.
use Digest::MD5 ;
%seen = ();
while( <*> ){
-d and next;
$filename="$_";
print "doing .. $filename\n";
$md5 = getmd5($filename) ."\n";
if ( ! defined( $seen{$md5} ) ){
$seen{$md5}="$filename";
}else{
print "Duplicate: $filename and $seen{$md5}\n";
}
}
sub getmd5 {
my $file = "$_";
open(FH,"<",$file) or die "Cannot open file: $!\n";
binmode(FH);
my $md5 = Digest::MD5->new;
$md5->addfile(FH);
close(FH);
return $md5->hexdigest;
}
If Perl is not a must and you are working on *nix, you can use shell tools
find /path -type f -print0 | xargs -0 md5sum | \
awk '($1 in seen){ print "duplicate: "$2" and "seen[$1] } \
( ! ($1 in seen ) ) { seen[$1]=$2 }'
md5sum *.txt | perl -ne '
chomp;
($sum, $file) = split(" ");
push #{$files{$sum}}, $file;
END {
foreach (keys %files) {
shift #{$files{$_}};
unlink #{$files{$_}} if #{$files{$_}};
}
}
'
Perl is kinda overkill for this:
md5sum * | sort | uniq -w 32 -D | cut -b 35- | tr '\n' '\0' | xargs -0 rm
(If you are missing some of these utilities or they don't have these flags/functions,
install GNU findutils and coreutils.)
Variations on a theme:
md5sum *.txt | perl -lne '
my ($sum, $file) = split " ", $_, 2;
unlink $file if $seen{$sum} ++;
'
No need to go and keep a list, just to remove one from the list and delete the rest; simply keep track of what you've seen before, and remove any file matching a sum that's already been seen. The 2-limit split is to do the right thing with filenames containing spaces.
Also, if you don't trust this, just change the word unlink to print and it will output a list of files to be removed. You can even tee that output to a file, and then rm $(cat to-delete.txt) in the end if it looks good.
a bash script is more expressive than perl in this case:
md5sum * |sort -k1|uniq -w32 -d|cut -f2 -d' '|xargs rm
I'd recommend that you do it in Perl, and use File::Find while you're at it.
Who knows what you're doing to generate your list of files, but you might want to combine it with your duplicate checking.
perl -MFile::Find -MDigest::MD5 -e '
my %m;
find(sub{
if(-f&&-r){
open(F,"<",$File::Find::name);
binmode F;
$d=Digest::MD5->new->addfile(F);
if(exists($m{$d->hexdigest}){
$m{$d->hexdigest}[5]++;
push $m{$d->hexdigest}[0], $File::Find::name;
}else{
$m{$d->hexdigest} = [[$File::Find::name],0,0,0,0,1];
}
close F
}},".");
foreach $d (keys %m) {
if ($m{$d}[5] > 1) {
print "Probable duplicates: ".join(" , ",$m{$d}[0])."\n\n";
}
}'
Here is a way of filtering by size first and by md5 checksum second:
#!/usr/bin/perl
use strict; use warnings;
use Digest::MD5 qw( md5_hex );
use File::Slurp;
use File::Spec::Functions qw( catfile rel2abs );
use Getopt::Std;
my %opts;
getopt('de', \%opts);
$opts{d} = '.' unless defined $opts{d};
$opts{d} = rel2abs $opts{d};
warn sprintf "Checking %s\n", $opts{d};
my $files = get_same_size_files( \%opts );
$files = get_same_md5_files( $files );
for my $size ( keys %$files ) {
for my $digest ( keys %{ $files->{$size}} ) {
print "$digest ($size)\n";
print "$_\n" for #{ $files->{$size}->{$digest} };
print "\n";
}
}
sub get_same_md5_files {
my ($files) = #_;
my %out;
for my $size ( keys %$files ) {
my %md5;
for my $file ( #{ $files->{$size}} ) {
my $contents = read_file $file, {binmode => ':raw'};
push #{ $md5{ md5_hex($contents) } }, $file;
}
for my $k ( keys %md5 ) {
delete $md5{$k} unless #{ $md5{$k} } > 1;
}
$out{$size} = \%md5 if keys %md5;
}
return \%out;
}
sub get_same_size_files {
my ($opts) = #_;
my $checker = defined($opts->{e})
? sub { scalar ($_[0] =~ /\.$opts->{e}\z/) }
: sub { 1 };
my %sizes;
my #files = grep { $checker->($_) } read_dir $opts->{d};
for my $file ( #files ) {
my $path = catfile $opts->{d}, $file;
next unless -f $path;
my $size = (stat $path)[7];
push #{ $sizes{$size} }, $path;
}
for my $k (keys %sizes) {
delete $sizes{$k} unless #{ $sizes{$k} } > 1;
}
return \%sizes;
}
You might want to have a look at how I did to find duplicate files and remove them. Though you have to modify it to your needs.
http://priyank.co.in/remove-duplicate-files