Recursively delete files older than 2 years (with specific extension like .zip, .log etc) - python-3.7

I'm new to Python and want to write a script to recursively delete files in a directory which are older than 2 years and have a specific extension like .zip, .txt etc.

I know this isn't GitHub but: I spend quite some time trying to figure it out and I have to admit the answer isn't that obvious but I found it
eventually. I have no idea why I spent half an hour on this random program but I did.
Its lucky i'm using python 3.7 as well because I didn't see your tag on the bottom of the post. This Image is a demo of me running what is titled The Program
Features
- Deletes all files from directory and subdirectory
- Able to change the extension to whatever you want eg: txt, bat, png, jpg
- Lets you change the folder you want erased to what you want eg from your C drive to pictures
The Program
import glob,os,sys,re,datetime
os.chdir("C:\\Users\\") # ------> PLEASE CHANGE THIS TO PREVENT YOUR C DRIVE GETTING DESTROYED THIS IS JUST AN EXAMPLE
src = os.getcwd()#Scans src which must be set to the current working directory
cn = 0
filedate = '2019'
clrd = 0
def random_function_name():
print("No files match the given criteria!")
return;
def find(path, *exts):
dirs = [a[0] for a in os.walk(path)]
f_filter = [d+e for d in dirs for e in exts]
return [f for files in [glob.iglob(files) for files in f_filter] for f in files]
print(src)
my_files = find(src,'\*py', '\*txt') #you can also add parameters like '\*txt', '\*jpg' ect
for f in my_files:
cn += 1
if filedate in datetime.datetime.fromtimestamp(os.path.getctime(f)).strftime('%Y/%m/%d|%H:%M:%S'):
print(' | CREATED:',datetime.datetime.fromtimestamp(os.path.getctime(f)).strftime('%Y/%m/%d|%H:%M:%S'),'|', 'Folder:','[',os.path.basename(os.path.dirname(f)),']', 'File:', os.path.split(os.path.abspath(f))[1], ' Bytes:', os.stat(f).st_size)
clrd += os.stat(f).st_size
def delete():
if cn != 0:
x = str(input("Delete {} file(s)? >>> ".format(cn)))
if x.lower() == 'yes':
os.remove(f)
print("You have cleared {} bytes of data".format(clrd))
sys.exit()
if x.lower() == 'no':
print('Aborting...')
sys.exit()
if x != 'yes' or 'no':
if x != '':
print("type yes or no")
delete()
else: delete()
if cn == 0:
print(str("No files to delete."))
sys.exit()
delete()
if filedate not in datetime.datetime.fromtimestamp(os.path.getctime(f)).strftime('%Y/%m/%d|%H:%M:%S'):
sys.setrecursionlimit(2500)
random_function_name()
On its own
This is for applying it to your own code
import glob,os,sys,re,datetime
os.chdir('C:\\Users')
src = os.getcwd()
def find(path, *exts):
dirs = [a[0] for a in os.walk(path)]
f_filter = [d+e for d in dirs for e in exts]
return [f for files in [glob.iglob(files) for files in f_filter] for f in files]
my_files = find(src,'\*py', '\*txt') #to add extensions do \*extension
for f in my_files:
if filedate in datetime.datetime.fromtimestamp(os.path.getctime(f)).strftime('%Y/%m/%d|%H:%M:%S'):
os.remove(f)

Related

How to process the data from a table.txt file from a series of folders and save the output in the same folder using Matlab?

Could you please help me to read the data from a table.txt in a series of subfolders from a directory? In all the subfolders, the output to read has the same name, 'table.txt'. I want to process the data and save the output in the same folder.
I can process it using the following code.
a = readmatrix('table.txt');
a4 = a(:,4);
a4 = a4 - mean(a4);
N = 2^(nextpow2(length(a4)));
freq = (abs(fftshift(fft(a4,N))));
t=[0:1e-12:20e-9].';
ts=t(2)-t(1);
F = ((-N/2:N/2-1)/N)*(1/ts);
fmr=[(F(N/2+1:end)/1e9)' freq(N/2+1:end)];
writematrix(fmr, 'fmr.csv');
cd folder
But how to perform the same action on all the subfolders?
Could somebody please help me out?
You can use the "find files in subfolders" behaviour of dir. Something like this:
allTables = dir('**/table.txt');
for ii = 1:numel(allTables)
thisFolder = allTables(ii).folder;
inFile = fullfile(thisFolder, allTables(ii).name);
a = readmatrix(inFile);
% do stuff ...
fmr = ...
outFile = fullfile(thisFolder, 'fmr.csv');
writematrix(fmr, outFile);
end

Is there a Pythonic way to add a string to the start of each file in a directory made with the name of the file without its extension?

I have written a code to append a string which is made up of file name to the file with that file name, but it does not append just one line, but the name of all the files in that folder and the line gets added after the data in all the files. All I want is to append a string to the start of the file.
With my code, I am getting all the Three Lines printed in all the files, at the end of each file i.e.
previous data...
parent a A B C D
parent b A B C D
parent c A B C D
This is my code
import os
import glob
os.chdir("C://Users//folder_naming_test_python//")
files = os.listdir()
#print("files=" )
#print(files)
d = []
for k in os.listdir():
d.append( k.split('.')[0])
#print("names=")
#print(d)
prefix = 'parent '
postfix = ' A B C D'
Headers = list(map(lambda orig_string :prefix + orig_string + postfix, d))
#print("Headers = ")
#print(Headers)
array_len = len(Headers)
for file in files:
for i in range(array_len):
f = open(file, 'a+')
a = f.read()
f.seek(0)
f.write(Headers[i]+'\n')
f.close()
f = open(file, 'r')
print(f.read())
My input data example would say; 3 files in a folder with names
a.txt, b.txt, c.txt
what I expect is irrespective of the data in the files,
either
parent a A B C D or
parent b A B C D or
parent c A B C D
followed by the data in file.....
has to be printed on first line of each file respectively(Note. a, b and c strings have to go in individual files and not all together in all the files)
It can be done very easily using fstrings
import os
from pathlib import Path
for filename in os.listdir():
with open(filename, "r+") as f:
content = f.read()
f.seek(0, 0)
f.write(f"parent {Path(filename).stem} A B C D\n")
f.write(content)

Storage values using os.stat(filename)

I'm trying to create an EDUCATIONAL PURPOSES ONLY virus. I do not plan on spreading it. It's purpose is to grow a file to the point your storage is full and slow your computer down. It prints the size of the file every 0.001 seconds. With that, I also want to know how fast it is growing the file. The following code doesn't seem to let it run:
class Vstatus():
def _init_(Status):
Status.countspeed == True
Status.active == True
Status.growingspeed == 0
import time
import os
#Your storage is at risk of over-expansion. Please do not let this file run forever, as your storage will fill continuously.
#This is for educational purposes only.
while Vstatus.Status.countspeed == True:
f = open('file.txt', 'a')
f.write('W')
fsize = os.stat('file.txt')
Key1 = fsize
time.sleep(1)
Key2 = fsize
Vstatus.Status.growingspeed = (Key2 - Key1)
Vstatus.Status.countspeed = False
while Vstatus.Status.active == True:
time.sleep(0.001)
f = open('file.txt', 'a')
f.write('W')
fsize = os.stat('file.txt')
print('size:' + fsize.st_size.__str__() + ' at a speed of ' + Vstatus.Status.growingspeed + 'bytes per second.')
This is for Educational Purposes ONLY
The main error I keep getting when running the file is here:
TypeError: unsupported operand type(s) for -: 'os.stat_result' and 'os.stat_result'
What does this mean? I thought os.stat returned an integer Can I get a fix on this?
Vstatus.Status.growingspeed = (Key2 - Key1)
You can't subtract os.stat objects. Your code also has some other problems. Your loops will run sequentially, meaning that your first loop will try to estimate how quickly the file is being written to without writing anything to the file.
import time # Imports at the top
import os
class VStatus:
def __init__(self): # double underscores around __init__
self.countspeed = True # Assignment, not equality test
self.active = True
self.growingspeed = 0
status = VStatus() # Make a VStatus instance
# You need to do the speed estimation and file appending in the same loop
with open('file.txt', 'a+') as f: # Only open the file once
start = time.time() # Get the current time
starting_size = os.fstat(f.fileno()).st_size
while status.active: # Access the attribute of the VStatus instance
size = os.fstat(f.fileno()).st_size # Send file desciptor to stat
f.write('W') # Writing more than one character at a time will be your biggest speed up
f.flush() # make sure the byte is written
if status.countspeed:
diff = time.time() - start
if diff >= 1: # More than a second has gone by
status.countspeed = False
status.growingspeed = (os.fstat(f.fileno()).st_size - starting_size)/diff # get rate of growth
else:
print(f"size: {size} at a speed of {status.growingspeed}")

Selecting Randon files from a folder tree

I have this folder organization
root/folder_1/file1_1 --up to-- file_5693
root/folder_2/file2_1 --up to-- file_100
root/folder_3/file3_1 --up to-- file_600
root/folder_4/file4_1 --up to-- file_689
I'd like to select a number (1000 example) of random files in each folder and put all them together in an output folder but, for folders with less than 200 files I'd like to copy all files.
root_2/output:
file1_350
.
.
.
file2_1 --> file2_100
.
.
.
etc
how can I do this ?
I tried to list all folder names in the directory with dir command but the folder numbers are not sequential. Any help ?
I might misunderstand but I do not see a reason for ordering the folder names since you will copy them anyway.
The following is the script for copying files inside folders which is again is in the root directory.
You can just change the following four variables ROOT_DIR, OUT_DIR, THRESHOLD_COPY and N_RANDOM_COPY.
% Define
ROOT_DIR = './'; % where the subdirectories are located
OUT_DIR = './root2'; % copy destination
THRESHOLD_COPY = 200; % threshold for copying all files
N_RANDOM_COPY = 100; % number of files that you want to copy
dirList = dir(ROOT_DIR);
dirList = dirList(3:end); % first two are ./ and ../
dirOnlyIndicators = cell2mat({dirList.isdir});
dirs = dirList(dirOnlyIndicators);
for dirIterator = transpose(dirs)
subdirList = dir([ ROOT_DIR dirIterator.name]);
fileIndicators = ~cell2mat({subdirList.isdir});
subfileList = {subdirList(fileIndicators)};
nFiles = sum(fileIndicators);
copyIndices = [];
if nFiles > THRESHOLD_COPY
copyIndices = randperm(nFiles);
copyIndices = copyIndices(1:N_RANDOM_COPY);
else
copyIndices = 1:nFiles;
end
for copyIndex = copyIndices
copyfile([ ROOT_DIR dirIterator.name '/' subfileList{copyIndex}.name],...
[OUT_DIR '/' subfileList{copyIndex}.name],...
'f');
end
end

Is it possible to copy all files from one S3 bucket to another with s3cmd?

I'm pretty happy with s3cmd, but there is one issue: How to copy all files from one S3 bucket to another? Is it even possible?
EDIT: I've found a way to copy files between buckets using Python with boto:
from boto.s3.connection import S3Connection
def copyBucket(srcBucketName, dstBucketName, maxKeys = 100):
conn = S3Connection(awsAccessKey, awsSecretKey)
srcBucket = conn.get_bucket(srcBucketName);
dstBucket = conn.get_bucket(dstBucketName);
resultMarker = ''
while True:
keys = srcBucket.get_all_keys(max_keys = maxKeys, marker = resultMarker)
for k in keys:
print 'Copying ' + k.key + ' from ' + srcBucketName + ' to ' + dstBucketName
t0 = time.clock()
dstBucket.copy_key(k.key, srcBucketName, k.key)
print time.clock() - t0, ' seconds'
if len(keys) < maxKeys:
print 'Done'
break
resultMarker = keys[maxKeys - 1].key
Syncing is almost as straight forward as copying. There are fields for ETag, size, and last-modified available for keys.
Maybe this helps others as well.
s3cmd sync s3://from/this/bucket/ s3://to/this/bucket/
For available options, please use:
$s3cmd --help
AWS CLI seems to do the job perfectly, and has the bonus of being an officially supported tool.
aws s3 sync s3://mybucket s3://backup-mybucket
http://docs.aws.amazon.com/cli/latest/reference/s3/sync.html
The answer with the most upvotes as I write this is this one:
s3cmd sync s3://from/this/bucket s3://to/this/bucket
It's a useful answer. But sometimes sync is not what you need (it deletes files, etc.). It took me a long time to figure out this non-scripting alternative to simply copy multiple files between buckets. (OK, in the case shown below it's not between buckets. It's between not-really-folders, but it works between buckets equally well.)
# Slightly verbose, slightly unintuitive, very useful:
s3cmd cp --recursive --exclude=* --include=file_prefix* s3://semarchy-inc/source1/ s3://semarchy-inc/target/
Explanation of the above command:
–recursiveIn my mind, my requirement is not recursive. I simply want multiple files. But recursive in this context just tells s3cmd cp to handle multiple files. Great.
–excludeIt’s an odd way to think of the problem. Begin by recursively selecting all files. Next, exclude all files. Wait, what?
–includeNow we’re talking. Indicate the file prefix (or suffix or whatever pattern) that you want to include.s3://sourceBucket/ s3://targetBucket/This part is intuitive enough. Though technically it seems to violate the documented example from s3cmd help which indicates that a source object must be specified:s3cmd cp s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]
You can also use the web interface to do so:
Go to the source bucket in the web interface.
Mark the files you want to copy (use shift and mouse clicks to mark several).
Press Actions->Copy.
Go to the destination bucket.
Press Actions->Paste.
That's it.
I needed to copy a very large bucket so I adapted the code in the question into a multi threaded version and put it up on GitHub.
https://github.com/paultuckey/s3-bucket-to-bucket-copy-py
It's actually possible. This worked for me:
import boto
AWS_ACCESS_KEY = 'Your access key'
AWS_SECRET_KEY = 'Your secret key'
conn = boto.s3.connection.S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
bucket = boto.s3.bucket.Bucket(conn, SRC_BUCKET_NAME)
for item in bucket:
# Note: here you can put also a path inside the DEST_BUCKET_NAME,
# if you want your item to be stored inside a folder, like this:
# bucket.copy(DEST_BUCKET_NAME, '%s/%s' % (folder_name, item.key))
bucket.copy(DEST_BUCKET_NAME, item.key)
Thanks - I use a slightly modified version, where I only copy files that don't exist or are a different size, and check on the destination if the key exists in the source. I found this a bit quicker for readying the test environment:
def botoSyncPath(path):
"""
Sync keys in specified path from source bucket to target bucket.
"""
try:
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
srcBucket = conn.get_bucket(AWS_SRC_BUCKET)
destBucket = conn.get_bucket(AWS_DEST_BUCKET)
for key in srcBucket.list(path):
destKey = destBucket.get_key(key.name)
if not destKey or destKey.size != key.size:
key.copy(AWS_DEST_BUCKET, key.name)
for key in destBucket.list(path):
srcKey = srcBucket.get_key(key.name)
if not srcKey:
key.delete()
except:
return False
return True
I wrote a script that backs up an S3 bucket: https://github.com/roseperrone/aws-backup-rake-task
#!/usr/bin/env python
from boto.s3.connection import S3Connection
import re
import datetime
import sys
import time
def main():
s3_ID = sys.argv[1]
s3_key = sys.argv[2]
src_bucket_name = sys.argv[3]
num_backup_buckets = sys.argv[4]
connection = S3Connection(s3_ID, s3_key)
delete_oldest_backup_buckets(connection, num_backup_buckets)
backup(connection, src_bucket_name)
def delete_oldest_backup_buckets(connection, num_backup_buckets):
"""Deletes the oldest backup buckets such that only the newest NUM_BACKUP_BUCKETS - 1 buckets remain."""
buckets = connection.get_all_buckets() # returns a list of bucket objects
num_buckets = len(buckets)
backup_bucket_names = []
for bucket in buckets:
if (re.search('backup-' + r'\d{4}-\d{2}-\d{2}' , bucket.name)):
backup_bucket_names.append(bucket.name)
backup_bucket_names.sort(key=lambda x: datetime.datetime.strptime(x[len('backup-'):17], '%Y-%m-%d').date())
# The buckets are sorted latest to earliest, so we want to keep the last NUM_BACKUP_BUCKETS - 1
delete = len(backup_bucket_names) - (int(num_backup_buckets) - 1)
if delete <= 0:
return
for i in range(0, delete):
print 'Deleting the backup bucket, ' + backup_bucket_names[i]
connection.delete_bucket(backup_bucket_names[i])
def backup(connection, src_bucket_name):
now = datetime.datetime.now()
# the month and day must be zero-filled
new_backup_bucket_name = 'backup-' + str('%02d' % now.year) + '-' + str('%02d' % now.month) + '-' + str(now.day);
print "Creating new bucket " + new_backup_bucket_name
new_backup_bucket = connection.create_bucket(new_backup_bucket_name)
copy_bucket(src_bucket_name, new_backup_bucket_name, connection)
def copy_bucket(src_bucket_name, dst_bucket_name, connection, maximum_keys = 100):
src_bucket = connection.get_bucket(src_bucket_name);
dst_bucket = connection.get_bucket(dst_bucket_name);
result_marker = ''
while True:
keys = src_bucket.get_all_keys(max_keys = maximum_keys, marker = result_marker)
for k in keys:
print 'Copying ' + k.key + ' from ' + src_bucket_name + ' to ' + dst_bucket_name
t0 = time.clock()
dst_bucket.copy_key(k.key, src_bucket_name, k.key)
print time.clock() - t0, ' seconds'
if len(keys) < maximum_keys:
print 'Done backing up.'
break
result_marker = keys[maximum_keys - 1].key
if __name__ =='__main__':main()
I use this in a rake task (for a Rails app):
desc "Back up a file onto S3"
task :backup do
S3ID = "*****"
S3KEY = "*****"
SRCBUCKET = "primary-mzgd"
NUM_BACKUP_BUCKETS = 2
Dir.chdir("#{Rails.root}/lib/tasks")
system "./do_backup.py #{S3ID} #{S3KEY} #{SRCBUCKET} #{NUM_BACKUP_BUCKETS}"
end
mdahlman's code didn't work for me but this command copies all the files in the bucket1 to a new folder (command also creates this new folder) in bucket 2.
cp --recursive --include=file_prefix* s3://bucket1/ s3://bucket2/new_folder_name/
s3cmd won't cp with only prefixes or wildcards but you can script the behavior with 's3cmd ls sourceBucket', and awk to extract the object name. Then use 's3cmd cp sourceBucket/name destBucket' to copy each object name in the list.
I use these batch files in a DOS box on Windows:
s3list.bat
s3cmd ls %1 | gawk "/s3/{ print \"\\"\"\"substr($0,index($0,\"s3://\"))\"\\"\"\"; }"
s3copy.bat
#for /F "delims=" %%s in ('s3list %1') do #s3cmd cp %%s %2
You can also use s3funnel which uses multi-threading:
https://github.com/neelakanta/s3funnel
example (without the access key or secret key parameters shown):
s3funnel source-bucket-name list | s3funnel dest-bucket-name copy --source-bucket source-bucket-name --threads=10