Looping through all folders in FTP directory - python

I found some code online, and modified it to list all folders in an FTP directory. I have all the folders listed, with the code below.
import ftplib
from ftplib import FTP
ftp = FTP()
import datetime
filenames = []
data = []
ftp = ftplib.FTP('ftp.something.com', 'u_name', 'pswd')
def get_dirs_ftp(folder=""):
contents = ftp.nlst(folder)
folders = []
for item in contents:
if "." not in item:
folders.append(item)
return folders
def get_all_dirs_ftp(folder=""):
dirs = []
new_dirs = []
new_dirs = get_dirs_ftp(folder)
while len(new_dirs) > 0:
for dir in new_dirs:
dirs.append(dir)
old_dirs = new_dirs[:]
new_dirs = []
for dir in old_dirs:
for new_dir in get_dirs_ftp(dir):
new_dirs.append(new_dir)
dirs.sort()
return dirs
allfiles = []
all_dirs = get_all_dirs_ftp()
Using the code above, I confirmed that the hierarchy is correct. Now, I am trying to loop through this list of folders and subfolders, and drill down to the files in each. This is where the problem occurs. Here's the rest of my code.
for dir in all_dirs:
ftp.cwd(dir)
ftp.retrlines('LIST')
filenames = []
ftp.retrlines('NLST', filenames.append)
# writes file name and modified date and file size, to dataframe
#data = []
for filename in filenames:
filename
modifiedTimeFtp = datetime.datetime.strptime(datetime[4:], "%Y%m%d%H%M%S").strftime("%d %b %Y %H:%M:%S")
size = ftp.size(filename)
filesize = "{:.2f}".format(size/(1024)) + 'kb'
finaldata = (str(filename) + '|' + str(modifiedTimeFtp) + '|' + str(filesize))
allfiles.append(finaldata,'\n')
Now, when I run this section of code, I get this error: TypeError: 'module' object is not subscriptable
I'm thinking that the problem lies in this range.
ftp.cwd(dir)
ftp.retrlines('LIST')
filenames = []
ftp.retrlines('NLST', filenames.append)
That's my guess, but I don't know for sure. Is there an easy way to get this working? I almost feel like this is mission impossible, because the FTP folder that I'm querying is pretty massive, and I'm guessing there can be all kinds of timeouts, or whatever, while the task is running. All I'm trying to do is get the file name, file date/time modified, and file size. Thanks for the look.

Here is the final, working, version.
import ftplib
from ftplib import FTP
ftp = FTP()
from datetime import datetime
filenames = []
data = []
ftp = ftplib.FTP('ftp.anything.com', 'u_name', 'ps_wd')
def get_dirs_ftp(folder=""):
contents = ftp.nlst(folder)
folders = []
for item in contents:
if "." not in item:
folders.append(item)
return folders
def get_all_dirs_ftp(folder=""):
dirs = []
new_dirs = []
new_dirs = get_dirs_ftp(folder)
while len(new_dirs) > 0:
for dir in new_dirs:
dirs.append(dir)
old_dirs = new_dirs[:]
new_dirs = []
for dir in old_dirs:
for new_dir in get_dirs_ftp(dir):
new_dirs.append(new_dir)
dirs.sort()
return dirs
#allfiles = []
# get parent and child folders in directory
all_dirs = get_all_dirs_ftp()
# create a list to append metadata
dir_list = []
for dir in all_dirs:
ftp.cwd('/'+dir+'/')
print(dir)
dir_list.append(dir)
ftp.dir(dir_list.append)
len(dir_list)
# you probably want to dump the results to a file...
outF = open('C:/your_path/filenames.csv', 'w')
for line in dir_list:
# write line to output file
outF.write(line)
outF.write("\n")
outF.close()
print('Done!!')

You should take a look at the stacktrace for the actual line that is causing the error.
From your code the fault appears to be this line:
modifiedTimeFtp = datetime.datetime.strptime(datetime[4:], "%Y%m%d%H%M%S").strftime("%d %b %Y %H:%M:%S")
datetime here appears to be the module, but you probably want to refer to some variable with the date, so datetime[4:] is causing the error.

Related

Creating a multidimensional list of similarly named files with different extensions

I have a directory of files that follows this file naming pattern:
alice_01.mov
alice_01.mp4
alice_02.mp4
bob_01.avi
My goal is to find all files at a given path and create a "multidimensional" list of them where each sublist is the unique name of the file (without extension) and then a list of extensions, like so:
resulting_list = [
['alice_01', ['mov','mp4']],
['alice_02', ['mp4']],
['bob_01', ['avi']]
]
I have gotten this far:
import os
path = "user_files/"
def user_files(path):
files = []
for file in os.listdir(path):
files.append(file)
return files
file_array = []
for file in user_files(path):
file_name = file.split(".")[0]
file_ext = file.split(".")[1]
if file_name not in (sublist[0] for sublist in file_array):
file_array.append([file_name,[file_ext]])
else:
file_array[file_array.index(file_name)].append([file_name,[file_ext]])
print(file_array)
My problem is in the else condition but I'm struggling to get it right.
Any help is appreciated.
Here's how you can do it using a dict to store the results:
filenames = [
"alice_01.mov",
"alice_01.mp4",
"alice_02.mp4",
"bob_01.avi",
]
file_dict = {}
for file in filenames:
file_name, file_ext = file.split(".")[0:2]
file_dict.setdefault(file_name, []).append(file_ext)
print(file_dict)
Result:
{'alice_01': ['mov', 'mp4'], 'alice_02': ['mp4'], 'bob_01': ['avi']}
UPDATE: The code above doesn't handle special cases, so here's a slightly more robust version.
from pprint import pprint
filenames = [
"alice_01.mov",
"alice_01.mp4",
"alice_02.mp4",
"bob_01.avi",
"john_007.json.xz",
"john_007.json.txt.xz",
"john_007.json.txt.zip",
"tom_and_jerry",
"tom_and_jerry.dat",
]
file_dict = {}
for file in filenames:
parts = file.split(".")
if len(parts) > 1:
file_name = ".".join(parts[0:-1])
file_ext = parts[-1]
else:
file_name = parts[0]
file_ext = ""
file_dict.setdefault(file_name, []).append(file_ext)
pprint(file_dict)
Result:
{'alice_01': ['mov', 'mp4'],
'alice_02': ['mp4'],
'bob_01': ['avi'],
'john_007.json': ['xz'],
'john_007.json.txt': ['xz', 'zip'],
'tom_and_jerry': ['', 'dat']}

How to copy only non-duplicate files whilst maintaining folder structure?

I am trying to find duplicates between two folders and copy only unique image files to the 'dest' folder. I can copy all the non-dupes using the code below, however it doesn't maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren't linked so not sure how to re-construct the sub dir?
Example:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
result
dupes = []
srcfiles = []
filecount = []
failed = []
removed = []
for i in result.values():
dupes.append(i['location'])
for dirpath, subdirs, files in os.walk(source):
for x in files:
if x.endswith(ext):
srcfiles.append(os.path.join(dirpath, x))
for f in srcfiles:
if f not in dupes:
shutil.copy(f, dest)
print('File copied successfully - '+f)
filecount.append(f)
else:
print('File not copied successfully !!!! - '+f)
failed.append(f)
I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can't get the ignore list function to work
shutil.copytree example:
for i in result.values():
df = []
df.append(i['filename'])
def ignorelist(source, df):
return [f for f in df if os.path.isfile(os.path.join(source, f))]
shutil.copytree(source, destnew, ignore=ignorelist)
This function ignorelist should do the trick:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
def ignorelist(source, files):
return [file for file in files
if (os.path.isfile(os.path.join(source, file))
and (os.path.join(source, file) in dupes
or not file.lower().endswith(ext)))]
shutil.copytree(source, dest, ignore=ignorelist)
And the other "more manual" way would be
import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\\')
dest = input('Input backup \ destination folder:').rstrip('/\\')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
srcfiles = []
copied = []
failed = []
skipped = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.lower().endswith(ext):
srcfile = os.path.join(dirpath,file)
srcfiles.append(srcfile)
if srcfile in dupes:
print('File not copied (duplicate) - '+srcfile)
skipped.append(srcfile)
else:
try:
destfile = os.path.join(dest,srcfile[len(source)+1:])
os.makedirs(os.path.dirname(destfile), exist_ok=True)
shutil.copy(srcfile,destfile)
print('File copied successfully - '+srcfile)
copied.append(srcfile)
except Exception as err:
print('File not copied (error %s) - %s' % (str(err),srcfile))
failed.append(f)
I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.
import shutil, os
from difPy import dif
source = input('Input source folder: ')
dest = input('Input backup \ destination folder: ')
# Remove trailing path separators if they exist:
if source.endswith(('/', '\\')):
source = source[:-1]
if dest.endswith(('/', '\\')):
dest = dest[:-1]
# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
source = source.replace('\\', os.sep)
elif os.sep == '\\':
source = source.replace('/', os.sep)
source_directory_length = len(source) + 1
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}
copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.endswith(ext):
source_path = os.path.join(dirpath, file)
if source_path not in dupes:
# get subdirectory of source directory that this file is in:
file_length = len(file) + 1
# Get subdirectory relative to the source directory:
subdirectory = source_path[source_directory_length:-file_length]
if subdirectory:
dest_directory = os.path.join(dest, subdirectory)
# ensure directory exists:
os.makedirs(dest_directory, exist_ok=True)
else:
dest_directory = dest
dest_path = os.path.join(dest_directory, file)
shutil.copy(source_path, dest_path)
print('File copied successfully -', source_path)
copied.append(source_path)
else:
print('File not copied -', source_path)
not_copied.append(source_path)

Create folders based on filenames

I have a folder with some 1500 excel files . The format of each file is something like this:
0d20170101abcd.xlsx
1d20170101ef.xlsx
0d20170104g.xlsx
0d20170109hijkl.xlsx
1d20170109mno.xlsx
0d20170110pqr.xlsx
The first character of the file name is either '0' or '1' followed by 'd' followed by the date when the file was created followed by customer id(abcd,ef,g,hijkl,mno,pqr).The customer id has no fixed length and it can vary.
I want to create folders for each unique date(folder name should be date) and move the files with the same date into a single folder .
So for the above example , 4 folders (20170101,20170104,20170109,20170110) has to be created with files with same dates copied into their respective folders.
I want to know if there is any way to do this in python ? Sorry for not posting any sample code because I have no idea as to how to start.
Try this out:
import os
import re
root_path = 'test'
def main():
# Keep track of directories already created
created_dirs = []
# Go through all stuff in the directory
file_names = os.listdir(root_path)
for file_name in file_names:
process_file(file_name, created_dirs)
def process_file(file_name, created_dirs):
file_path = os.path.join(root_path, file_name)
# Check if it's not itself a directory - safe guard
if os.path.isfile(file_path):
file_date, user_id, file_ext = get_file_info(file_name)
# Check we could parse the infos of the file
if file_date is not None \
and user_id is not None \
and file_ext is not None:
# Make sure we haven't already created the directory
if file_date not in created_dirs:
create_dir(file_date)
created_dirs.append(file_date)
# Move the file and rename it
os.rename(
file_path,
os.path.join(root_path, file_date, '{}.{}'.format(user_id, file_ext)))
print file_date, user_id
def create_dir(dir_name):
dir_path = os.path.join(root_path, dir_name)
if not os.path.exists(dir_path) or not os.path.isdir(dir_path):
os.mkdir(dir_path)
def get_file_info(file_name):
match = re.search(r'[01]d(\d{8})([\w+-]+)\.(\w+)', file_name)
if match:
return match.group(1), match.group(2), match.group(3)
return None, None, None
if __name__ == '__main__':
main()
Note that depending on the names of your files, you might want to change (in the future) the regex I use, i.e. [01]d(\d{8})([\w+-]+) (you can play with it and see details about how to read it here)...
Check this code.
import os
files = list(x for x in os.listdir('.') if x.is_file())
for i in files:
d = i[2:10] #get data from filename
n = i[10:] #get new filename
if os.path.isdir(i[2:10]):
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
else:
os.mkdir(os.getcwd()+i)
os.rename(os.getcwd()+i,os.getcwd()+d+"/"+i)
Here's is the repl link
Try this out :
import os, shutil
filepath = "your_file_path"
files = list(x for x in os.listdir(filepath) if x.endswith(".xlsx"))
dates = list(set(x[2:10] for x in files))
for j in dates:
os.makedirs(filepath + j)
for i in files:
cid = i[10:]
for j in dates:
if j in i:
os.rename(filepath+i,cid)
shutil.copy2(filepath+cid, filepath+j)

Python file sort

So i wrote this to help me sort files in a bunch of different folders, it works by taking the first file from each folder and creating a folder for it then the 2nd file from each folder and does the same and so on, but when ever I run the code nothing happens can someone help.
import os, sys
path = "\Users\mac\Desktop\soliddd sort\dir"
fdir = os.listdir(path)
f = len(fdir[0])
array = [[] for i in xrange(f)]
def setArray(c, i, j):
array[j][i] = c[j]
def chooseFile(j):
for i in fdir:
setArray(fdir[i], i, j)
def makedir(f, fdir):
for i in f:
folder = r"\Users\mac\Desktop\soliddd sort\dir"+str(i)
if not os.path.exists(folder):
os.makedirs(folder)
for j in fdir:
with open(os.path.join(folder, array[i][j], 'wb')) as temp:
temp.write(buff)
folder.close()
def main():
for j in f:
chooseFile(j)
makedir(f, fdir)
A tiny example on how you can go about sorting files into folders, as I couldn't understand why you were randomly selecting files from different locations. This example gets all the file names, and then lets you select a common filename to sort into a new folder
def sort_files(base_dir="C:\sorted_files")
import os, shutil
folder_list = [
'folder_path_a',
'folder_path_b',
]
file_list = []
for folder in folder_list:
file_list.extend([os.path.join(folder, file_name) for file_name in os.listdir(folder)])
key = raw_input("Group files with <text> in the filename: ")
matching_files = [ file_name for file_name in file_list if key in file_name ]
for file_name in matching_files:
shutil.move(file_name, os.path.join(os.path.join(base_dir,key), os.path.basename(file_name)))

List all the files in all subdirectories from an FTP using Python

I'm new to Python and I'm trying to list all the files in all the sub-directories from an FTP.
The FTP, as usual, is in this format.
A
B
C
Subdirectories :
AA
BB
CC
I could list the directories ['A', 'B', 'C'] using ftp.nlist(). I'd like to get ['AA', 'BB', 'CC'] as my output. I've tried and looked up a lot to find a solution/hint to do this.
I know this is a bit old, but an answer here could have saved me a bit of effort, so here it is. I'm a bit of an amateur, so this is probably not the most efficient way, but here is a program I wrote to get all directories on an FTP server. It will list all directories no matter how far they are down the tree.
from ftplib import FTP
def get_dirs_ftp(folder=""):
contents = ftp.nlst(folder)
folders = []
for item in contents:
if "." not in item:
folders.append(item)
return folders
def get_all_dirs_ftp(folder=""):
dirs = []
new_dirs = []
new_dirs = get_dirs_ftp(folder)
while len(new_dirs) > 0:
for dir in new_dirs:
dirs.append(dir)
old_dirs = new_dirs[:]
new_dirs = []
for dir in old_dirs:
for new_dir in get_dirs_ftp(dir):
new_dirs.append(new_dir)
dirs.sort()
return dirs
host ="your host"
user = "user"
password = "password"
print("Connecting to {}".format(host))
ftp = FTP(host)
ftp.login(user, password)
print("Connected to {}".format(host))
print("Getting directory listing from {}".format(host))
all_dirs = get_all_dirs_ftp()
print("***PRINTING ALL DIRECTORIES***")
for dir in all_dirs:
print(dir)
I wrote a similar solution to Ed Kern, but using the "mlsd" command.
Since Ed Kern`s code would cause an error for files without a filename extension. Using mlsd this error is avoided. Note that very old FTP servers might not have the mlsd command.
from ftplib import FTP
def get_items_mlsd(folder):
filedatas = []
for file_data in ftp.mlsd(folder):
filedatas.append(file_data)
return filedatas
def get_all_dirs_ftp(folder=""):
items = []
new_items = []
new_items = get_items_mlsd(folder)
while len(new_items) > 0:
old_dirs = new_items
new_items = []
for file_data in old_dirs:
file_name, meta = file_data
file_type = meta.get("type")
if file_type != "dir":
items.append(file_name)
else:
news = get_items_mlsd(file_name)
for new in news:
file_name1 , meta = new
file_type = meta.get("type")
if file_type == "dir":
new = list(new)
directory = new[0]
new[0] = file_name + "/" + directory
new = tuple(new)
new_items.append(new)
else:
file_name1 = file_name + "/" + file_name1
items.append(file_name1)
items.sort()
return items
host = "host"
user = "user_name"
password = "pw"
print("Connecting to {}".format(host))
ftp = FTP(host)
ftp.login(user, password)
print("Connected to {}".format(host))
print("Getting file listing from {}".format(host))
all_items = get_all_dirs_ftp()
print("***PRINTING ALL ITEMS***")
with open('ftp_files.txt', 'w') as f:
for dir in all_items:
print(dir)

Categories