Retrieve file from FTP and directly write into a zip archive - python

I want to download files from an FTP server and archive them locally in a (zip) archive.
It is well known how to download files and save them individually:
import ftplib
remote = ftplib.FTP(ftp_server)
remote.login(username, password)
for filename in file_list:
remote.retrbinary("RETR " + filename, open(filename, 'wb').write)
remote.quit()
It is also well known how to add files to an archive:
import zipfile
archive = zipfile.ZipFile(archive_file)
archive.write(filename)
archive.close()
But it seems not possible to use both at the same time:
remote.retrbinary("RETR " + filename, archive.write(filename))
This leads to a FileNotFoundError, because filename has not been saved to a local (temporary) directory in between.
Is there a way to directly send the file stream from FTP into a zip archive? Or would it be more efficient to download all files straight, add them to the archive, and then delete the files? I would like to keep harddisk I/O as low as possible.

Download the file to memory and use ZipFile.writestr:
import ftplib
import zipfile
from io import BytesIO
# ...
archive = zipfile.ZipFile(archive_file, "w")
for filename in file_list:
flo = BytesIO()
ftp.retrbinary('RETR ' + filename, flo.write)
archive.writestr(filename, flo.getvalue())
archive.close()

Related

Create zipfile at local and write files from s3

I am creating a zipfile on my local machine and would like to write files from s3. So far I'm unable to do it. Here's what I have in the mean time.
import os
import zipfile
from fs import open_fs
fs = open_fs(os.getenv('s3_sample_folder'))
file_names = file_names() #list of file names
with zipfile.ZipFile('zipfile.zip', mode='w') as zf:
for file in file_names:
with fs.open('/'+file, 'rb') as remote_file:
content = remote_file.read()
zf.write(content, basename(content))
The ZipFile.write method accepts a file name, not file content. You should use the ZipFile.writestr method instead to write file content to the zip file:
zf.writestr(file, content)
Since you are using PyFilesystem, you can open a S3 filesystem and a Zip filesystem, then use copy_file to copy between them.
Something like the following should work:
import os
from fs import open_fs
from fs.copy import copy_file
with open_fs(os.getenv('s3_sample_folder')) as s3_fs:
with open_fs('zip://zipfile.zip', create=True) as zip_fs:
for filename in file_names():
copy_file(s3_fs, filename, zip_fs, filename)

efficiently zipping files using python

I need to zip directories:
directory -> directory.zip.
It should be possible to easily open that file on WIndows but creation of that file should be as fast as possible -> sth like tar
Then original directory may be deleted. What would be the best options for that? The only reason for "zipping" that directory is to be able to download it over http as one file.
You can use the built-in zipfile module. In particular ZIP_STORED disables compression.
For extra performance, you could also send the generated zip output directly into the HTTP response, without first creating a file on disk or a buffer in memory.
For zipping directories I have always used the zipfile module, and used it like so;
import zipfile
def zip(src, dst):
zf = zipfile.ZipFile("%s.zip" % (dst), "w", zipfile.ZIP_DEFLATED)
abs_src = os.path.abspath(src)
for dirname, subdirs, files in os.walk(src):
for filename in files:
absname = os.path.abspath(os.path.join(dirname, filename))
arcname = absname[len(abs_src) + 1:]
print('zipping %s as %s' % (os.path.join(dirname, filename),
arcname))
zf.write(absname, arcname)
zf.close()
Then called with this;
zip(str(source), str(destination))

gzip multiple files in python

I have to compress a lot of XML files into and split them by the data in the file name, just for clarification's sake, there is a parser which collects information from XML file and then moves it to a backup folder. My code needs to gzip it according to the date in the filename and group those files in a compressed .gz file.
Please find the code bellow:
import os
import re
import gzip
import shutil
import sys
import time
#
timestr = time.strftime("%Y%m%d%H%M")
logfile = 'D:\\Coleta\\log_compactador_xml_tar'+timestr+'.log'
ptm_dir = "D:\\PTM\\monitored_programs\\"
count_files_mdc = 0
count_files_3gpp = 0
count_tar = 0
#
for subdir, dir, files in os.walk(ptm_dir):
for file in files:
path = os.path.join(subdir, file)
try:
backup_files_dir = path.split(sep='\\')[4]
parser_id = path.split(sep='\\')[3]
if re.match('backup_files_*', backup_files_dir):
if file.endswith('xml'):
# print(time.strftime("%Y-%m-%d %H:%M:%S"), path)
data_arq = file[1:14]
if parser_id in ('parser-924'):
gzip_filename_mdc = os.path.join(subdir,'E4G_PM_MDC_IP51_'+timestr+'_'+data_arq)
with open(path, 'r')as f_in, gzip.open(gzip_filename_mdc + ".gz", 'at') as f_out_mdc:
shutil.copyfileobj(f_in, f_out_mdc)
count_files_mdc += 1
f_out_mdc.close()
f_in.close()
print(time.strftime("%Y-%m-%d %H:%M:%S"), "Compressing file MDC: ",path)
os.remove(path)
except PermissionError:
print(time.strftime("%Y-%m-%d %H:%M:%S"), "Permission error on file:", fullpath, file=logfile)
pass
except IndexError:
print(time.strftime("%Y-%m-%d %H:%M:%S"), "IndexError: ", path, file=logfile)
pass
As long as I seem it creates a stream of data, then compress and write it to a new file with the specified filename. However, instead of grouping each XML file independently inside a ".gz" file, it does creates inside the "gzip" file, a big file (big stream of data?) with the same name of the output "gzip" file, but without any extension. After the files are totally compressed, it's not possible to uncompress the big file generated inside the "gzip" output file. Does someone know where is the problem with my code?
PS: I have edited the code for readability purposes.
Not sure whether the solution is still needed, but I will just leave it here for anyone who faces the same issue.
There is a way to create a gzip archive in python using tarfile, the code is quite simple:
with tarfile.open(filename, mode="w:gz") as archive:
archive.add(name=name_of_file_to_add, recursive=True)
in this case name_of_file_to_add can be a directory, in which case tarfile will add it recursively with all its contents. Obviously you will need to import the tarfile module.
If you need to add files without a directory a simple for with calls to add will do (recursive flag is not required in this case).

Unable to remove zipped file after unzipping

I'm attempting to remove a zipped file after unzipping the contents on windows. The contents can be stored in a folder structure in the zip. I'm using the with statement and thought this would close the file-like object (source var) and zip file. I've removed lines of code relating to saving the source file.
import zipfile
import os
zipped_file = r'D:\test.zip'
with zipfile.ZipFile(zipped_file) as zip_file:
for member in zip_file.namelist():
filename = os.path.basename(member)
if not filename:
continue
source = zip_file.open(member)
os.remove(zipped_file)
The error returned is:
WindowsError: [Error 32] The process cannot access the file because it is being used by another process: 'D:\\test.zip'
I've tried:
looping over the os.remove line in case it's a slight timing issue
Using close explicitly instead of the with statment
Attempted on local C drive and mapped D Drive
instead of passing in a string to the ZipFile constructor, you can pass it a file like object:
import zipfile
import os
zipped_file = r'D:\test.zip'
with open(zipped_file, mode="r") as file:
zip_file = zipfile.ZipFile(file)
for member in zip_file.namelist():
filename = os.path.basename(member)
if not filename:
continue
source = zip_file.open(member)
os.remove(zipped_file)
You are opening files inside the zip... which create a file lock on the whole zip file. close the inner file open first... via source.close() at the end of your loop
import zipfile
import os
zipped_file = r'D:\test.zip'
with zipfile.ZipFile(zipped_file) as zip_file:
for member in zip_file.namelist():
filename = os.path.basename(member)
if not filename:
continue
source = zip_file.open(member)
source.close()
os.remove(zipped_file)
Try to close the zipfile before removing.
you can do also like this, which works pretty good:
import os, shutil, zipfile
fpath= 'C:/Users/dest_folder'
path = os.getcwd()
for file in os.listdir(path):
if file.endswith(".zip"):
dirs = os.path.join(path, file)
if os.path.exists(fpath):
shutil.rmtree(fpath)
_ = os.mkdir(fpath)
with open(dirs, 'rb') as fileobj:
z = zipfile.ZipFile(fileobj)
z.extractall(fpath)
z.close()
os.remove(dirs)

Downloading from an ftp using Python

I have a piece of code in Python to download files from an ftp. The code downloads the very first file in the list of available days but fails to download the second. What could be the problem?
import os, ftplib
destdir='D:\precipitation\dl'
ftp = ftplib.FTP('ftp.itc.nl')
ftp.login('anonymous', '')
ftp.cwd('pub/mpe/msg')
available_days=['summsgmpe_20100101.zip','summsgmpe_20100102.zip', 'summsgmpe_20100103.zip', 'summsgmpe_20100104.zip', 'summsgmpe_20100105.zip', 'summsgmpe_20100106.zip', 'summsgmpe_20100107.zip', 'summsgmpe_20100108.zip']
hdfs = list()
for day in available_days :
file = available_days[available_days.index(day)]
print 'file=', file
local_file = os.path.join(destdir, file)
ftp.retrbinary('RETR %s' %file, open(local_file, 'wb').write)
hdfs.append(os.path.abspath(local_file))
ftp.cwd('..')
ftp.quit()
Remove your call to ftp.cwd(..)
That's moving up a directory for each iteration of the list, instead of staying in the correct folder where the files are.

Categories