I'm trying to extract files from a zip file using Python 2.7.1 (on Windows, fyi) and each of my attempts shows extracted files with Modified Date = time of extraction (which is incorrect).
import os,zipfile
outDirectory = 'C:\\_TEMP\\'
inFile = 'test.zip'
fh = open(os.path.join(outDirectory,inFile),'rb')
z = zipfile.ZipFile(fh)
for name in z.namelist():
z.extract(name,outDirectory)
fh.close()
I also tried using the .extractall method, with the same results.
import os,zipfile
outDirectory = 'C:\\_TEMP\\'
inFile = 'test.zip'
zFile = zipfile.ZipFile(os.path.join(outDirectory,inFile))
zFile.extractall(outDirectory)
Can anyone tell me what I'm doing wrong?
I'd like to think this is possible without having to post-correct the modified time per How do I change the file creation date of a Windows file?.
Well, it does take a little post-processing, but it's not that bad:
import os
import zipfile
import time
outDirectory = 'C:\\TEMP\\'
inFile = 'test.zip'
fh = open(os.path.join(outDirectory,inFile),'rb')
z = zipfile.ZipFile(fh)
for f in z.infolist():
name, date_time = f.filename, f.date_time
name = os.path.join(outDirectory, name)
with open(name, 'wb') as outFile:
outFile.write(z.open(f).read())
date_time = time.mktime(date_time + (0, 0, -1))
os.utime(name, (date_time, date_time))
Okay, maybe it is that bad.
Based on Jia103's answer, I have developed a function (using Python 2.7.14) which preserves directory and file dates AFTER everything has been extracted. This isolates any ugliness in the function, and you can also use zipfile.Zipfile.extractAll() or whatever zip extract method you want:
import time
import zipfile
import os
# Restores the timestamps of zipfile contents.
def RestoreTimestampsOfZipContents(zipname, extract_dir):
for f in zipfile.ZipFile(zipname, 'r').infolist():
# path to this extracted f-item
fullpath = os.path.join(extract_dir, f.filename)
# still need to adjust the dt o/w item will have the current dt
date_time = time.mktime(f.date_time + (0, 0, -1))
# update dt
os.utime(fullpath, (date_time, date_time))
To preserve dates, just call this function after your extract is done.
Here's an example, from a script I wrote to zip/unzip game save directories:
z = zipfile.ZipFile(zipname, 'r')
print 'I have opened zipfile %s, ready to extract into %s' \
% (zipname, gamedir)
try: os.makedirs(gamedir)
except: pass # Most of the time dir already exists
z.extractall(gamedir)
RestoreTimestampsOfZipContents(zipname, gamedir) #<-- USED
print '%s zip extract done' % GameName[game]
Thanks everyone for your previous answers!
Based on Ethan Fuman's answer, I have developed this version (using Python 2.6.6) which is a little more consise:
zf = ZipFile('archive.zip', 'r')
for zi in zf.infolist():
zf.extract(zi)
date_time = time.mktime(zi.date_time + (0, 0, -1))
os.utime(zi.filename, (date_time, date_time))
zf.close()
This extracts to the current working directory and uses the ZipFile.extract() method to write the data instead of creating the file itself.
Based on Ber's answer, I have developed this version (using Python 2.7.11), which also accounts for directory mod dates.
from os import path, utime
from sys import exit
from time import mktime
from zipfile import ZipFile
def unzip(zipfile, outDirectory):
dirs = {}
with ZipFile(zipfile, 'r') as z:
for f in z.infolist():
name, date_time = f.filename, f.date_time
name = path.join(outDirectory, name)
z.extract(f, outDirectory)
# still need to adjust the dt o/w item will have the current dt
date_time = mktime(f.date_time + (0, 0, -1))
if (path.isdir(name)):
# changes to dir dt will have no effect right now since files are
# being created inside of it; hold the dt and apply it later
dirs[name] = date_time
else:
utime(name, (date_time, date_time))
# done creating files, now update dir dt
for name in dirs:
date_time = dirs[name]
utime(name, (date_time, date_time))
if __name__ == "__main__":
unzip('archive.zip', 'out')
exit(0)
Since directories are being modified as the extracted files are being created inside them, there appears to be no point in setting their dates with os.utime until after the extraction has completed, so this version caches the directory names and their timestamps till the very end.
Related
My code was working just fine before adding the hash function. I was getting the list of all folders and files in my directory in the Pretty Table. Once I added the hash function, I got maybe 5 of the files in that directory with hashes in the table. I am not sure where I have gone wrong. Please forgive me, I am new to this. We are not learning to code from scratch, but have to modify existing codes to function the way we need it to.
# Python Standard Libaries
import os #file system methode
import hashlib #hashing function
import sys #system methods
import time #time conversions
# Python 3rd Party Libraries
from prettytable import PrettyTable # pip install prettytable
# Local Functions
def GetFileMetaData(fileName):
#obtain file system metadata
try:
metaData = os.stat(fileName) # Use the stat method to obtain meta data
fileSize = metaData.st_size # Extract fileSize and MAC Times
timeLastAccess = metaData.st_atime
timeLastModified = metaData.st_mtime
timeCreated = metaData.st_ctime
macTimeList = [timeLastModified, timeCreated, timeLastAccess] # Group the MAC Times in a List
return True, None, fileSize, macTimeList
except Exception as err:
return False, str(err), None, None
# Psuedo Constants
# Start of the Script
tbl = PrettyTable(['FilePath','FileSize','UTC-Modified', 'UTC-Accessed', 'UTC-Created', 'SHA-256 HASH'])
#file check
while True:
targetFolder = input("Enter Target Folder: ")
if os.path.isdir(targetFolder):
break
else:
print("\nInvalid Folder ... Please Try Again")
print("Walking: ", targetFolder, "\n")
print()
for currentRoot, dirList, fileList in os.walk(targetFolder):
for nextFile in fileList:
fullPath = os.path.join(currentRoot, nextFile)
absPath = os.path.abspath(fullPath)
fileSize = os.path.getsize(absPath)
success, errInfo, fileSize, macList = GetFileMetaData(absPath)
if success:
#convert to readable Greenich Time
modTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[0]))
accTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[1]))
creTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[2]))
#hashing function
with open(absPath, 'rb') as target:
fileContents = target.read()
sha256Obj = hashlib.sha256()
sha256Obj.update(fileContents)
hexDigest = sha256Obj.hexdigest()
tbl.add_row( [ absPath, fileSize,modTime, accTime, creTime, hexDigest] )
tbl.align = "l" # align the columns left justified
# display the table
print (tbl.get_string(sortby="FileSize", reversesort=True))
print("\nScript-End\n")
i want to build a function that convert names from csv to a document in word by docx library and i want to create an empty file with os.makedirs(), the file is created but i cant get its path to later join path with word document to save the document in that file
here is my code:
import docx
import pandas as pd
from datetime import datetime
import os
from docx2pdf import convert
from pathlib import Path
def auto_fill(x,y):
database=pd.read_csv(x)
df=pd.DataFrame(database)
df=df.dropna(axis=0)
targeted_doc=docx.Document(y)
date = datetime.date(datetime.now())
strdate = date.strftime("%m-%d-%Y")
path = strdate
newfile = os.makedirs(path)
newfile_path = path(newfile)
for i in range(len(df.Name)+1):
for n in range (len(targeted_doc.paragraphs)+1):
if targeted_doc.paragraphs[n].text=="Name of trainee":
name=targeted_doc.paragraphs[n].runs[0].text=df.at[i,'Name']
for m in range(len(targeted_doc.paragraphs) + 1):
if targeted_doc.paragraphs[m].text == "tissue date":
date = targeted_doc.paragraphs[n].runs[0].text = strdate
for l in range(len(targeted_doc.paragraphs) + 1):
if targeted_doc.paragraphs[n].text == "tserial number":
sr_num = targeted_doc.paragraphs[l].runs[0].text = df.at[i, 'serial number']
name_of_file = (f"{df.at[i, 'Name']}.docx")
outputdoc=targeted_doc.save(name_of_file)
path_of_document=path(outputdoc)
completesave = os.path.join(path_of_document, name_of_file)
convert(path_of_document,newfile_path+f"{name_of_file}.pdf")
auto_fill("database.csv","01.docx")
If I'm understanding what you're trying to accomplish, then just use the path variable you made earlier. Since you used os.makedirs(path), then the path to that would just be the path object.
If you don't change the location, you can use the same path. If you change the location, you can get the path from your script using os.getcwd() and join it wiht the path using os.path.join()
You can store the result of os.path.join(os.getcwd(), path) to a variable and use it later. You can compose that absolute path before creating the file, so you'll have the entire path
My goal is to take the contents of all text files in subfolders created today and move them to a single existing report.txt but I can't seem to find a good way to go about it. I'm not very experienced in coding so any help would be much appreciated. Here is what I have so far (I know it's rubbish):
if getmtime == today:
with open(glob.iglob(drive + "://CADIQ//CADIQ_JOBS//?????????????????????")) as f:
for line in f:
content += line
with open(reportFile, "a") as f:
f.write(content)
Try this, based on How do I list all files of a directory?
import os, time
def last_mod_today(path):
'''
return True if getmtime and time have year, mon, day coincinding in their localtime struct, False else
'''
t_s = time.localtime(os.path.getmtime(path))
today = time.localtime(time.time())
return t_s.tm_mday==today.tm_mday and t_s.tm_year == today.tm_year and t_s.tm_mon == today.tm_mon
name_to_path = lambda d,x:os.path.normpath(os.path.join(os.path.join(os.getcwd(), d),x))
def log_files(d):
'''
walking through the files in d
log the content of f when last modif time for f is today
WARNING : what happens when the file is a JPEG ?
'''
scand_dir = os.path.join(os.getcwd(), d)
print(f"scanning {scand_dir}...")
(_, _, filenames) = next(os.walk(scand_dir))
log = open("log.txt", 'a')
for f in filenames:
if last_mod_today(name_to_path(d,f)):
with open(name_to_path(d,f), 'r') as todays_file:
log.write('##############################\n')
log.write(f"file : {name_to_path(d,f)}\n")
log.write(todays_file.read())
log.write('\n')
log.write('##############################\n')
log.close()
#first scanning files in the current directory
(_, dirnames, _) = next(os.walk('./'))
log_files('./')
#then crawling through the subdirs (one level)
for d in dirnames:
log_files(d)
I would start by creating a desired_date object, which is a datetime.date. You can then format that date into a string, which makes up the pattern you want to look for in your glob. The glob pattern doesn't care about the time, just the date.
from pathlib import Path
import datetime
desired_date = datetime.date(year=2020, month=12, day=22)
pattern = "13.2.1_" + desired_date.strftime("%y_%m_%d") + "_*"
for path in Path("path/to/folders").glob(pattern):
if not path.is_dir():
continue
print(path)
From there, you can visit each path, glob all text files in the current path, and accumulate the lines in each text file. Finally, write everything to one file.
import glob
contents = b''
for file in glob.glob('./*/*.txt'): # u can change as per your directory
fname = file.split(r'\\')[-1]
with open(fname, 'rb') as f1:
contents += f1.read()
with open('report.txt','wb') as rep:
rep.write(contents)
Hope this helps so :)
Better try to read or write files in terms of bytes because sometimes there may be a chance of corrupting data.
I have been scouring the internet trying to find a pythonic (sp?!) way to process this data..
Everyday we will recieve a load of data in .dbf format (hopefully) - we then need to save this data as a shapefile.
Does anyone have any links or any suggestions as to my process?
To append the file's creation_date to its name, you need to obtain the creation date with os.stat() and then rename the file with os.rename(). You can format the date string with date.strftime().
import datetime, os
filename = 'original.ext'
fileinfo = os.stat(filename)
creation_date = datetime.date.fromtimestamp(fileinfo.st_ctime)
os.rename(filename, filename + '-' + creation_date.strftime('%Y-%m-%d'))
Off the top of my head:
import os
import datetime
myfile = "test.txt"
creationdate = os.stat(myfile).st_ctime
timestamp = datetime.datetime.fromtimestamp(creationdate)
datestr = datetime.datetime.strftime(timestamp, "%Y%m%d")
os.rename(myfile, os.path.splitext(myfile)[0] + datestr + os.path.splitext(myfile)[1])
renames test.txt to test20110221.txt.
It was in model builder all along!
# (generated by ArcGIS/ModelBuilder)
# Usage: DBF2SHAPEFILE <XY_Table> <Y_Field> <X_Field> <Output_Feature_Class>
# ---------------------------------------------------------------------------
# Import system modules
import sys, string, os, arcgisscripting, datetime
# Adds the creation date to all of the previous shapefiles in that folder
filename = 'D:/test.txt'
fileinfo = os.stat(filename)
creation_date = datetime.date.fromtimestamp(fileinfo.st_ctime)
os.rename(filename, filename + '-' + creation_date.strftime('%Y-%m-%d'))
# Create the Geoprocessor object
gp = arcgisscripting.create()
# Load required toolboxes...
gp.AddToolbox("C:/Program Files/ArcGIS/ArcToolbox/Toolboxes/Data Management Tools.tbx")
# Script arguments...
XY_Table = sys.argv[1]
Y_Field = sys.argv[2]
X_Field = sys.argv[3]
Output_Feature_Class = sys.argv[4]
# Local variables...
Layer_Name_or_Table_View = ""
# Process: Make XY Event Layer...
gp.MakeXYEventLayer_management(XY_Table, X_Field, Y_Field, Layer_Name_or_Table_View, "")
# Process: Copy Features...
gp.CopyFeatures_management(Layer_Name_or_Table_View, Output_Feature_Class, "", "0", "0", "0")
If you wanted to do it without using ArcGIS, you could use OGR's python bindings or the ogr2ogr utility through a subprocess. You could use the utility through a windows batch file, which would be a lot faster than calling the arc process for every file if you have many to do...
As you know it's not a question of changing the extension, there is a specific format required.
I know how to extract a zip archive using Python, but how exactly do I display the progress of that extraction in a percentage?
I suggest using tqdm, you can install it using pip like so:
pip install tqdm
Then, you can use it directly like so:
>>> from tqdm import tqdm
>>>
>>> with zipfile.ZipFile(some_source) as zf:
... for member in tqdm(zf.infolist(), desc='Extracting '):
... try:
... zf.extract(member, target_path)
... except zipfile.error as e:
... pass
This will produce something like so:
Extracting : 100%|██████████| 60.0k/60.0k [14:56<00:00, 66.9File/s]
the extract method doesn't provide a call back for this so one would have to use getinfo to get the e uncompressed size and then open the file read from it in blocks and write it to the place you want the file to go and update the percentage one would also have to restore the mtime if that is wanted an example:
import zipfile
z = zipfile.ZipFile(some_source)
entry_info = z.getinfo(entry_name)
i = z.open(entry_name)
o = open(target_name, 'w')
offset = 0
while True:
b = i.read(block_size)
offset += len(b)
set_percentage(float(offset)/float(entry_info.file_size) * 100.)
if b == '':
break
o.write(b)
i.close()
o.close()
set_attributes_from(entry_info)
this extracts entry_name to target_name
most of this is also done by shutil.copyfileobj but it doesn't have a call back for progress either
the source of the ZipFile.extract method calls _extract_member uses:
source = self.open(member, pwd=pwd)
target = file(targetpath, "wb")
shutil.copyfileobj(source, target)
source.close()
target.close()
where member has be converted from a name to a ZipInfo object by getinfo(member) if it wasn't a ZipInfo object
Sorry a bit late seeing this. Had a similar problem, needing an equivalent to zipfile.Zipfile.extractall. If you have tqdm>=4.40.0 (which I released over a year ago), then:
from os import fspath
from pathlib import Path
from shutil import copyfileobj
from zipfile import ZipFile
from tqdm.auto import tqdm # could use from tqdm.gui import tqdm
from tqdm.utils import CallbackIOWrapper
def extractall(fzip, dest, desc="Extracting"):
"""zipfile.Zipfile(fzip).extractall(dest) with progress"""
dest = Path(dest).expanduser()
with ZipFile(fzip) as zipf, tqdm(
desc=desc, unit="B", unit_scale=True, unit_divisor=1024,
total=sum(getattr(i, "file_size", 0) for i in zipf.infolist()),
) as pbar:
for i in zipf.infolist():
if not getattr(i, "file_size", 0): # directory
zipf.extract(i, fspath(dest))
else:
with zipf.open(i) as fi, open(fspath(dest / i.filename), "wb") as fo:
copyfileobj(CallbackIOWrapper(pbar.update, fi), fo)
For the lazy, below is a self-contained working example based on Dan D's answer. Tested on Python 3.10.6. Not optimized, but works.
In this example, the assumption is that the target "test" directory exists, but you can of course create it in the extract function.
The advantage of Dan's answer over most of the answers I've seen for this topic is that showing progress each time a file from the archive is processed does not achieve the goal if the archive consists of very large files.
import zipfile
import os
from pathlib import Path
def extract(zip_path, target_path):
block_size = 8192
z = zipfile.ZipFile(zip_path)
for entry_name in z.namelist():
entry_info = z.getinfo(entry_name)
i = z.open(entry_name)
print(entry_name)
if entry_name[-1] != '/':
dir_name = os.path.dirname(entry_name)
p = Path(f"{target_path}/{dir_name}")
p.mkdir(parents=True, exist_ok=True)
o = open(f"{target_path}/{entry_name}", 'wb')
offset = 0
while True:
b = i.read(block_size)
offset += len(b)
print(float(offset)/float(entry_info.file_size) * 100.)
if b == b'':
break
o.write(b)
o.close()
i.close()
z.close()
extract("test.zip", "test")
import zipfile
srcZipFile = 'srcZipFile.zip'
distZipFile = 'distZipFile'
with zipfile.ZipFile(srcZipFile) as zf:
filesList = zf.namelist()
for idx, file in enumerate(filesList):
percent = round((idx / len(filesList))*100)
print(percent)
zf.extract(file, distZipFile)
zf.close()