How to modify file content using python? - python

The following code reads a file, uses syntax tree to append fullstop to docstrings of that file. How can I save changes made in called file? I understand present code doesn't change content in original file but the local variables accessing it. Can you suggest changes, if possible provide learning resource as well?
astcode.py
import ast
import sys
import os
filename = sys.argv[1]
# """Getting all the functions"""
ast_filename = os.path.splitext(ast.__file__)[0] + '.py'
with open(filename) as fd:
file_contents = fd.read()
module = ast.parse(file_contents)
# with open(filename, 'w') as file:
# module level
if(isinstance(module.body[0], ast.Expr)):
docstr = module.body[0].value.s
if(module.body[0].value.s not in '.'):
docstr += '.'
# ast.dump(module, include_attributes=True)
print(docstr)
# function level
function_definitions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
for function in function_definitions:
# next_node = function_definitions[idx].body
next_node = function.body
for new_node in next_node:
if(isinstance(new_node, ast.Expr)):
if(isinstance(new_node.value, ast.Str)):
# Docstring stored in docstr variable.
docstr = new_node.value.s
if(docstr[-1] not in '.'):
new_node.value.s += '.'
# astString = ast.dump(new_node, annotate_fields=True, include_attributes=True)
# print(astString)
# compile(astString, filename, 'eval')
# print(exec(astString))
print(new_node.value.s)
# for line in module:
# file.write(line)
Example
testfile.py
def readDictionaryFile(dictionary_filename):
"""readDictionaryfile doc string"""
return []
def readTextFile(text_filename):
"""readTextfile doc string"""
return []
$ python3 astcode.py testfile.py
Expected
testfile.py
def readDictionaryFile(dictionary_filename):
"""readDictionaryfile doc string."""
return []
def readTextFile(text_filename):
"""readTextfile doc string."""
return []
Note: Fullstop(.) appended.

Looking at the documentation link, I notice there's a NodeVisitor and NodeTransformer with a code example. I looked at how they unparse a function def, and it's basically the same as you've done in your original question, so I used that.
# https://docs.python.org/3/library/ast.html#ast.NodeTransformer
class MyDocstringTransformer(ast.NodeTransformer):
def visit_FunctionDef(self, node):
if len(node.body):
if isinstance(node.body[0], ast.Expr):
if isinstance(node.body[0].value, ast.Constant):
if isinstance(node.body[0].value.value, str):
docstring = node.body[0].value.value
node.body[0].value.value = docstring + '.'
return node
Using python 3.9's ast module gets us https://docs.python.org/3/library/ast.html#ast.unparse which is about as close as we can get to changing the ast node and then rewriting the original file.
tree = ast.parse(file_contents)
new_tree = MyDocstringTransformer().visit(tree)
print(ast.unparse(new_tree))
Instead of just overwriting to the same filename, you may want to write to a temp file, then let the OS attempt to delete the old file and rename the temp file to the old name, thus performing the replace in the OS.

Related

How to update a file inside a folder in a zipfile without unzipping the zip in Python? [duplicate]

I have archive.zip with two files: hello.txt and world.txt
I want to overwrite hello.txt file with new one with that code:
import zipfile
z = zipfile.ZipFile('archive.zip','a')
z.write('hello.txt')
z.close()
but it won't overwrite file, somehow it creates another instance of hello.txt — take a look at winzip screenshot:
Since there is no smth like zipfile.remove(), what's the best way to handle this problem?
There's no way to do that with python zipfile module. You have to create a new zip file and recompress everything again from the first file, plus the new modified file.
Below is some code to do just that. But note that it isn't efficient, since it decompresses and then recompresses all data.
import tempfile
import zipfile
import shutil
import os
def remove_from_zip(zipfname, *filenames):
tempdir = tempfile.mkdtemp()
try:
tempname = os.path.join(tempdir, 'new.zip')
with zipfile.ZipFile(zipfname, 'r') as zipread:
with zipfile.ZipFile(tempname, 'w') as zipwrite:
for item in zipread.infolist():
if item.filename not in filenames:
data = zipread.read(item.filename)
zipwrite.writestr(item, data)
shutil.move(tempname, zipfname)
finally:
shutil.rmtree(tempdir)
Usage:
remove_from_zip('archive.zip', 'hello.txt')
with zipfile.ZipFile('archive.zip', 'a') as z:
z.write('hello.txt')
Building on nosklo's answer.
UpdateableZipFile A class that inherits from ZipFile, maintians the same interface but adds the ability to overwrite files (via writestr or write) and removing files.
import os
import shutil
import tempfile
from zipfile import ZipFile, ZIP_STORED, ZipInfo
class UpdateableZipFile(ZipFile):
"""
Add delete (via remove_file) and update (via writestr and write methods)
To enable update features use UpdateableZipFile with the 'with statement',
Upon __exit__ (if updates were applied) a new zip file will override the exiting one with the updates
"""
class DeleteMarker(object):
pass
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
# Init base
super(UpdateableZipFile, self).__init__(file, mode=mode,
compression=compression,
allowZip64=allowZip64)
# track file to override in zip
self._replace = {}
# Whether the with statement was called
self._allow_updates = False
def writestr(self, zinfo_or_arcname, bytes, compress_type=None):
if isinstance(zinfo_or_arcname, ZipInfo):
name = zinfo_or_arcname.filename
else:
name = zinfo_or_arcname
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and name in self.namelist():
temp_file = self._replace[name] = self._replace.get(name,
tempfile.TemporaryFile())
temp_file.write(bytes)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).writestr(zinfo_or_arcname,
bytes, compress_type=compress_type)
def write(self, filename, arcname=None, compress_type=None):
arcname = arcname or filename
# If the file exits, and needs to be overridden,
# mark the entry, and create a temp-file for it
# we allow this only if the with statement is used
if self._allow_updates and arcname in self.namelist():
temp_file = self._replace[arcname] = self._replace.get(arcname,
tempfile.TemporaryFile())
with open(filename, "rb") as source:
shutil.copyfileobj(source, temp_file)
# Otherwise just act normally
else:
super(UpdateableZipFile, self).write(filename,
arcname=arcname, compress_type=compress_type)
def __enter__(self):
# Allow updates
self._allow_updates = True
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# call base to close zip file, organically
try:
super(UpdateableZipFile, self).__exit__(exc_type, exc_val, exc_tb)
if len(self._replace) > 0:
self._rebuild_zip()
finally:
# In case rebuild zip failed,
# be sure to still release all the temp files
self._close_all_temp_files()
self._allow_updates = False
def _close_all_temp_files(self):
for temp_file in self._replace.itervalues():
if hasattr(temp_file, 'close'):
temp_file.close()
def remove_file(self, path):
self._replace[path] = self.DeleteMarker()
def _rebuild_zip(self):
tempdir = tempfile.mkdtemp()
try:
temp_zip_path = os.path.join(tempdir, 'new.zip')
with ZipFile(self.filename, 'r') as zip_read:
# Create new zip with assigned properties
with ZipFile(temp_zip_path, 'w', compression=self.compression,
allowZip64=self._allowZip64) as zip_write:
for item in zip_read.infolist():
# Check if the file should be replaced / or deleted
replacement = self._replace.get(item.filename, None)
# If marked for deletion, do not copy file to new zipfile
if isinstance(replacement, self.DeleteMarker):
del self._replace[item.filename]
continue
# If marked for replacement, copy temp_file, instead of old file
elif replacement is not None:
del self._replace[item.filename]
# Write replacement to archive,
# and then close it (deleting the temp file)
replacement.seek(0)
data = replacement.read()
replacement.close()
else:
data = zip_read.read(item.filename)
zip_write.writestr(item, data)
# Override the archive with the updated one
shutil.move(temp_zip_path, self.filename)
finally:
shutil.rmtree(tempdir)
usage example:
with UpdateableZipFile("C:\Temp\Test2.docx", "a") as o:
# Overwrite a file with a string
o.writestr("word/document.xml", "Some data")
# exclude an exiting file from the zip
o.remove_file("word/fontTable.xml")
# Write a new file (with no conflict) to the zp
o.writestr("new_file", "more data")
# Overwrite a file with a file
o.write(r"C:\Temp\example.png", "word/settings.xml")
Based on this answer here's a quick and dirty way to monkey patch stock zipfile to support file deletion (while we waiting for it being accepted for python:main):
from zipfile import ZipFile, ZipInfo
from operator import attrgetter
import functools
def enable_zip_remove(func):
def _zipfile_remove_member(self, member):
# get a sorted filelist by header offset, in case the dir order
# doesn't match the actual entry order
fp = self.fp
entry_offset = 0
filelist = sorted(self.filelist, key=attrgetter('header_offset'))
for i in range(len(filelist)):
info = filelist[i]
# find the target member
if info.header_offset < member.header_offset:
continue
# get the total size of the entry
entry_size = None
if i == len(filelist) - 1:
entry_size = self.start_dir - info.header_offset
else:
entry_size = filelist[i + 1].header_offset - info.header_offset
# found the member, set the entry offset
if member == info:
entry_offset = entry_size
continue
# Move entry
# read the actual entry data
fp.seek(info.header_offset)
entry_data = fp.read(entry_size)
# update the header
info.header_offset -= entry_offset
# write the entry to the new position
fp.seek(info.header_offset)
fp.write(entry_data)
fp.flush()
# update state
self.start_dir -= entry_offset
self.filelist.remove(member)
del self.NameToInfo[member.filename]
self._didModify = True
# seek to the start of the central dir
fp.seek(self.start_dir)
def zipfile_remove(self, member):
"""Remove a file from the archive. The archive must be open with mode 'a'"""
if self.mode != 'a':
raise RuntimeError("remove() requires mode 'a'")
if not self.fp:
raise ValueError(
"Attempt to write to ZIP archive that was already closed")
if self._writing:
raise ValueError(
"Can't write to ZIP archive while an open writing handle exists."
)
# Make sure we have an info object
if isinstance(member, ZipInfo):
# 'member' is already an info object
zinfo = member
else:
# get the info object
zinfo = self.getinfo(member)
return self._zipfile_remove_member(zinfo)
#functools.wraps(func)
def wrapper(*args, **kwargs):
if not hasattr(ZipFile, "remove"):
setattr(ZipFile, "_zipfile_remove_member", _zipfile_remove_member)
setattr(ZipFile, "remove", zipfile_remove)
return func(*args, **kwargs)
return wrapper
Usage:
#enable_zip_remove
def replace_zip_file():
with ZipFile("archive.zip", "a") as z:
z.remove("hello.txt")
z.write("hello.txt")
P.S. NSFW
My solution is similar to the other answers but uses SQLite to manage the intermediate files and provides __getitem__, __setitem__ and __delitem__ for an easy interface.
By default the db is in-memory but you can provide a temp file path if you have a zip larger than available memory.
And of course SQLite is built into Python and faster than the file system
import sqlite3
import subprocess
import zipfile
from pathlib import Path
from sql import CREATE_TABLE, DELETE_FILE, INSERT_FILE, SELECT_CONTENT
class EditableZip:
"""Intended to make editing files inside zip archive easy, this class is capable of loading files
from a zip file into a sqlite database, facilitates editing/removing/adding files, and saving
to a zip.
The database can be in-memory (default) or in a temporary on disk file if
temp_db_path is provided.
If an on-disk file is used, EditableZip.close can be called to remove the file or EditableZip
can be used as a context manager.
If auto_save is set to True and an initial zip_path was provided then the file will
be overwritten when EditableZip closes. If you wish to save to a different file,
or no zip_path is used in instantiation, auto_save can take a file path.
Files can be added by item assignment
with EditableZip(auto_save="example.zip") as ez:
ez["thing.txt"] = "stuff"
# empty dir
ez["empty/"] = None
Assignment accepts Non-text files as bytes.
EditableZip is subscriptable. If the subscript is a path in the db, the data will be returned.
EditableZip.files can be used to iterate over files in the db.
"""
def __init__(
self,
zip_path: None | str | Path = None,
temp_db_path: None | Path = None,
auto_save: bool | str | Path = False,
):
self.temp_db_path, self.auto_save, self.file_path = (
temp_db_path,
auto_save,
zip_path,
)
self.db = sqlite3.connect(
str(temp_db_path if temp_db_path is not None else ":memory:")
)
self.db.execute(CREATE_TABLE)
if self.file_path:
self.load(self.file_path)
#property
def files(self):
"Returns a generator of all file paths in the database."
try:
return (
i[0] for i in self.db.execute("SELECT file_path FROM files").fetchall()
)
except TypeError:
return None
def load(self, zip_path: str | Path) -> None:
"Add all files from zip at zip_path to db."
with zipfile.ZipFile(zip_path, mode="r") as archive:
for item in archive.infolist():
self[item.filename] = (
None if item.filename[-1] == "/" else archive.read(item)
)
def save(self, zip_path: None | str | Path) -> Path:
"Save all files from db to zip at zip_path."
zip_path = self.file_path if zip_path is None else zip_path
with zipfile.ZipFile(zip_path, "w") as archive:
for file in self.files:
if file_data := self.fetch(file):
archive.writestr(file, file_data)
else:
archive.writestr(zipfile.ZipInfo(file), "")
return zip_path
def close(self):
"Auto save if applicable and close + remove db."
if self.auto_save:
self.save(
zip_path=self.auto_save
if isinstance(self.auto_save, (str, Path))
else None
)
self.db.close()
if isinstance(self.temp_db_path, Path):
self.temp_db_path.unlink(missing_ok=True)
def fetch(self, file_path: str) -> bytes:
"Get content of db file for file_path."
try:
return self.db.execute(SELECT_CONTENT, {"file_path": file_path}).fetchone()[
0
]
except TypeError:
return None
def __getitem__(self, key):
result = self.fetch(key)
try:
return result.decode("utf-8")
except AttributeError:
return result
def __setitem__(self, file_path, content: str | bytes):
if isinstance(content, str):
content = content.encode("utf-8")
self.db.execute(
INSERT_FILE,
{"file_path": file_path, "file_content": content},
)
def __delitem__(self, file_path):
self.db.execute(DELETE_FILE, {"file_path": file_path})
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
if __name__ == "__main__":
# A use case: editing epub files.
# File source:
# https://archiveofourown.org/downloads/13795605/Victoria%20Potter%20and%20the.epub?updated_at=1650231615
file_path = Path("Victoria Potter and the.epub")
new_file = (file_path.parent / (file_path.stem + "- lowercase")).with_suffix(
file_path.suffix
)
# Create a copy of the epub with all letters lowercase
with EditableZip(zip_path=file_path, auto_save=new_file) as ez:
for file in ez.files:
if Path(file).suffix in [".html", ".xhtml"]:
ez[file] = ez[file].lower()
Reference: Delete file from zipfile with the ZipFile Module
In short,
You can take the code from https://github.com/python/cpython/blob/659eb048cc9cac73c46349eb29845bc5cd630f09/Lib/zipfile.py and create a separate file from it. After that just reference it from your project instead of built-in python library: import myproject.zipfile as zipfile.
Usage:
with zipfile.ZipFile(f"archive.zip", "a") as z:
z.remove(f"firstfile.txt")

read a dict into self

I have a class that is doing a lot of stuff. In the end, it saves everything into a pickle. When I rerun this class I want to read the pickle instead of doing everything again. Unfortunately it the variable is always empty if I unpickle. Why is that so?
import pandas as pd
class Test:
def __init__(path, value):
# path points to a .txt file but its in the same folder as the pickle
data_path, data = os.path.split(path)
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
self = pd.read_pickle(path)
else:
# do a ton of stuff and safe it as pickle afterwards
variable = Test(path, value)
In this case variable is empty if I read from pickle but correct if I do all the stuff...
If I want to cache some calculation results I will load/dump the object outside the class, something like,
pickle_path = os.path.join(data_path, name.split('.')[1] + '.pickle'
if os.path.isfile(pickle_path):
with open(pickle_path, 'rb') as f:
variable = pickle.load(f) # use cached results
else:
variable = Test() # do all the calculations

Can I configure python to have matlab like print?

Can I configure python to have matlab like print, so that when I just have a function
returnObject()
that it simply prints that object without me having to type print around it? I assume this is not easy, but something like if an object does not get bound by some other var it should get printed, so that this would work.
a = 5 #prints nothing
b = getObject() #prints nothing
a #prints 5
b #prints getObject()
getObject() #prints the object
If you use an ipython notebook individual cells work like this. But you can only view one object per cell by typing the objects name. To see multiple objects you'd need to call print, or use lots of cells.
You could write a script to modify the original script based on a set of rules that define what to print, then run the modified script.
A basic script to do this would be:
f = open('main.py', 'r')
p = open('modified.py', 'w')
p.write('def main(): \n')
for line in f:
temp = line
if len(temp) == 1:
temp = 'print(' + line + ')'
p.write('\t' + temp)
p.close()
from modified import main
main()
The script main.py would then look like this:
x = 236
x
output:
236
Idea is as follows: parse AST of Python code, replace every expression with call to print and content of expression as argument and then run the modified version. I'm not sure whether it works with every code, but you might try. Save it as matlab.py and run your code as python3 -m matlab file.py.
#!/usr/bin/env python3
import ast
import os
import sys
class PrintAdder(ast.NodeTransformer):
def add_print(self, node):
print_func = ast.Name("print", ast.Load())
print_call = ast.Call(print_func, [node.value], [])
print_statement = ast.Expr(print_call)
return print_statement
def visit_Expr(self, node):
if isinstance(node.value, ast.Call) and node.value.func.id == 'print':
return node
return self.add_print(node)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', type=argparse.FileType(), nargs='?', default='-')
args = parser.parse_args()
with args.infile as infile:
code = infile.read()
file_name = args.infile.name
tree = ast.parse(code, file_name, 'exec')
tree = PrintAdder().visit(tree)
tree = ast.fix_missing_locations(tree)
bytecode = compile(tree, file_name, 'exec')
exec(bytecode)
if __name__ == '__main__':
main()

How to assign the elements of a list as file names in python?

I am trying to assign the elements of a list as names for some files that live in a directory, so far I created a function that recover the name of a each file from a directory and returns them in a list:
def retrive(directory_path):
path_names = []
for filename in sorted(glob.glob(os.path.join(directory_path, '*.pdf'))):
retrieved_files = filename.split('/')[-1]
path_names.append(retrieved_files)
print (path_names)
The above function returns in a list the names of each file, then I am writing the files into another directory as follows:
path = os.path.join(new_dir_path, "list%d.txt" % i)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Finally, my question is: how can I assign as a name of each file, each element of path_names?, something like this line:
path = os.path.join(new_dir_path, "list%d.txt" % i)
I also tried to use the format() function. However I still cant assign the the correct name to each file.
Here's the full script:
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from tika import parser
all_texts = []
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
texts = parsed['content']
all_texts.append(texts)
for i , a_list in enumerate(all_texts):
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, "list%d.txt" % i)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
The desired output will consist of the actual names of each processed file.
You’re almost there:
for path_name in path_names:
path = os.path.join(new_dir_path, "list%s.txt" % path_name)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as f:
for item in [a_list]:
f.write(item+"\n")
Update based on updated code sample. You are using different loops here, and that is not ideal unless you are doing processing in between the two loops. Since I am going to keep that structure, we are going to have to make sure to associate each block of content with the original filename. The best structure for that is a dict, and in case order is important, we use an OrderedDict. Now, when we’re looping over the filename, content pairs in the OrderedDict we’ll want to change the extension of the file to match the new file type. Luckily, python has some nice utilities for file/path manipulation in the os.path module. os.path.basename can be used to strip off the directory from a file and os.path.splitext will strip off an extension from a filename. We use both of those to get just the filename without the extension and then append .txt to designate the new file type. Putting it all together, we get :
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from collections import OrderedDict
from tika import parser
all_texts = OrderedDict()
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
filename = os.path.basename(filename)
texts = parsed['content']
all_texts[filename] = texts
for i, (original_filename, a_list) in enumerate(all_texts.items()):
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Second update: OP asked how I would write this code if this was all that there was, so here goes:
# move imports to top of file: PEP 8
import codecs, glob, os
from tika import parser
def transform_directoy(input_directory, output_directory):
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
parsed_content = parsed['content']
original_filename = os.path.basename(filename)
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
path = os.path.join(output_directory, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
# no need for a second loop since we can piggy back off the first loop
with codecs.open(path, "w", encoding='utf8') as filename:
# No need for a for loop here since our list only has one item
filename.write(parsed_content)
filename.write("\n")

LRU cache on hard drive python

I want to be able to decorate a function as you would do with functools.lru_cache, however, I want the results to be cached on the hard drive and not in memory. Looking around, I get a feeling this is a solved problem, and I was wondering if anyone could point me in the right direction (or at least give me a few more keywords to try googling)
I don't know if this will help or if it matters, but the function is computing images from unique filenames.
Here's some code to get you started:
from pathlib import Path
import pickle
import hashlib
import os
class LRU_Cache:
def __init__(self, directory, original_function, maxsize=10):
self.directory = directory
self.original_function = original_function
self.maxsize = maxsize
try:
os.mkdir(directory)
except OSError:
pass
def __call__(self, *args):
filename = hashlib.sha1(pickle.dumps(args)).hexdigest()
fullname = os.path.join(self.directory, filename)
try:
with open(fullname, 'rb') as f:
value = pickle.load(f)
Path(fullname).touch()
return value
except FileNotFoundError:
pass
value = self.original_function(*args)
with open(fullname, 'wb') as f:
pickle.dump(value, f)
filenames = os.listdir(self.directory)
if len(filenames) <= self.maxsize:
return
fullnames = [os.path.join(self.directory, filename)
for filename in filenames]
oldest = min(fullnames, key=lambda fn: os.stat(fn).st_mtime)
os.remove(oldest)
It uses hashes the arguments to create a unique filename for each function call. The function return value is pickled using that filename.
Cache hits unpickle the stored result and update the file modification time.
If the cache directory exceeds a target size, the oldest cache file is removed.
Use it like this:
def square(x):
print('!')
return x ** 2
sqr = LRU_Cache('square_cache', square, 10)
Now call sqr normally and results will be cached to disk.

Categories