How to read from a text file compressed with 7z? - python

I would like to read (in Python 2.7), line by line, from a csv (text) file, which is 7z compressed. I don't want to decompress the entire (large) file, but to stream the lines.
I tried pylzma.decompressobj() unsuccessfully. I get a data error. Note that this code doesn't yet read line by line:
input_filename = r"testing.csv.7z"
with open(input_filename, 'rb') as infile:
obj = pylzma.decompressobj()
o = open('decompressed.raw', 'wb')
obj = pylzma.decompressobj()
while True:
tmp = infile.read(1)
if not tmp: break
o.write(obj.decompress(tmp))
o.close()
Output:
o.write(obj.decompress(tmp))
ValueError: data error during decompression

This will allow you to iterate the lines. It's partially derived from some code I found in an answer to another question.
At this point in time (pylzma-0.5.0) the py7zlib module doesn't implement an API that would allow archive members to be read as a stream of bytes or characters — its ArchiveFile class only provides a read() function that decompresses and returns the uncompressed data in a member all at once. Given that, about the best that can be done is return bytes or lines iteratively via a Python generator using that as a buffer.
The following does the latter, but may not help if the problem is the archive member file itself is huge.
The code below should work in Python 3.x as well as 2.7.
import io
import os
import py7zlib
class SevenZFileError(py7zlib.ArchiveError):
pass
class SevenZFile(object):
#classmethod
def is_7zfile(cls, filepath):
""" Determine if filepath points to a valid 7z archive. """
is7z = False
fp = None
try:
fp = open(filepath, 'rb')
archive = py7zlib.Archive7z(fp)
_ = len(archive.getnames())
is7z = True
finally:
if fp: fp.close()
return is7z
def __init__(self, filepath):
fp = open(filepath, 'rb')
self.filepath = filepath
self.archive = py7zlib.Archive7z(fp)
def __contains__(self, name):
return name in self.archive.getnames()
def readlines(self, name, newline=''):
r""" Iterator of lines from named archive member.
`newline` controls how line endings are handled.
It can be None, '', '\n', '\r', and '\r\n' and works the same way as it does
in StringIO. Note however that the default value is different and is to enable
universal newlines mode, but line endings are returned untranslated.
"""
archivefile = self.archive.getmember(name)
if not archivefile:
raise SevenZFileError('archive member %r not found in %r' %
(name, self.filepath))
# Decompress entire member and return its contents iteratively.
data = archivefile.read().decode()
for line in io.StringIO(data, newline=newline):
yield line
if __name__ == '__main__':
import csv
if SevenZFile.is_7zfile('testing.csv.7z'):
sevenZfile = SevenZFile('testing.csv.7z')
if 'testing.csv' not in sevenZfile:
print('testing.csv is not a member of testing.csv.7z')
else:
reader = csv.reader(sevenZfile.readlines('testing.csv'))
for row in reader:
print(', '.join(row))

If you were using Python 3.3+, you might be able to do this using the lzma module which was added to the standard library in that version.
See: lzma Examples

If you can use python 3, there is a useful library, py7zr, which supports partially 7zip decompression as below:
import py7zr
import re
filter_pattern = re.compile(r'<your/target/file_and_directories/regex/expression>')
with SevenZipFile('archive.7z', 'r') as archive:
allfiles = archive.getnames()
selective_files = [f if filter_pattern.match(f) for f in allfiles]
archive.extract(targets=selective_files)

Related

Reading a binary file Python (pickle) [duplicate]

I created some data and stored it several times like this:
with open('filename', 'a') as f:
pickle.dump(data, f)
Every time the size of file increased, but when I open file
with open('filename', 'rb') as f:
x = pickle.load(f)
I can see only data from the last time.
How can I correctly read file?
Pickle serializes a single object at a time, and reads back a single object -
the pickled data is recorded in sequence on the file.
If you simply do pickle.load you should be reading the first object serialized into the file (not the last one as you've written).
After unserializing the first object, the file-pointer is at the beggining
of the next object - if you simply call pickle.load again, it will read that next object - do that until the end of the file.
objects = []
with (open("myfile", "rb")) as openfile:
while True:
try:
objects.append(pickle.load(openfile))
except EOFError:
break
There is a read_pickle function as part of pandas 0.22+
import pandas as pd
obj = pd.read_pickle(r'filepath')
The following is an example of how you might write and read a pickle file. Note that if you keep appending pickle data to the file, you will need to continue reading from the file until you find what you want or an exception is generated by reaching the end of the file. That is what the last function does.
import os
import pickle
PICKLE_FILE = 'pickle.dat'
def main():
# append data to the pickle file
add_to_pickle(PICKLE_FILE, 123)
add_to_pickle(PICKLE_FILE, 'Hello')
add_to_pickle(PICKLE_FILE, None)
add_to_pickle(PICKLE_FILE, b'World')
add_to_pickle(PICKLE_FILE, 456.789)
# load & show all stored objects
for item in read_from_pickle(PICKLE_FILE):
print(repr(item))
os.remove(PICKLE_FILE)
def add_to_pickle(path, item):
with open(path, 'ab') as file:
pickle.dump(item, file, pickle.HIGHEST_PROTOCOL)
def read_from_pickle(path):
with open(path, 'rb') as file:
try:
while True:
yield pickle.load(file)
except EOFError:
pass
if __name__ == '__main__':
main()
I developed a software tool that opens (most) Pickle files directly in your browser (nothing is transferred so it's 100% private):
https://pickleviewer.com/ (formerly)
Now it's hosted here: https://fire-6dcaa-273213.web.app/
Edit: Available here if you want to host it somewhere: https://github.com/ch-hristov/Pickle-viewer
Feel free to host this somewhere.

How to edit a file in python without a temporary sacrificial file? [duplicate]

I'm using Python, and would like to insert a string into a text file without deleting or copying the file. How can I do that?
Unfortunately there is no way to insert into the middle of a file without re-writing it. As previous posters have indicated, you can append to a file or overwrite part of it using seek but if you want to add stuff at the beginning or the middle, you'll have to rewrite it.
This is an operating system thing, not a Python thing. It is the same in all languages.
What I usually do is read from the file, make the modifications and write it out to a new file called myfile.txt.tmp or something like that. This is better than reading the whole file into memory because the file may be too large for that. Once the temporary file is completed, I rename it the same as the original file.
This is a good, safe way to do it because if the file write crashes or aborts for any reason, you still have your untouched original file.
Depends on what you want to do. To append you can open it with "a":
with open("foo.txt", "a") as f:
f.write("new line\n")
If you want to preprend something you have to read from the file first:
with open("foo.txt", "r+") as f:
old = f.read() # read everything in the file
f.seek(0) # rewind
f.write("new line\n" + old) # write the new line before
The fileinput module of the Python standard library will rewrite a file inplace if you use the inplace=1 parameter:
import sys
import fileinput
# replace all occurrences of 'sit' with 'SIT' and insert a line after the 5th
for i, line in enumerate(fileinput.input('lorem_ipsum.txt', inplace=1)):
sys.stdout.write(line.replace('sit', 'SIT')) # replace 'sit' and write
if i == 4: sys.stdout.write('\n') # write a blank line after the 5th line
Rewriting a file in place is often done by saving the old copy with a modified name. Unix folks add a ~ to mark the old one. Windows folks do all kinds of things -- add .bak or .old -- or rename the file entirely or put the ~ on the front of the name.
import shutil
shutil.move(afile, afile + "~")
destination= open(aFile, "w")
source= open(aFile + "~", "r")
for line in source:
destination.write(line)
if <some condition>:
destination.write(<some additional line> + "\n")
source.close()
destination.close()
Instead of shutil, you can use the following.
import os
os.rename(aFile, aFile + "~")
Python's mmap module will allow you to insert into a file. The following sample shows how it can be done in Unix (Windows mmap may be different). Note that this does not handle all error conditions and you might corrupt or lose the original file. Also, this won't handle unicode strings.
import os
from mmap import mmap
def insert(filename, str, pos):
if len(str) < 1:
# nothing to insert
return
f = open(filename, 'r+')
m = mmap(f.fileno(), os.path.getsize(filename))
origSize = m.size()
# or this could be an error
if pos > origSize:
pos = origSize
elif pos < 0:
pos = 0
m.resize(origSize + len(str))
m[pos+len(str):] = m[pos:origSize]
m[pos:pos+len(str)] = str
m.close()
f.close()
It is also possible to do this without mmap with files opened in 'r+' mode, but it is less convenient and less efficient as you'd have to read and temporarily store the contents of the file from the insertion position to EOF - which might be huge.
As mentioned by Adam you have to take your system limitations into consideration before you can decide on approach whether you have enough memory to read it all into memory replace parts of it and re-write it.
If you're dealing with a small file or have no memory issues this might help:
Option 1)
Read entire file into memory, do a regex substitution on the entire or part of the line and replace it with that line plus the extra line. You will need to make sure that the 'middle line' is unique in the file or if you have timestamps on each line this should be pretty reliable.
# open file with r+b (allow write and binary mode)
f = open("file.log", 'r+b')
# read entire content of file into memory
f_content = f.read()
# basically match middle line and replace it with itself and the extra line
f_content = re.sub(r'(middle line)', r'\1\nnew line', f_content)
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(f_content)
# close file
f.close()
Option 2)
Figure out middle line, and replace it with that line plus the extra line.
# open file with r+b (allow write and binary mode)
f = open("file.log" , 'r+b')
# get array of lines
f_content = f.readlines()
# get middle line
middle_line = len(f_content)/2
# overwrite middle line
f_content[middle_line] += "\nnew line"
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(''.join(f_content))
# close file
f.close()
Wrote a small class for doing this cleanly.
import tempfile
class FileModifierError(Exception):
pass
class FileModifier(object):
def __init__(self, fname):
self.__write_dict = {}
self.__filename = fname
self.__tempfile = tempfile.TemporaryFile()
with open(fname, 'rb') as fp:
for line in fp:
self.__tempfile.write(line)
self.__tempfile.seek(0)
def write(self, s, line_number = 'END'):
if line_number != 'END' and not isinstance(line_number, (int, float)):
raise FileModifierError("Line number %s is not a valid number" % line_number)
try:
self.__write_dict[line_number].append(s)
except KeyError:
self.__write_dict[line_number] = [s]
def writeline(self, s, line_number = 'END'):
self.write('%s\n' % s, line_number)
def writelines(self, s, line_number = 'END'):
for ln in s:
self.writeline(s, line_number)
def __popline(self, index, fp):
try:
ilines = self.__write_dict.pop(index)
for line in ilines:
fp.write(line)
except KeyError:
pass
def close(self):
self.__exit__(None, None, None)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
with open(self.__filename,'w') as fp:
for index, line in enumerate(self.__tempfile.readlines()):
self.__popline(index, fp)
fp.write(line)
for index in sorted(self.__write_dict):
for line in self.__write_dict[index]:
fp.write(line)
self.__tempfile.close()
Then you can use it this way:
with FileModifier(filename) as fp:
fp.writeline("String 1", 0)
fp.writeline("String 2", 20)
fp.writeline("String 3") # To write at the end of the file
If you know some unix you could try the following:
Notes: $ means the command prompt
Say you have a file my_data.txt with content as such:
$ cat my_data.txt
This is a data file
with all of my data in it.
Then using the os module you can use the usual sed commands
import os
# Identifiers used are:
my_data_file = "my_data.txt"
command = "sed -i 's/all/none/' my_data.txt"
# Execute the command
os.system(command)
If you aren't aware of sed, check it out, it is extremely useful.

Unified opening of .csv and .csv.gz files

In Python 2.7, I would like to open a file and do some manipulations with it. The problem is that I do not know beforehand if it has a .csv or a .csv.gz extension. If I knew it was .csv, I would do
with open(filename, "r") as f_in:
do something
If I knew it was .csv.gz, I could say
import gzip
with gzip.open(filename, "r") as f_in:
do something
I am curious if there is a way to avoid repetition after figuring out the file extension:
def find_ext(filename):
return filename.split(".")[-1]
ext = find_ext(filename)
if ext == "csv":
with open(filename, "r") as f_in:
do something
else if ext == "gz":
import gzip
with gzip.open(filename, "r") as f_in:
do something
I wouldn't bother looking at the file extension: Files get renamed or use non-standard variations.
Instead, open the raw file and examine the header. If it begins with two 32-bit words 0x00088b1f and 0, it is a gzip file.
import struct
f = open(filename, 'rb')
v = f.read(8)
v1 = struct.unpack('I', v)[0]
v2 = struct.unpack('I', v)[1]
if v1 == 0x00088b1f and v2 == 0:
# it is gzip
import gzip
import mimetypes
smart_open = lambda fn: {"gzip": gzip.open(fn)}.get(mimetypes.guess_type(fn)[1], open(fn))
# usage:
f = smart_open("test.csv.gz")
f = smart_open("test.csv")
Since different libraries are used for different cases the check of the extension is required. But one can go with construction like:
try:
...
except AnError:
...
else:
...
finally:
...
Anyway, one (if/else) or another (try/finally) type of construction is necessary since a file opening process is not the same for different cases.
To avoid repetition use such approach (it is pseudocode):
def readcsv(file):
...
def readgzip(file):
...
if csv:
readcsv(file)
elif gzip:
readgzip(file)

function for loading both strings and files on disk?

I have a design question. I have a function loadImage() for loading an image file. Now it accepts a string which is a file path. But I also want to be able to load files which are not on physical disk, eg. generated procedurally. I could have it accept a string, but then how could it know the string is not a file path but file data? I could add an extra boolean argument to specify that, but that doesn't sound very clean. Any ideas?
It's something like this now:
def loadImage(filepath):
file = open(filepath, 'rb')
data = file.read()
# do stuff with data
The other version would be
def loadImage(data):
# do stuff with data
How to have this function accept both 'filepath' or 'data' and guess what it is?
You can change your loadImage function to expect an opened file-like object, such as:
def load_image(f):
data = file.read()
... and then have that called from two functions, one of which expects a path and the other a string that contains the data:
from StringIO import StringIO
def load_image_from_path(path):
with open(path, 'rb') as f:
load_image(f)
def load_image_from_string(s):
sio = StringIO(s)
try:
load_image(sio)
finally:
sio.close()
How about just creating two functions, loadImageFromString and loadImageFromFile?
This being Python, you can easily distinguish between a filename and a data string. I would do something like this:
import os.path as P
from StringIO import StringIO
def load_image(im):
fin = None
if P.isfile(im):
fin = open(im, 'rb')
else:
fin = StringIO(im)
# Read from fin like you would from any open file object
Other ways to do it would be a try block instead of using os.path, but the essence of the approach remains the same.

Using ConfigParser to read a file without section name

I am using ConfigParser to read the runtime configuration of a script.
I would like to have the flexibility of not providing a section name (there are scripts which are simple enough; they don't need a 'section'). ConfigParser will throw a NoSectionError exception, and will not accept the file.
How can I make ConfigParser simply retrieve the (key, value) tuples of a config file without section names?
For instance:
key1=val1
key2:val2
I would rather not write to the config file.
You can do this in a single line of code.
In python 3, prepend a fake section header to your config file data, and pass it to read_string().
from configparser import ConfigParser
parser = ConfigParser()
with open("foo.conf") as stream:
parser.read_string("[top]\n" + stream.read()) # This line does the trick.
You could also use itertools.chain() to simulate a section header for read_file(). This might be more memory-efficient than the above approach, which might be helpful if you have large config files in a constrained runtime environment.
from configparser import ConfigParser
from itertools import chain
parser = ConfigParser()
with open("foo.conf") as lines:
lines = chain(("[top]",), lines) # This line does the trick.
parser.read_file(lines)
In python 2, prepend a fake section header to your config file data, wrap the result in a StringIO object, and pass it to readfp().
from ConfigParser import ConfigParser
from StringIO import StringIO
parser = ConfigParser()
with open("foo.conf") as stream:
stream = StringIO("[top]\n" + stream.read()) # This line does the trick.
parser.readfp(stream)
With any of these approaches, your config settings will be available in parser.items('top').
You could use StringIO in python 3 as well, perhaps for compatibility with both old and new python interpreters, but note that it now lives in the io package and readfp() is now deprecated.
Alternatively, you might consider using a TOML parser instead of ConfigParser.
Alex Martelli provided a solution for using ConfigParser to parse .properties files (which are apparently section-less config files).
His solution is a file-like wrapper that will automagically insert a dummy section heading to satisfy ConfigParser's requirements.
Enlightened by this answer by jterrace, I come up with this solution:
Read entire file into a string
Prefix with a default section name
Use StringIO to mimic a file-like object
ini_str = '[root]\n' + open(ini_path, 'r').read()
ini_fp = StringIO.StringIO(ini_str)
config = ConfigParser.RawConfigParser()
config.readfp(ini_fp)
EDIT for future googlers: As of Python 3.4+ readfp is deprecated, and StringIO is not needed anymore. Instead we can use read_string directly:
with open('config_file') as f:
file_content = '[dummy_section]\n' + f.read()
config_parser = ConfigParser.RawConfigParser()
config_parser.read_string(file_content)
You can use the ConfigObj library to do that simply : http://www.voidspace.org.uk/python/configobj.html
Updated: Find latest code here.
If you are under Debian/Ubuntu, you can install this module using your package manager :
apt-get install python-configobj
An example of use:
from configobj import ConfigObj
config = ConfigObj('myConfigFile.ini')
config.get('key1') # You will get val1
config.get('key2') # You will get val2
The easiest way to do this is to use python's CSV parser, in my opinion. Here's a read/write function demonstrating this approach as well as a test driver. This should work provided the values are not allowed to be multi-line. :)
import csv
import operator
def read_properties(filename):
""" Reads a given properties file with each line of the format key=value. Returns a dictionary containing the pairs.
Keyword arguments:
filename -- the name of the file to be read
"""
result={ }
with open(filename, "rb") as csvfile:
reader = csv.reader(csvfile, delimiter='=', escapechar='\\', quoting=csv.QUOTE_NONE)
for row in reader:
if len(row) != 2:
raise csv.Error("Too many fields on row with contents: "+str(row))
result[row[0]] = row[1]
return result
def write_properties(filename,dictionary):
""" Writes the provided dictionary in key-sorted order to a properties file with each line of the format key=value
Keyword arguments:
filename -- the name of the file to be written
dictionary -- a dictionary containing the key/value pairs.
"""
with open(filename, "wb") as csvfile:
writer = csv.writer(csvfile, delimiter='=', escapechar='\\', quoting=csv.QUOTE_NONE)
for key, value in sorted(dictionary.items(), key=operator.itemgetter(0)):
writer.writerow([ key, value])
def main():
data={
"Hello": "5+5=10",
"World": "Snausage",
"Awesome": "Possum"
}
filename="test.properties"
write_properties(filename,data)
newdata=read_properties(filename)
print "Read in: "
print newdata
print
contents=""
with open(filename, 'rb') as propfile:
contents=propfile.read()
print "File contents:"
print contents
print ["Failure!", "Success!"][data == newdata]
return
if __name__ == '__main__':
main()
Having ran into this problem myself, I wrote a complete wrapper to ConfigParser (the version in Python 2) that can read and write files without sections transparently, based on Alex Martelli's approach linked on the accepted answer. It should be a drop-in replacement to any usage of ConfigParser. Posting it in case anyone in need of that finds this page.
import ConfigParser
import StringIO
class SectionlessConfigParser(ConfigParser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, *args, **kwargs):
default_section = kwargs.pop('default_section', None)
ConfigParser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except ConfigParser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, basestring):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return ConfigParser.RawConfigParser.readfp(self, stream, *args,
**kwargs)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except ConfigParser.NoSectionError:
pass
ConfigParser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
Blueicefield's answer mentioned configobj, but the original lib only supports Python 2. It now has a Python 3+ compatible port:
https://github.com/DiffSK/configobj
APIs haven't changed, see it's doc.

Categories