Files added to tarfile come back as empty files - python

I am trying to add a file to a gzipped tarfile in python
import tarfile
# create test file
with open("testfile.txt", "w") as f:
f.write("TESTTESTTEST")
# create archive
with tarfile.open("archfile.tar.gz", "x:gz") as archive:
with open("testfile.txt", 'rb') as f:
archive.addfile(tarfile.TarInfo("testfile.txt"), f)
# read test file out of archive
with tarfile.open("archfile.tar.gz", "r:gz") as archive:
print(archive.extractfile("testfile.txt").read())
The result is b'' - an empty bytestring.
The file is not empty - if I try to read the file using the following code:
with open("testfile.txt", 'rb') as f:
print(f.read())
... I get b'TESTTESTTEST'
Is there something obvious I am missing? My end goal is to add the string in memory using f = io.StringIO('TESTTESTTEST')
I also tried removing the :gz and I see the same problem with a raw tar archive.
For additional info - I'm using Python 3 in a jupyter session on Windows 10. I see the same problem in Windows/Python 3.5.2/PyCharm.

I hit a similar problem. The documentation says that when you call tar.addfile it will write TarInfo.size bytes from the given file. That means that you have to either create the TarInfo with the file size or use tar.add() instead of tar.addfile:
# create archive V1
with tarfile.open("archfile.tar.gz", "x:gz") as archive:
with open("testfile.txt", 'rb') as f:
info = archive.gettarinfo("testfile.txt")
archive.addfile(info, f)
# create archive V2
with tarfile.open("archfile.tar.gz", "x:gz") as archive:
archive.add("testfile.txt")
# create archive V3
with tarfile.open("archfile.tar.gz", "w:gz") as archive:
with io.BytesIO(b"TESTTESTTEST") as f:
info = tarfile.TarInfo("testfile.txt")
f.seek(0, io.SEEK_END)
info.size = f.tell()
f.seek(0, io.SEEK_SET)
archive.addfile(info, f)

You can us the StringIO module to write the content as a file object to the tar file.
Sample:
import tarfile
import StringIO
tar = tarfile.TarFile("archfile.tar.gz","w")
with open("testfile.txt", 'rb') as f:
s = StringIO.StringIO(f.read())
info = tarfile.TarInfo(name="testfile.txt")
info.size = len(s.buf)
tar.addfile(tarinfo=info, fileobj=s)
tar.close()

Not a perfect answer but I managed to work around this with zipfile.
import zipfile
import io
# create archive
with zipfile.ZipFile("archfile.zip", "w") as archive:
with io.StringIO("TESTTESTTEST") as f:
archive.writestr("1234.txt", f.read())
# read test file out of archive
with zipfile.ZipFile("archfile.zip", "r") as archive:
print(archive.read("1234.txt"))
produces b'TESTTESTTEST'

Related

os.write() appends file instead of overwriting, but O_APPEND isn't used [duplicate]

I have the following code:
import re
#open the xml file for reading:
file = open('path/test.xml','r+')
#convert to string:
data = file.read()
file.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
file.close()
where I'd like to replace the old content that's in the file with the new content. However, when I execute my code, the file "test.xml" is appended, i.e. I have the old content follwed by the new "replaced" content. What can I do in order to delete the old stuff and only keep the new?
You need seek to the beginning of the file before writing and then use file.truncate() if you want to do inplace replace:
import re
myfile = "path/test.xml"
with open(myfile, "r+") as f:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
f.truncate()
The other way is to read the file then open it again with open(myfile, 'w'):
with open(myfile, "r") as f:
data = f.read()
with open(myfile, "w") as f:
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
Neither truncate nor open(..., 'w') will change the inode number of the file (I tested twice, once with Ubuntu 12.04 NFS and once with ext4).
By the way, this is not really related to Python. The interpreter calls the corresponding low level API. The method truncate() works the same in the C programming language: See http://man7.org/linux/man-pages/man2/truncate.2.html
file='path/test.xml'
with open(file, 'w') as filetowrite:
filetowrite.write('new content')
Open the file in 'w' mode, you will be able to replace its current text save the file with new contents.
Using truncate(), the solution could be
import re
#open the xml file for reading:
with open('path/test.xml','r+') as f:
#convert to string:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
f.truncate()
import os#must import this library
if os.path.exists('TwitterDB.csv'):
os.remove('TwitterDB.csv') #this deletes the file
else:
print("The file does not exist")#add this to prevent errors
I had a similar problem, and instead of overwriting my existing file using the different 'modes', I just deleted the file before using it again, so that it would be as if I was appending to a new file on each run of my code.
See from How to Replace String in File works in a simple way and is an answer that works with replace
fin = open("data.txt", "rt")
fout = open("out.txt", "wt")
for line in fin:
fout.write(line.replace('pyton', 'python'))
fin.close()
fout.close()
in my case the following code did the trick
with open("output.json", "w+") as outfile: #using w+ mode to create file if it not exists. and overwrite the existing content
json.dump(result_plot, outfile)
Using python3 pathlib library:
import re
from pathlib import Path
import shutil
shutil.copy2("/tmp/test.xml", "/tmp/test.xml.bak") # create backup
filepath = Path("/tmp/test.xml")
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))
Similar method using different approach to backups:
from pathlib import Path
filepath = Path("/tmp/test.xml")
filepath.rename(filepath.with_suffix('.bak')) # different approach to backups
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))

Python tarfile compress an object in memory

I am trying to use tarfile to add a file in memory and then writing it back to disk, but the issue i am having is that in my final output, when i extract the newly created tar.gz file, i am getting an empty file. What am I doing wrong in my code?
import tarfile
import io
with open('logo.png', 'rb') as f:
data = f.read()
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('some.png')
tar.addfile(info, data)
with open('/tmp/test/test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
I also tried doing tar.addfile(info, fh.write(data)), but that just creates a corrupted tar file.
TarFile.addfile() takes a file-like object.
When the documentation says:
tarinfo.size bytes are read from it and added to the archive.
It means that tarinfo.size is used to determine how many bytes to read. Therefore, you need set tarinfo.size appropriately.
The only thing you need to do is read the data from the source, count the length, then load that data into a BytesIO object:
E.g.
import tarfile
import io
with open('logo.png', 'rb') as f:
data = f.read()
source_f = io.BytesIO(initial_bytes=data)
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('logo.png')
info.size = len(data)
tar.addfile(info, source_f)
with open('test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
or a more memory efficient way, seek the source file:
f = open('logo.png', 'rb')
f.seek(0,2) # go to the end
source_len = f.tell()
f.seek(0)
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('logo.png')
info.size = source_len
tar.addfile(info, f)
with open('test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
f.close()

how to open a gz file and save the file as txt in python

I have a gz file, how can I unzip the file and save the content to a txt in python?
I imported gzip already
file_path = gzip.open(file_name, 'rb')
How about opening a second file and writing to it?
import gzip
with gzip.open('file.txt.gz', 'rb') as f, open('file.txt', 'w') as f_out:
f_out.write(f.read())
Gzip's open method should open the file in such a way that its contents can be read like a normal file:
import gzip
#Define the file's location
file_path = "/path/to/file.gz"
#Open the file and read its contents
with gzip.open(file_path, "rb") as file:
file_content = file.read()
#Save the new txt file
txt_file_name = "txtFile.txt"
with open(txt_file_name, "w") as file:
file.write(file_content)

How to unzip gz file using Python

I need to extract a gz file that I have downloaded from an FTP site to a local Windows file server. I have the variables set for the local path of the file, and I know it can be used by GZIP muddle.
How can I do this? The file inside the GZ file is an XML file.
import gzip
import shutil
with gzip.open('file.txt.gz', 'rb') as f_in:
with open('file.txt', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
From the documentation:
import gzip
with gzip.open('file.txt.gz', 'rb') as f:
file_content = f.read()
Maybe you want pass it to pandas also.
with gzip.open('features_train.csv.gz') as f:
features_train = pd.read_csv(f)
features_train.head()
from sh import gunzip
gunzip('/tmp/file1.gz')
Not an exact answer because you're using xml data and there is currently no pd.read_xml() function (as of v0.23.4), but pandas (starting with v0.21.0) can uncompress the file for you! Thanks Wes!
import pandas as pd
import os
fn = '../data/file_to_load.json.gz'
print(os.path.isfile(fn))
df = pd.read_json(fn, lines=True, compression='gzip')
df.tail()
If you are parsing the file after unzipping it, don't forget to use decode() method, is necessary when you open a file as binary.
import gzip
with gzip.open(file.gz, 'rb') as f:
for line in f:
print(line.decode().strip())
It is very simple.. Here you go !!
import gzip
#path_to_file_to_be_extracted
ip = sample.gzip
#output file to be filled
op = open("output_file","w")
with gzip.open(ip,"rb") as ip_byte:
op.write(ip_byte.read().decode("utf-8")
wf.close()
You can use gzip.decompress() to do it:
read input file using rb mode;
open output file using w mode and utf8 encoding;
gzip.decompress() input bytes;
decode what you get to str.
write str to output file.
def decompress(infile, tofile):
with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof:
decom_str = gzip.decompress(inf.read()).decode('utf-8')
tof.write(decom_str)
If you have the gzip (and gunzip) programs installed on your computer a simple way is to call that command from python:
import os
filename = 'file.txt.gz'
os.system('gunzip ' + filename)
optionally, if you want to preserve the original file, use
os.system('gunzip --keep ' + filename)
if you have a linux environment it is very easy to unzip using the command gunzip.
go to the file folder and give as below
gunzip file-name

How to run an executable file contained in a text file(hex of exe)

I have a txt file containing hex of an exe. I read that file in python, but failed to run that .exe file.
any kind of help will be appriciated...
thanx
import binascii
def getExeFile():
file1=input("Enter an exe file name(path):")
with open(file1, 'rb') as f:
content1 = f.read()
bucket1=open("f1.txt", 'w')
bucket1.write(str(binascii.hexlify(content1)))
print(binascii .hexlify(content1))
bucket1.close()
def getNonExeFile():
file2=input("Enter a non-exe file name(path):")
with open(file2, 'rb') as f:
content2 = f.read()
bucket2=open("f2.txt", 'w')
bucket2.write(str(binascii.hexlify(content2)))
print(binascii .hexlify(content2))
bucket2.close()
getExeFile()
getNonExeFile()
print("End")
Dump it to a temporary file; Change it's permissions so it's executable and run it in a subprocess
Example:
from os import chown
from subprocess import check_call
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(get_hex_from_file("mydata.dat"))
chown(f.name, 0755)
check_call(f.name)
Of course I'm making the assumption here that you are doing this on some kind of UNIX machine and that "EXE" in this case means some kind of ELF/A.OUT/COFF executable! -- Nevertheless; same principles and code (with some tweaking) would probably work on other paltforms; e.g: Windows.
See:
tempfile.NamedTemporaryFile
subprocess.check_call
os.chown
Python 3.4:
This is my code that create a simple txt file having hex of and exe and a txt file.
Now I want my program to take that hex file and run the exe file and open the txt file.
import binascii
def getExeFile():
file1=input("Enter an exe file name(path):")
with open(file1, 'rb') as f:
content1 = f.read()
bucket1=open("f1.txt", 'w')
bucket1.write(str(binascii.hexlify(content1)))
bucket1.close()
def getNonExeFile():
file2=input("Enter a non-exe file name(path):")
with open(file2, 'rb') as f:
content2 = f.read()
bucket2=open("f2.txt", 'w')
bucket2.write(str(binascii.hexlify(content2)))
bucket2.close()
getExeFile()
getNonExeFile()
print("End")
Thanx...!

Categories