I want to read a bz2 file in a server, decompress it and read with csv parser, but I still have the error;
myfile = bz2.BZ2File(bio.read(), "rb")
TypeError: file() argument 1 must be encoded string without NULL bytes, not str
import paramiko
from config import config
import bz2
import csv
import StringIO
from io import BytesIO
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(config.get('mrc_ssh', 'host'), username=config.get('mrc_ssh', 'user'))
sftp_client = ssh.open_sftp()
_file = sftp_client.open('/home/myfile.bz2')
bio = BytesIO(_file.read())
print bio
myfile = bz2.BZ2File(bio.read(), "rb")
reader = csv.DictReader(myfile)
for row in reader:
print row
bz2.BZ2File takes a filename as the first argument. Not actual data.
Either use (if you can store locally the file):
myfile = bz2.BZ2File('/home/myfile.bz2', "rb")
Or use the one-shot decompression function bz2.decompress
uncompressed_data = bz2.decompress(bio.read())
Related
I have the below code in which I am trying to hash an image and set the output in a text file, later I am encrypting the txt file and I need to delete the old version of it. However, I am receiving the following error "The process cannot access the file because it is being used by another process". I have tried to use time.sleep(3) to delay the code but I am still receiving the same error.
or should I be using .close() and how to set it?
can someone please advise?
import os
import hashlib
import logging
import time
import cryptography
from cryptography.fernet import Fernet
key = Fernet.generate_key()
file = open('key.key', 'wb') # Open the file as wb to write bytes
file.write(key) # The key is type bytes still
file.close()
file = open('key.key', 'rb') # Open the file as wb to read bytes
key = file.read() # The key will be type bytes
file.close()
logging.basicConfig(filename='InitializationHash.txt', level=logging.INFO,
format='%(message)s')
def hash_image(filepath):
with open(filepath, 'rb') as f:
file_bytes = f.read()
hash_text = hashlib.sha256(file_bytes).hexdigest()
logging.info(hash_text)
def get_one_image(file_Name):
filepath = os.path.abspath(file_Name)
hash_image(filepath)
if __name__ == '__main__':
get_one_image("initializationlandmark.png")
input_file = 'InitializationHash.txt'
output_file = 'InitializationHash.encrypted'
with open(input_file, 'rb') as f:
data = f.read() # Read the bytes of the input file
fernet = Fernet(key)
encrypted = fernet.encrypt(data)
with open(output_file, 'wb') as f:
f.write(encrypted) # Write the encrypted bytes to the output file
os.remove("InitializationHash.txt")
I am trying to use tarfile to add a file in memory and then writing it back to disk, but the issue i am having is that in my final output, when i extract the newly created tar.gz file, i am getting an empty file. What am I doing wrong in my code?
import tarfile
import io
with open('logo.png', 'rb') as f:
data = f.read()
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('some.png')
tar.addfile(info, data)
with open('/tmp/test/test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
I also tried doing tar.addfile(info, fh.write(data)), but that just creates a corrupted tar file.
TarFile.addfile() takes a file-like object.
When the documentation says:
tarinfo.size bytes are read from it and added to the archive.
It means that tarinfo.size is used to determine how many bytes to read. Therefore, you need set tarinfo.size appropriately.
The only thing you need to do is read the data from the source, count the length, then load that data into a BytesIO object:
E.g.
import tarfile
import io
with open('logo.png', 'rb') as f:
data = f.read()
source_f = io.BytesIO(initial_bytes=data)
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('logo.png')
info.size = len(data)
tar.addfile(info, source_f)
with open('test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
or a more memory efficient way, seek the source file:
f = open('logo.png', 'rb')
f.seek(0,2) # go to the end
source_len = f.tell()
f.seek(0)
fh = io.BytesIO()
with tarfile.open(fileobj=fh, mode='w:gz') as tar:
info = tarfile.TarInfo('logo.png')
info.size = source_len
tar.addfile(info, f)
with open('test.tar.gz', 'wb') as f:
f.write(fh.getvalue())
f.close()
I am working on a script that fetches a zip file from a URL using tje request library. That zip file contains a csv file. I'm trying to read that csv file without saving it. But while parsing it's giving me this error: _csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
import csv
import requests
from io import BytesIO, StringIO
from zipfile import ZipFile
response = requests.get(url)
zip_file = ZipFile(BytesIO(response.content))
files = zip_file.namelist()
with zip_file.open(files[0]) as csvfile:
csvreader = csv.reader(csvfile)
# _csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
for row in csvreader:
print(row)
Try this:
import pandas as pd
import requests
from io import BytesIO, StringIO
from zipfile import ZipFile
response = requests.get(url)
zip_file = ZipFile(BytesIO(response.content))
files = zip_file.namelist()
with zip_file.open(files[0]) as csvfile:
print(pd.read_csv(csvfile, encoding='utf8', sep=","))
As #Aran-Fey alluded to:
import zipfile
import csv
import io
with open('/path/to/archive.zip', 'r') as f:
with zipfile.ZipFile(f) as zf:
csv_filename = zf.namelist()[0] # see namelist() for the list of files in the archive
with zf.open(csv_filename) as csv_f:
csv_f_as_text = io.TextIOWrapper(csv_f)
reader = csv.reader(csv_f_as_text)
csv.reader (and csv.DictReader) require a file-like object opened in text mode. Normally this is not a problem when simply open(...)ing file in 'r' mode, as the Python 3 docs say, text mode is the default: "The default mode is 'r' (open for reading text, synonym of 'rt')". But if you try rt with open on a ZipFile, you'll see an error that: ZipFile.open() requires mode "r" or "w":
with zf.open(csv_filename, 'rt') as csv_f:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
...
ValueError: open() requires mode "r" or "w"
That's what io.TextIOWrapper is for -- for wrapping byte streams to be readable as text, decoding them on the fly.
I want to get the content of a remote file with fabric, without creating a temporary file.
from StringIO import StringIO
from fabric.api import get
fd = StringIO()
get(remote_path, fd)
content=fd.getvalue()
With Python 3 (and fabric3), I get this fatal error when using io.StringIO: string argument expected, got 'bytes', apparently because Paramiko writes to the file-like object with bytes. So I switched to using io.BytesIO and it works:
from io import BytesIO
def _read_file(file_path, encoding='utf-8'):
io_obj = BytesIO()
get(file_path, io_obj)
return io_obj.getvalue().decode(encoding)
import tempfile
from fabric.api import get
with tempfile.TemporaryFile() as fd:
get(remote_path, fd)
fd.seek(0)
content=fd.read()
See: http://docs.python.org/2/library/tempfile.html#tempfile.TemporaryFile
and: http://docs.fabfile.org/en/latest/api/core/operations.html#fabric.operations.get
I'm trying to figure how to use mmap with a gzip compressed file. Is that even possible ?
import mmap
import os
import gzip
filename = r'C:\temp\data.gz'
file = gzip.open(filename, "rb+")
size = os.path.getsize(filename)
file = mmap.mmap(file.fileno(), size)
print file.read(8)
The output data is compressed.
You can do easilly. Indeed the gzip module gets as optional argument a file-like object.
import mmap
import gzip
filename = "a.gz"
handle = open(filename, "rb")
mapped = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ)
gzfile = gzip.GzipFile(mode="r", fileobj=mapped)
print gzfile.read()
The same applies to tarfile module:
import sys
import mmap
import tarfile
f = open(sys.argv[1], 'rb')
fo = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
tf = tarfile.open(mode='r:gz', fileobj=fo)
print tf.getnames()
Well, not the way you want.
mmap() can be used to access the gzipped file if the compressed data is what you want.
mmap() is a system call for mapping disk blocks into RAM almost as if you were adding swap.
You can't map the uncompressed data into RAM with mmap() as it is not on the disk.