urlopen trouble while trying to download a gzip file - python

I am going to use the wiktionary dump for the purpose of POS tagging. Somehow it gets stuck when downloading. Here is my code:
import nltk
from urllib import urlopen
from collections import Counter
import gzip
url = 'http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles-in-ns0.gz'
fStream = gzip.open(urlopen(url).read(), 'rb')
dictFile = fStream.read()
fStream.close()
text = nltk.Text(word.lower() for word in dictFile())
tokens = nltk.word_tokenize(text)
Here is the error I get:
Traceback (most recent call last):
File "~/dir1/dir1/wikt.py", line 15, in <module>
fStream = gzip.open(urlopen(url).read(), 'rb')
File "/usr/lib/python2.7/gzip.py", line 34, in open
return GzipFile(filename, mode, compresslevel)
File "/usr/lib/python2.7/gzip.py", line 89, in __init__
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
TypeError: file() argument 1 must be encoded string without NULL bytes, not str
Process finished with exit code 1

You are passing the downloaded data to gzip.open(), which expects to be passed a filename instead.
The code then tries to open a filename named by the gzipped data, and fails.
Either save the URL data to a file, then use gzip.open() on that, or decompress the gzipped data using the zlib module instead. 'Saving' the data can be as easy as using a StringIO.StringIO() in-memory file object:
from StringIO import StringIO
from urllib import urlopen
import gzip
url = 'http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles-in-ns0.gz'
inmemory = StringIO(urlopen(url).read())
fStream = gzip.GzipFile(fileobj=inmemory, mode='rb')

Related

wave write function not working, what am I doing wrong?

I am trying to halve the existing sampling rate of a folder full of .wav files. This is the only way I have found to do it but it is not working. The read part works just fine up until f.close(), then the wave.write part causes the error.
import wave
import contextlib
import os
for file_name in os.listdir(os.getcwd()):
if file_name.endswith(".wav"):
with contextlib.closing(wave.open(file_name, 'rb')) as f:
rate = f.getframerate()
new_rate = rate/2
f.close()
with contextlib.closing(wave.open(file_name, 'wb')) as f:
rate = f.setframerate(new_rate)
This is the output when I run it.
Traceback (most recent call last):
File "C:\Users\hsash\OneDrive\Desktop\used AR1-20210513T223533Z-001 - Copy (2)\sounds\python code.py", line 36, in <module>
rate = f.setframerate(new_rate)
File "C:\Users\hsash\AppData\Local\Programs\Python\Python39\lib\contextlib.py", line 303, in __exit__
self.thing.close()
File "C:\Users\hsash\AppData\Local\Programs\Python\Python39\lib\wave.py", line 444, in close
self._ensure_header_written(0)
File "C:\Users\hsash\AppData\Local\Programs\Python\Python39\lib\wave.py", line 462, in _ensure_header_written
raise Error('# channels not specified')
wave.Error: # channels not specified
It says right there that #channels not specified. When you are opening a wavefile for writing, python sets all of the header fields to zero irrespectively of the current state of the file.
In order to make sure that the other fields are saved you need to copy them over from the old file when you read it the first time.
In the snippet below I'm using getparams and setparams to copy the header fields over and I'm using readframes and writeframes to copy the wave data.
import wave
import contextlib
import os
for file_name in os.listdir(os.getcwd()):
if file_name.endswith(".wav"):
with contextlib.closing(wave.open(file_name, 'rb')) as f:
rate = f.getframerate()
params = f.getparams()
frames = f.getnframes()
data = f.readframes(frames)
new_rate = rate/2
f.close()
with contextlib.closing(wave.open(file_name, 'wb')) as f:
f.setparams(params)
f.setframerate(new_rate)
f.writeframes(data)

How to un-tar in-memory data using Python3?

I've got some tar data in bytes, and want to read it without writing it to the file system.
Writing it to the file system works:
with open('out.tar', 'wb') as f:
f.write(data)
then, in the shell: tar -xzvf out.tar
But the following errors:
import tarfile
tarfile.open(data, 'r')
'''
File ".../lib/python3.7/tarfile.py", line 1591, in open
return func(name, filemode, fileobj, **kwargs)
File ".../lib/python3.7/tarfile.py", line 1638, in gzopen
fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
File ".../lib/python3.7/gzip.py", line 163, in __init__
fileobj = self.myfileobj = builtins.open(fil
'''
what is the right way to read the tar in memory?
Update
The following works:
from io import BytesIO
tarfile.open(fileobj=BytesIO(data), 'r')
Why?
tarfile.open is supposed to be able to work with bytes. Converting the bytes to a file-like object myself and then telling tarfile.open to use the file-like object works, but why is the transformation necessary? When does the raw bytes-based API work vs. not work?
You can use the tarfile and from there you can read the data using Byte stream.
import tarfile
with tarfile.open(fileobj = BytesIO(your_file_name)) as tar:
for tar_file in tar:
if (tar_file.isfile()):
inner_data = tar.extractfile(tar_file).read().decode('utf-8')

How to read utf-8 encoded JSON file (locally/from internet) in a string in Kodi?

I followed Kodi simple video plugin tutorial which works as expected with JSON string embedded into main.py file.
The tutorial refers that JSON string can be obtained by other means from other media (file, internet).
My attempt to read utf-8 JSON file into the string was unsuccessful so far.
Initially I tried the following approach
import json
import codecs
fname = 'c:/temp/iptv/video_data.json'
with open(fname, encoding='utf-8') as f:
data = f.read()
VIDEO = json.loads(data)
What produced the following error in log file
2020-05-09 11:11:29.327 T:19428 ERROR: EXCEPTION Thrown (PythonToCppException) : -->Python callback/script returned the following error<--
- NOTE: IGNORING THIS CAN LEAD TO MEMORY LEAKS!
Error Type: <type 'exceptions.TypeError'>
Error Contents: 'encoding' is an invalid keyword argument for this function
Traceback (most recent call last):
File "C:\Users\Alex Fox\AppData\Roaming\Kodi\addons\plugin.video.example\main.py", line 27, in <module>
with open(fname, encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
-->End of Python script error report<--
I investigated the problem and found that Kodi 18.4 uses python27.dll and I assume that this library is somehow accessed from Kodi.
I substituted the code above with following code snippet into main.py file
import json
import codecs
fname = 'c:/temp/iptv/video_data.json'
with open(fname, 'rb') as f:
bytes = f.read()
data = bytes.decode('utf-8')
VIDEOS = json.loads(data)
On plugin's run it produces following error record in log file
2020-05-09 16:54:17.024 T:5700 ERROR: EXCEPTION Thrown (PythonToCppException) : -->Python callback/script returned the following error<--
- NOTE: IGNORING THIS CAN LEAD TO MEMORY LEAKS!
Error Type: <type 'exceptions.UnicodeEncodeError'>
Error Contents: 'ascii' codec can't encode characters in position 1-4: ordinal not in range(128)
Traceback (most recent call last):
File "C:\Users\Alex Fox\AppData\Roaming\Kodi\addons\plugin.video.example\main.py", line 239, in <module>
router(sys.argv[2][1:])
File "C:\Users\Alex Fox\AppData\Roaming\Kodi\addons\plugin.video.example\main.py", line 233, in router
list_categories()
File "C:\Users\Alex Fox\AppData\Roaming\Kodi\addons\plugin.video.animatron\main.py", line 137, in list_categories
url = get_url(action='listing', category=category)
File "C:\Users\Alex Fox\AppData\Roaming\Kodi\addons\plugin.video.animatron\main.py", line 66, in get_url
return '{0}?{1}'.format(_url, urlencode(kwargs))
File "C:\bin\Portable\Kodi\system\python\Lib\urllib.py", line 1343, in urlencode
v = quote_plus(str(v))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 1-4: ordinal not in range(128)
-->End of Python script error report<--
While at the same time following code outside KODI (tested with Python 3.8.2)
import json
import codecs
fname = 'c:/temp/iptv/animation.json'
with open(fname, 'rb') as f:
bytes = f.read()
str = bytes.decode('utf-8')
VIDEOS = json.loads(str)
print(json.dumps(VIDEOS, indent=4))
for key in VIDEOS.keys():
print(key)
outputs JSON dump and VIDEO dictionary keys properly.
What is a proper way to read utf-8 encoded JSON file/internet into a string in Kodi?
Solution for obtaining data from web server:
Download and install from zip file script.modules.requtests-2.22.0.zip in Kodi.
Then add into addon.xml file <import addon="script.module.requests" version="2.22.0"/>
<requires>
<import addon="xbmc.python" version="2.25.0"/>
<import addon="script.module.requests" version="2.22.0"/>
</requires>
Now in plugin's main.py file insert following code snippet
import json
import requests
url = 'http://iptv.server.com/series.json'
VIDEOS = {}
html = requests.get(url)
DATA = json.loads(html.content)
for title in DATA.keys():
title_utf8 = title.encode('utf-8')
VIDEOS[title_utf8] = []
for episode in DATA[title]:
episode['name'] = episode['name'].encode('utf-8')
episode['genre'] = episode['genre'].encode('utf-8')
VIDEOS[title_utf8].append(episode)
Launch Kodi and test the plugin.
Next code snippet is for reading file from local storage
import json
fname = 'c:/temp/iptv/animation.json';
VIDEOS = {}
with open(fname,'r') as f:
str = f.read()
DATA = json.loads(str)
for title in DATA.keys():
title_utf8 = title.encode('utf-8')
VIDEOS[title_utf8] = []
for episode in DATA[title]:
episode['name'] = episode['name'].encode('utf-8')
episode['genre'] = episode['genre'].encode('utf-8')
VIDEOS[title_utf8].append(episode)

Use codecs to read file with correct encoding: TypeError

I need to read from a file, linewise. Also also need to make sure the encoding is correctly handled.
I wrote the following code:
#!/bin/bash
import codecs
filename = "something.x10"
f = open(filename, 'r')
fEncoded = codecs.getreader("ISO-8859-15")(f)
totalLength = 0
for line in fEncoded:
totalLength+=len(line)
print("Total Length is "+totalLength)
This code does not work on all files, on some files I get a
Traceback (most recent call last):
File "test.py", line 11, in <module>
for line in fEncoded:
File "/usr/lib/python3.2/codecs.py", line 623, in __next__
line = self.readline()
File "/usr/lib/python3.2/codecs.py", line 536, in readline
data = self.read(readsize, firstline=True)
File "/usr/lib/python3.2/codecs.py", line 480, in read
data = self.bytebuffer + newdata
TypeError: can't concat bytes to str
Im using python 3.3 and the script must work with this python version.
What am I doing wrong, I was not able to find out which files work and which not, even some plain ASCII files fail.
You are opening the file in non-binary mode. If you read from it, you get a string decoded according to your default encoding (http://docs.python.org/3/library/functions.html?highlight=open%20builtin#open).
codec's StreamReader needs a bytestream (http://docs.python.org/3/library/codecs#codecs.StreamReader)
So this should work:
import codecs
filename = "something.x10"
f = open(filename, 'rb')
f_decoded = codecs.getreader("ISO-8859-15")(f)
totalLength = 0
for line in f_decoded:
total_length += len(line)
print("Total Length is "+total_length)
or you can use the encoding parameter on open:
f_decoded = open(filename, mode='r', encoding='ISO-8859-15')
The reader returns decoded data, so I fixed your variable name. Also, consider pep8 as a guide for formatting and coding style.

tmpfile and gzip combination problem

I have problem with this code:
file = tempfile.TemporaryFile(mode='wrb')
file.write(base64.b64decode(data))
file.flush()
os.fsync(file)
# file.seek(0)
f = gzip.GzipFile(mode='rb', fileobj=file)
print f.read()
I dont know why it doesn't print out anything. If I uncomment file.seek then error occurs:
File "/usr/lib/python2.5/gzip.py", line 263, in _read
self._read_gzip_header()
File "/usr/lib/python2.5/gzip.py", line 162, in _read_gzip_header
magic = self.fileobj.read(2)
IOError: [Errno 9] Bad file descriptor
Just for information this version works fine:
x = open("test.gzip", 'wb')
x.write(base64.b64decode(data))
x.close()
f = gzip.GzipFile('test.gzip', 'rb')
print f.read()
EDIT: For wrb problem. It doesn't give me an error when initialize it. Python 2.5.2.
>>> t = tempfile.TemporaryFile(mode="wrb")
>>> t.write("test")
>>> t.seek(0)
>>> t.read()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
IOError: [Errno 9] Bad file descriptor
'wrb' is not a valid mode.
This works fine:
import tempfile
import gzip
with tempfile.TemporaryFile(mode='w+b') as f:
f.write(data.decode('base64'))
f.flush()
f.seek(0)
gzf = gzip.GzipFile(mode='rb', fileobj=f)
print gzf.read()
Some tips:
You can't .seek(0) or .read() a gzip file in wrb mode or wb or w+b. GzipFile class __init__ set itself to READ or WRITE only by looking at the first character of wrb (set itself to WRITE for this case).
When doing f = gzip.GzipFile(mode='rb', fileobj=file) your real file is file not f, I understood that after reading GzipFile class definition.
A working example for me was:
from tempfile import NamedTemporaryFile
import gzip
with NamedTemporaryFile(mode='w+b', delete=True, suffix='.txt.gz', prefix='f') as t_file:
gzip_file = gzip.GzipFile(mode='wb', fileobj=t_file)
gzip_file.write('SOMETHING HERE')
gzip_file.close()
t_file.seek(0)
# Do something here with your t_file, maybe send it to an external storage or:
print t_file.read()
I hope this can be useful for someone out there, took a lot of my time to make it work.

Categories