I have a url to live audio recording that I'm trying to transcribe using Google Speech to Text API. I am using an example code from the Cloud Speech to Text API. However, the problem is that when I pass the live url I do not receive any output. Below is the relevant portion of my code. Any help would be greatly appreciated!
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import io
import os
import time
import requests
import numpy as np
from urllib.request import urlopen
from datetime import datetime
from datetime import timedelta
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "app_creds.json"
def get_stream():
stream = urlopen('streamurl')
duration = 60
begin = datetime.now()
duration = timedelta(seconds=duration)
while datetime.now() - begin < duration:
data = stream.read(8000)
return data
def transcribe_streaming():
"""Streams transcription of the given audio file."""
client = speech.SpeechClient()
content = get_stream()
# In practice, stream should be a generator yielding chunks of audio data.
stream = [content]
requests = (types.StreamingRecognizeRequest(audio_content=chunk)
for chunk in stream)
config = types.RecognitionConfig(
streaming_config = types.StreamingRecognitionConfig(config=config)
# streaming_recognize returns a generator.
responses = client.streaming_recognize(streaming_config, requests)
for response in responses:
# Once the transcription has settled, the first result will contain the
# is_final result. The other results will be for subsequent portions of
# the audio.
for result in response.results:
print('Finished: {}'.format(result.is_final))
print('Stability: {}'.format(result.stability))
alternatives = result.alternatives
# The alternatives are ordered from most likely to least.
for alternative in alternatives:
print('Confidence: {}'.format(alternative.confidence))
print(u'Transcript: {}'.format(alternative.transcript))
When sending audio to the Google Speech service, make sure that the service object setup matches the audio encoding. In your particular case
config = types.RecognitionConfig(
corresponds to single channel, 16KHz, linear 16 bit PCM encoding. See the list of other supported encodings if you need to transcribe audio in different formats.
A part of my code I used a while back, I don't know if that may help:
def live_recognize_loop(self):
client = self.client
def is_running():
return self.recording
while self.recording:
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator(is_running)
requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
responses = client.streaming_recognize(client.custom_streaming_config, requests)
responses_iterator = iter(responses)
while self.recording:
response = next(responses_iterator)
except StopIteration:
except OutOfRange:
# Exception 400 - Exceeded maximum allowed stream duration of 65 seconds.
break # Start over
except ServiceUnavailable as e:
# Exception 503 - Getting metadata from plugin failed
self.log("{0} - NOT RECOGNIZED - {1}\n".format(self.getDate(), e))
except ResourceExhausted as e:
except GoogleAPICallError as e:
if response.results:
result = response.results[0]
if result.alternatives:
transcript = result.alternatives[0].transcript
if not result.is_final:
# print(transcript)
#print("\t\t FINAL: %s" % transcript)
break # Start over
MicrophoneStream class
from __future__ import division
import pyaudio
from six.moves import queue
class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
# The API currently only supports 1-channel (mono) audio
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
self.closed = False
return self
def __exit__(self, type, value, traceback):
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
return None, pyaudio.paContinue
def generator(self, is_running=None):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if callable(is_running) and not is_running():
if chunk is None:
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
chunk = self._buff.get(block=False)
if chunk is None:
except queue.Empty:
yield b''.join(data)
Try using:
import urllib
urllib.urlretrieve ("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
(for Python 3+ use import urllib.request and urllib.request.urlretrieve)
I have a url :"https://findicons.com/files/icons/2787/beautiful_flat_icons/128/running.png"
I want to get the image and write it to file , i write the code as follow:
import urllib.request
web = urllib.request.urlopen(iturl)
itdata = web.read()
f = open(str(cou) + '.png', "wb")
cou = cou + 1
My question is ,if i have many urls to download ,how can i implement it by coroutine of tornado?
This isn't the entire code, just something I came up with in 5 mins but it should give you enough information to satisfy your requirements. If you have any questions or further explanation is required, please let me know.
from tornado import gen, httpclient, ioloop
def main():
client = httpclient.AsyncHTTPClient()
response = yield client.fetch(
follow_redirects = True)
def download_image(response):
buffer_size = 1024
filename = response.request.url.split('/')[-1] # this is not always reliable
with open(filename, 'ab') as img:
while True:
chunk = response.buffer.read(buffer_size)
if chunk == '':
Tornado Issue #1616
Examples of RequestHandler
in an attempt to learn multithreaded file download I wrote this piece of cake:
import urllib2
import os
import sys
import time
import threading
urls = ["http://broadcast.lds.org/churchmusic/MP3/1/2/nowords/271.mp3",
url = urls[1]
def downloadFile(url, saveTo=None):
file_name = url.split('/')[-1]
if not saveTo:
saveTo = '/Users/userName/Desktop'
u = urllib2.urlopen(url)
except urllib2.URLError , er:
print("%s" % er.reason)
f = open(os.path.join(saveTo, file_name), 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
file_size_dl += len(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
if file_size_dl == file_size:
print r"Download Completed %s%% for file %s, saved to %s" % (file_size_dl * 100. / file_size, file_name, saveTo,)
def synchronusDownload():
urls_saveTo = {urls[0]: None, urls[1]: None, urls[2]: None}
for url, saveTo in urls_saveTo.iteritems():
th = threading.Thread(target=downloadFile, args=(url, saveTo), name="%s_Download_Thread" % os.path.basename(url))
but it seems like for the initiation of the second download it waits for the first thread and then goes to download the next file, as printed in shell too.
my plan was to begin all downloads simultaneously and print the updated progress of the files getting downloaded.
Any help will be greatly appreciated.
This is a common problem and here are the steps typically taken:
1.) use Queue.Queue to create a queue of all the urls you would like to visit.
2.) Create a class that inherits from threading.Thread. It should have a run method that grabs a url from the queue and gets the data.
3.) Create a pool of threads based on your class to be "workers"
4.) Don't exit the program until queue.join() has been completed
Your functions are actually running in parallel. You can verify this by printing at the start of each function - 3 outputs will be printed as soon as your program is started.
What's happening is your first two files are so small that they are completely downloaded before the scheduler switches threads. Try setting bigger files in your list:
urls = [
Program output:
Downloading: 100MB.zip Bytes: 104857600
Downloading: 20MB.zip Bytes: 20971520
Downloading: 50MB.zip Bytes: 52428800
Download Completed 100.0% for file 20MB.zip, saved to .
Download Completed 100.0% for file 50MB.zip, saved to .
Download Completed 100.0% for file 100MB.zip, saved to .
I am uploading a large file using the Python requests package, and I can't find any way to give data back about the progress of the upload. I have seen a number of progress meters for downloading a file, but these will not work for a file upload.
The ideal solution would be some sort of callback method such as:
def progress(percent):
print percent
r = requests.post(URL, files={'f':hugeFileHandle}, callback=progress)
Thanks in advance for your help :)
requests doesn't support upload streaming e.g.:
import os
import sys
import requests # pip install requests
class upload_in_chunks(object):
def __init__(self, filename, chunksize=1 << 13):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
yield data
def __len__(self):
return self.totalsize
# XXX fails
r = requests.post("http://httpbin.org/post",
data=upload_in_chunks(__file__, chunksize=10))
btw, if you don't need to report progress; you could use memory-mapped file to upload large file.
To workaround it, you could create a file adaptor similar to the one from
urllib2 POST progress monitoring:
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
it = upload_in_chunks(__file__, 10)
r = requests.post("http://httpbin.org/post", data=IterableToFileAdapter(it))
# pretty print
import json
json.dump(r.json, sys.stdout, indent=4, ensure_ascii=False)
I recommend to use a tool package named requests-toolbelt, which make monitoring upload bytes very easy, like
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
import requests
def my_callback(monitor):
# Your callback function
print monitor.bytes_read
e = MultipartEncoder(
fields={'field0': 'value', 'field1': 'value',
'field2': ('filename', open('file.py', 'rb'), 'text/plain')}
m = MultipartEncoderMonitor(e, my_callback)
r = requests.post('http://httpbin.org/post', data=m,
headers={'Content-Type': m.content_type})
And you may want to read this to show a progress bar.
I got it working with the code from here: Simple file upload progressbar in PyQt.
I changed it a bit, to use BytesIO instead of StringIO.
class CancelledError(Exception):
def __init__(self, msg):
self.msg = msg
Exception.__init__(self, msg)
def __str__(self):
return self.msg
__repr__ = __str__
class BufferReader(BytesIO):
def __init__(self, buf=b'',
self._callback = callback
self._cb_args = cb_args
self._cb_kwargs = cb_kwargs
self._progress = 0
self._len = len(buf)
BytesIO.__init__(self, buf)
def __len__(self):
return self._len
def read(self, n=-1):
chunk = BytesIO.read(self, n)
self._progress += int(len(chunk))
'size' : self._len,
'progress': self._progress
if self._callback:
self._callback(*self._cb_args, **self._cb_kwargs)
except: # catches exception from the callback
raise CancelledError('The upload was cancelled.')
return chunk
def progress(size=None, progress=None):
print("{0} / {1}".format(size, progress))
files = {"upfile": ("file.bin", open("file.bin", 'rb').read())}
(data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(files)
headers = {
"Content-Type": ctype
body = BufferReader(data, progress)
requests.post(url, data=body, headers=headers)
The trick is, to generate data and header from the files list manually, using encode_multipart_formdata() from urllib3
I know this is an old question, but I couldn't find an easy answer anywhere else, so hopefully this will help somebody else:
import requests
import tqdm
with open(file_name, 'rb') as f:
r = requests.post(url, data=tqdm(f.readlines()))
This solution uses requests_toolbelt and tqdm both well maintained and popular libraries.
from pathlib import Path
from tqdm import tqdm
import requests
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
def upload_file(upload_url, fields, filepath):
path = Path(filepath)
total_size = path.stat().st_size
filename = path.name
with tqdm(
) as bar:
with open(filepath, "rb") as f:
fields["file"] = ("filename", f)
e = MultipartEncoder(fields=fields)
m = MultipartEncoderMonitor(
e, lambda monitor: bar.update(monitor.bytes_read - bar.n)
headers = {"Content-Type": m.content_type}
requests.post(upload_url, data=m, headers=headers)
Example usage
upload_url = 'https://uploadurl'
fields = {
"field1": value1,
"field2": value2
filepath = '97a6fce8_owners_2018_Van Zandt.csv'
upload_file(upload_url, fields, filepath)
Usually you would build a streaming datasource (a generator) that reads the file chunked and reports its progress on the way (see kennethreitz/requests#663. This does not work with requests file-api, because requests doesn’t support streaming uploads (see kennethreitz/requests#295) – a file to upload needs to be complete in memory before it starts getting processed.
but requests can stream content from a generator as J.F. Sebastian has proven before, but this generator needs to generate the complete datastream including the multipart encoding and boundaries. This is where poster comes to play.
poster is originally written to be used with pythons urllib2 and supports streaming generation of multipart requests, providing progress indication as it goes along. Posters Homepage provides examples of using it together with urllib2 but you really don’t want to use urllib2. Check out this example-code on how to to HTTP Basic Authentication with urllib2. Horrrrrrrrible.
So we really want to use poster together with requests to do file uploads with tracked progress. And here is how:
# load requests-module, a streamlined http-client lib
import requests
# load posters encode-function
from poster.encode import multipart_encode
# an adapter which makes the multipart-generator issued by poster accessable to requests
# based upon code from http://stackoverflow.com/a/13911048/1659732
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = iterable.total
def read(self, size=-1):
return next(self.iterator, b'')
def __len__(self):
return self.length
# define a helper function simulating the interface of posters multipart_encode()-function
# but wrapping its generator with the file-like adapter
def multipart_encode_for_requests(params, boundary=None, cb=None):
datagen, headers = multipart_encode(params, boundary, cb)
return IterableToFileAdapter(datagen), headers
# this is your progress callback
def progress(param, current, total):
if not param:
# check out http://tcd.netinf.eu/doc/classnilib_1_1encode_1_1MultipartParam.html
# for a complete list of the properties param provides to you
print "{0} ({1}) - {2:d}/{3:d} - {4:.2f}%".format(param.name, param.filename, current, total, float(current)/float(total)*100)
# generate headers and gata-generator an a requests-compatible format
# and provide our progress-callback
datagen, headers = multipart_encode_for_requests({
"input_file": open('recordings/really-large.mp4', "rb"),
"another_input_file": open('recordings/even-larger.mp4', "rb"),
"field": "value",
"another_field": "another_value",
}, cb=progress)
# use the requests-lib to issue a post-request with out data attached
r = requests.post(
auth=('user', 'password'),
# show response-code and -body
print r, r.text
My upload server doesn't support Chunk-Encoded so I came up with this solution. It basically just a wrapper around python IOBase and allow tqdm.wrapattr to work seamless.
import io
import requests
from typing import Union
from tqdm import tqdm
from tqdm.utils import CallbackIOWrapper
class UploadChunksIterator(Iterable):
This is an interface between python requests and tqdm.
Make tqdm to be accessed just like IOBase for requests lib.
def __init__(
self, file: Union[io.BufferedReader, CallbackIOWrapper], total_size: int, chunk_size: int = 16 * 1024
): # 16MiB
self.file = file
self.chunk_size = chunk_size
self.total_size = total_size
def __iter__(self):
return self
def __next__(self):
data = self.file.read(self.chunk_size)
if not data:
raise StopIteration
return data
# we dont retrive len from io.BufferedReader because CallbackIOWrapper only has read() method.
def __len__(self):
return self.total_size
fp = "data/mydata.mp4"
s3url = "example.com"
_quiet = False
with open(fp, "rb") as f:
total_size = os.fstat(f.fileno()).st_size
if not _quiet:
f = tqdm.wrapattr(f, "read", desc=hv, miniters=1, total=total_size, ascii=True)
with f as f_iter:
res = requests.put(
data=UploadChunksIterator(f_iter, total_size=total_size),
Making #jfs' answer better in terms of an informative progress bar.
import math
import os
import requests
import sys
class ProgressUpload:
def __init__(self, filename, chunk_size=1250):
self.filename = filename
self.chunk_size = chunk_size
self.file_size = os.path.getsize(filename)
self.size_read = 0
self.divisor = min(math.floor(math.log(self.file_size, 1000)) * 3, 9) # cap unit at a GB
self.unit = {0: 'B', 3: 'KB', 6: 'MB', 9: 'GB'}[self.divisor]
self.divisor = 10 ** self.divisor
def __iter__(self):
progress_str = f'0 / {self.file_size / self.divisor:.2f} {self.unit} (0 %)'
sys.stderr.write(f'\rUploading {dist_file}: {progress_str}')
with open(self.filename, 'rb') as f:
for chunk in iter(lambda: f.read(self.chunk_size), b''):
self.size_read += len(chunk)
yield chunk
sys.stderr.write('\b' * len(progress_str))
percentage = self.size_read / self.file_size * 100
completed_str = f'{self.size_read / self.divisor:.2f}'
to_complete_str = f'{self.file_size / self.divisor:.2f} {self.unit}'
progress_str = f'{completed_str} / {to_complete_str} ({percentage:.2f} %)'
def __len__(self):
return self.file_size
# sample usage
requests.post(upload_url, data=ProgressUpload('file_path'))
The key is the __len__ method. Without it, I was getting connection closed errors. That's the only reason you can't just use tqdm + iter to get a simple progress bar.
My python code that works great. Credit : twine
import sys
import tqdm
import requests
import requests_toolbelt
class ProgressBar(tqdm.tqdm):
def update_to(self, n: int) -> None:
self.update(n - self.n)
with open("test.zip", "rb") as fp:
data_to_send = []
session = requests.session()
("files", ("test.zip", fp))
encoder = requests_toolbelt.MultipartEncoder(data_to_send)
with ProgressBar(
) as bar:
monitor = requests_toolbelt.MultipartEncoderMonitor(
encoder, lambda monitor: bar.update_to(monitor.bytes_read)
r = session.post(
headers={"Content-Type": monitor.content_type},
I want to get many pages from a website, like
curl "http://farmsubsidy.org/DE/browse?page=[0000-3603]" -o "de.#1"
but get the pages' data in python, not disk files.
Can someone please post pycurl code to do this,
or fast urllib2 (not one-at-a-time) if that's possible,
or else say "forget it, curl is faster and more robust" ? Thanks
So you have 2 problem and let me show you in one example. Notice the pycurl already did the multithreading/not one-at-a-time w/o your hardwork.
#! /usr/bin/env python
import sys, select, time
import pycurl,StringIO
c1 = pycurl.Curl()
c2 = pycurl.Curl()
c3 = pycurl.Curl()
c1.setopt(c1.URL, "http://www.python.org")
c2.setopt(c2.URL, "http://curl.haxx.se")
c3.setopt(c3.URL, "http://slashdot.org")
s1 = StringIO.StringIO()
s2 = StringIO.StringIO()
s3 = StringIO.StringIO()
c1.setopt(c1.WRITEFUNCTION, s1.write)
c2.setopt(c2.WRITEFUNCTION, s2.write)
c3.setopt(c3.WRITEFUNCTION, s3.write)
m = pycurl.CurlMulti()
# Number of seconds to wait for a timeout to happen
# Stir the state machine into action
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
# Keep going until all the connections have terminated
while num_handles:
# The select method uses fdset internally to determine which file descriptors
# to check.
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
# Cleanup
print "http://www.python.org is ",s1.getvalue()
print "http://curl.haxx.se is ",s2.getvalue()
print "http://slashdot.org is ",s3.getvalue()
Finally, these code is mainly based on an example on the pycurl site =.=
may be you should really read doc. ppl spend huge time on it.
here is a solution based on urllib2 and threads.
import urllib2
from threading import Thread
BASE_URL = 'http://farmsubsidy.org/DE/browse?page='
NUM_RANGE = range(0000, 3603)
def main():
for nums in split_seq(NUM_RANGE, THREADS):
t = Spider(BASE_URL, nums)
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
class Spider(Thread):
def __init__(self, base_url, nums):
self.base_url = base_url
self.nums = nums
def run(self):
for num in self.nums:
url = '%s%s' % (self.base_url, num)
data = urllib2.urlopen(url).read()
print data
if __name__ == '__main__':
You can just put that into a bash script inside a for loop.
However you may have better success at parsing each page using python.
You will be able to get at the exact data and save it at the same time into a db.
If you want to crawl a website using python, you should have a look to scrapy http://scrapy.org
Using BeautifulSoup4 and requests -
Grab head page:
page = Soup(requests.get(url='http://rootpage.htm').text)
Create an array of requests:
from requests import async
requests = [async.get(url.get('href')) for url in page('a')]
responses = async.map(requests)
[dosomething(response.text) for response in responses]
Requests requires gevent to do this btw.
I can recommend you to user async module of human_curl
Look example:
from urlparse import urljoin
from datetime import datetime
from human_curl.async import AsyncClient
from human_curl.utils import stdout_debug
def success_callback(response, **kwargs):
"""This function call when response successed
print("success callback")
print(response, response.request)
def fail_callback(request, opener, **kwargs):
"""Collect errors
print("fail callback")
print(request, opener)
with AsyncClient(success_callback=success_callback,
fail_callback=fail_callback) as async_client:
for x in xrange(10000):
async_client.get('http://google.com/', params=(("x", str(x)),)
async_client.get('http://google.com/', params=(("x", str(x)),),
success_callback=success_callback, fail_callback=fail_callback)
Usage very simple. Then page success loaded of failed async_client call you callback. Also you can specify number on parallel connections.