How to display progress of downloading a YouTube video using pytube? - python

I've been on multiple forums and I've read the docs. I still don't understand how to make this work.
How do I show the progress while the video is being downloaded? Do I need to provide parameters? I see a lot of people doing yt = YouTube(url, on_progress) without parenthesis or parameters so I'm very confused.
I don't know what file_handle should be, I also don't know where I can get the bytes_remaining.
Thank you in advance
def on_progress(stream, chunk, file_handle, bytes_remaining):
total_size = stream.filesize
bytes_downloaded = total_size - bytes_remaining
percentage_of_completion = bytes_downloaded / total_size * 100
print(percentage_of_completion)
def main():
chunk_size = 1024
url = "https://www.youtube.com/watch?v=GceNsojnMf0"
yt = YouTube(url)
video = yt.streams.get_highest_resolution()
yt.register_on_progress_callback(on_progress(video, chunk_size, 'D:\\Videos', video.filesize))
print(f"Fetching \"{video.title}\"..")
print(f"Fetching successful\n")
print(f"Information: \n"
f"File size: {round(video.filesize * 0.000001, 2)} MegaBytes\n"
f"Highest Resolution: {video.resolution}\n"
f"Author: {yt.author}")
print("Views: {:,}\n".format(yt.views))
print(f"Downloading \"{video.title}\"..")
video.download('D:\\Videos')

The method register_on_progress_callback() from the YouTube object just need a callback function itself not the result of the function. You also need to update the parameters of your function on_progress : only three parameters are used by the method register_on_progress_callback() (stream, chunk and bytes_remaining). You can update your code like that :
def on_progress(stream, chunk, bytes_remaining):
total_size = stream.filesize
bytes_downloaded = total_size - bytes_remaining
percentage_of_completion = bytes_downloaded / total_size * 100
print(percentage_of_completion)
def main():
chunk_size = 1024
url = "https://www.youtube.com/watch?v=GceNsojnMf0"
yt = YouTube(url)
video = yt.streams.get_highest_resolution()
yt.register_on_progress_callback(on_progress)
print(f"Fetching \"{video.title}\"..")
print(f"Fetching successful\n")
print(f"Information: \n"
f"File size: {round(video.filesize * 0.000001, 2)} MegaBytes\n"
f"Highest Resolution: {video.resolution}\n"
f"Author: {yt.author}")
print("Views: {:,}\n".format(yt.views))
print(f"Downloading \"{video.title}\"..")
video.download('D:\\Videos')
main()

Related

Streaming video files using Flask

Please help me to understand one moment.
I am trying to make Flask to stream .mp4 video. I know that i can use Response(generator_function())
But it does not allow to jump to specific minute while watching a video in browser.
So i am trying to use Range header. Here is how i try it:
app = Flask(__name__)
def get_chunk(byte1=None, byte2=None):
filesize = os.path.getsize('try2.mp4')
yielded = 0
yield_size = 1024 * 1024
if byte1 is not None:
if not byte2:
byte2 = filesize
yielded = byte1
filesize = byte2
with open('try2.mp4', 'rb') as f:
content = f.read()
while True:
remaining = filesize - yielded
if yielded == filesize:
break
if remaining >= yield_size:
yield content[yielded:yielded+yield_size]
yielded += yield_size
else:
yield content[yielded:yielded+remaining]
yielded += remaining
#app.route('/')
def get_file():
filesize = os.path.getsize('try2.mp4')
range_header = flask_request.headers.get('Range', None)
if range_header:
byte1, byte2 = None, None
match = re.search(r'(\d+)-(\d*)', range_header)
groups = match.groups()
if groups[0]:
byte1 = int(groups[0])
if groups[1]:
byte2 = int(groups[1])
if not byte2:
byte2 = byte1 + 1024 * 1024
if byte2 > filesize:
byte2 = filesize
length = byte2 + 1 - byte1
resp = Response(
get_chunk(byte1, byte2),
status=206, mimetype='video/mp4',
content_type='video/mp4',
direct_passthrough=True
)
resp.headers.add('Content-Range',
'bytes {0}-{1}/{2}'
.format(byte1,
length,
filesize))
return resp
return Response(
get_chunk(),
status=200, mimetype='video/mp4'
)
#app.after_request
def after_request(response):
response.headers.add('Accept-Ranges', 'bytes')
return response
get_chunk yields chunks from byte1 to byte2 if this bytes are specified, and from 0 to filesize otherwise (chunk size = 1MB).
But it does not work.
I see that firstly browser sends request with <200> status. And then with <206>. Please advice me how to make it working.
On development server you need to enable threaded=True for video stream to work correctly.
Updated:
import os
import re
...
#app.after_request
def after_request(response):
response.headers.add('Accept-Ranges', 'bytes')
return response
def get_chunk(byte1=None, byte2=None):
full_path = "try2.mp4"
file_size = os.stat(full_path).st_size
start = 0
if byte1 < file_size:
start = byte1
if byte2:
length = byte2 + 1 - byte1
else:
length = file_size - start
with open(full_path, 'rb') as f:
f.seek(start)
chunk = f.read(length)
return chunk, start, length, file_size
#app.route('/video')
def get_file():
range_header = request.headers.get('Range', None)
byte1, byte2 = 0, None
if range_header:
match = re.search(r'(\d+)-(\d*)', range_header)
groups = match.groups()
if groups[0]:
byte1 = int(groups[0])
if groups[1]:
byte2 = int(groups[1])
chunk, start, length, file_size = get_chunk(byte1, byte2)
resp = Response(chunk, 206, mimetype='video/mp4',
content_type='video/mp4', direct_passthrough=True)
resp.headers.add('Content-Range', 'bytes {0}-{1}/{2}'.format(start, start + length - 1, file_size))
return resp
if __name__ == '__main__':
app.run(threaded=True)
okay i this might be coming late but this is a simplified code i wrote. still same concept as above but better and simpler i think.
import os
import re
from flask import render_template, request, Blueprint, current_app, send_file
core = Blueprint("core", __name__)
# your request handles here with #core.route()
#core.route("/")
def home():
return render_template("index.html")
#core.route("/video", methods=["GET"])
def video():
headers = request.headers
if not "range" in headers:
return current_app.response_class(status=400)
video_path = os.path.abspath(os.path.join("media", "test.mp4"))
size = os.stat(video_path)
size = size.st_size
chunk_size = 10**3
start = int(re.sub("\D", "", headers["range"]))
end = min(start + chunk_size, size - 1)
content_lenght = end - start + 1
def get_chunk(video_path, start, end):
with open(video_path, "rb") as f:
f.seek(start)
chunk = f.read(end)
return chunk
headers = {
"Content-Range": f"bytes {start}-{end}/{size}",
"Accept-Ranges": "bytes",
"Content-Length": content_lenght,
"Content-Type": "video/mp4",
}
return current_app.response_class(get_chunk(video_path, start, end), 206, headers)

How to add progress bar?

Is there a way to add progress bar in pytube? I don't know how to use the following method:
pytube.Stream().on_progress(chunk, file_handler, bytes_remaining)
My code:
from pytube import YouTube
# from pytube import Stream
from general import append_to_file
def downloader(video_link, down_dir=None):
try:
tube = YouTube(video_link)
title = tube.title
print("Now downloading, " + str(title))
video = tube.streams.filter(progressive=True, file_extension='mp4').first()
print('FileSize : ' + str(round(video.filesize/(1024*1024))) + 'MB')
# print(tube.streams.filter(progressive=True, file_extension='mp4').first())
# Stream(video).on_progress()
if down_dir is not None:
video.download(down_dir)
else:
video.download()
print("Download complete, " + str(title))
caption = tube.captions.get_by_language_code('en')
if caption is not None:
subtitle = caption.generate_srt_captions()
open(title + '.srt', 'w').write(subtitle)
except Exception as e:
print("ErrorDownloadVideo | " + str(video_link))
append_to_file('debug', format(e))
# FILESIZE print(tube.streams.filter(progressive=True, file_extension='mp4').first().filesize/(1024*1024))
You can also do like this without writing your own function.
code:
from pytube import YouTube
from pytube.cli import on_progress #this module contains the built in progress bar.
link=input('enter url:')
yt=YouTube(link,on_progress_callback=on_progress)
videos=yt.streams.first()
videos.download()
print("(:")
Call your progress function inside the Youtube class
yt = YouTube(video_link, on_progress_callback=progress_function)
This is your progress function
def progress_function(self,stream, chunk,file_handle, bytes_remaining):
size = stream.filesize
p = 0
while p <= 100:
progress = p
print str(p)+'%'
p = percent(bytes_remaining, size)
This computes the percentage converting the file size and the bytes remaining
def percent(self, tem, total):
perc = (float(tem) / float(total)) * float(100)
return perc
The callback function takes three arguments, not four: stream, chunk and bytes_remaining.
I know this is already answered, but I came across this and for me, the progress was counting down from 100 to 0. Since I wanted to fill a progress bar with the percentage value, I couldn't use this.
So I came up with this solution:
def progress_func(self, stream, chunk, file_handle,bytes_remaining):
size = self.video.filesize
progress = (float(abs(bytes_remaining-size)/size))*float(100))
self.loadbar.setValue(progress)
The loadbar is my Progress Bar from PyQt5.
Hope this helps someone.
This is something interesting!
We can emulate the download animation of linux with the following code:
def progress_function(chunk, file_handle, bytes_remaining):
global filesize
current = ((filesize - bytes_remaining)/filesize)
percent = ('{0:.1f}').format(current*100)
progress = int(50*current)
status = '█' * progress + '-' * (50 - progress)
sys.stdout.write(' ↳ |{bar}| {percent}%\r'.format(bar=status, percent=percent))
sys.stdout.flush()
yt_obj = YouTube('<<some youtube video URL>>', on_progress_callback=progress_function)
Output looks like:
↳ |██████████████████████████████████----------------| 68.4%
Have fun!!
Somewhat shorter option:
yt = YouTube(video_link, on_progress_callback=progress_function)
video = yt.streams.first() # or whatever
# Prints something like "15.555% done..."
def progress_function(stream, chunk, file_handle, bytes_remaining):
print(round((1-bytes_remaining/video.filesize)*100, 3), '% done...')
You can, of course, limit the progress output, for instance, to values like 10, 20, 30%... - just surround the print statement with the required if-clause.
Here is a bit advanced version
def on_progress(vid, chunk, bytes_remaining):
total_size = vid.filesize
bytes_downloaded = total_size - bytes_remaining
percentage_of_completion = bytes_downloaded / total_size * 100
totalsz = (total_size/1024)/1024
totalsz = round(totalsz,1)
remain = (bytes_remaining / 1024) / 1024
remain = round(remain, 1)
dwnd = (bytes_downloaded / 1024) / 1024
dwnd = round(dwnd, 1)
percentage_of_completion = round(percentage_of_completion,2)
#print(f'Total Size: {totalsz} MB')
print(f'Download Progress: {percentage_of_completion}%, Total Size:{totalsz} MB, Downloaded: {dwnd} MB, Remaining:{remain} MB')
yt.register_on_progress_callback(on_progress)
from pytube import Playlist
from pytube import YouTube
previousprogress = 0
def on_progress(stream, chunk, bytes_remaining):
global previousprogress
total_size = stream.filesize
bytes_downloaded = total_size - bytes_remaining
liveprogress = (int)(bytes_downloaded / total_size * 100)
if liveprogress > previousprogress:
previousprogress = liveprogress
print(liveprogress)
yt = YouTube('https://www.youtube.com/watch?v=4zqKJBxRyuo&ab_channel=SleepEasyRelax-KeithSmith')
yt.register_on_progress_callback(on_progress)
yt.streams.filter(only_audio=True).first().download()
You can add progress bar like this. ignore silly type error (if any)
pytube.request.default_range_size = 1048576 # this is for chunck size, 1MB size
yt = YouTube(url)
video = yt.streams.first()
video.download(<whatever>)
def progress_callback(stream, chunk, bytes_remaining):
size = video.filesize
progress = int(((size - bytes_remaining) / size) * 100)
print(progress)
# do call progress bar from GUI here
def complete_callback(stream, file_handle):
print("downloading finished")
# progress bar stop call from GUI here
yt.register_on_progress_callback(progress_callback)
yt.register_on_complete_callback(complete_callback)

Python: How to calculate multithreaded download speed

I wrote a multi-threaded http down-loader, now it can download a file faster than single threaded down-loader, and the MD5 sum is correct. However, I found the speed it showed is so so fast that I do not believe it is true value.
Unit was not printed yet, But I am sure it is KB/s, please take a look at my code about the measure.
# Setup the slaver
def _download(self):
# Start download partital content when queue not empty
while not self.configer.down_queue.empty():
data_range = self.configer.down_queue.get()
headers = {
'Range': 'bytes={}-{}'.format(*data_range)
}
response = requests.get(
self.configer.url, stream = True,
headers = headers
)
start_point = data_range[0]
for bunch in response.iter_content(self.block_size):
_time = time.time()
with self.file_lock:
with open(
self.configer.path, 'r+b',
buffering = 1
) as f:
f.seek(start_point)
f.write(bunch)
f.flush()
start_point += self.block_size
self.worker_com.put((
threading.current_thread().name,
int(self.block_size / (time.time() - _time))
))
self.configer.down_queue.task_done()
# speed monitor
def speed_monitor(self):
while len(self.thread_list)>0:
try:
info = self.worker_com.get_nowait()
self.speed[info[0]] = info[1]
except queue.Empty:
time.sleep(0.1)
continue
sys.stdout.write('\b'*64 + '{:10}'.format(self.total_speed)
+ ' thread num ' + '{:2}'.format(self.worker_count))
sys.stdout.flush()
If you need more information, please visit my github respository. i will be appreciate if you can point out my error. thanks.

Downloading a large archive from AWS Glacier using Boto

I am trying to download a large archive (~ 1 TB) from Glacier using the Python package, Boto. The current method that I am using looks like this:
import os
import boto.glacier
import boto
import time
ACCESS_KEY_ID = 'XXXXX'
SECRET_ACCESS_KEY = 'XXXXX'
VAULT_NAME = 'XXXXX'
ARCHIVE_ID = 'XXXXX'
OUTPUT = 'XXXXX'
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY)
gv = layer2.get_vault(VAULT_NAME)
job = gv.retrieve_archive(ARCHIVE_ID)
job_id = job.id
while not job.completed:
time.sleep(10)
job = gv.get_job(job_id)
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT)
The problem is that the job ID expires after 24 hours, which is not enough time to retrieve the entire archive. I will need to break the download into at least 4 pieces. How can I do this and write the output to a single file?
It seems that you can simply specify the chunk_size parameter when calling job.download_to_file like so :
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT, chunk_size=1024*1024)
However, if you can't download the all the chunks during the 24 hours I don't think you can choose to download only the one you missed using layer2.
First method
Using layer1 you can simply use the method get_job_output and specify the byte-range you want to download.
It would look like that :
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = gv.get_job_output(VAULT_NAME, job_id, (file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1
With this script you should be able to rerun the script when it fails and continue to download your archive where you left it.
Second method
By digging in the boto code I found a "private" method in the Job class that you might also use : _download_byte_range. With this method you can still use layer2.
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = job._download_byte_range(file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1
You have to add the region_name in your boto.connect_glacier function as the following :
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY,
region_name = 'your region name')

Live recognition with Python and Pocketsphinx

I have recently been working with pocket sphinx in python. I have successfully got the
example below to work recognising a recorded wav.
#!/usr/bin/env python
import sys,os
def decodeSpeech(hmmd,lmdir,dictp,wavfile):
"""
Decodes a speech file
"""
try:
import pocketsphinx as ps
import sphinxbase
except:
print """Pocket sphinx and sphixbase is not installed
in your system. Please install it with package manager.
"""
speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)
wavFile = file(wavfile,'rb')
wavFile.seek(44)
speechRec.decode_raw(wavFile)
result = speechRec.get_hyp()
return result[0]
if __name__ == "__main__":
hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"
lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"
dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"
wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"
recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print recognised
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
The problem is how can I do real time speech recognition from a microphone? In
a while loop with a if statement so that if a set word is recognised from the microphone
a function can be called?
The code for realtime recognition looks like this:
config = Decoder.default_config()
config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
config.set_string('-logfn', '/dev/null')
decoder = Decoder(config)
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
stream.start_stream()
in_speech_bf = False
decoder.start_utt()
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
if decoder.get_in_speech() != in_speech_bf:
in_speech_bf = decoder.get_in_speech()
if not in_speech_bf:
decoder.end_utt()
print 'Result:', decoder.hyp().hypstr
decoder.start_utt()
else:
break
decoder.end_utt()
You can also use gstreamer python bindings in pocketsphinx, check livedemo.py
Try this. Pocketsphinx is now a GStreamer plugin.
This is the code I see on the internet and I've modified a few things to really listen to the words very bad and slow
You can help me modify it for good. It is built on ubuntu 16.04 LTS
I do not know much about programming
Looking forward to help
# -*- encoding: utf-8 -*-
#!/usr/bin/env python
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math;import Mic
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
config.set_string('-keyphrase', 'no one')
config.set_float('-kws_threshold', 1e+20)
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
#print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#print " Finished "
#print " Average audio intensity is ", r
stream.close()
p.terminate()
if r < 3000:
self.THRESHOLD = 3500
else:
self.THRESHOLD = r + 100
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print "Bắt đầu ghi âm"
started = True
audio2send.append(cur_data)
elif started:
print "Hoàn thành ghi âm"
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
if r.count("one") > 0 and r.count("no") > 0:
Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
os.remove(filename)
return
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
prev_audio = deque(maxlen= 0.5 * rel)
audio2send = []
print "Chế độ nghe ..."
else:
prev_audio.append(cur_data)
print "* Hoàn thành nghe"
stream.close()
p.terminate()

Categories