Recording youtube live stream to file in python - python

I want to record youtube live stream and save it to file using python.
I tried with pytube library but it probably works for videos, not for live streams.
I want to record stream and save it to file with video format as avi or something like this.

Modification based on #wownis 's answer.
(I tried that answer, however, it doesn't work.)
# pip install urllib
# pip install m3u8
# pip install streamlink
import urllib
import m3u8
import streamlink
def get_stream(url):
"""
Get upload chunk url
"""
streams = streamlink.streams(url)
stream_url = streams["best"]
m3u8_obj = m3u8.load(stream_url.args['url'])
return m3u8_obj.segments[0]
def dl_stream(url, filename, chunks):
"""
Download each chunks
"""
pre_time_stamp = 0
for i in range(chunks+1):
stream_segment = get_stream(url)
cur_time_stamp = \
stream_segment.program_date_time.strftime("%Y%m%d-%H%M%S")
if pre_time_stamp == cur_time_stamp:
pass
else:
print(cur_time_stamp)
file = open(filename + '_' + str(cur_time_stamp) + '.ts', 'ab+')
with urllib.request.urlopen(stream_segment.uri) as response:
html = response.read()
file.write(html)
pre_time_stamp = cur_time_stamp
url = "https://www.youtube.com/watch?v=2U3JnFbD-es"
dl_stream(url, "live", 15)
Output like this:
./
live_20200713-103739.ts
live_20200713-103744.ts
...

I found a solution and i put my code in python:
import urllib
import m3u8
import streamlink
def record_stream(url,filename,iterations):
last_part = 0
for i in range(iterations+1):
streams = streamlink.streams(url)
stream_url = streams["best"]
print(stream_url.args['url'])
m3u8_obj = m3u8.load(stream_url.args['url'])
previous_part_time = last_part
last_part = m3u8_obj.segments[-1].program_date_time
if i >= 1:
for j in range(1, len(m3u8_obj.segments)):
if m3u8_obj.segments[-j].program_date_time == previous_part_time:
break
print(j)
file = open(filename + ".ts", "ab+")
for i in range(j-1,0,-1):
with urllib.request.urlopen(m3u8_obj.segments[-i].uri) as response:
html = response.read()
file.write(html)
url = "https://www.youtube.com/watch?v=BgKGctL0u1U"
record_stream(url,"file",10)
10 means 10 iterations if chunks have 2s it means that records 20s of stream

Related

pytube (python) video stops playing video after few seconds

I have been using pytube to create my youtube video downloader and after the video is done downloading and compiling and i play it, it plays for only a few seconds then just displays a still image while the audio continues in background
These are the functions in file "module.py"
import pytube
from moviepy.editor import *
import os.path
def video(link):
yt = pytube.YouTube(link)
streamlist = []
for stream in yt.streams.filter():
streamlist.append(stream)
finalstreamlist = []
for i in streamlist:
if i.resolution == "1080p" and i.mime_type == "video/mp4":
finalstreamlist.append(i)
stream = yt.streams.get_by_itag(finalstreamlist[0].itag)
stream.download(r"C:\Users\pc\PycharmProjects\youtube")
return [stream.title, yt.length]
def audio(link):
yt = pytube.YouTube(link)
streamlist = []
for stream in yt.streams.filter():
streamlist.append(stream)
finalstreamlist = []
for i in streamlist:
if i.mime_type == "audio/mp4":
finalstreamlist.append(i)
stream = yt.streams.get_by_itag(finalstreamlist[0].itag)
stream.download(r"C:\Users\pc\PycharmProjects\youtube", "Audio.mp4")
return ["Audio.mp4",yt.length]
def mixer(video,audio,title):
videoclip = VideoFileClip(video)
audioclip = AudioFileClip(audio)
videoclip2 = videoclip.set_audio(audioclip)
videoclip2.write_videofile(title)
And this is the "main.py" file:
from modules import *
import time
link = "https://www.youtube.com/watch?v=CLk7A7HXhYQ"
vtitle = video(link)[0] + ".mp4"
atitle = audio(link)[0]
print("Files Downloaded")
time.sleep(1)
print("Compiling")
mixer(vtitle,atitle,vtitle)
print("FileDone")
I tried your code and it downloads video file correctly but problem is when it mixs video and audio.
I think problem it that it writes new video with the same name as original video - probably it doesn't load old video to memory but it reads it all time from file - and this makes conflict.
I think you should write new video with new (temporary) filename and later rename it to expected name. Or you should download video with temporary name (i.e. video.mp4)
My code which I used for tests
import pytube
from moviepy.editor import *
import os
BASE = os.path.dirname(os.path.abspath(__file__))
def video(link):
yt = pytube.YouTube(link)
finalstreamlist = yt.streams.filter(resolution='1080p', mime_type='video/mp4')
itag = finalstreamlist[0].itag
print('video itag:', itag)
stream = yt.streams.get_by_itag(itag)
stream.download(os.path.join(BASE, 'youtube'), 'video.mp4')
return [os.path.join(BASE, 'youtube/video.mp4'), stream.title, yt.length]
def audio(link):
yt = pytube.YouTube(link)
finalstreamlist = yt.streams.filter(mime_type='video/mp4')
itag = finalstreamlist[0].itag
print('audio itag:', itag)
stream = yt.streams.get_by_itag(itag)
stream.download(os.path.join(BASE, 'youtube'), 'audio.mp4')
return [os.path.join(BASE, 'youtube/audio.mp4'), stream.title, yt.length]
def mixer(video, audio, title):
videoclip = VideoFileClip(video)
audioclip = AudioFileClip(audio)
new_videoclip = videoclip.set_audio(audioclip)
new_videoclip.write_videofile(title)
# --- main ---
link = 'https://www.youtube.com/watch?v=CLk7A7HXhYQ'
print('Downloading')
v = video(link)
a = audio(link)
print('Downloaded')
print('Compiling')
output = os.path.join(BASE, v[1] + '.mp4')
mixer(v[0], a[0], output)
print('Compilied')

Using TQDM almost doubles the file size of my GET request

I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).

How do I download videos from Pexels API?

I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)
sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.
from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()
I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)
You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.

StringIO class does not return expected results in python 3

This code that works in python version 2 fails in python 3.
AttributeError: '_io.StringIO' object has no attribute 'name'
Here is the code:
!pip install
warc3-wet
import warc
import requests
from contextlib import closing
from io import StringIO
def get_partial_warc_file(url, num_bytes=1024 * 10):
with closing(requests.get(url, stream=True)) as r:
buf = StringIO(r.raw.read(num_bytes).decode('utf-8'))
return warc.WARCFile(fileobj=buf, compress=True)
urls = {
'warc': 'https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2016-07/segments/1454701145519.33/warc/CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.gz',
'wat': 'https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2016-07/segments/1454701145519.33/wat/CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.wat.gz',
'wet': 'https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2016-07/segments/1454701145519.33/wet/CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.wet.gz'
}
files = {file_type: get_partial_warc_file(url=url) for file_type, url in urls.items()}
And here is source:
https://dmorgan.info/posts/common-crawl-python/
Update:
This code returns the meta-data of the record, how do I read the news article?
aws s3 cp --no-sign-request
s3://commoncrawl/crawl-data/CC-NEWS/crawl-data/CC-NEWS/2019/08/CC-NEWS-20190824001636-00982.warc.gz
import warc
var = 0
with warc.open("/tmp/CC-NEWS-20190824001636-00982.warc") as f:
for record in f:
if var > 1:
break
else:
print (record.date, record.from_response, record.header, record.ip_address, record.offset, record.payload, record.type, record.url, record.write_to)
var = var + 1
Here is the code that will return the news article source code along with meta-data.
# wget https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2016-07/segments/1454701145519.33/warc/CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.gz
# gunzip CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.gz
#!pip install warc3-wet
import warc
var = -10
with warc.open("CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc") as f:
for record in f:
if var > 1:
break
else:
print (record.payload.read(), record.date, record.from_response, record.header, record.ip_address, record.offset, record.payload, record.type, record.url, record.write_to)
var = var + 1

How can I use Python to time how long a mp3 takes to download from website?

A mp3 is accessible via two different URLs. I'm trying to use Python to figure out which URL is fastest to download from...?
For example, I want to time how long https://cpx.podbean.com/mf/download/a6bxxa/LAF_15min_044_mindfulness.mp3 takes to download and compare that to how long http://cpx.podbean.com/mf/play/a6bxxa/LAF_15min_044_mindfulness.mp3 takes to download.
To download the mp3 I'm currently using:
urllib.request.urlretrieve(mp3_url, mp3_filename)
you could essentially do something like:
from datetime import datetime
starttime = datetime.now()
urllib.request.urlretrieve(mp3_url, mp3_filename) # Whatever code you're using...
finishtime = datetime.now()
runtime = finishtime - starttime
print str(runtime)
this will print a timestamp like 0:03:19.356798 in the format of [hours]:[minutes]:[seconds.micro seconds]
My bad... i didn't realize you're trying to figure out which link was the fastest. I have no clue how you're storing the your mp3_url and mp3_filename elements, but try something like this (adjust accordingly):
from datetime import datetime
mp3_list = {
'file1.mp3': 'http://www.url1.com',
'file2.mp3': 'http://www.url2.com',
'file3.mp3': 'http://www.url3.com',
}
runtimes = []
for mp3_url, mp3_filename in mp3_list.items(): # i'm not sure how or where you are storing mp3_url or mp3_filename, so you'll have to modify this line accordingly...
starttime = datetime.now()
urllib.request.urlretrieve(mp3_url, mp3_filename) # Whatever code you're using...
finishtime = datetime.now()
runtime = finishtime - starttime
runtimes.append({'runtime': runtime, 'url': mp3_url, 'filename': mp3_filename})
fastest_mp3_url = sorted(runtimes, key=lambda k: k['runtime'])[0]['url']
fastest_mp3_filename = sorted(runtimes, key=lambda k: k['runtime'])[0]['filename']
print fastest_mp3_url
print fastest_mp3_filename
It's simple there are plenty of methods to do so (python3x)
using win64pyinstaller with progress
from win64pyinstaller import install
install("your_url", "destination_folder_with_file_name")
using urllib3 with progress
modifying [PabloG's] solution which is in python 2x
How to download a file over HTTP?
import urllib3
from sys import stdout
from urllib.request import urlopen
def _restart_line():
stdout.write('\r')
stdout.flush()
url = "your_url"
file_name = url.split('/')[-1]
u = urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.get("Content-Length"))
print(f"Downloading: {file_name} Bytes: {file_size}")
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = f"done - {(file_size_dl/1000000):.2f}, {(file_size_dl * 100 / file_size):.2f} %"
status = status + chr(8)*(len(status)+1)
stdout.write(status)
stdout.flush()
_restart_line()
f.close()
there are more ways to do it, hope you got your answer thankyou!

Categories