Im trying to get the Title and transcript of all videos of a playlist:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import os
api_key = "*********************************"
#1.query API
rq = build("youtube", "v3", developerKey=api_key).playlistItems().list(
part="contentDetails, snippet",
playlistId="PL-osiE80TeTtoQCKZ03TU5fNfx2UY6U4p",
maxResults=50,
).execute()
#2.Create a list with video Ids and Titles
vid_ids = []
vid_title = []
for item in rq["items"]:
vid_ids.append(item["contentDetails"]["videoId"])
vid_title.append(item["snippet"]["title"])
#3.Get transcripts
srt = YouTubeTranscriptApi.get_transcripts(vid_ids)
print(srt)
But I get an error because one or more of those videos have no subtitles:
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=D2lwk1Ukgz0! This is most likely caused by:
Subtitles are disabled for this video
What would you code in python to avoid this error, and get the transcripts of at least the rest of the videos of the playlist? maybe an If Statement (if the video has no subtitles jump to the next one) or similar?
Thanks in advance.
Try and except should help here:
for id in vid_ids:
try:
srt = YouTubeTranscriptApi.get_transcripts(id)
except:
print(f"{id} doesn't have a transcript")
This will basically ignore exceptions and tell you which id doesn't have a transcript.
Just try one video id at a time to get its transcript using try/except and pay attention not to pass directly a video id but instead an array of one video id to YouTubeTranscriptApi.get_transcripts otherwise it doesn't work.
So change:
#3.Get transcripts
srt = YouTubeTranscriptApi.get_transcripts(vid_ids)
print(srt)
For:
#3.Get transcripts
srt = []
for vid_id in vid_ids:
try:
srt += [YouTubeTranscriptApi.get_transcripts([vid_id])]
except:
srt += [({vid_id => []}, [])]
print(srt)
Related
I am trying to implement ocr to my twitter monitor using tesseract. My questions are: how can I get images from the user and instantly run ocr. I am monitoring certain twitter accounts newest tweets, if a new comes in and is containing an url i am opening it in a browser, now I want to check if there is an image inside the tweet as well and print the content in the console. My code looks like this:
import tweepy
import re
import webbrowser
import time
import urllib
from datetime import datetime
# a bunch of access keys
keys = [(example_keys)]
# which key is in use right now
key_index = 0
test = 0
url_store = ''
# Function to extract url from newest tweet
def get_tweets(username, tweet_mode='extended'):
# Authorization to consumer key and consumer secret
auth = tweepy.OAuthHandler(keys[key_index][0], keys[key_index][1])
# Access to user's access key and access secret
auth.set_access_token(keys[key_index][2], keys[key_index][3])
# Calling api
api = tweepy.API(auth)
# try to get latest tweet until rate limit is reached
try:
# Get newest tweet from profile
tweets = api.user_timeline(screen_name=username, count=1)
tweet = [tweet.text for tweet in tweets][0]
print(tweet)
global url_store
# regex through tweet for url
url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(tweet))
# check if url was found and isn't the same as the url from the last tweet
if (url!=[] and url[0]!=url_store):
# store url in variable
url_store=url[0]
# open the url in webbrowser
webbrowser.open(url[0])
# save the html dom to a text file
urllib.request.urlretrieve(url[0], "test.txt")
# when rate limit is reached
except tweepy.TweepError:
# select the next key from array
changeKeys()
# right now function always returns false
return False
def changeKeys():
global key_index
# increment key_index by 1 unless end of key array is reached -> start from the beginning
if key_index >= len(keys) - 1:
key_index = 0
else:
key_index += 1
def getIMG():
# Driver code
if __name__ == '__main__':
# boolean if url was found (right now its always false)
found=False
# never ending for loop
while not found:
# get tweets from specific twitter handle
found = get_tweets("Trump",)
time.sleep(0.02)
This is a great question. Your approach of using RegEx is the wrong way to look for images.
Each Tweet contains "Entities" - see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object
You can use those to get the images directly from the tweets.
For example:
tweet.entities.urls
Will get you all the URLs in the Tweet.
I made a simple python script that posts a random youtube video and a quote to Facebook group(s).
The problem is, that it doesn't give Facebook the time to load the random video. To be more specific, at the moment the post looks like this:
But I want it to look like this:
My current code looks like this (I omitted sensitive data):
""" Song of the day script """
import facebook
import os
from pyquery import PyQuery
import requests
import random
class Sofy(object):
GROUPS = ["123", "123"]
FB_ACCESS_TOKEN = "123accesstoken"
PLAYLISTS = ["123youtubeplaylist"]
VIDEOS = []
def get_video(self):
req = requests.get("https://www.youtube.com/playlist?list={}".format(self.PLAYLISTS[0]))
pq = PyQuery(req.text)
for video in pq(".pl-video").items():
self.VIDEOS.append(video.attr("data-video-id"))
return "https://www.youtube.com/watch?v={}".format(random.choice(self.VIDEOS[-5:]))
def get_qoute(self):
pwd = os.path.dirname(os.path.realpath(__file__))
fx = pwd + '/quotes.txt'
lines = open(fx).read().splitlines()
return random.choice(lines)
def run(self):
quote = self.get_qoute()
video = self.get_video()
graph = facebook.GraphAPI(access_token=self.FB_ACCESS_TOKEN, version='2.2')
for group in self.GROUPS:
graph.put_object(group, "feed", message="{}\n Song of the day: {}".format(quote, video))
print "All done :)"
if __name__=='__main__':
sofy = Sofy()
sofy.run()
I tried doing this with Selenium but it didn't quote work as expected. Also, this way looks cleaner, but I can't figure out how to let youtube video load, I'm not even sure if it's possible?
It doesn't look like you're actually sharing the link correctly, looks like you're adding the URL into the 'message' parameter -
It should be attached correctly if you specify it in the 'link' parameter
How can I get song name from internet radio stream?
Python: Get name of shoutcast/internet radio station from url I looked here, but there is only getting name of radio station. But how to get name of the playing song? Here is stream link from where I want to get name of song. http://pool.cdn.lagardere.cz/fm-evropa2-128
How should I do it? Can you help me please?
To get the stream title, you need to request metadata. See shoutcast/icecast protocol description:
#!/usr/bin/env python
from __future__ import print_function
import re
import struct
import sys
try:
import urllib2
except ImportError: # Python 3
import urllib.request as urllib2
url = 'http://pool.cdn.lagardere.cz/fm-evropa2-128' # radio stream
encoding = 'latin1' # default: iso-8859-1 for mp3 and utf-8 for ogg streams
request = urllib2.Request(url, headers={'Icy-MetaData': 1}) # request metadata
response = urllib2.urlopen(request)
print(response.headers, file=sys.stderr)
metaint = int(response.headers['icy-metaint'])
for _ in range(10): # # title may be empty initially, try several times
response.read(metaint) # skip to metadata
metadata_length = struct.unpack('B', response.read(1))[0] * 16 # length byte
metadata = response.read(metadata_length).rstrip(b'\0')
print(metadata, file=sys.stderr)
# extract title from the metadata
m = re.search(br"StreamTitle='([^']*)';", metadata)
if m:
title = m.group(1)
if title:
break
else:
sys.exit('no title found')
print(title.decode(encoding, errors='replace'))
The stream title is empty in this case.
I want to get all video url's of a specific channel. I think json with python or java would be a good choice. I can get the newest video with the following code, but how can I get ALL video links (>500)?
import urllib, json
author = 'Youtube_Username'
inp = urllib.urlopen(r'http://gdata.youtube.com/feeds/api/videos?max-results=1&alt=json&orderby=published&author=' + author)
resp = json.load(inp)
inp.close()
first = resp['feed']['entry'][0]
print first['title'] # video title
print first['link'][0]['href'] #url
After the youtube API change, max k.'s answer does not work. As a replacement, the function below provides a list of the youtube videos in a given channel. Please note that you need an API Key for it to work.
import urllib
import json
def get_all_video_in_channel(channel_id):
api_key = YOUR API KEY
base_video_url = 'https://www.youtube.com/watch?v='
base_search_url = 'https://www.googleapis.com/youtube/v3/search?'
first_url = base_search_url+'key={}&channelId={}&part=snippet,id&order=date&maxResults=25'.format(api_key, channel_id)
video_links = []
url = first_url
while True:
inp = urllib.urlopen(url)
resp = json.load(inp)
for i in resp['items']:
if i['id']['kind'] == "youtube#video":
video_links.append(base_video_url + i['id']['videoId'])
try:
next_page_token = resp['nextPageToken']
url = first_url + '&pageToken={}'.format(next_page_token)
except:
break
return video_links
Short answer:
Here's a library That can help with that.
pip install scrapetube
import scrapetube
videos = scrapetube.get_channel("UC9-y-6csu5WGm29I7JiwpnA")
for video in videos:
print(video['videoId'])
Long answer:
The module mentioned above was created by me due to a lack of any other solutions. Here's what i tried:
Selenium. It worked but had three big drawbacks: 1. It requires a web browser and driver to be installed. 2. has big CPU and memory requirements. 3. can't handle big channels.
Using youtube-dl. Like this:
import youtube_dl
youtube_dl_options = {
'skip_download': True,
'ignoreerrors': True
}
with youtube_dl.YoutubeDL(youtube_dl_options) as ydl:
videos = ydl.extract_info(f'https://www.youtube.com/channel/{channel_id}/videos')
This also works for small channels, but for bigger ones i would get blocked by youtube for making so many requests in such a short time (because youtube-dl downloads more info for every video in the channel).
So i made the library scrapetube which uses the web API to get all the videos.
Increase max-results from 1 to however many you want, but beware they don't advise grabbing too many in one call and will limit you at 50 (https://developers.google.com/youtube/2.0/developers_guide_protocol_api_query_parameters).
Instead you could consider grabbing the data down in batches of 25, say, by changing the start-index until none came back.
EDIT: Here's the code for how I would do it
import urllib, json
author = 'Youtube_Username'
foundAll = False
ind = 1
videos = []
while not foundAll:
inp = urllib.urlopen(r'http://gdata.youtube.com/feeds/api/videos?start-index={0}&max-results=50&alt=json&orderby=published&author={1}'.format( ind, author ) )
try:
resp = json.load(inp)
inp.close()
returnedVideos = resp['feed']['entry']
for video in returnedVideos:
videos.append( video )
ind += 50
print len( videos )
if ( len( returnedVideos ) < 50 ):
foundAll = True
except:
#catch the case where the number of videos in the channel is a multiple of 50
print "error"
foundAll = True
for video in videos:
print video['title'] # video title
print video['link'][0]['href'] #url
Based on the code found here and at some other places, I've written a small script that does this. My script uses v3 of Youtube's API and does not hit against the 500 results limit that Google has set for searches.
The code is available over at GitHub: https://github.com/dsebastien/youtubeChannelVideosFinder
Independent way of doing things. No api, no rate limit.
import requests
username = "marquesbrownlee"
url = "https://www.youtube.com/user/username/videos"
page = requests.get(url).content
data = str(page).split(' ')
item = 'href="/watch?'
vids = [line.replace('href="', 'youtube.com') for line in data if item in line] # list of all videos listed twice
print(vids[0]) # index the latest video
This above code will scrap only limited number of video url's max upto 60. How to grab all the videos url which is present in the channel. Can you please suggest.
This above code snippet will display only the list of all the videos which is listed twice. Not all the video url's in the channel.
Using Selenium Chrome Driver:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
driverPath = ChromeDriverManager().install()
driver = webdriver.Chrome(driverPath)
url = 'https://www.youtube.com/howitshouldhaveended/videos'
driver.get(url)
height = driver.execute_script("return document.documentElement.scrollHeight")
previousHeight = -1
while previousHeight < height:
previousHeight = height
driver.execute_script(f'window.scrollTo(0,{height + 10000})')
time.sleep(1)
height = driver.execute_script("return document.documentElement.scrollHeight")
vidElements = driver.find_elements_by_id('thumbnail')
vid_urls = []
for v in vidElements:
vid_urls.append(v.get_attribute('href'))
This code has worked the few times I've tried it; however, you might need to tweak the sleep time, or add a way to recognize when the browser is still loading the extra information. It easily worked for me for getting a channel with 300+ videos, but it was having an issue with one that had 7000+ videos due to the time required to load the new videos on the browser becoming inconsistent.
I modified the script originally posted by dermasmid to fit my needs. This is the result:
import scrapetube
import sys
path = '_list.txt'
sys.stdout = open(path, 'w')
videos = scrapetube.get_channel("UC9-y-6csu5WGm29I7JiwpnA")
for video in videos:
print("https://www.youtube.com/watch?v="+str(video['videoId']))
# print(video['videoId'])
Basically it is saves all the URLs from the playlist into a "_list.txt" file. I am using this "_list.txt" file to download all the videos using the yt-dlp.exe. All the downloaded files have the .mp4 extension.
Now I do need to create another "_playlist.txt" file that contains all the FILENAMES coresponding to each URL from the "_List.txt".
For example, for: "https://www.youtube.com/watch?v=yG1m7oGZC48" to have "Apple M1 Ultra & NUMA - Computerphile.mp4" as output into the "_playlist.txt"
I do made some further improvements, to be able to add the channel URL into the console, print the result on screen and also into an external file called "_list.txt".
import scrapetube
import sys
path = '_list.txt'
print('**********************\n')
print("The result will be saved in '_list.txt' file.")
print("Enter Channel ID:")
# Prints the output in the console and into the '_list.txt' file.
class Logger:
def __init__(self, filename):
self.console = sys.stdout
self.file = open(filename, 'w')
def write(self, message):
self.console.write(message)
self.file.write(message)
def flush(self):
self.console.flush()
self.file.flush()
sys.stdout = Logger(path)
# Strip the: "https://www.youtube.com/channel/"
channel_id_input = input()
channel_id = channel_id_input.strip("https://www.youtube.com/channel/")
videos = scrapetube.get_channel(channel_id)
for video in videos:
print("https://www.youtube.com/watch?v="+str(video['videoId']))
# print(video['videoId'])
I have been unable to overcome this error while trying to add a video to my playlist using the youtube gdata python api.
gdata.service.RequestError: {'status':
400, 'body': 'Invalid request URI',
'reason': 'Bad Request'}
This seems to be the same error, but there are no solutions as yet. Any help guys?
import getpass
import gdata.youtube
import gdata.youtube.service
yt_service = gdata.youtube.service.YouTubeService()
# The YouTube API does not currently support HTTPS/SSL access.
yt_service.ssl = False
yt_service = gdata.youtube.service.YouTubeService()
yt_service.email = #myemail
yt_service.password = getpass.getpass()
yt_service.developer_key = #mykey
yt_service.source = #text
yt_service.client_id= #text
yt_service.ProgrammaticLogin()
feed = yt_service.GetYouTubePlaylistFeed(username='default')
# iterate through the feed as you would with any other
for entry in feed.entry:
if (entry.title.text == "test"):
lst = entry;
print entry.title.text, entry.id.text
custom_video_title = 'my test video on my test playlist'
custom_video_description = 'this is a test video on my test playlist'
video_id = 'Ncakifd_16k'
playlist_uri = lst.id.text
playlist_video_entry = yt_service.AddPlaylistVideoEntryToPlaylist(playlist_uri, video_id, custom_video_title, custom_video_description)
if isinstance(playlist_video_entry, gdata.youtube.YouTubePlaylistVideoEntry):
print 'Video added'
The confounding thing is that updating the playlist works, but adding a video does not.
playlist_entry_id = lst.id.text.split('/')[-1]
original_playlist_description = lst.description.text
updated_playlist = yt_service.UpdatePlaylist(playlist_entry_id,'test',original_playlist_description,playlist_private=False)
The video_id is not wrong because its the video from the sample code. What am I missing here? Somebody help!
Thanks.
Gdata seems to use v1 API. So, the relevant documentation is here: http://code.google.com/apis/youtube/1.0/developers_guide_protocol.html#Retrieving_a_playlist
This means, your "playlist_uri" should not take the value of "lst.id.text", but should take the "feedLink" element's "href" attribute in order to be used with "AddPlaylistVideoEntryToPlaylist"
Even if you happen to use v2 API, you should take the URI from the "content" element's "src" attribute as explained in the documentation, you get by substituting 2.0, in the above URL! (SO doesn't allow me to put two hyperlinks because i don't have enough reputations! :))