I am trying to scrape a youtube channel and return all of the links for each video of this channel, however when I try to print out these links, I only get a few links that have nothing to do with the videos. I am suspecting the videos may be loaded by Javascript, so would there we a way to even do this with beautifulsoup? Will I have to use selenium? Can somebody please help me and do some testing. Here is my code so far:
import requests
from bs4 import BeautifulSoup
print('scanning page...')
youtuber = 'memeulous'
result = requests.get('https://www.youtube.com/c/' + youtuber + '/videos')
status = result.status_code
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
if status == 200:
print('valid URL, grabbing uploads...')
else:
print('invalid URL, status code: ' + str(status))
quit()
print(links)
and here is my output:
scanning page...
valid URL, grabbing uploads...
[About, Press, Copyright, Contact us, Creators, Advertise, Developers, Terms, Privacy, Policy and Safety, How YouTube works, Test new features]
[Finished in 4.0s]
as you can see, no video links.
One way of doing this would be with the following code:
import requests
api_key = "PASTE_YOUR_API_KEY_HERE!"
yt_user = "memeulous"
api_url = f"https://www.googleapis.com/youtube/v3/channels?part=contentDetails&forUsername={yt_user}&key={api_key}"
response = requests.get(api_url).json()
playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
channel_url = f"https://www.googleapis.com/youtube/v3/playlistItems?" \
f"part=snippet%2CcontentDetails&maxResults=50&playlistId={playlist_id}&key={api_key}"
def get_video_ids(vid_data: dict) -> list:
return [_id["contentDetails"]["videoId"] for _id in vid_data["items"]]
def build_links(vid_ids: list) -> list:
return [f"https://www.youtube.com/watch?v={_id}" for _id in vid_ids]
def get_all_links() -> list:
all_links = []
url = channel_url
while True:
res = requests.get(url).json()
all_links.extend(build_links(get_video_ids(res)))
try:
paging_token = res["nextPageToken"]
url = f"{channel_url}&pageToken={paging_token}"
except KeyError:
break
return all_links
print(get_all_links())
This gets you all the video links (469) for the memeulous user.
['https://www.youtube.com/watch?v=4L8_isnyGfg', 'https://www.youtube.com/watch?v=ogpaiD2e-ss', 'https://www.youtube.com/watch?v=oH-nJe9XMN0', 'https://www.youtube.com/watch?v=kUcbKl4qe5g', ...
You can get the total video count from the videos_data object likes this:
print(f"Total videos: {videos_data['pageInfo']['totalResults']}")
I hope this helps and will get you started. All you need to do, is get the API key for the YouTube Data API.
Related
I have a list of twitter usernames. I need to get their number of followers. I used BS and requests. However, I've only received one account every time.
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv', engine= 'python')
followers = purcsv['username']
followers.head(10)
handle = purcsv['username'][0:40]
temp = ("https://twitter.com/"+handle)
temp = temp.tolist()
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text,'lxml')
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
That's because you are looping over the urls first and fetching the content for each in the same variable page here:
for url in temp:
page = requests.get(url)
so page will always contain the last url page accessed, to solve this you need to process a page once fetched
followers_list = []
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
here is a full example to verify
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv')
followers = purcsv['username']
handles = purcsv['username'][0:40].tolist()
followers_list = []
for handle in handles:
url = "https://twitter.com/" + handle
try:
page = requests.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
output:
Number of followers: 13714085
Number of followers: 4706511
['13714085', '4706511']
You may consider using async function for fetching and processing those urls if you have two many of them.
My problem is i am trying to scrape data form a web page with multiple web pages of witch each is a separate blog post. But the current code only scrapes the data form the url that i have set as variable source.
source = requests.get('https://www.trenerklemen.si/objave/').text.
I would like to scrape text from other url but one part is dynamic and i have no idea how to access it
source = requests.get('https://www.trenerklemen.si/?????/').text
How would i get the ???? part if it changes. Thanks for the answers.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.trenerklemen.si/objave/').text
soup = BeautifulSoup(source,'lxml')
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)
from bs4 import BeautifulSoup
import requests
mainLink = "https://www.trenerklemen.si"
ALL_links = ["https://www.trenerklemen.si/objave/"]
counter = 0
while(counter < len(ALL_links)):
source = requests.get(ALL_links[counter]).text
soup = BeautifulSoup(source,'lxml')
for link in soup.findAll('a'):
LinkNow = str(link.get('href'))
if(len(LinkNow) > 0):
if(LinkNow not in ALL_links):
if(LinkNow[0] == '/'):
ALL_links.append("https://www."+mainLink+LinkNow)
if(mainLink in LinkNow):
ALL_links.append(LinkNow)
try:
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)
except Exception as e:
print("Error: "+str(e))
counter += 1
Websites usually use sitemaps for search engines to be able to crawl the content. You can use the sitemap as a source of your links to scrape.
Sitemap for your website: https://www.trenerklemen.si/post-sitemap.xml
I've written some code in python to grab details from a torrent site. However, when I run my code I found the results as I expected. The only problem with this crawler is that it skips the content of first page [as the pagination urls start from 2] which I can't fix. Any help on this will be highly appreciable.
import requests
from lxml import html
page_link = "https://yts.ag/browse-movies"
b_link = "https://yts.ag"
def get_links(main_link):
response = requests.get(main_link).text
tree = html.fromstring(response)
for item in tree.cssselect('ul.tsc_pagination a'):
if "page" in item.attrib["href"]:
movie_details(b_link + item.attrib["href"])
def movie_details(link):
response = requests.get(link).text
tree = html.fromstring(response)
for titles in tree.cssselect("div.browse-movie-wrap"):
title = titles.cssselect('div.browse-movie-bottom a.browse-movie-title')[0].text
link = titles.cssselect('div.browse-movie-year')[0].text
rating= titles.cssselect('figcaption.hidden-xs h4.rating')[0].text
genre = titles.cssselect('figcaption.hidden-xs h4')[0].text
genre1 = titles.cssselect('figcaption.hidden-xs h4')[1].text
print(title, link, rating, genre, genre1)
get_links(page_link)
Why not just call the movie_details() function on the main_link before the loop ?
def get_links(main_link):
response = requests.get(main_link).text
tree = html.fromstring(response)
movie_details(main_link)
for item in tree.cssselect('ul.tsc_pagination a'):
if "page" in item.attrib["href"]:
movie_details(b_link + item.attrib["href"])
continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.
I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.