Getting youtube link element from source code - python

I am observing http://www.bing.com/videos/search?q=kohli and trying to lookup video urls.
Anchor tag contains youtube link, but inside dictionary which I want to extract.
redditFile = urllib2.urlopen("http://www.bing.com/videos?q="+urllib.quote_plus(word))
redditHtml = redditFile.read()
redditFile.close()
soup = BeautifulSoup(redditHtml)
productDivs = soup.findAll('div', attrs={'class' : 'dg_u'})
for div in productDivs:
print div.find('a')['vrhm'] #This element contains youtube urls but print does not display it
if div.find('div', {"class":"vthumb", 'smturl': True}) is not None:
print div.find('div', {"class":"vthumb", 'smturl': True})['smturl'] #this gives link to micro video
How can I get youtube link from a tag and vrhm attribute?

You can use the json.load to load a a dictionary from json string.
The for loop can be modified as
>>> productDivs = soup.findAll('div', attrs={'class' : 'dg_u'})
>>> for div in productDivs:
... a_dict = json.loads( div.a['vrhm'] )
... print a_dict['p']
https://www.youtube.com/watch?v=bWbrWI3PBss
https://www.youtube.com/watch?v=bWbrWI3PBss
https://www.youtube.com/watch?v=PbTx2Fjth-0
https://www.youtube.com/watch?v=pB1Kjx-eheY
..
..
What it does?
div.a['vrhm'] extracts the vrhm attribute of the immediate a child of the div.
a_dict = json.loads( div.a['vrhm'] ) loads the json string and creates the dictionary a_dict.
print a_dict['p'] The a_dict is a python dictionary. Use them as you usually do.

Related

How can I extract a specific item attribute from an ebay listing using BeautifulSoup?

def get_data(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
current_data = get_data(link)
x = current_data.find_all(text="Style Code:")
I'm trying to get the style code of a shoe off ebay but the problem is that it doesn't have a specific class or any kind of unique identifier so I can't just use find() to get the data. Currently I searched by text to find 'Style Code:' but how can I get to the next div? An example of a shoe product page would be this.
soup.select_one('span.ux-textspans:-soup-contains("Style Code:")').find_next('span').get_text(strip=True)
Try this,
spans = soup.find_all('span', attrs={'class':'ux-textspans'})
style_code = None
for idx, span in enumerate(spans):
if span.text == 'Style Code:':
style_code = spans[idx+1].text
break
print(style_code)
# 554724-371
Since there are lot's of span is similar (with class 'ux-textspans') you need to iterate through it and find the next span after 'Style Code:'

Scrape href not working with python

I have copies of this very code that I am trying to do and every time I copy it line by line it isn't working right. I am more than frustrated and can't seem to figure out where it is not working. What I am trying to do is go to a website, scrap the different ratings pages which are labelled A, B, C ... etc. Then I am going to each site to pull the total number of pages they are using. I am trying to scrape the <span class='letter-pages' href='/ratings/A/1' and so on. What am I doing wrong?
import requests
from bs4 import BeautifulSoup
url = "https://www.brightscope.com/ratings/"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
hrefs = []
ratings = []
ks = []
pages_scrape = []
for href in soup.findAll('a'):
if 'href' in href.attrs:
hrefs.append(href.attrs['href'])
for good_ratings in hrefs:
if good_ratings.startswith('/ratings/'):
ratings.append(url[:-9]+good_ratings)
# elif good_ratings.startswith('/401k'):
# ks.append(url[:-9]+good_ratings)
del ratings[0]
del ratings[27:]
print(ratings)
for each_rating in ratings:
page = requests.get(each_rating)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find('span', class_='letter-pages'):
#Not working Here
pages_scrape.append(href.attrs['href'])
# Will print all the anchor tags with hrefs if I remove the above comment.
print(href)
You are trying to get the href prematurely. You are trying to extract the attribute directly from a span tag that has nested a tags, rather than a list of a tags.
for each_rating in ratings:
page = requests.get(each_rating)
soup = BeautifulSoup(page.text, 'html.parser')
span = soup.find('span', class_='letter-pages')
for a in span.find_all('a'):
href = a.get('href')
pages_scrape.append(href)
I didn't test this on all pages, but it worked for the first one. You pointed out that on some of the pages the content wasn't getting scraped, which is due to the span search returning None. To get around this you can do something like:
for each_rating in ratings:
page = requests.get(each_rating)
soup = BeautifulSoup(page.text, 'html.parser')
span = soup.find('span', class_='letter-pages')
if span:
for a in span.find_all('a'):
href = a.get('href')
pages_scrape.append(href)
print(href)
else:
print('span.letter-pages not found on ' + page)
Depending on your use case you might want to do something different, but this will indicate to you which pages don't match your scraping model and need to be manually investigated.
You probably meant to do find_all instead of find -- so change
for href in soup.find('span', class_='letter-pages'):
to
for href in soup.find_all('span', class_='letter-pages'):
You want to be iterating over a list of tags, not a single tag. find would give you a single tag object. When you iterate over a single tag, you iterate get NavigableString objects. find_all gives you the list of tag objects you want.

why am I not able to print the html address and title from a html page source?

I am trying to get an article link and title of that article from an HTML data of a page, I tried using this code
def get_ndtvsports_articles():
cricbuzz_article_link = "https://sports.ndtv.com/cricket"
r = requests.get(ndtvsports_article_link)
ndtvsports_article_html = r.text
soup = BeautifulSoup(ndtvsports_article_html, "html.parser")
# print(soup.prettify())
ndtvsports_items = soup.find_all("div",
{"class": "post-title"})
ndtvsports_article_dict = {}
for div in ndtvsports_items:
ndtvsports_article_dict[div.find('a')['title']] = div.find('a')['href']
return ndtvsports_article_dict
I ended up getting output as this
<function get_ndtvsports_articles at 0x7f33fb762950>
This does not have a text entry
This does not have a text entry
This does not have a text entry
This does not have a text entry
This does not have a text entry
This does not have a text entry
I was expecting to get href link and page title like this
{'Mendis misses out on maiden double ton': 'http://www.cricbuzz.com/cricket-news/100130/bangladesh-vs-sri-lanka-1st-test-day-3-tea-kusal-mendis-dananjaya-de-silva-report'}
I suspect what you are doing is
print(get_ndtvsports_articles)
and NOT
print(get_ndtvsports_articles())

Python Beautifulsoup4 remove <span> tags

I am scraping information from a website using this line
offers = soup.find_all("span", "rcnt")
Which gives me this result:
[<span class="rcnt">8.668</span>]
And for some reason when I tried to unwrap it it gave me this
[<span class="rcnt"></span>]
Instead of 8.668
How do I code this correctly
Use .string or .renderContents() to get the value.
htmls = '<span class="rcnt">8.668</span>'
soup = BeautifulSoup(htmls)
offers = soup.find_all("span", "rcnt")
print offers[0].string ## this one is better
print offers[0].renderContents()
It is not clear from your description as to what code you are using to get(unwrap) the content. Here is what you do.
offers is a list. To get the content within the span elements you do:
elements = [tag.text for tag in offers]
elements will have the contents of all of the span tags in your HTML.
>>> html = '<span class="rcnt">8.668</span><span class="rcnt">5.7868</span>'
>>> soup = BeautifulSoup(html)
>>> offers = soup.find_all("span", "rcnt")
>>> elements = [tag.text for tag in offers]
>>> elements
[u'8.668', u'5.7868']
Just use .string() to retrieve the value inside any html tag.
html = '<span class="rcnt">8.668</span>'
soup = BeautifulSoup(html)
offers = soup.('span',attrs={"class":"rcnt"})
It returns an array of all the span tag.Now you can use .string() function to retrieve the string part within the span tag as:
for i in range(0,len(offers)):
print offers[i]

Using beautifulsoup to scrape <h2> tag

I am scraping a website data using beautiful soup. I want the the anchor value (My name is nick) of the following. But i searched a lot in the google but can't find any perfect solution to solve my query.
news_panel = soup.findAll('div', {'class': 'menuNewsPanel_MenuNews1'})
for news in news_panel:
temp = news.find('h2')
print temp
output :
<h2 class="menuNewsHl2_MenuNews1">My name is nick</h2>
But i want output like this : My name is nick
Just grab the text attribute:
>>> soup = BeautifulSoup('''<h2 class="menuNewsHl2_MenuNews1">My name is nick</h2>''')
>>> soup.text
u'My name is nick'
Your error is probably occurring because you don't have that specific tag in your input string.
Check if temp is not None
news_panel = soup.findAll('div', {'class': 'menuNewsPanel_MenuNews1'})
for news in news_panel:
temp = news.find('h2')
if temp:
print temp.text
or put your print statement in a try ... except block
news_panel = soup.findAll('div', {'class': 'menuNewsPanel_MenuNews1'})
for news in news_panel:
try:
print news.find('h2').text
except AttributeError:
continue
Try using this:
all_string=soup.find_all("h2")[0].get_text()

Categories