Extract rating from google search results - python

I am trying to extract google search results using google api in python.I am able to extract url, link, title and snippet. But i also want to extract the rating that is displayed in the google search results.
Below is the code i am using:
#Google Search Function
def google_search(search_term, api_key, cse_id, **kwargs):
service = build("customsearch", "v1", developerKey=api_key)
res = service.cse().list(q=search_term, cx=cse_id,start = 1,hq ='company reviews', **kwargs).execute()
return res['items']
results = google_search('Swiggy', my_api_key, my_cse_id, num=10)
print(results[2]["title"])
print(results[2]["link"])
print(results[2]["displayLink"])
print(results[2]["snippet"])
I can see the first search result, on searching "swiggy company review" on google, shows rating of 3.7 but i don't know how to extract that information.Can anyone please suggest any solution?
Thanks in advance

Since Google API has been deprecated, it could be easily done scraping it using BeautifulSoup CCS selector select() (for multiple elements) / select_one() (for specific element) methods amoung other techniques.
Code and full example:
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
response = requests.get(
'https://www.google.com/search?q=swiggy company review',
headers=headers).text
soup = BeautifulSoup(response, 'lxml')
# Selects just one Review element (using converted xPath to CSS selector):
# review = soup.select_one('#rso > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > span:nth-of-type(1)').text
# print(review)
# Selects just one Vote element (using converted xPath to CSS selector):
# votes = soup.select_one('#rso > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > span:nth-of-type(2)').text
# print(votes)
data = []
# Selects multiple Vote elements:
for something in soup.select('.uo4vr'):
rating = something.select_one('.uo4vr g-review-stars+ span').text.split(':')[1].strip()
votes_reviews = something.select_one('.uo4vr span+ span').text.split(' ')[0]
data.append({
"Rating": rating,
"Votes/Reviews": votes_reviews,
})
print(json.dumps(data, indent=2))
Output:
[
{
"Rating": "4",
"Votes/Reviews": "1,219"
},
{
"Rating": "4",
"Votes/Reviews": "1,090"
},
{
"Rating": "3.8",
"Votes/Reviews": "46"
},
{
"Rating": "3.8",
"Votes/Reviews": "260"
},
{
"Rating": "4.1",
"Votes/Reviews": "1,047"
},
{
"Rating": "3.3",
"Votes/Reviews": "47"
},
{
"Rating": "1.5",
"Votes/Reviews": "114"
}
]
Alternatively, you can use Google Organic Results API from SerpApi. It's a paid API with a free trial.
Code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"engine": "google",
"q": "swiggy company review",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
# For extracting single elements:
# rating = results['organic_results'][0]['rich_snippet']['top']['detected_extensions']['rating']
# print(f"Rating: {rating}")
# votes = results['organic_results'][0]['rich_snippet']['top']['detected_extensions']['votes']
# print(f"Votes: {votes}")
# For extracing multiple elements:
data = []
for organic_result in results['organic_results']:
title = organic_result['title']
try:
rating = organic_result['rich_snippet']['top']['detected_extensions']['rating']
except:
rating = None
try:
votes = organic_result['rich_snippet']['top']['detected_extensions']['votes']
except:
votes = None
try:
reviews = organic_result['rich_snippet']['top']['detected_extensions']['reviews']
except:
reviews = None
data.append({
"Title": title,
"Rating": rating,
"Votes": votes,
"Reviews": reviews,
})
print(json.dumps(data, indent=2))
Output:
[
{
"Title": "Swiggy Reviews | Glassdoor",
"Rating": 4,
"Votes": 1219,
"Reviews": null
},
{
"Title": "Ride.Swiggy: 254 Employee Reviews | Indeed.com",
"Rating": null,
"Votes": null,
"Reviews": null
}
{
"Title": "Working at Swiggy | Glassdoor",
"Rating": 4,
"Votes": 1090,
"Reviews": null
}
]
Disclaimer, I work for SerpApi.

Related

Web scraping through API - Python

I'm trying to web scrape a web site through python.
URL = "https://www.boerse-frankfurt.de/bond/xs0216072230"
With the code below, I am getting no result, it shows this in output : {}
Code is below :
import requests
url = (
"https://api.boerse-frankfurt.de/v1/data/master_data_bond?isin=XS0216072230"
)
headers = {
"X-Client-TraceId": "d87b41992f6161c09e875c525c70ffcf",
"X-Security": "d361b3c92e9c50a248e85a12849f8eee",
"Client-Date": "2022-08-25T09:07:36.196Z",
}
data = requests.get(url, headers=headers).json()
print(data)
It should print :
{
"isin": "XS0216072230",
"type": {
"originalValue": "25",
"translations": {
"de": "(Industrie-) und Bankschuldverschreibungen",
"en": "Industrial and bank bonds",
},
},
"market": {
"originalValue": "OPEN",
"translations": {"de": "Freiverkehr", "en": "Open Market"},
Any help would be appreciated, I am avoiding Selenium approach for this at the moment.
Thanks in advance.
URL must have some data. https://api.boerse-frankfurt.de/v1/data/master_data_bond?isin=XS0216072230 this url is Empty
This works for me
import requests
url = (
"https://api.boerse-frankfurt.de/v1/data/master_data_bond?isin=XS0216072230"
)
header = {
"authority":"api.boerse-frankfurt.de",
"method":"GET",
"path":"/v1/data/master_data_bond?isin=XS0216072230",
"scheme":"https",
"accept":"application/json, text/plain, */*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.6",
"client-date":"2022-08-26T18:35:26.470Z",
"origin":"https://www.boerse-frankfurt.de",
"referer":"https://www.boerse-frankfurt.de/",
"x-client-traceid":"21eb43fb86f0065542ba9a34b7f2fa93",
"x-security":"14407a81ab4670847d3d55b0d74a3aea",
}
data = requests.get(url, headers=header).json()
print(data)
But I think you might need to update x-client-traceid,client-date, and x-security regularly

I'm trying to web scrape ebay using python and BeautifulSoup, but I'm getting a list index out of rangeerror

As in the title, I'm trying to write a Ebay web-scrape program, yet when I try to find the price, it creates a list error, yet it works for getting the product name.
The url is: https://www.ebay.com.au/sch/i.html?_from=R40&_nkw=switch&_sacat=0&_pgn=1
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
**Open Collection**
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close
grabs each products
containers = page_soup.findAll("div", {"class" : "s-item__wrapper clearfix"})
filename = "EbayWebscraping.csv"
f = open(filename, "w")
headers = "product_name, quality"
for container in containers:
title_container = container.findAll('h3', {'class' : 's-item__title'} )
product_name = title_container[0].text
#Where the problem is#
price_container = container.findAll('span', {'class' : 's-item__price'})
price = price_container[0].text
print('Product: ' + product_name)
print('Price: ' + price)
if you see containers in which at index 0 there is no product or price info so you can start from index 1 and also you can use try-except instead of that
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.ebay.com.au/sch/i.html?_from=R40&_nkw=switch&_sacat=0&_pgn=1")
soup=BeautifulSoup(page.text,"lxml")
containers = soup.findAll("div", {"class" : "s-item__wrapper clearfix"})[1:]
for container in containers:
print(container.find('h3', {'class' : 's-item__title'} ).text)
print(container.find("span", class_="s-item__price").text)
Output:
30 in 1 Game Collection Nintendo Switch Brand New Sealed
AU $47.00
Street Fighter 30th Anniversary Collection Nintendo Switch Brand New Sealed
AU $47.00
For Nintendo Switch Case ZUSLAB Clear Slim Soft Heavy Duty Shockproof Cover
AU $9.99 to AU $16.95
.....
You can also check if the selector is present before doing further processing:
if container.findAll('span', {'class' : 's-item__price'}):
# do something
You also don't need to access [0] index. text would work perfectly. Additionally, there's no need to use findAll since you already extracting data from containers and its selector that contains data about title, price inside. Think of the container as matryoshka doll if it makes more sense.
You just have to call text and price selectors e.g:
containers = page_soup.findAll("div", {"class" : "s-item__wrapper clearfix"})
for container in containers:
product_name = container.find('h3', {'class' : 's-item__title'}).text
price = container.find('span', {'class' : 's-item__price'}).text
Code that paginates through all pages and example in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'_nkw': 'switch', # search query
'_pgn': 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output
Extracting page: 1
----------
[
{
"title": "Shop on eBay",
"price": "$20.00",
"link": "https://ebay.com/itm/123456?hash=item28caef0a3a:g:E3kAAOSwlGJiMikD&amdata=enc%3AAQAHAAAAsJoWXGf0hxNZspTmhb8%2FTJCCurAWCHuXJ2Xi3S9cwXL6BX04zSEiVaDMCvsUbApftgXEAHGJU1ZGugZO%2FnW1U7Gb6vgoL%2BmXlqCbLkwoZfF3AUAK8YvJ5B4%2BnhFA7ID4dxpYs4jjExEnN5SR2g1mQe7QtLkmGt%2FZ%2FbH2W62cXPuKbf550ExbnBPO2QJyZTXYCuw5KVkMdFMDuoB4p3FwJKcSPzez5kyQyVjyiIq6PB2q%7Ctkp%3ABlBMULq7kqyXYA"
},
{
"title": "Jabra Elite 7 Pro - Black Certified Refurbished",
"price": "$82.99",
"link": "https://www.ebay.com/itm/165621993671?epid=12050643207&hash=item268fd710c7:g:gMwAAOSwx8Bi9Fwg&amdata=enc%3AAQAHAAAA4NGq89JefbLJPItXeQ93BWhuE9Wt3pRHvU92HE2wiaGKAUlhQ8hDKu9iP2m5gdNQc8t8ujFSUwXJSyCxrnjh9qaxVXN0s0V7clbWiZTPr7Co3AwECpNLit29NfC%2BXbQxEv7kePJokjM9wnHv%2BAamoTlPl0K8BHa0S3FVrb7IUn9s%2FmvdzTiGUd4DHYNdIEQeFNK7zqB8%2BlWrukvfUz62JemzooE1UYtLbCtQwfIDP1F2GbOL4DoRwHXynUtpduYPA8TX6qZOv8eL44j4hNnP6%2BjGBaDGCReJ6ld13xxhYEUf%7Ctkp%3ABFBM3qnT0f5g"
},
{
"title": "New Listingnintendo switch bundle ",
"price": "$225.00",
"link": "https://www.ebay.com/itm/354344900745?hash=item52809a1889:g:egsAAOSw-qZjUQl-&amdata=enc%3AAQAHAAAA4MkbjLSYGoCVhjI%2BBE%2F1cIoqAfUyH73WJdSL7XugI%2BMtaCzRdexKqk3SnxM3PT5yMHSrChuJdcLC6ESDVvNs2j01yTzx8Cl9i9CQbV89Gp9tzPQNIaBGkVwSh989DJ4lmSmCKywnPQ9yLQqY3fz96kBJbbZwGd63yks4tTuZOiNcAl7PTriDOrVNHF%2FUXm3s18tajQeqtrZxW4pb8nWa5%2FtdmrwDphxTKmA9sONVXfKX5oFujpDxrwswe%2FgoJi2XGjGqe06ruHbzH295EHuRLUv4Tn0R2Kf7CKaman2IEpPo%7Ctkp%3ABFBM3qnT0f5g"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code that paginates through all pages:
from serpapi import EbaySearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "switch", # search query
"_pgn": 1 # page number
"LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$70.00",
"extracted": 70.0
},
"link": "https://www.ebay.com/itm/334599074264?hash=item4de7a8b1d8:g:Vy4AAOSwLLNjUK2i&amdata=enc%3AAQAHAAAAkKM1u%2BmETRpbgLxiKL9uymVFiae4NU2iJa00z6qQK4lyzoe477sEDhhVVjF39BDTAOJQ4PLP%2BoXj1xf5wH8Ja5v1oAmO%2FNRlSFlTK80FlnQkHpIYswiG%2BNH44f98M5LWkwgeOb%2FRVc9uU6Ep9HYaV9JV39LZFRiOJLOGgFvoRxLD4731y0VuzM%2BcPXThX7aXtA%3D%3D%7Ctkp%3ABk9SR4KOv9H-YA"
},
{
"price": {
"raw": "$169.95",
"extracted": 169.95
},
"link": "https://www.ebay.com/itm/185625421454?epid=4050657390&hash=item2b3823268e:g:WrIAAOSwPKdjPfvK&amdata=enc%3AAQAHAAAAoBkI9bwtrhJH9mDVPkHzYgem23XBXWHO%2FghvdNjkqq2RX%2BCoy33RIc%2FxXg%2BHWp0Y5jUL9%2BOfnpKyRshkZTRttODPLt%2Fu0VIfjunwr%2F6r9lKHiZ9w%2FnaITM0BTU0FeU1gKw2dERJwDKrzgCPNc%2FStsq0BdCUYNxQeLG4I1ezDBYZSseUv96U33wRLz%2BJ94pP6UgnCp2nj4oX3qFujBLsvG%2F8%3D%7Ctkp%3ABk9SR4KOv9H-YA"
},
# ...
]
Disclaimer, I work for SerpApi.

Scraping Ebay, working until I use it in sold items

I will use this code to explain my doubt:
Using the url without sold filter
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.ebay.es/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=iphone+x&_sacat=0&LH_TitleDesc=0&_udlo=400&LH_Auction=1&_osacat=0&_odkw=Pok%C3%A9mon+card+Charizard+4%2F102&rt=nc"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all("div", {"class": "s-item__info clearfix"})
print(len(results))
Output: 12
Then I use the url where there are only sold items, I check the html and the class is the same.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.ebay.es/sch/i.html?_from=R40&_nkw=iphone+x&_sacat=0&LH_TitleDesc=0&_udlo=400&LH_Auction=1&rt=nc&LH_Sold=1&LH_Complete=1"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all("div", {"class": "s-item__info clearfix"})
print(len(results))
Output: 0
I tried different classes but I can´t never obtain something.
Thanks.
It was a captcha problem. tHanks!
There are several reasons why the output will be empty.
This is often because the site may think it is being accessed by a bot if requests is the default user-agent in the requests library is python-requests, this can be prevented by passing your actual User-Agent to the "headers". This seems to be a reason why you get a CAPTCHA.
The next step would be if User-Agent passing didn't work would be to use rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on.
Also if passing request headers is not enough. That's when you can try using proxies (ideally residential) in combination with request headers.
An additional step is to use CAPTCHA solver, for example, 2captcha. It allows bypassing all possible CAPTCHAs depending on the target website.
Check the code using BeautifulSoup in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
}
params = {
'_nkw': 'iphone_x', # search query
'LH_Sold': '1', # shows sold items
'_pgn': 1 # page number
}
data = []
limit = 10 # page limit (if needed)
while True:
page = requests.get('https://www.ebay.es/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
data.append({
"title" : title,
"price" : price
})
if params['_pgn'] == limit:
break
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Apple iPhone X 64 GB y 256 GB Grado A++ Desbloqueado - Excelente Estado Todos los Colores",
"price": "234,52 EUR"
},
{
"title": "Funda de silicona a prueba de golpes para iPhone 11 Pro Max 14Pro 8 7 SE 2022 colores",
"price": "4,56 EUR"
},
{
"title": "Apple iPhone X 64 GB 256 GB gris plateado sin contrato COMO NUEVO SIN MANCHA Wow ",
"price": "377,00 EUR a 409,00 EUR"
},
{
"title": "Funda transparente de silicona completa a prueba de golpes para iPhone 11 12 13 14 PRO MAX Mini X XR 8",
"price": "1,13 EUR a 4,06 EUR"
},
{
"title": "Apple iPhone X - 256 GB - Plateado (Desbloqueado) (Leer descripción) FA1065",
"price": "163,88 EUR"
},
other results ...
]
Also you can using official eBay Finding API, has a limit of 5000 requests per day, or third-party API like Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
import json
params = {
"api_key": "...", # serpapi key, https://serpapi.com/manage-api-key
"engine": "ebay", # search engine
"ebay_domain": "ebay.es", # ebay domain
"_nkw": "iphone_x", # search query
"LH_Sold": "1", # shows sold items
"_pgn": 1 # page number
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
title = organic_result.get("title")
price = organic_result.get("price")
data.append({
"title" : title,
"price" : price
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"title": "Apple iPhone X (10) Desbloqueado 64 GB/256 GB Gris espacial/Plateado - Excelente Estado",
"price": {
"raw": "297,34 EUR",
"extracted": 297.34
}
},
{
"title": "Nuevo anuncioApple iPhone X - 64GB - Bianco (Sbloccato).",
"price": {
"raw": "340,00 EUR",
"extracted": 340.0
}
},
{
"title": "Apple iPhone X - 256GB - Factory Unlocked - Good Condition",
"price": {
"raw": "230,80 EUR",
"extracted": 230.8
}
},
other results ...
]

google search web scraping class= not same as on browser

I am trying to grab video panel in google result
for example I am searching ---> "great+castles" <--
and in that search result, it has a panel that contains videos
when I scrape it I get HTML but with different values of attributes
I am not able to grab video panel
text="great+castles"
url = f'https://google.com/search?q={text}'
response = requests.get(url)
print(url)
soup = BeautifulSoup(response.text,'html.parser')
a=soup.findAll('div',{'id':'main'})
a
I do get output response but attributes are not same as on google chrome
Firstly, you can always write that HTML response in HTML file and check what actually you're getting by opening in the browser.
Secondly, you cannot scrape data from google that easily, you need proxies for that but even with elite proxies you may face number of challenges like reCaptcha etc.
You have 2 options to check source code returned by requests:
Save response as html file locally and open it in browser.
Get Scrapy framework and use view(response) from Scrapy Shell. The Scrapy option is handy but requires installation of the framework that can be an overkill for a one-time project.
There is also another(more robust) way to get results from Google Search by using [Google Search API][2] from SerpApi. It's a paid API with a free plan.
For example, your request will return handy json including the inline video section:
> "inline_videos":
> [
> {
> "position":
> 1,
> "title":
> "A Thousand Years of European Castles",
> "link":
> "https://www.youtube.com/watch?v=uXSFt-zey84",
> "thumbnail":
> "https://i.ytimg.com/vi/uXSFt-zey84/mqdefault.jpg?sqp=-oaymwEECHwQRg&rs=AMzJL3n1trdIa7_n5X-kJf8pq70OYoY47w",
> "channel":
> "Best Documentary",
> "duration":
> "53:59",
> "platform":
> "YouTube",
> "date":
> "Jan 25, 2022"
> },
> {
> "position":
> 2,
> "title":
> "The Most Beautiful Castles in the World",
> "link":
> "https://www.youtube.com/watch?v=ln-v2ibnWHU",
> "thumbnail":
> "https://i.ytimg.com/vi/ln-v2ibnWHU/mqdefault.jpg?sqp=-oaymwEECHwQRg&rs=AMzJL3kHM2n3_vkRLM_stMr0XuiFs5uaCQ",
> "channel":
> "Luxury Homes",
> "duration":
> "4:58",
> "platform":
> "YouTube",
> "date":
> "Mar 29, 2020"
> },
> {
> "position":
> 3,
> "title":
> "Great Castles of Europe: Neuschwanstein (Part 1 of 3)",
> "link":
> "https://www.youtube.com/watch?v=R_uFzANW2Xo",
> "thumbnail":
> "https://i.ytimg.com/vi/R_uFzANW2Xo/mqdefault.jpg?sqp=-oaymwEECHwQRg&rs=AMzJL3nYdSY5YW2QU1pijXo3xx7ObrILdg",
> "channel":
> "trakehnen",
> "duration":
> "8:51",
> "platform":
> "YouTube",
> "date":
> "Sep 24, 2009"
> }
> ],
Disclaimer, I work for SerpApi.
.
You can scrape Google Search Video Panel Results using BeautifulSoup web scraping library.
To get to the tab we need, you need to register it in the parameters, like this:
# this URL params is taken from the actual Google search URL
# and transformed to a more readable format
params = {
"q": "great castles", # query
"tbm" : "vid", # video panel
"gl": "us", # contry of the search
"hl": "en" # language of the search
}
To get the required data, you need to get a "container", which is a CSS selector called class selector that contains all the information about video results i.e title, link, channel name and so on.
In our case, this is the "video-voyager" selector which contains data about the title, channel name, video link, description and so on.
Have a look at the SelectorGadget Chrome extension to easily pick selectors by clicking on the desired element in your browser (not always work perfectly if the website is rendered via JavaScript).
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
}
params = {
"q": "great castles", # query
"tbm" : "vid", # video panel
"gl": "us", # contry of the search
"hl": "en" # language of the search
}
# by default it will scrape video page results but can be truned off
def scrape_google_videos(inline_videos=False, video_page=True):
if inline_videos:
data_inline_video = []
params.pop("tbm", None) # deletes tbm: vid
html = requests.get("https://www.google.com/search", headers=headers, params=params, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
print("Inline video data:\n")
for result in soup.select(".WZIVy"):
title = result.select_one(".cHaqb").text
platform = result.select_one("cite").text
chanel = result.select_one(".pcJO7e span").text.replace(" · ", "")
date = result.select_one(".hMJ0yc span").text
data_inline_video.append({
"title" : title,
"platform" : platform,
"chanel" : chanel,
"date" : date
})
print(json.dumps(data_inline_video, indent=2, ensure_ascii=False))
if video_page:
data_video_panel = []
html = requests.get("https://www.google.com/search", headers=headers, params=params, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
print("Video panel data:\n")
for products in soup.select("video-voyager"):
title = products.select_one(".DKV0Md").text
description = products.select_one(".Uroaid").text
link = products.select_one(".ct3b9e a")["href"]
chanel = products.select_one(".Zg1NU+ span").text
duration = products.select_one(".J1mWY div").text
date = products.select_one(".P7xzyf span span").text
data_video_panel.append({
"title" : title,
"description" : description,
"link" : link,
"chanel" : chanel,
"duration" : duration,
"date" : date
})
print(json.dumps(data_video_panel, indent=2, ensure_ascii=False))
scrape_google_videos(video_page=True, inline_videos=False)
Inline video data:
[
{
"title": "A Thousand Years of European Castles",
"platform": "YouTube",
"chanel": "Best Documentary",
"date": "Jan 25, 2022"
},
{
"title": "MOST BEAUTIFUL Castles on Earth",
"platform": "YouTube",
"chanel": "Top Fives",
"date": "Feb 2, 2022"
},
{
"title": "Great Castles of Europe: Neuschwanstein (Part 1 of 3)",
"platform": "YouTube",
"chanel": "trakehnen",
"date": "Sep 24, 2009"
}
]

Why does soup only shows half of the chart I'm scraping?

I'm scraping from a google search but I can only get the first row of a two row chart on the right-hand side.
The search query is:
https://www.google.com/search?q=kegerators
I've noticed that doing an inspect element doesn't really work as beautifulsoup seems to extract a different code.
The code I have is:
htmltext=br.open(query).read()
soup=BeautifulSoup(htmltext)
search = soup.findAll("div", attrs={ "class" : "_cf" })
print search
Upon looking at the code (basically looking for "b>$" - as I know I should see 8 of those) I only get 4, which happen to be the top row of the chart.
These is the result of the search:
[<div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRY5NBoY-anFlJUYExmil81vJG5i1nw6LqVu64lSjw8tSPBUEdh3JaiFix-gfSKMGtE2ZwX8w&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$599.00</b></div><div><cite style="white-space:nowrap">Kegerator</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRS4iCsD4EDV37Rg1kZf0nxFK3bYgYaWC-bxMv-ISg4dI8m-COU3ZHCZGs3FdJBK3npkpoE&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">Kegco K199SS‑2 D...</div><div><b>$539.99</b></div><div><cite style="white-space:nowrap">BeverageFa...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t2.gstatic.com/shopping?q=tbn:ANd9GcSkf6-jVZt34pd_6QyqZGre06VxszvFZX70-wUOEDRhEFhorX_Yek0oyr-5jvk8FNpj2KWusQ&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$499.00</b></div><div><cite style="white-space:nowrap">Compact Ap...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t1.gstatic.com/shopping?q=tbn:ANd9GcTf56EQ6DVbOk02D7cLgVmlurU-2gNrhD6a74MnzQBWg1W290DTYQuj0sSUxQEbxo1XO6pB&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">FunTime Black Kegge...</div><div><b>$399.99</b></div><div><cite style="white-space:nowrap">Max Tool</cite></div></div>]
Is Google doing something strange here?
The reason why results might differ is that Google displays different results on each request, e.g. sometimes it could get 10 shopping results, sometimes 7 or 4.
Specifying gl (country, e.g: us), hl (language, e.g: en) query params could get exact or close to the exact result that you see in your browser.
Also, don't forget to specify a user-agent, otherwise, Google will block your requests eventually.
Code and example in the online IDE:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "buy coffe", # intentional grammatical error to display right side shopping results
"hl": "en",
"gl": "us"
}
response = requests.get("https://www.google.com/search", headers=headers, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
# scrapes both top and right side shopping resutls
for result in soup.select('.pla-hovercard-content-ellip'):
title = result.select_one('.pymv4e').text
link = result.select_one('.pla-hovercard-content-ellip a.tkXAec')['href']
ad_link = f"https://www.googleadservices.com/pagead{result.select_one('.pla-hovercard-content-ellip a')['href']}"
price = result.select_one('.qptdjc').text
try:
rating = result.select_one('.Fam1ne.tPhRLe')["aria-label"].replace("Rated ", "").replace(" out of ", "").replace(",", "")
except:
rating = None
try:
reviews = result.select_one('.GhQXkc').text.replace("(", "").replace(")", "")
except:
reviews = None
source = result.select_one('.zPEcBd.LnPkof').text.strip()
print(f'{title}\n{link}\n{ad_link}\n{price}\n{rating}\n{reviews}\n{source}\n')
-------------
'''
MUD\WTR | Mushroom Coffee Replacement, 90 servings
https://mudwtr.com/collections/shop/products/90-serving-bag
https://www.googleadservices.com/pagead/aclk?sa=l&ai=DChcSEwj5p8u-2rzyAhV2yJQJHfzhBoUYABAHGgJ5bQ&sig=AOD64_3NGBzLzkTv61K7kSrD2f9AREHH_g&ctype=5&q=&ved=2ahUKEwji7MK-2rzyAhWaaM0KHcnaDDcQ9aACegQIAhBo&adurl=
$125.00
4.85
1k+
mudwtr.com
...
'''
Alternatively, you can do the same thing using Google Inline Shopping API from SerpApi. It's a paid API with a free plan.
The difference is that everything is already extracted, and all that needs to be done is just to iterate over structured JSON.
Code to integrate:
import json, os
from serpapi import GoogleSearch
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "buy coffe",
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['shopping_results']:
print(json.dumps(result, indent=2, ensure_ascii=False))
--------
'''
{
"position": 1,
"block_position": "right",
"title": "Maxwell House Original Roast | 48oz",
"price": "$10.49",
"extracted_price": 10.49,
"link": "https://www.google.com/aclk?sa=l&ai=DChcSEwiGn8aT2rzyAhXgyZQJHZHdBJMYABAEGgJ5bQ&ae=2&sig=AOD64_0jBjdUIMeqJvrXYxn4NGcpwCYrJQ&ctype=5&q=&ved=2ahUKEwiOxLmT2rzyAhWiFVkFHWMNAaEQ5bgDegQIAhBa&adurl=",
"source": "Boxed",
"rating": 4.6,
"reviews": 2000,
"thumbnail": "https://serpapi.com/searches/611e1b2cfdca3e6a1c9335e6/images/e4ae7f31164ec52021f1c04d8be4e4bda2138b1acd12c868052125eb86ead292.png"
}
...
'''
P.S - I wrote a blog post about this topic that you can find here.
Disclaimer, I work for SerpApi.

Categories