Scrape Google Scholar Security Page - python

I have a string like this:
url = 'http://scholar.google.pl/citations?view_op\x3dsearch_authors\x26hl\x3dpl\x26oe\x3dLatin2\x26mauthors\x3dlabel:security\x26after_author\x3drukAAOJ8__8J\x26astart\x3d10'
I wish to convert it to this:
converted_url = 'https://scholar.google.pl/citations?view_op=search_authors&hl=en&mauthors=label:security&after_author=rukAAOJ8__8J&astart=10'
I have tried this:
converted_url = url.decode('utf-8')
However, this error is thrown:
AttributeError: 'str' object has no attribute 'decode'

You can use requests to do decoding automatically for you.
Note: after_author URL parameter is a next page token, so when you make a request to the exact URL you provided, the returned HTML will not be the same as you expect because after_author URL parameters changes on every request, for example in my case it is different - uB8AAEFN__8J, and in your URL it's rukAAOJ8__8J.
To get it to work you need to parse the next page token from the first page that will lead to the second page and so on, for example:
# from my other answer:
# https://github.com/dimitryzub/stackoverflow-answers-archive/blob/main/answers/scrape_all_scholar_profiles_bs4.py
params = {
"view_op": "search_authors",
"mauthors": "valve",
"hl": "pl",
"astart": 0
}
authors_is_present = True
while authors_is_present:
# if next page is present -> update next page token and increment to the next page
# if next page is not present -> exit the while loop
if soup.select_one("button.gs_btnPR")["onclick"]:
params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", str(soup.select_one("button.gs_btnPR")["onclick"])).group(1) # -> XB0HAMS9__8J
params["astart"] += 10
else:
authors_is_present = False
Code and example to extract profiles data in the online IDE:
from parsel import Selector
import requests, json
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "label:security",
"hl": "pl",
"view_op": "search_authors"
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
}
html = requests.get("https://scholar.google.pl/citations", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
profiles = []
for profile in selector.css(".gs_ai_chpr"):
profile_name = profile.css(".gs_ai_name a::text").get()
profile_link = f'https://scholar.google.com{profile.css(".gs_ai_name a::attr(href)").get()}'
profile_email = profile.css(".gs_ai_eml::text").get()
profile_interests = profile.css(".gs_ai_one_int::text").getall()
profiles.append({
"profile_name": profile_name,
"profile_link": profile_link,
"profile_email": profile_email,
"profile_interests": profile_interests
})
print(json.dumps(profiles, indent=2))
Alternatively, you can achieve the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free plan.
The difference is that you don't need to figure out how to extract data, bypass blocks from search engines, increase the number of requests, and so on.
Example code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # SerpApi profiles parsing engine
"hl": "pl", # language
"mauthors": "label:security" # search query
}
search = GoogleSearch(params)
results = search.get_dict()
for profile in results["profiles"]:
print(json.dumps(profile, indent=2))
# part of the output:
'''
{
"name": "Johnson Thomas",
"link": "https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ",
"serpapi_link": "https://serpapi.com/search.json?author_id=eKLr0EgAAAAJ&engine=google_scholar_author&hl=pl",
"author_id": "eKLr0EgAAAAJ",
"affiliations": "Professor of Computer Science, Oklahoma State University",
"email": "Zweryfikowany adres z cs.okstate.edu",
"cited_by": 159999,
"interests": [
{
"title": "Security",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=pl&mauthors=label%3Asecurity",
"link": "https://scholar.google.com/citations?hl=pl&view_op=search_authors&mauthors=label:security"
},
{
"title": "cloud computing",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=pl&mauthors=label%3Acloud_computing",
"link": "https://scholar.google.com/citations?hl=pl&view_op=search_authors&mauthors=label:cloud_computing"
},
{
"title": "big data",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=pl&mauthors=label%3Abig_data",
"link": "https://scholar.google.com/citations?hl=pl&view_op=search_authors&mauthors=label:big_data"
}
],
"thumbnail": "https://scholar.google.com/citations/images/avatar_scholar_56.png"
}
'''
Disclaimer, I work for SerpApi.

decode is used to convert bytes into string. And your url is string, not bytes.
You can use encode to convert this string into bytes and later use decode to convert to correct string.
(I use prefix r to simulate text with this problem - without prefix url doesn't have to be converted)
url = r'http://scholar.google.pl/citations?view_op\x3dsearch_authors\x26hl\x3dpl\x26oe\x3dLatin2\x26mauthors\x3dlabel:security\x26after_author\x3drukAAOJ8__8J\x26astart\x3d10'
print(url)
url = url.encode('utf-8').decode('unicode_escape')
print(url)
result:
http://scholar.google.pl/citations?view_op\x3dsearch_authors\x26hl\x3dpl\x26oe\x3dLatin2\x26mauthors\x3dlabel:security\x26after_author\x3drukAAOJ8__8J\x26astart\x3d10
http://scholar.google.pl/citations?view_op=search_authors&hl=pl&oe=Latin2&mauthors=label:security&after_author=rukAAOJ8__8J&astart=10
BTW: first check print(url) maybe you have correct url but you use wrong method to display it. Python Shell displays all result without print() using print(repr()) which display some chars as code to show what endcoding is used in text (utf-8, iso-8859-1, win-1250, latin-1, etc.)

Related

for a in soup.find_all("div",{"class":"rg_meta"}): NOT working

I hope this question will find you in the best of health and spirit. I use this python script for image scrapping but this loop is not working. Can any one suggest me solution this is script?
from bs4 import BeautifulSoup
import requests
import re
import urllib.request as urllib2
import os
import argparse
import sys
import json
# adapted from http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
def main(args):
parser = argparse.ArgumentParser(description='Scrape Google images')
parser.add_argument('-s', '--search', default='bananas', type=str, help='search term')
parser.add_argument('-n', '--num_images', default=10, type=int, help='num images to save')
parser.add_argument('-d', '--directory', default='/Users/gene/Downloads/', type=str, help='save directory')
args = parser.parse_args()
query = args.search#raw_input(args.search)
max_images = args.num_images
save_directory = args.directory
image_type="Action"
query=input()
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
for i , (img , Type) in enumerate( ActualImages[0:max_images]):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
if len(Type)==0:
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+".jpg"), 'wb')
else :
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print ("could not load : "+img)
print (e)
if __name__ == '__main__':
from sys import argv
try:
main(argv)
except KeyboardInterrupt:
pass
sys.exit()
To scrape Google Images data, especially images in original resolution, you need to get them from inline JSON using regular expressions. Otherwise, you can extract it with browser automation such as selenium, however, scraping speed would be a lot slower in comparison to regex.
To find if there's an inline json present or not, you can start by looking for the title of the first image in the page source (Ctrl+U) to find the matches we need, and if they are in the <script> elements, then most likely this is inline JSON. From there we can extract the data.
We can find both thumbnails and full resolution images. First, using regular expressions we find thumbnails, and then with the original resolution:
# https://regex101.com/r/SxwJsW/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
Check code in online IDE.
import requests, re, json, lxml
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
google_images = []
params = {
"q": "tower", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us" # country where search comes fro
}
html = requests.get("https://google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
all_script_tags = soup.select("script")
# https://regex101.com/r/RPIbXK/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/NRKEmV/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)
# https://regex101.com/r/SxwJsW/1
matched_google_images_thumbnails = ", ".join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(", ")
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
]
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
google_images.append({
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
})
print(json.dumps(google_images, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Eiffel Tower - Wikipedia",
"link": "https://en.wikipedia.org/wiki/Eiffel_Tower",
"source": "Wikipedia",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTsuYzf9os1Qb1ssPO6fWn-5Jm6ASDXAxUFYG6eJfvmehywH-tJEXDW0t7XLR3-i8cNd-0&usqp=CAU",
"original": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/85/Tour_Eiffel_Wikimedia_Commons_%28cropped%29.jpg/640px-Tour_Eiffel_Wikimedia_Commons_%28cropped%29.jpg"
},
{
"title": "tower | architecture | Britannica",
"link": "https://www.britannica.com/technology/tower",
"source": "Encyclopedia Britannica",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR8EsWofNiFTe6alwRlwXVR64RdWTG2fuBQ0z1FX4tg3HbL7Mxxvz6GnG1rGZQA8glVNA4&usqp=CAU",
"original": "https://cdn.britannica.com/51/94351-050-86B70FE1/Leaning-Tower-of-Pisa-Italy.jpg"
},
{
"title": "Tower - Wikipedia",
"link": "https://en.wikipedia.org/wiki/Tower",
"source": "Wikipedia",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT3L9LA0VamqmevhCtkrHZvM9MlBf9EjtTT7KhyzRP3zi3BmuCOmn0QFQG42xFfWljcsho&usqp=CAU",
"original": "https://upload.wikimedia.org/wikipedia/commons/3/3e/Tokyo_Sky_Tree_2012.JPG"
},
# ...
]
As an alternative you can use Google Images API from SerpApi. It's a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Simple code example:
from serpapi import GoogleSearch
import os, json
image_results = []
# search query parameters
params = {
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...
"q": "tower", # search query
"tbm": "isch", # image results
"num": "100", # number of images per page
"ijn": 0, # page number: 0 -> first page, 1 -> second...
"api_key": os.getenv("API_KEY") # your serpapi api key
# other query parameters: hl (lang), gl (country), etc
}
search = GoogleSearch(params) # where data extraction happens
images_is_present = True
while images_is_present:
results = search.get_dict() # JSON -> Python dictionary
# checks for "Google hasn't returned any results for this query."
if "error" not in results:
for image in results["images_results"]:
if image["original"] not in image_results:
image_results.append(image["original"])
# update to the next page
params["ijn"] += 1
else:
print(results["error"])
images_is_present = False
print(json.dumps(image_results, indent=2))
Output:
[
"https://cdn.rt.emap.com/wp-content/uploads/sites/4/2022/08/10084135/shutterstock-woods-bagot-rough-site-for-leadenhall-tower.jpg",
"https://dynamic-media-cdn.tripadvisor.com/media/photo-o/1c/60/ff/c5/ambuluwawa-tower-is-the.jpg?w=1200&h=-1&s=1",
"https://cdn11.bigcommerce.com/s-bf3bb/product_images/uploaded_images/find-your-nearest-cell-tower-in-five-minutes-or-less.jpeg",
"https://s3.amazonaws.com/reuniontower/Reunion-Tower-Exterior-Skyline.jpg",
"https://assets2.rockpapershotgun.com/minecraft-avengers-tower.jpg/BROK/resize/1920x1920%3E/format/jpg/quality/80/minecraft-avengers-tower.jpg",
"https://images.adsttc.com/media/images/52ab/5834/e8e4/4e0f/3700/002e/large_jpg/PERTAMINA_1_Tower_from_Roundabout.jpg?1386960835",
"https://awoiaf.westeros.org/images/7/78/The_tower_of_joy_by_henning.jpg",
"https://eu-assets.simpleview-europe.com/plymouth2016/imageresizer/?image=%2Fdmsimgs%2Fsmeatontower3_606363908.PNG&action=ProductDetailNew",
# ...
]
There's a Scrape and download Google Images with Python blog post if you need a little bit more code explanation.

How to exclude particular websites in Bing Web Search API result?

I'm using Bing Web Search API to search for some info in Bing.com. However, for some search queries, I'm getting websites from search, which are not relevant for my purposes.
For example, if the search query is books to read this summer sometimes Bing returns youtube.com or amazon.com as a result. However, I don't want to have these kinds of websites in my search results. So, I want to block these kinds of websites before the search starts.
Here is my sample code:
params = {
"mkt": "en-US",
"setLang": "en-US",
"textDecorations": False,
"textFormat": "raw",
"responseFilter": "Webpages",
"q": "books to read this summer",
"count": 20
}
headers = {
"Ocp-Apim-Subscription-Key": MY_APY_KEY_HERE,
"Accept": "application/json",
"Retry-After": "1",
}
response = requests.get(WEB_SEARCH, headers=headers, params=params, timeout=15)
response.raise_for_status()
search_data = response.json()
search_data variable contains links I want to block without iteration over the response object. Particularly, I want Bing not to include youtube and amazon in the result.
Any help will be appreciated.
Edited: WEB_SEARCH is the web search endpoint
After digging Bing Web Search documentation I found an answer. Here it is.
LINKS_TO_EXCLUDE = ["-site:youtube.com", "-site:amazon.com"]
def bing_data(user_query):
params = {
"mkt": "en-US",
"setLang": "en-US",
"textDecorations": False,
"textFormat": "raw",
"responseFilter": "Webpages",
"count": 30
}
headers = {
"Ocp-Apim-Subscription-Key": MY_APY_KEY_HERE,
"Accept": "application/json",
"Retry-After": "1",
}
# This is the line what I needed to have
params["q"] = user_query + " " + " ".join(LINKS_TO_EXCLUDE)
response = requests.get(WEB_SEARCH_ENDPOINT, headers=headers, params=params)
response.raise_for_status()
search_data = response.json()
return search_data

How can I find the url of an image from Google Images (or bing) using python (and preferably BS4)?

I've tried using the inspect element on google chrome to find the image links and I see the following:
<img class="mimg rms_img" style="color: rgb(192, 54, 11);" height="185" width="297" alt="Image result for pizza" id="emb48403F0A" src="https://th.bing.com/th/id/OIP.uEcdCNY9nFhqWqbz4B0mFQHaEo?w=297&h=185&c=7&o=5&dpr=1.5&pid=1.7" data-thhnrepbd="1" data-bm="186">
When I try to search for this element and others like it using: soup.findAll("img",{"class": "mimg rms_img"}) I get nothing.
Am I doing something wrong or is this just not the best way to go about it?
To extract the image URL for both thumbnail, and original size URL you need to:
Find all <script> tags.
Match and extract image URLs via regex.
Iterate over found matches and decode them.
Code and full example in the online IDE (it's straightforward, try to read it slowly):
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nGoogle Full Resolution Images:') # in order
for fixed_full_res_image in matched_google_full_resolution_images:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
get_images_data()
--------------
'''
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSb48h3zks_bf6y7HnZGyGPn3s2TAHKKm_7kzxufi5nzbouJcQderHqoEoOZ4SpOuPDjfw&usqp=CAU
...
Google Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''
Alternatively, you can do it as well by using Google Images API from SerpApi. It's a paid API with a free plan.
Essentially, one of the main differences is that you don't need to dig into the source code of the page and extract something via regex, all that needs to be done is to iterate over structured JSON string and get the data you want. Check out the playground.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
----------
'''
...
{
"position": 60, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRt-tXSZMBNLLX8MhavbBNkKmjJ7wNXxtdr5Q&usqp=CAU",
"source": "pexels.com",
"title": "1,000+ Best Cats Videos · 100% Free Download · Pexels Stock Videos",
"link": "https://www.pexels.com/search/videos/cats/",
"original": "https://images.pexels.com/videos/855282/free-video-855282.jpg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
...
'''
P.S - I wrote a more in-depth blog post with GIFs and screenshots about how to scrape Google Images.
Disclaimer, I work for SerpApi.
Here is how you can do it in another search engine DuckDuckGo:
search_query = 'what you want to find'
num_images = 10
driver_location = '/put/location/of/your/driver/here'
# setting up the driver
ser = Service(driver_location)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)
# searching the query
driver.get(f'https://duckduckgo.com/?q={search_query}&kl=us-en&ia=web')
# going to Images Section
ba = driver.find_element(By.XPATH, "//a[#class='zcm__link js-zci-link js-zci-link--images']")
ba.click()
# getting the images URLs
for result in driver.find_elements(By.CSS_SELECTOR, '.js-images-link')[0:0+num_images]:
imageURL = result.get_attribute('data-id')
print(f'{imageURL}\n')
driver.quit()

Using Python to search google and store the websites into variables

I looking for a way to search google with python and store each website into a slot in an data list. Im looking for something like the example code below.
search=input('->')
results=google.search((search),(10))
print results
In this case i want it to search google for whatever is in the variable "search", 10 is the amount of results I want to store in the variable and finally putting them on the screen with the "print results".
I would appreciate any help or anything similar to what I want. Thanks.
As mentioned above google does provide an api for completing searches (https://developers.google.com/custom-search/json-api/v1/overview), and as mentioned depending on what you are trying to accomplish can get quite expensive. Another option is to scrap the google page. Below is an example I created using Beautiful Soup (https://www.crummy.com/software/BeautifulSoup/bs4/doc/#) to scrap the google result.
import urllib2
import xml.etree.ElementTree
from bs4 import BeautifulSoup #install using 'pip install beautifulsoup4'
'''
Since spaces will not work in url parameters, the spaces have to be converted int '+'
ex) "example text" -> "example+text"
'''
def spacesToPluses(string):
words = string.split(" ")
convertedString = ""
for i in range(0, len(words)):
convertedString += words[i] + "+"
return convertedString[0:len(convertedString)-1]
'''
Opens the url with the parameter included and reads it as a string
'''
def getRawGoogleResponse(url):
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} #Required for google to allow url request
request=urllib2.Request(url,None,headers)
response = urllib2.urlopen(request)
rawResponse = response.read()
return rawResponse
'''
Takes in the raw string representation and converts it into an easier to navigate object (Beautiful Soup)
'''
def getParsedGoogleResponse(url):
rawResponse = getRawGoogleResponse(url)
fullPage = BeautifulSoup(rawResponse, 'html.parser')
return fullPage
'''
Finds all of the urls on a single page
'''
def getGoogleResultsOnPage(fullPage):
searchResultContainers = fullPage.find_all("h3", {"class": "r"}) #the results are contained in an h3 element with the class 'r'
pageUrls = []
for container in searchResultContainers: #get each link in the container
fullUrl = container.find('a')['href']
beginningOfUrl = fullUrl.index('http')
pageUrls.append(fullUrl[beginningOfUrl:])#Chops off the extra bits google adds to the url
return pageUrls
'''
Returns number of pages (max of 10)
'''
def getNumPages(basePage):
navTable = basePage.find("table", {"id" : "nav"}) #The nav table contains the number of pages (up to 10)
pageNumbers = navTable.find_all("a", {"class" : "fl"})
lastPageNumber = int(pageNumbers[len(pageNumbers)-2].text)
return lastPageNumber
'''
Loops through pages gathering url from each page
'''
def getAllGoogleSearchResults(search, numResults):
baseUrl = "https://www.google.com/search?q=" + spacesToPluses(search)
basePage = getParsedGoogleResponse(baseUrl)
numPages = getNumPages(basePage)
allUrls = []
for i in range(0, numPages):
completeUrl = baseUrl + "&start=" + str(i * 10) #google uses the parameter 'start' to represent the url to start at (10 urls pre page)
page = getParsedGoogleResponse(completeUrl)
for url in getGoogleResultsOnPage(page):
allUrls.append(url)
return allUrls[0:numResults]#return just the number of results
def main():
print(getAllGoogleSearchResults("even another test", 1))
main()
The solution works for the first 10 pages (or next highest) of google results. The urls are returned in an array of string objects. The information is scrapped by getting the response using urllib2. Hope this helps.
Google search page has maximum 10 number of results to return(by default), parameter num in parameters dict is responsible for this:
params = {
"q": query, # query
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
To get more data, you can paginate through all pages using an infinite while loop. Pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector .d6cvqb a[id=pnnext]), you need to increase the value of ["start"] by 10 to access the next page, if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
You also need to pay attention to the fact that most sites, including Google, do not like being scraped and the request might be blocked if you using requests as default user-agent in requests library is a python-requests. Additional step could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
query = input("Input your query: ") # for example: "auto"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query, # query
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
page_num = 0
website_data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = f'Title: {result.select_one("h3").text}'
link = f'Link: {result.select_one("a")["href"]}'
website_data.append({
"title" : title,
"link" : link,
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(website_data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Title: Show Your Auto",
"link": "Link: http://www.showyourauto.com/vehicles/388/2002-ford-saleen-mustang-s281-extreme-speedster"
},
{
"title": "Title: Global Competition in the Auto Parts Industry: Hearings ...",
"link": "Link: https://books.google.com/books?id=dm7bjDjkrRQC&pg=PA2&lpg=PA2&dq=auto&source=bl&ots=sIf4ELozPN&sig=ACfU3U3xea1-cJl9hiQe8cpac2KLrIF20g&hl=en&sa=X&ved=2ahUKEwjWn7ukv6P7AhU3nGoFHSRxABY4jgIQ6AF6BAgEEAM"
},
{
"title": "Title: Issues relating to the domestic auto industry: hearings ...",
"link": "Link: https://books.google.com/books?id=fHX_MJobx3EC&pg=PA79&lpg=PA79&dq=auto&source=bl&ots=jcrwR-jwck&sig=ACfU3U0p0Wn6f-RU11U8Z0GtqMjTKd44ww&hl=en&sa=X&ved=2ahUKEwjWn7ukv6P7AhU3nGoFHSRxABY4jgIQ6AF6BAgaEAM"
},
# ...
]
You can also use Google Search Engine Results API from SerpApi. It's a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
query = input("Input your query: ") # for example: "auto"
params = {
"api_key": os.getenv("API_KEY"), # serpapi key
"engine": "google", # serpapi parser engine
"q": query, # search query
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"page_num": page_num,
"title": result.get("title"),
"link": result.get("link"),
"displayed_link": result.get("displayed_link"),
})
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"page_num": 4,
"title": "San Francisco's JFK Drive to Remain Closed to Cars",
"link": "https://www.nbcbayarea.com/decision-2022/jfk-drive-san-francisco-election/3073470/",
"displayed_link": "https://www.nbcbayarea.com › decision-2022 › jfk-driv..."
},
{
"page_num": 4,
"title": "Self-Driving Cruise Cars Are Expanding to Most of SF, Says ...",
"link": "https://sfstandard.com/business/self-driving-cruise-cars-are-expanding-to-most-of-sf-says-ceo/",
"displayed_link": "https://sfstandard.com › business › self-driving-cruise-c..."
},
# ...
]

Why does soup only shows half of the chart I'm scraping?

I'm scraping from a google search but I can only get the first row of a two row chart on the right-hand side.
The search query is:
https://www.google.com/search?q=kegerators
I've noticed that doing an inspect element doesn't really work as beautifulsoup seems to extract a different code.
The code I have is:
htmltext=br.open(query).read()
soup=BeautifulSoup(htmltext)
search = soup.findAll("div", attrs={ "class" : "_cf" })
print search
Upon looking at the code (basically looking for "b>$" - as I know I should see 8 of those) I only get 4, which happen to be the top row of the chart.
These is the result of the search:
[<div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRY5NBoY-anFlJUYExmil81vJG5i1nw6LqVu64lSjw8tSPBUEdh3JaiFix-gfSKMGtE2ZwX8w&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$599.00</b></div><div><cite style="white-space:nowrap">Kegerator</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRS4iCsD4EDV37Rg1kZf0nxFK3bYgYaWC-bxMv-ISg4dI8m-COU3ZHCZGs3FdJBK3npkpoE&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">Kegco K199SS‑2 D...</div><div><b>$539.99</b></div><div><cite style="white-space:nowrap">BeverageFa...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t2.gstatic.com/shopping?q=tbn:ANd9GcSkf6-jVZt34pd_6QyqZGre06VxszvFZX70-wUOEDRhEFhorX_Yek0oyr-5jvk8FNpj2KWusQ&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$499.00</b></div><div><cite style="white-space:nowrap">Compact Ap...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t1.gstatic.com/shopping?q=tbn:ANd9GcTf56EQ6DVbOk02D7cLgVmlurU-2gNrhD6a74MnzQBWg1W290DTYQuj0sSUxQEbxo1XO6pB&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">FunTime Black Kegge...</div><div><b>$399.99</b></div><div><cite style="white-space:nowrap">Max Tool</cite></div></div>]
Is Google doing something strange here?
The reason why results might differ is that Google displays different results on each request, e.g. sometimes it could get 10 shopping results, sometimes 7 or 4.
Specifying gl (country, e.g: us), hl (language, e.g: en) query params could get exact or close to the exact result that you see in your browser.
Also, don't forget to specify a user-agent, otherwise, Google will block your requests eventually.
Code and example in the online IDE:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "buy coffe", # intentional grammatical error to display right side shopping results
"hl": "en",
"gl": "us"
}
response = requests.get("https://www.google.com/search", headers=headers, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
# scrapes both top and right side shopping resutls
for result in soup.select('.pla-hovercard-content-ellip'):
title = result.select_one('.pymv4e').text
link = result.select_one('.pla-hovercard-content-ellip a.tkXAec')['href']
ad_link = f"https://www.googleadservices.com/pagead{result.select_one('.pla-hovercard-content-ellip a')['href']}"
price = result.select_one('.qptdjc').text
try:
rating = result.select_one('.Fam1ne.tPhRLe')["aria-label"].replace("Rated ", "").replace(" out of ", "").replace(",", "")
except:
rating = None
try:
reviews = result.select_one('.GhQXkc').text.replace("(", "").replace(")", "")
except:
reviews = None
source = result.select_one('.zPEcBd.LnPkof').text.strip()
print(f'{title}\n{link}\n{ad_link}\n{price}\n{rating}\n{reviews}\n{source}\n')
-------------
'''
MUD\WTR | Mushroom Coffee Replacement, 90 servings
https://mudwtr.com/collections/shop/products/90-serving-bag
https://www.googleadservices.com/pagead/aclk?sa=l&ai=DChcSEwj5p8u-2rzyAhV2yJQJHfzhBoUYABAHGgJ5bQ&sig=AOD64_3NGBzLzkTv61K7kSrD2f9AREHH_g&ctype=5&q=&ved=2ahUKEwji7MK-2rzyAhWaaM0KHcnaDDcQ9aACegQIAhBo&adurl=
$125.00
4.85
1k+
mudwtr.com
...
'''
Alternatively, you can do the same thing using Google Inline Shopping API from SerpApi. It's a paid API with a free plan.
The difference is that everything is already extracted, and all that needs to be done is just to iterate over structured JSON.
Code to integrate:
import json, os
from serpapi import GoogleSearch
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "buy coffe",
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['shopping_results']:
print(json.dumps(result, indent=2, ensure_ascii=False))
--------
'''
{
"position": 1,
"block_position": "right",
"title": "Maxwell House Original Roast | 48oz",
"price": "$10.49",
"extracted_price": 10.49,
"link": "https://www.google.com/aclk?sa=l&ai=DChcSEwiGn8aT2rzyAhXgyZQJHZHdBJMYABAEGgJ5bQ&ae=2&sig=AOD64_0jBjdUIMeqJvrXYxn4NGcpwCYrJQ&ctype=5&q=&ved=2ahUKEwiOxLmT2rzyAhWiFVkFHWMNAaEQ5bgDegQIAhBa&adurl=",
"source": "Boxed",
"rating": 4.6,
"reviews": 2000,
"thumbnail": "https://serpapi.com/searches/611e1b2cfdca3e6a1c9335e6/images/e4ae7f31164ec52021f1c04d8be4e4bda2138b1acd12c868052125eb86ead292.png"
}
...
'''
P.S - I wrote a blog post about this topic that you can find here.
Disclaimer, I work for SerpApi.

Categories