Beautiful Soup image scraper problems - python

I get the following traceback:
Traceback (most recent call last):
File "/home/ro/image_scrape_test.py", line 20, in <module>
soup = BeautifulSoup(searched, "lxml")
File "/usr/local/lib/python3.4/dist-packages/bs4/__init__.py", line 176, in __init__
elif len(markup) <= 256:
TypeError: object of type 'NoneType' has no len()
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib
#searches google images
driver = webdriver.Firefox()
google_images = ("https://www.google.com/search?site=imghp&tbm=isch source=hp&biw=1366&bih=648&q=")
search_term = input("what is your search term")
searched = driver.get("{0}{1}".format(google_images, search_term))
def savepic(url):
uri = ("/home/ro/image scrape/images/download.jpg")
if url != "":
urllib.urlretrieve(url, uri)
soup = BeautifulSoup(searched, "lxml")
soup1 = soup.content
images = soup1.find_all("a")
for image in images:
savepic(image)
I'm starting out so i'd appreciate any tips on how I can improve my code.
Thankyou

driver.get() loads a webpage in the browser and returns None which makes the searched variable to have a None value.
You probably meant to get the .page_source instead:
soup = BeautifulSoup(driver.page_source, "lxml")
Two additional points here:
you don't actually need BeautifulSoup here - you can locate the desired images with selenium using, for instance, driver.find_elements_by_tag_name()
I have not tested your code, but I think you would need to add additional Explicit Waits to make selenium wait for the page to load

searched is None. Apparently, the url you are using is invalid.

You can scrape Google images by only using the beautifulsoup and requests library, selenium is not required.
For example, if you only want to extract thumbnail images (small resolution size), you can pass "content-type": "image/png" query param (solution found from MendelG) and it will return thumbnail image links.
import requests
from bs4 import BeautifulSoup
params = {
"q": "batman wallpaper",
"tbm": "isch",
"content-type": "image/png",
}
html = requests.get("https://www.google.com/search", params=params)
soup = BeautifulSoup(html.text, 'html.parser')
for img in soup.select("img"):
print(img["src"])
# https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQAxU74QyJ8jn8Qq0ZK3ur_GkxjICcvmiC30DWnk03DEsi7YUgS8XXksdyybXY&s
# https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRh5Fhah5gT9msG7vhXeQzAziS17Jp1HE_wE5O00113DtE2rJztgvxwRSonAno&s
# ...
To scrape the full-res image URL with requests and beautifulsoup you need to scrape data from the page source code via regex.
Find all <script> tags:
soup.select('script')
Match images data via regex:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
Match desired images (full res size) via regex:
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps() it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
Extract and decode them using bytes() and decode():
for fixed_full_res_image in matched_google_full_resolution_images:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images to a folder:
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nDownloading Google Full Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with regex to match and extract needed data from the source code of the page, instead, you only need to iterate over structured JSON and get what you want faster.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
...
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.

Related

Trouble scraping weather data from Google

I'm writing a program that will scrape wind speed and direction data from Google. I've seen other results online where it works out fine, but for some reason, it's not working out for me. I am specifically interested in scraping the elements with "img" tags. Here is my code:
import requests
import bs4
import geocoder
lat, long = 40.776903698619975, -74.45007646247723
base_url = r"https://www.google.com/search?q="
geoc = geocoder.osm([lat, long], method='reverse').json["raw"]["address"]
search_query = geoc["state"] + " " + geoc["country"] + " wind conditions"
lowest_admin_levels = ("municipality", "town", "city", "county")
level_found = False
for level in lowest_admin_levels:
try:
search_query = geoc[level] + " " + search_query
level_found = True
break
except KeyError:
continue
url = base_url + search_query.replace(" ", "+")
print(url)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'html.parser')
print(soup.find_all('img'))
The lat/long variables could be any coordinates, those are just examples. soup.find_all('img') returns just one "img" element, when in reality, the page has multiple "img"s containing arrows rotated according to the wind direction, which you can see in this link https://www.google.com/search?q=Morris+Township+New+Jersey+United+States+wind+conditions. Thank you!
As the comment already says, Google loads the images dynamically using JavaScript. The requests library and Beautiful soup are not able to get those JavaScript loaded images. That's why you need Selenium, to get those images.
Installation
pip install selenium
pip install webdriver-manager
Solution
import geocoder
# New imports
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
lat, long = 40.776903698619975, -74.45007646247723
BASE_URL = r"https://www.google.com/search?q="
geoc = geocoder.osm([lat, long], method='reverse').json["raw"]["address"]
search_query = geoc["state"] + " " + geoc["country"] + " wind conditions"
lowest_admin_levels = ("municipality", "town", "city", "county")
for level in lowest_admin_levels:
try:
search_query = geoc[level] + " " + search_query
break
except KeyError:
continue
url = BASE_URL + search_query.replace(" ", "+")
chrome_options = Options()
# The options make the browser headless, so you don't see it
# comment out those two lines to see whats happening
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options) # You could specify the path to the chrome driver instead
driver.get(url)
time.sleep(2)
imgs = driver.find_elements_by_tag_name('img') # all the image tags
for img in imgs:
image_source = img.get_attribute('src') # The src of the img tag
print(image_source)
When you remove the headless option, you will see what selenium “sees”. Using Selenium, you can also click around on the website and interact with it, as you would as a normal user.
It's doesn't require geocoder nor selenium. Check out SelectorGadget Chrome extension to visually grab CSS selectors by clicking on the desired element.
Also, you can get wind direction from the same element, e.g. class='wob_t' -> area-label:
<span class="wob_t" style="display:inline;text-align:right" aria-label="8 km/h From northwest Tuesday 10:00">8 km/h</span>
Which is the same as in the <img> element (look at alt):
<img src="//ssl.gstatic.com/m/images/weather/wind_unselected.svg" alt="8 km/h From northwest" style="transform-origin:50% 50%;transform:rotate(408deg);width:16px" aria-hidden="true" data-atf="1" data-frt="0" class="">
Code and full example that scrapes more in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "london weather",
"hl": "en",
"gl": "us"
}
response = requests.get('https://www.google.com/search', headers=headers, params=params).text
soup = BeautifulSoup(response, 'lxml')
for weather_result in soup.select('.wob_noe .wob_hw'):
try:
wind_speed = weather_result.select_one('.wob_t').text
'''
extracts elements, splits the string by a SPACE, and grabs 2nd and 4th index,
and then joins via SPACE. Or just use regex instead.
Example:
7 mph From northwest Sunday 9:00 AM ---> From northeast
'''
wind_direction = ' '.join(weather_result.select_one('.wob_t')['aria-label'].split(' ')[2:4])
print(f"{wind_speed}\n{wind_direction}\n")
except:
pass # or None instead
----------
'''
8 mph
From northeast
11 mph
From east
9 mph
From northeast
...
'''
Alternatively, you can use Google Direct Answer Box API from SerpApi. It's a paid API with a free plan.
Essentially, you don't need to figure out extracting part of the process and all that really needs to be done is just to iterate over structured JSON and use whatever you need from it (apart from bypass blocks from Google or maintain the parser over time).
Code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"engine": "google",
"q": "london weather",
"api_key": os.getenv("API_KEY"),
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
forecast = results['answer_box']['forecast']
print(json.dumps(forecast, indent=2))
----------
'''
[
{
"day": "Tuesday",
"weather": "Partly cloudy",
"temperature": {
"high": "72",
"low": "57"
},
"thumbnail": "https://ssl.gstatic.com/onebox/weather/48/partly_cloudy.png"
}
...
]
'''
Disclaimer, I work for SerpApi.

Query google images with copy right free filter using requests/urllib

I am trying to query images with copy right free filter. Even though the url leads to the right settings in my code, for some reason the page that is read by both urllib and requests is the first few images without the copy right free and size filter. If anyone can help with this I would greatly appreciate it.
code:
#%%
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
url = 'https://google.com/search?q='
input = 'cat'
#string: tbm=isch --> means image search
#tbs=isz:m --> size medium
#il:cl --> copy right free(i think)
url = url+input+'&tbm=isch&tbs=isz:m%2Cil:cl'
print(url)
html = urlopen(Request(url, headers={'User-Agent': 'Google Chrome'}))
'''with urllib.request.urlopen(url) as response:
html = response.read()
print(html)'''
#print(str(r.content))
soup = BeautifulSoup(html.read(),'html.parser')
#print(soup.prettify)
#using soup to find all img tags
results = soup.find_all('img')
str_result = str(results)
print(str_result)
lst_result = str_result.split(',')
#trying to get the very first link for the images with the appropriate settings
link = lst_result[4].split(' ')[4].split('"')[1]
#print(link)
# writing into the appropriate testing file, to be changed
file = open('.img1.png','wb')
get_img = requests.get(link)
file.write(get_img.content)
file.close()
import requests
import re, json
extentions = ['jpg', 'jpeg', 'png', 'gif', 'svg']
# determine image extention (not guaranteed, some links lack the extension)
def extention(url):
# or use the "imghdr" package to determine the extention
for ext in extentions:
if url.endswith(f'.{ext}'):
return ext
return '.UNKNOWN'
URL = 'https://google.com/search'
params = {
'q': 'cat', # search term
'tbm': 'isch',
'tbs': 'isz:m,il:cl'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.3'
}
r = requests.get(URL, params=params, headers=headers)
html = r.text
matches = re.findall('data:\[(?!\])(.*?)],\ sideChannel', html) # the data lives in a script, not in <img> elements (initially)
data = json.loads(f'[{matches[1]}]')
images = []
for image in data[31][0][12][2]: # the data structure may change some day, but its consistent between requests as of now
if type(image) is list:
try:
images.append(image[1][3][0])
except:
pass
images = list(dict.fromkeys(images)) # remove duplicate links
# retrive and save the first image's data
print(images[0])
imgdata = requests.get(images[0], headers=headers).content
with open(f'img.{extention(images[0])}', 'wb') as file:
file.write(imgdata)

Beautiful soup returns empty array

I'm using beautiful soup to find the first hit from a google search.
Looking for "Stack Overflow" it should find https://www.stackoverflow.com
The code is mainly taken from here However, it suddenly stopped working with results[0] being index out of range.
print results[0] IndexError: list index out of range
I suspect it's a cache problem as it was working fine and then stopped without changing the code. I've also rebooted and cleared the cache but still no results.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import webbrowser # for webrowser, duh!
import re
#------------------------------------------------
def write_it(s, f):
# w for over write
file = open(f, "w")
file.write(s)
file.close()
#------------------------------------------------
def URL_encode_space(s):
return re.sub(r"\s", "%20", s)
#------------------------------------------------
def URL_decode_space(s):
return re.sub(r"%20", " ", s)
#------------------------------------------------
urlBase = "https://google.com"
searchRequest = "Stack Overflow"
print searchRequest
searchRequest = URL_encode_space(searchRequest)
# String literal for HTML quote
q = "%22" # is a "
numOfResults = 10
myURL = urlBase + "/search?q=" + q + searchRequest + q + "&num={" + str(numOfResults) + "}"
page = requests.get(myURL)
soup = BeautifulSoup(page.text, "html.parser")
links = soup.findAll("a")
results = []
for link in links:
link_href = link.get('href')
if "url?q=" in link_href and not "webcache" in link_href:
print (link.get('href').split("?q=")[1].split("&sa=U")[0])
results.append(link.get('href').split("?q=")[1].split("&sa=U")[0])
print results[0]
# open web browser?
webbrowser.open(myURL)
I can obviously check the 'len(results)' to remove the error but that doesn't explain why it no longer works.
Just like people said above it doesn't clear what could cause the problem.
Make sure you're using user agent.
I took this code from my other answer (scraping headings, summary, and links from google search results).
Code and full example:
from bs4 import BeautifulSoup
import requests
import json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q=java&oq=java',
headers=headers).text
soup = BeautifulSoup(html, 'lxml')
summary = []
for container in soup.findAll('div', class_='tF2Cxc'):
heading = container.find('h3', class_='LC20lb DKV0Md').text
article_summary = container.find('span', class_='aCOpRe').text
link = container.find('a')['href']
summary.append({
'Heading': heading,
'Article Summary': article_summary,
'Link': link,
})
print(json.dumps(summary, indent=2, ensure_ascii=False))
Alternatively, you can use Google Organic Results API from SerpApi to get these results.
It's a paid API with a free trial.
Part of JSON:
{
"position": 1,
"title": "Java | Oracle",
"link": "https://www.java.com/",
"displayed_link": "https://www.java.com",
"snippet": "Java Download. » What is Java? » Need Help? » Uninstall. About Java. Go Java Java Training Java + Greenfoot Oracle Code One Oracle Academy for ..."
}
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "stackoverflow",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(f"Link: {result['link']}")
Output:
Link: https://stackoverflow.com/
Link: https://en.wikipedia.org/wiki/Stack_Overflow
Link: https://stackoverflow.blog/
Link: https://stackoverflow.blog/podcast/
Link: https://www.linkedin.com/company/stack-overflow
Link: https://www.crunchbase.com/organization/stack-overflow
Disclaimer, I work for SerpApi.

Trying to search for images using Google Search, error 400

I keep getting this error:urllib.error.HTTPError: HTTP Error 400: Bad Request
I believe it may have something to do with the links, since when I put them in (and replace the {}) I receive the same error, but I don't know which links are correct/
(Python 3.6, Anaconda)
import os
import urllib.request as ulib
from bs4 import BeautifulSoup as Soup
import json
url_a = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q={}'
url_b = '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}'
url_c = '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg'
url_d = '\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'
url_base = ''.join((url_a, url_b, url_c, url_d))
headers = {'User-Agent': 'Chrome/69.0.3497.100'}
def get_links(search_name):
search_name = search_name.replace(' ', '+')
url = url_base.format(search_name, 0)
request = ulib.Request(url, data=None, headers=headers)
json_string = ulib.urlopen(request).read()
page = json.loads(json_string)
new_soup = Soup(page[1][1], 'lxml')
images = new_soup.find_all('img')
links = [image['src'] for image in images]
return links
if __name__ == '__main__':
search_name = 'Thumbs up'
links = get_links(search_name)
for link in links:
print(link)
I think you have a bunch of params you don't need
Try this simpler URL for image searching:
https://www.google.com/search?q={KEY_WORD}&tbm=isch
For example:
https://www.google.com/search?q=apples&tbm=isch
I think the problem is in asearch=ichunk&async=_id:rg_s,_pms:s which cannot be used with search, if i remove them it works:
import os
import urllib.request as ulib
from bs4 import BeautifulSoup as Soup
import json
url_a = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q=a+mouse'
url_b = '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}'
url_c = '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg'
url_d = '\.i&ijn=1'
url_base = ''.join((url_a, url_b, url_c, url_d))
print(url_base);
headers = {'User-Agent': 'Chrome/69.0.3497.100'}
def get_links(search_name):
search_name = search_name.replace(' ', '+')
url = url_base.format(search_name, 0)
request = ulib.Request(url, data=None, headers=headers)
json_string = ulib.urlopen(request).read()
print(json_string)
page = json.loads(json_string)
new_soup = Soup(page[1][1], 'lxml')
images = new_soup.find_all('img')
links = [image['src'] for image in images]
return links
if __name__ == '__main__':
search_name = 'Thumbs up'
links = get_links(search_name)
for link in links:
print(link)
I'm not really sure what you were trying to do by scraping JSON data with beautifulsoup since it can't do it. Instead, you can prase <script> tags that might contain JSON data via re module and then iterate over parsed JSON string.
Have a look at requsets library. You can get a more easy to read code by only adding needed query parameters (already mentioned by LeKhan9) in say, params (dict) variable and then pass it into request.get() just like you did with headers like so:
params = {
"q": "minecraft lasagna skin",
"tbm": "isch",
"ijn": "0", # batch of 100 images
}
request.get(URL, params=params)
Code and full example in the online IDE that scrapes suggested search results at the top as well (try to read step-by-step, it's pretty straightforward):
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "minecraft lasagna skin",
"tbm": "isch",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nGoogle Full Resolution Images:') # in order
for fixed_full_res_image in matched_google_full_resolution_images:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
----------------
'''
Google Images Metadata:
Lasagna Minecraft Skins | Planet Minecraft Community
planetminecraft.com
https://www.planetminecraft.com/skins/tag/lasagna/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPttXb_7ClNBirfv2Beh4aOBjlc-7Jw_kY8pZ4DrkbAavZcJEtz8djo_9iqdnatiG6Krw&usqp=CAU
...
Google Full Resolution Images:
https://static.planetminecraft.com/files/resource_media/preview/skinLasagnaman_minecraft_skin-6204972.jpg
...
'''
Alternatively, you can achieve this using Google Images API from SerpApi. It's a paid API with a free plan.
The biggest and noticeable difference is that you only need to iterate over structured JSON with already parsed data without the need to figure why something isn't parsing properly. Check out the playground.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "minecraft shaders 8k photo",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['suggested_searches'], indent=2, ensure_ascii=False))
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False)
-----------
# same output as above but in JSON format
I wrote a blog post on how to scrape Google Images in a bit more detailed way.
Dislaimer, I work for SerpApi.

Get authors name and URL for tag from google scholar

I wish to write to a CSV file a list of all authors with their URL to a CSV file who class themselves as a specific tag on Google Scholar. For example, if we were to take 'security' I would want this output:
author url
Howon Kim https://scholar.google.pl/citations?user=YUoJP-oAAAAJ&hl=pl
Adrian Perrig https://scholar.google.pl/citations?user=n-Oret4AAAAJ&hl=pl
... ...
I have written this code which prints each author's name
# -*- coding: utf-8 -*-
import urllib.request
import csv
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
mydivs = soup.findAll("h3", { "class" : "gsc_1usr_name"})
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
for each in mydivs:
for anchor in each.find_all('a'):
print (anchor.text)
However, this only does it for the first page. Instead, I would like to go through every page. How can I do this?
I'm not writing the code for you.. but I'll give you an outline for how you can.
Look at the bottom of the page. See the next button? Search for it the containing div has an id of gsc_authors_bottom_pag which should be easy to find. I'd do this with selenium, find the next button (right) and click it. Wait for the page to load, scrape repeat. Handle edge cases (out of pages, etc).
If the after_author=* bit didn't change in the url you could just increment the url start.. but unless you want to try to crack that code (unlikely) then just click the next button.
This page use <button> instead of <a> for link to next/previous page.
Button to next page has aria-label="Następna".
There are two buttons to next page but you can use any of them.
Button has JavaScript code to redirect to new page
window.location=url_to_next_page
but it is simple text so you can use slicing to get only url
import urllib.request
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
while True:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
# ... do something on page ...
# find buttons to next page
buttons = soup.findAll("button", {"aria-label": "Następna"})
# exit if no buttons
if not buttons:
break
on_click = buttons[0].get('onclick')
print('javascript:', on_click)
#add `domain` and remove `window.location='` and `'` at the end
url = 'http://scholar.google.pl' + on_click[17:-1]
# converting some codes to chars
url = url.encode('utf-8').decode('unicode_escape')
print('url:', url)
BTW: if you speak Polish then you can visit on Facebook: Python Poland or Python: pierwsze kroki
Since furas is already answered on how to loop through all pages, this is a complementary answer to his answer. The script below scrapes much more than your question asks and scrapes to a .csv file.
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, os, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
def get_profiles_to_csv():
html = requests.get('http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
# creating CSV File
with open('awesome_file.csv', mode='w') as csv_file:
# defining column names
fieldnames = ['Author', 'URL']
# defining .csv writer
# https://docs.python.org/3/library/csv.html#csv.DictWriter
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writing (creating) columns
writer.writeheader()
# collecting scraped data
author_data = []
# Selecting container where all data located
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
# id = link
# id_identifer = 'user='
# before_keyword, keyword, after_keyword = id.partition(id_identifer)
# author_id = after_keyword
# affiliations = result.select_one('.gs_ai_aff').text
# email = result.select_one('.gs_ai_eml').text
# try:
# interests = result.select_one('.gs_ai_one_int').text
# except:
# interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
# cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# because we have a csv.DictWriter() we converting to the required format
# dict() keys should be exactly the same as fieldnames, otherwise it will throw an error
author_data.append({
'Author': name,
'URL': f'https://scholar.google.com{link}',
})
# iterating over celebrity data list() that became dict() and writing it to the .csv
for data in author_data:
writer.writerow(data)
# print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')
# output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=pl&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=pl&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=pl&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=pl&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=pl&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=pl&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=pl&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=pl&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=pl&user=ty7wIXoAAAAJ
'''
Alternatively, you can do the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free plan.
Code to integrate:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import csv, os
def get_profiles_to_csv():
with open('awesome_serpapi_file_pagination.csv', mode='w') as csv_file:
fieldnames = ['Author', 'URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_profiles",
"mauthors": "label:security"
}
search = GoogleSearch(params)
while True:
results = search.get_dict()
try:
for result in results['profiles']:
name = result['name']
link = result['link']
writer.writerow({'Author': name, 'URL': link})
except:
print('Done')
break
if (not 'pagination' in results) and (not 'next' in results['pagination']):
break
search.params_dict.update(dict(parse_qsl(urlsplit(results["pagination"]["next"]).query)))
get_profiles_to_csv()
# part of the output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=en&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=en&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=en&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=en&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=en&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=en&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=en&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=en&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=en&user=ty7wIXoAAAAJ
'''
Disclaimer, I work for SerpApi.

Categories