Beautiful soup returns empty array - python

I'm using beautiful soup to find the first hit from a google search.
Looking for "Stack Overflow" it should find https://www.stackoverflow.com
The code is mainly taken from here However, it suddenly stopped working with results[0] being index out of range.
print results[0] IndexError: list index out of range
I suspect it's a cache problem as it was working fine and then stopped without changing the code. I've also rebooted and cleared the cache but still no results.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import webbrowser # for webrowser, duh!
import re
#------------------------------------------------
def write_it(s, f):
# w for over write
file = open(f, "w")
file.write(s)
file.close()
#------------------------------------------------
def URL_encode_space(s):
return re.sub(r"\s", "%20", s)
#------------------------------------------------
def URL_decode_space(s):
return re.sub(r"%20", " ", s)
#------------------------------------------------
urlBase = "https://google.com"
searchRequest = "Stack Overflow"
print searchRequest
searchRequest = URL_encode_space(searchRequest)
# String literal for HTML quote
q = "%22" # is a "
numOfResults = 10
myURL = urlBase + "/search?q=" + q + searchRequest + q + "&num={" + str(numOfResults) + "}"
page = requests.get(myURL)
soup = BeautifulSoup(page.text, "html.parser")
links = soup.findAll("a")
results = []
for link in links:
link_href = link.get('href')
if "url?q=" in link_href and not "webcache" in link_href:
print (link.get('href').split("?q=")[1].split("&sa=U")[0])
results.append(link.get('href').split("?q=")[1].split("&sa=U")[0])
print results[0]
# open web browser?
webbrowser.open(myURL)
I can obviously check the 'len(results)' to remove the error but that doesn't explain why it no longer works.

Just like people said above it doesn't clear what could cause the problem.
Make sure you're using user agent.
I took this code from my other answer (scraping headings, summary, and links from google search results).
Code and full example:
from bs4 import BeautifulSoup
import requests
import json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q=java&oq=java',
headers=headers).text
soup = BeautifulSoup(html, 'lxml')
summary = []
for container in soup.findAll('div', class_='tF2Cxc'):
heading = container.find('h3', class_='LC20lb DKV0Md').text
article_summary = container.find('span', class_='aCOpRe').text
link = container.find('a')['href']
summary.append({
'Heading': heading,
'Article Summary': article_summary,
'Link': link,
})
print(json.dumps(summary, indent=2, ensure_ascii=False))
Alternatively, you can use Google Organic Results API from SerpApi to get these results.
It's a paid API with a free trial.
Part of JSON:
{
"position": 1,
"title": "Java | Oracle",
"link": "https://www.java.com/",
"displayed_link": "https://www.java.com",
"snippet": "Java Download. » What is Java? » Need Help? » Uninstall. About Java. Go Java Java Training Java + Greenfoot Oracle Code One Oracle Academy for ..."
}
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "stackoverflow",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(f"Link: {result['link']}")
Output:
Link: https://stackoverflow.com/
Link: https://en.wikipedia.org/wiki/Stack_Overflow
Link: https://stackoverflow.blog/
Link: https://stackoverflow.blog/podcast/
Link: https://www.linkedin.com/company/stack-overflow
Link: https://www.crunchbase.com/organization/stack-overflow
Disclaimer, I work for SerpApi.

Related

Trouble scraping weather data from Google

I'm writing a program that will scrape wind speed and direction data from Google. I've seen other results online where it works out fine, but for some reason, it's not working out for me. I am specifically interested in scraping the elements with "img" tags. Here is my code:
import requests
import bs4
import geocoder
lat, long = 40.776903698619975, -74.45007646247723
base_url = r"https://www.google.com/search?q="
geoc = geocoder.osm([lat, long], method='reverse').json["raw"]["address"]
search_query = geoc["state"] + " " + geoc["country"] + " wind conditions"
lowest_admin_levels = ("municipality", "town", "city", "county")
level_found = False
for level in lowest_admin_levels:
try:
search_query = geoc[level] + " " + search_query
level_found = True
break
except KeyError:
continue
url = base_url + search_query.replace(" ", "+")
print(url)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'html.parser')
print(soup.find_all('img'))
The lat/long variables could be any coordinates, those are just examples. soup.find_all('img') returns just one "img" element, when in reality, the page has multiple "img"s containing arrows rotated according to the wind direction, which you can see in this link https://www.google.com/search?q=Morris+Township+New+Jersey+United+States+wind+conditions. Thank you!
As the comment already says, Google loads the images dynamically using JavaScript. The requests library and Beautiful soup are not able to get those JavaScript loaded images. That's why you need Selenium, to get those images.
Installation
pip install selenium
pip install webdriver-manager
Solution
import geocoder
# New imports
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
lat, long = 40.776903698619975, -74.45007646247723
BASE_URL = r"https://www.google.com/search?q="
geoc = geocoder.osm([lat, long], method='reverse').json["raw"]["address"]
search_query = geoc["state"] + " " + geoc["country"] + " wind conditions"
lowest_admin_levels = ("municipality", "town", "city", "county")
for level in lowest_admin_levels:
try:
search_query = geoc[level] + " " + search_query
break
except KeyError:
continue
url = BASE_URL + search_query.replace(" ", "+")
chrome_options = Options()
# The options make the browser headless, so you don't see it
# comment out those two lines to see whats happening
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options) # You could specify the path to the chrome driver instead
driver.get(url)
time.sleep(2)
imgs = driver.find_elements_by_tag_name('img') # all the image tags
for img in imgs:
image_source = img.get_attribute('src') # The src of the img tag
print(image_source)
When you remove the headless option, you will see what selenium “sees”. Using Selenium, you can also click around on the website and interact with it, as you would as a normal user.
It's doesn't require geocoder nor selenium. Check out SelectorGadget Chrome extension to visually grab CSS selectors by clicking on the desired element.
Also, you can get wind direction from the same element, e.g. class='wob_t' -> area-label:
<span class="wob_t" style="display:inline;text-align:right" aria-label="8 km/h From northwest Tuesday 10:00">8 km/h</span>
Which is the same as in the <img> element (look at alt):
<img src="//ssl.gstatic.com/m/images/weather/wind_unselected.svg" alt="8 km/h From northwest" style="transform-origin:50% 50%;transform:rotate(408deg);width:16px" aria-hidden="true" data-atf="1" data-frt="0" class="">
Code and full example that scrapes more in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "london weather",
"hl": "en",
"gl": "us"
}
response = requests.get('https://www.google.com/search', headers=headers, params=params).text
soup = BeautifulSoup(response, 'lxml')
for weather_result in soup.select('.wob_noe .wob_hw'):
try:
wind_speed = weather_result.select_one('.wob_t').text
'''
extracts elements, splits the string by a SPACE, and grabs 2nd and 4th index,
and then joins via SPACE. Or just use regex instead.
Example:
7 mph From northwest Sunday 9:00 AM ---> From northeast
'''
wind_direction = ' '.join(weather_result.select_one('.wob_t')['aria-label'].split(' ')[2:4])
print(f"{wind_speed}\n{wind_direction}\n")
except:
pass # or None instead
----------
'''
8 mph
From northeast
11 mph
From east
9 mph
From northeast
...
'''
Alternatively, you can use Google Direct Answer Box API from SerpApi. It's a paid API with a free plan.
Essentially, you don't need to figure out extracting part of the process and all that really needs to be done is just to iterate over structured JSON and use whatever you need from it (apart from bypass blocks from Google or maintain the parser over time).
Code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"engine": "google",
"q": "london weather",
"api_key": os.getenv("API_KEY"),
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
forecast = results['answer_box']['forecast']
print(json.dumps(forecast, indent=2))
----------
'''
[
{
"day": "Tuesday",
"weather": "Partly cloudy",
"temperature": {
"high": "72",
"low": "57"
},
"thumbnail": "https://ssl.gstatic.com/onebox/weather/48/partly_cloudy.png"
}
...
]
'''
Disclaimer, I work for SerpApi.

Tag of Google news title for beautiful soup

I am trying to extract the result of a search from Google news (vaccine for example) and provide some sentiment analysis based on the headline collected.
So far, I can't seem to find the correct tag to collect the headlines.
Here is my code:
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = 'https://www.google.com/search?q={0}&source=lnms&tbm=nws'.format(self.term)
def run (self):
response = requests.get(self.url)
print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
headline_results = soup.find_all('div', class_="phYMDf nDgy9d")
for h in headline_results:
blob = TextBlob(h.get_text())
self.sentiment += blob.sentiment.polarity / len(headline_results)
self.subjectivity += blob.sentiment.subjectivity / len(headline_results)
a = Analysis('Vaccine')
a.run()
print(a.term, 'Subjectivity: ', a.subjectivity, 'Sentiment: ' , a.sentiment)
The result are always 0 for the sentiment and 0 for the subjectivity. I feel like the issue is with the class_="phYMDf nDgy9d".
If you browse into that link, you are going to see the finished state of page but requests.get does not exeute or load any more data other than the page you request. Luckily there is some data and you can scrape that. I suggest you to use html prettifier services like codebeautify to get better understanding about what the page structure is.
Also if you see classes like phYMDf nDgy9d be sure to avoid finding with them. They are minified versions of classes so at any moment if they change a part of the CSS code, the class you are looking for is going to get a new name.
What I did is probably overkill but, I managed to dig down to scrape specific parts and your code works now.
When you look at the prettier version of requested html file, necessary contents are in a div with an id of main shown above. Then it's children are starting with a div element Google Search, continuing with a style element and after one empty div element, there are post div elements. The last two elements in that children list are footer and script elements. We can cut these off with [3:-2] and then under that tree we have pure data (pretty much). If you check the remaining part of the code after the posts variable, you can understand it I think.
Here is the code:
from textblob import TextBlob
import requests, re
from bs4 import BeautifulSoup
from pprint import pprint
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = 'https://www.google.com/search?q={0}&source=lnms&tbm=nws'.format(self.term)
def run (self):
response = requests.get(self.url)
#print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
mainDiv = soup.find("div", {"id": "main"})
posts = [i for i in mainDiv.children][3:-2]
news = []
for post in posts:
reg = re.compile(r"^/url.*")
cursor = post.findAll("a", {"href": reg})
postData = {}
postData["headline"] = cursor[0].find("div").get_text()
postData["source"] = cursor[0].findAll("div")[1].get_text()
postData["timeAgo"] = cursor[1].next_sibling.find("span").get_text()
postData["description"] = cursor[1].next_sibling.find("span").parent.get_text().split("· ")[1]
news.append(postData)
pprint(news)
for h in news:
blob = TextBlob(h["headline"] + " "+ h["description"])
self.sentiment += blob.sentiment.polarity / len(news)
self.subjectivity += blob.sentiment.subjectivity / len(news)
a = Analysis('Vaccine')
a.run()
print(a.term, 'Subjectivity: ', a.subjectivity, 'Sentiment: ' , a.sentiment)
A few outputs:
[{'description': 'It comes after US health officials said last week they had '
'started a trial to evaluate a possible vaccine in Seattle. '
'The Chinese effort began on...',
'headline': 'China embarks on clinical trial for virus vaccine',
'source': 'The Star Online',
'timeAgo': '5 saat önce'},
{'description': 'Hanneke Schuitemaker, who is leading a team working on a '
'Covid-19 vaccine, tells of the latest developments and what '
'needs to be done now.',
'headline': 'Vaccine scientist: ‘Everything is so new in dealing with this '
'coronavirus’',
'source': 'The Guardian',
'timeAgo': '20 saat önce'},
.
.
.
Vaccine Subjectivity: 0.34522727272727277 Sentiment: 0.14404040404040402
[{'description': '10 Cool Tech Gadgets To Survive Working From Home. From '
'Wi-Fi and cell phone signal boosters, to noise-cancelling '
'headphones and gadgets...',
'headline': '10 Cool Tech Gadgets To Survive Working From Home',
'source': 'CRN',
'timeAgo': '2 gün önce'},
{'description': 'Over the past few years, smart home products have dominated '
'the gadget space, with goods ranging from innovative updates '
'to the items we...',
'headline': '6 Smart Home Gadgets That Are Actually Worth Owning',
'source': 'Entrepreneur',
'timeAgo': '2 hafta önce'},
.
.
.
Home Gadgets Subjectivity: 0.48007305194805205 Sentiment: 0.3114683441558441
I used headlines and description data to do the operations but you can play with that if you want. You have the data now :)
use this
headline_results = soup.find_all('div', {'class' : 'BNeawe vvjwJb AP7Wnd'})
you already printed the response.text, if you want to find the specific data please search from the response.text result
Try to use select() instead. CSS selectors are more flexible. CSS selectors reference.
Have a look at SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
If you want to get all titles and so on, then you are looking for this container:
soup.select('.dbsr')
Make sure to pass user-agent, because Google might block your requests eventually and you'll receive a different HTML thus empty output. Check what is your user-agent
Pass user-agent:
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
requests.get("YOUR_URL", headers=headers)
I'm not sure what exactly are you trying to do but a solution from Guven Degirmenci is a bit overkill as he mentioned, with slicing, regex, doing something in div#main. It's much simpler.
Code and example in the online IDE:
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = f"https://www.google.com/search?q={self.term}&tbm=nws"
def run (self):
response = requests.get(self.url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
news_data = []
for result in soup.select('.dbsr'):
title = result.select_one('.nDgy9d').text
link = result.a['href']
source = result.select_one('.WF4CUc').text
snippet = result.select_one('.Y3v8qd').text
date_published = result.select_one('.WG9SHc span').text
news_data.append({
"title": title,
"link": link,
"source": source,
"snippet": snippet,
"date_published": date_published
})
for h in news_data:
blob = TextBlob(f"{h['title']} {h['snippet']}")
self.sentiment += blob.sentiment.polarity / len(news_data)
self.subjectivity += blob.sentiment.subjectivity / len(news_data)
a = Analysis("Lasagna")
a.run()
print(a.term, "Subjectivity: ", a.subjectivity, "Sentiment: " , a.sentiment)
# Vaccine Subjectivity: 0.3255952380952381 Sentiment: 0.05113636363636363
# Lasagna Subjectivity: 0.36556818181818185 Sentiment: 0.25386093073593075
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to maintain the parser, figure out how to parse certain elements or figuring out why something isn't working as it should, and understand how to bypass blocks from Google. All that needs to be done is to iterate over structured JSON and get what you want fast.
Code integrated with your example:
from textblob import TextBlob
import os
from serpapi import GoogleSearch
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = f"https://www.google.com/search"
def run (self):
params = {
"engine": "google",
"tbm": "nws",
"q": self.url,
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
news_data = []
for result in results['news_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
source = result['source']
date_published = result['date']
news_data.append({
"title": title,
"link": link,
"source": source,
"snippet": snippet,
"date_published": date_published
})
for h in news_data:
blob = TextBlob(f"{h['title']} {h['snippet']}")
self.sentiment += blob.sentiment.polarity / len(news_data)
self.subjectivity += blob.sentiment.subjectivity / len(news_data)
a = Analysis("Vaccine")
a.run()
print(a.term, "Subjectivity: ", a.subjectivity, "Sentiment: " , a.sentiment)
# Vaccine Subjectivity: 0.30957251082251086 Sentiment: 0.06277056277056277
# Lasagna Subjectivity: 0.30957251082251086 Sentiment: 0.06277056277056277
P.S - I wrote a bit more detailed blog post about how to scrape Google News.
Disclaimer, I work for SerpApi.

Trying to search for images using Google Search, error 400

I keep getting this error:urllib.error.HTTPError: HTTP Error 400: Bad Request
I believe it may have something to do with the links, since when I put them in (and replace the {}) I receive the same error, but I don't know which links are correct/
(Python 3.6, Anaconda)
import os
import urllib.request as ulib
from bs4 import BeautifulSoup as Soup
import json
url_a = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q={}'
url_b = '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}'
url_c = '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg'
url_d = '\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'
url_base = ''.join((url_a, url_b, url_c, url_d))
headers = {'User-Agent': 'Chrome/69.0.3497.100'}
def get_links(search_name):
search_name = search_name.replace(' ', '+')
url = url_base.format(search_name, 0)
request = ulib.Request(url, data=None, headers=headers)
json_string = ulib.urlopen(request).read()
page = json.loads(json_string)
new_soup = Soup(page[1][1], 'lxml')
images = new_soup.find_all('img')
links = [image['src'] for image in images]
return links
if __name__ == '__main__':
search_name = 'Thumbs up'
links = get_links(search_name)
for link in links:
print(link)
I think you have a bunch of params you don't need
Try this simpler URL for image searching:
https://www.google.com/search?q={KEY_WORD}&tbm=isch
For example:
https://www.google.com/search?q=apples&tbm=isch
I think the problem is in asearch=ichunk&async=_id:rg_s,_pms:s which cannot be used with search, if i remove them it works:
import os
import urllib.request as ulib
from bs4 import BeautifulSoup as Soup
import json
url_a = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q=a+mouse'
url_b = '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}'
url_c = '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg'
url_d = '\.i&ijn=1'
url_base = ''.join((url_a, url_b, url_c, url_d))
print(url_base);
headers = {'User-Agent': 'Chrome/69.0.3497.100'}
def get_links(search_name):
search_name = search_name.replace(' ', '+')
url = url_base.format(search_name, 0)
request = ulib.Request(url, data=None, headers=headers)
json_string = ulib.urlopen(request).read()
print(json_string)
page = json.loads(json_string)
new_soup = Soup(page[1][1], 'lxml')
images = new_soup.find_all('img')
links = [image['src'] for image in images]
return links
if __name__ == '__main__':
search_name = 'Thumbs up'
links = get_links(search_name)
for link in links:
print(link)
I'm not really sure what you were trying to do by scraping JSON data with beautifulsoup since it can't do it. Instead, you can prase <script> tags that might contain JSON data via re module and then iterate over parsed JSON string.
Have a look at requsets library. You can get a more easy to read code by only adding needed query parameters (already mentioned by LeKhan9) in say, params (dict) variable and then pass it into request.get() just like you did with headers like so:
params = {
"q": "minecraft lasagna skin",
"tbm": "isch",
"ijn": "0", # batch of 100 images
}
request.get(URL, params=params)
Code and full example in the online IDE that scrapes suggested search results at the top as well (try to read step-by-step, it's pretty straightforward):
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "minecraft lasagna skin",
"tbm": "isch",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nGoogle Full Resolution Images:') # in order
for fixed_full_res_image in matched_google_full_resolution_images:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
----------------
'''
Google Images Metadata:
Lasagna Minecraft Skins | Planet Minecraft Community
planetminecraft.com
https://www.planetminecraft.com/skins/tag/lasagna/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPttXb_7ClNBirfv2Beh4aOBjlc-7Jw_kY8pZ4DrkbAavZcJEtz8djo_9iqdnatiG6Krw&usqp=CAU
...
Google Full Resolution Images:
https://static.planetminecraft.com/files/resource_media/preview/skinLasagnaman_minecraft_skin-6204972.jpg
...
'''
Alternatively, you can achieve this using Google Images API from SerpApi. It's a paid API with a free plan.
The biggest and noticeable difference is that you only need to iterate over structured JSON with already parsed data without the need to figure why something isn't parsing properly. Check out the playground.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "minecraft shaders 8k photo",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['suggested_searches'], indent=2, ensure_ascii=False))
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False)
-----------
# same output as above but in JSON format
I wrote a blog post on how to scrape Google Images in a bit more detailed way.
Dislaimer, I work for SerpApi.

Beautiful Soup image scraper problems

I get the following traceback:
Traceback (most recent call last):
File "/home/ro/image_scrape_test.py", line 20, in <module>
soup = BeautifulSoup(searched, "lxml")
File "/usr/local/lib/python3.4/dist-packages/bs4/__init__.py", line 176, in __init__
elif len(markup) <= 256:
TypeError: object of type 'NoneType' has no len()
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib
#searches google images
driver = webdriver.Firefox()
google_images = ("https://www.google.com/search?site=imghp&tbm=isch source=hp&biw=1366&bih=648&q=")
search_term = input("what is your search term")
searched = driver.get("{0}{1}".format(google_images, search_term))
def savepic(url):
uri = ("/home/ro/image scrape/images/download.jpg")
if url != "":
urllib.urlretrieve(url, uri)
soup = BeautifulSoup(searched, "lxml")
soup1 = soup.content
images = soup1.find_all("a")
for image in images:
savepic(image)
I'm starting out so i'd appreciate any tips on how I can improve my code.
Thankyou
driver.get() loads a webpage in the browser and returns None which makes the searched variable to have a None value.
You probably meant to get the .page_source instead:
soup = BeautifulSoup(driver.page_source, "lxml")
Two additional points here:
you don't actually need BeautifulSoup here - you can locate the desired images with selenium using, for instance, driver.find_elements_by_tag_name()
I have not tested your code, but I think you would need to add additional Explicit Waits to make selenium wait for the page to load
searched is None. Apparently, the url you are using is invalid.
You can scrape Google images by only using the beautifulsoup and requests library, selenium is not required.
For example, if you only want to extract thumbnail images (small resolution size), you can pass "content-type": "image/png" query param (solution found from MendelG) and it will return thumbnail image links.
import requests
from bs4 import BeautifulSoup
params = {
"q": "batman wallpaper",
"tbm": "isch",
"content-type": "image/png",
}
html = requests.get("https://www.google.com/search", params=params)
soup = BeautifulSoup(html.text, 'html.parser')
for img in soup.select("img"):
print(img["src"])
# https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQAxU74QyJ8jn8Qq0ZK3ur_GkxjICcvmiC30DWnk03DEsi7YUgS8XXksdyybXY&s
# https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRh5Fhah5gT9msG7vhXeQzAziS17Jp1HE_wE5O00113DtE2rJztgvxwRSonAno&s
# ...
To scrape the full-res image URL with requests and beautifulsoup you need to scrape data from the page source code via regex.
Find all <script> tags:
soup.select('script')
Match images data via regex:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
Match desired images (full res size) via regex:
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps() it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
Extract and decode them using bytes() and decode():
for fixed_full_res_image in matched_google_full_resolution_images:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images to a folder:
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nDownloading Google Full Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with regex to match and extract needed data from the source code of the page, instead, you only need to iterate over structured JSON and get what you want faster.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
...
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.

Pandas: Write all re.search results to csv from BeautifulSoup

I have these beginnings of a Python pandas script that searches for values in on Google and grabs any PDF links it can find on the first page.
I have two questions, listed below.
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
import re
df = pd.DataFrame(["Shakespeare", "Beowulf"], columns=["Search"])
print "Searching for PDFs ..."
hdr = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(search):
google = "http://www.google.com/search?q="
url = google + search + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
pdf_links = None
placeholder = None #just a column placeholder
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
cite = soup.find_all("cite", attrs={"class":"_Rm"})
for link in cite:
all_links = re.search(r".+", link.text).group().encode("utf-8")
if all_links.endswith(".pdf"):
pdf_links = re.search(r"(.+)pdf$", all_links).group()
print pdf_links
except urllib2.HTTPError, e:
print e.fp.read()
return pd.Series([pdf_links, placeholder])
df[["PDF links", "Placeholder"]] = df["Search"].apply(crawl)
df.to_csv(FileName, index=False, delimiter=",")
The results from print pdf_links will be:
davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
sparks.eserver.org/books/shakespeare-tempest.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
www.penguin.com/static/pdf/.../beowulf.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
sparks.eserver.org/books/beowulf.pdf
And the csv output will look like:
Search PDF Links
Shakespeare calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
Beowulf sparks.eserver.org/books/beowulf.pdf
Questions:
Is there a way to write all of the results as rows to the csv instead of
just the bottom one? And if possible, include the value in Search for each row that corresponds to "Shakespeare" or "Beowulf"?
How can I write out the full pdf links without long links automatically abbreviating with "..."?
This will get you all the proper pdf links using soup.find_all("a",href=True) and save them in a Dataframe and to a csv:
hdr = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(columns=None, *search):
df = pd.DataFrame(columns= columns)
for term in search:
google = "http://www.google.com/search?q="
url = google + term + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
pdfs = []
links = soup.find_all("a",href=True)
for link in links:
lk = link["href"]
if lk.endswith(".pdf"):
pdfs.append((term, lk))
df2 = pd.DataFrame(pdfs, columns=columns)
df = df.append(df2, ignore_index=True)
except urllib2.HTTPError, e:
print e.fp.read()
return df
df = crawl(["Search", "PDF link"],"Shakespeare","Beowulf")
df.to_csv("out.csv",index=False)
out.csv:
Search,PDF link
Shakespeare,http://davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
Shakespeare,http://www.w3.org/People/maxf/XSLideMaker/hamlet.pdf
Shakespeare,http://sparks.eserver.org/books/shakespeare-tempest.pdf
Shakespeare,https://phillipkay.files.wordpress.com/2011/07/william-shakespeare-plays.pdf
Shakespeare,http://www.artsvivants.ca/pdf/eth/activities/shakespeare_overview.pdf
Shakespeare,http://triggs.djvu.org/djvu-editions.com/SHAKESPEARE/SONNETS/Download.pdf
Beowulf,http://www.yorku.ca/inpar/Beowulf_Child.pdf
Beowulf,https://is.muni.cz/el/1441/podzim2013/AJ2RC_STAL/2._Beowulf.pdf
Beowulf,http://teacherweb.com/IL/Steinmetz/MottramM/Beowulf---Seamus-Heaney.pdf
Beowulf,http://www.penguin.com/static/pdf/teachersguides/beowulf.pdf
Beowulf,http://www.neshaminy.org/cms/lib6/PA01000466/Centricity/Domain/380/text.pdf
Beowulf,http://www.sparknotes.com/free-pdfs/uscellular/download/beowulf.pdf
To get PDF links, you're looking for these selectors:
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class OR use try/except instead
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
CSS selectors reference. Have a look at SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
To save them to CSV, you're looking for this:
# store all links from a for loop
pdfs = []
# create PDF Link column and append PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
# save to csv and delete default pandas index column. Done!
df.to_csv('PDFs.csv', index=False)
Code and example in the online IDE (also shows how to save locally):
from bs4 import BeautifulSoup
import requests, lxml
import pandas as pd
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "best lasagna recipe:pdf"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
pdfs = []
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
pdfs.append(pdf_file)
# creates PDF Link column and appends PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('Bs4_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Alternatively, you can achieve the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that rather than creating everything from scratch, figuring out why certain things don't work as expected, and then maintain it over time, all that you need to do is to iterate over structured JSON and get the data you want. It might be also more readable and quickly understand what's going on inside the code.
Code to integrate with your example:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "best lasagna recipe:pdf",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
pdfs = []
# iterate over organic results and check if .pdf file type exists in link
for result in results['organic_results']:
if '.pdf' in result['link']:
pdf_file = result['link']
pdfs.append(pdf_file)
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('SerpApi_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Disclaimer, I work for SerpApi.

Categories