Not able to Scrape through BeautifulSoup - python

I am trying to scrape image and news url from this website. The tag which i have defined is
root_tag=["div", {"class":"ngp_col ngp_col-bottom-gutter-2 ngp_col-md-6 ngp_col-lg-4"}]
image_tag=["div",{"class":"low-rez-image"},"url"]
news_url=["a",{"":""},"href"]
and url is url ,my code for scraping the website is.
ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
headers = {'User-Agent': ua2,
'Accept': 'text/html,application/xhtml+xml,application/xml;' \
'q=0.9,image/webp,*/*;q=0.8'}
session = requests.Session()
response = session.get(url, headers=headers)
webContent = response.content
bs = BeautifulSoup(webContent, 'lxml')
all_tab_data = bs.findAll(root_tag[0], root_tag[1])
result=[]
for div in all_tab_data:
try:
news_url=None
news_url = div.find(news_tag[0], news_tag[1]).get(news_tag[2])
except Exception as e:
news_url= None
try:
image_url = None
div_img = str(div)
match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img)
if match != None:
image_url = str(match.group(0))
else:
image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2])
except Exception as e:
image_url=None
pass
result.append([news_url,image_url])
I debug the code and find that all_tab_data is empty but i am choosing correct root_tag. So i dont know what i am doing wrong

The content is loaded from a JSON.
You can get all the image urls this way:
import requests
url = "https://www.nationalgeographic.com/magazine/_jcr_content/content/promo-carousel.promo-carousel.json"
data = requests.get(url).json()
for item in data:
for sub_item in item['promo_carousel']:
p_img = sub_item['promo_image']
if p_img is not None:
print(p_img['image']['uri'])
Output:
https://www.nationalgeographic.com/content/dam/animals/2020/09/african-cheetah-snow/african-cheetah-snow-2.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/09/wallaby-atrazine/wallaby-og-a0xh8r-01.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/09/elephant-tuberculosis/r40bfj.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/08/handfish/01-handfish-minden_90392182.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/08/cal-fire-update/california-fire-palley-mm9468_200905_000229.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/11/face-mask-recognition/20200901_002_out_mp4_00_00_03_18_still003.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/10/winds-fires-california/winds-fires-california-2019.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/10/fire-air-quality/fire-air-pollution-20253854760329.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/02/autopsy/mm9412_200717_000522.jpg
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/stellar-map-milky-way-og.png
https://www.nationalgeographic.com/content/dam/science/2020/07/31/vaccine/vaccine_20209514426186.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/history-magazine/2020/09-10/metric-system/og-french-metric-system.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/OG/red-terror-explainer-og.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/OG/promo-medieval-pandemic.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/2020/09/Asian-American-COVID/og_asianamerican.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/2020/08/goodbye-hong-kong/19-hong-kong-security-law-china.jpg
https://www.nationalgeographic.com/content/dam/travel/commercial/2020/samsung/wyoming/samsung-wyoming-mountain.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/kissing-tourism-sites/gettyimages-3332297.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/thinking-about-traveling/nationalgeographic_1085186.jpg
https://www.nationalgeographic.com/content/dam/science/commercial/2019/domestic/wyss-foundation/wyss-foundation_cfn_natgeo-image-collection_1971120.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/least-visited-US-national-parks/nationalgeographic_2466315.jpg
EDIT: To get title and article data use this:
for item in data:
for sub_item in item['promo_carousel']:
print(f"{sub_item['components'][0]['title']['text']}"
f"\n{sub_item['uri']}")
p_img = sub_item['promo_image']
if p_img is not None:
print(f"{p_img['image']['uri']}")
print("-" * len(sub_item['uri']))
Prints (shortened for brevity):
Rare photographs show African cheetahs in snowstorm
https://www.nationalgeographic.com/animals/2020/09/cheetahs-snow-south-africa/
https://www.nationalgeographic.com/content/dam/animals/2020/09/african-cheetah-snow/african-cheetah-snow-2.jpg
------------------------------------------------------------------------------
Wallabies exposed to common weed killer have reproductive abnormalities
https://www.nationalgeographic.com/animals/2020/09/wallaby-sexual-development-impaired-by-atrazine-herbicide/
https://www.nationalgeographic.com/content/dam/animals/2020/09/wallaby-atrazine/wallaby-og-a0xh8r-01.jpg
-------------------------------------------------------------------------------------------------------------
...

Another solution:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.nationalgeographic.com/magazine/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = json.loads( soup.select_one('[data-pestle-module="Lead"] [data-pestle-options]').string )
# to print all data uncomment next line:
# print(json.dumps(data, indent=4))
for b in data['body']:
if 'multilayout_promo_beta' not in b:
continue
for s in b['multilayout_promo_beta']['stories']:
if not s.get('lead_media'):
continue
if 'immersive_lead' not in s['lead_media']:
print(s['components'][0]['title']['text'])
print(s['lead_media']['image']['uri'])
else:
print(s['lead_media']['immersive_lead']['title'])
print(s['lead_media']['immersive_lead']['lead_media']['image']['uri'])
print(s['uri'])
print('-' * 80)
Prints:
America’s neglected hiking trails are more popular than ever—but they’re struggling
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/us-hiking-trails/us-hiking-trails-campfire-valley.jpg
https://www.nationalgeographic.com/magazine/2020/10/america-long-neglected-hiking-trails-are-more-popular-than-ever-but-they-are-struggling-feature/
--------------------------------------------------------------------------------
The heroic effort in the Amazon to save one of the world’s largest eagles
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/saving-largest-eagle/harpy-eagles-brazil-14a.jpg
https://www.nationalgeographic.com/animals/2020/04/saving-worlds-largest-eagle/
--------------------------------------------------------------------------------
The robot revolution has arrived
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/rise-of-the-machines/mm8612_190408_00122-3.jpg
https://www.nationalgeographic.com/magazine/2020/09/the-robot-revolution-has-arrived-feature/
--------------------------------------------------------------------------------
They may look goofy, but ostriches are nobody’s fool
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/ostriches/ostriches-standing-tall-male-beach.jpg
https://www.nationalgeographic.com/magazine/2020/09/they-may-look-goofy-but-ostriches-are-nobodys-fool-feature/
--------------------------------------------------------------------------------
The Great Lakes depend on ice. This winter, they barely froze.
https://www.nationalgeographic.com/content/dam/science/2020/03/19/no-ice/year-with-no-ice-sacka-46.jpg
https://www.nationalgeographic.com/science/2020/03/great-lakes-depend-on-winter-ice-low-cover/
--------------------------------------------------------------------------------
‘I put my camera to my face and cried.’ Documenting a COVID-19 hot spot
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departements-detroit-singer-funeral.jpg
https://www.nationalgeographic.com/magazine/2020/10/danny-wilcox-frazier-on-photographing-covid-19-in-detroit/
--------------------------------------------------------------------------------
COVID-19’s impact on the animal kingdom—so far
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departments-covid-animals-tiger.jpg
https://www.nationalgeographic.com/magazine/2020/10/covid-19s-impact-on-the-animal-kingdom-so-far/
--------------------------------------------------------------------------------
To prevent the next deadly disease, we must stop harming nature
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/coronavirus/departments-coronavirus-coral-reef.jpg
https://www.nationalgeographic.com/magazine/2020/09/pristine-seas-enric-sala-we-must-stop-harming-nature-to-prevent-deadly-disease-coronavirus/
--------------------------------------------------------------------------------
Beyond masks and gloves—here’s how the pros handle dangerous microbes
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/coronavirus/tool-kit-covid-testing.jpg
https://www.nationalgeographic.com/magazine/2020/09/beyond-masks-and-gloves-here-is-how-the-pros-handle-dangerous-microbes/
--------------------------------------------------------------------------------
NASA sent a map to space to help aliens find Earth. Now it needs an update.
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/departments-stellar-map-galaxy.jpg
https://www.nationalgeographic.com/magazine/2020/10/nasa-sent-a-map-to-space-to-help-aliens-find-earth-now-it-needs-an-update/
--------------------------------------------------------------------------------
This archaeologist hunts DNA from prehistoric diseases
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departments-genius-rifkin.jpg
https://www.nationalgeographic.com/magazine/2020/10/archaeologist-riaan-rifkin-hunts-dna-from-prehistoric-diseases/
--------------------------------------------------------------------------------
See the ingenious cameras used to photograph elusive animals
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/departments-artifact-crittercam-wooden-fin.jpg
https://www.nationalgeographic.com/magazine/2020/10/see-the-ingenious-crittercams-used-to-photograph-elusive-animals/
--------------------------------------------------------------------------------
Popsicles and belly rubs: The joys of watching a panda grow up
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/explore/explore-essay-panda-stretching.jpg
https://www.nationalgeographic.com/magazine/2020/09/popsicles-and-belly-rubs-the-joys-of-watching-panda-bei-bei-grow-up/
--------------------------------------------------------------------------------

Related

Web-scrape. BeautifulSoup. Multiple Pages. How on earth would you do that?

Hi I am a Newbie to programming. So I spent 4 days trying to learn python. I evented some new swear words too.
I was particularly interested in trying as an exercise some web-scraping to learn something new and get some exposure to see how it all works.
This is what I came up with. See code at end. It works (to a degree)
But what's missing?
This website has pagination on it. In this case 11 pages worth.  How would you go about adding to this script and getting python to go look at those other pages too and carry out the same scrape. Ie scrape page one , scrape page 2, 3 ... 11 and post the results to a csv?
https://www.organicwine.com.au/vegan/?pgnum=1
https://www.organicwine.com.au/vegan/?pgnum=2
https://www.organicwine.com.au/vegan/?pgnum=3
https://www.organicwine.com.au/vegan/?pgnum=4
https://www.organicwine.com.au/vegan/?pgnum=5
https://www.organicwine.com.au/vegan/?pgnum=6
https://www.organicwine.com.au/vegan/?pgnum=7
8, 9,10, and 11
On these pages the images are actually a thumbnail images something like 251px by 251px.
How would you go about adding to this script to say. And whilst you are at it follow the links to the detailed product page and capture the image link from there where the images are 1600px by 1600px and post those links to CSV
https://www.organicwine.com.au/mercer-wines-preservative-free-shiraz-2020
When we have identified those links lets also download those larger images to a folder
CSV writer. Also I don't understand line 58
for i in range(23)
how would i know how many products there were without counting them (i.e. there is 24 products on page one)
So this is what I want to learn how to do. Not asking for much (he says sarcastically) I could pay someone on up-work to do it but where's the fun in that? and that does not teach me how to 'fish'.
Where is a good place to learn python? A master class on web-scraping. It seems to be trial and error and blog posts and where ever you can pick up bits of information to piece it all together.
Maybe I need a mentor.
I wish there had been someone I could have reached out to, to tell me what beautifulSoup was all about. worked it out by trial and error and mostly guessing. No understanding of it but it just works.
Anyway, any help in pulling this all together to produce a decent script would be greatly appreciated.
Hopefully there is someone out there who would not mind helping me.
Apologies to organicwine for using their website as a learning tool. I do not wish to cause any harm or be a nuisance to the site
Thank you in advance
John
code:
import requests
import csv
from bs4 import BeautifulSoup
URL = "https://www.organicwine.com.au/vegan/?pgnum=1"
response = requests.get(URL)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all('div', class_="caption")
# print(product_title)
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
print(f'''Wine Name: {winetext}''')
# print(f'''\nWine Name: {winename}\n''')
product_price = soup.find_all('div', class_='wrap-thumb-mob')
# print(product_price.text)
price =[]
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
print(f'''Wine Price: {wineprice}''')
# print(f'''\nWine Price: {price}\n''')
image =[]
product_image_link = (soup.find_all('div', class_='thumbnail-image'))
# print(product_image_link)
for imagelink in product_image_link:
wineimagelink = imagelink.a['href']
image.append(wineimagelink)
# image.append(imagelink)
print(f'''Wine Image Lin: {wineimagelink}''')
# print(f'''\nWine Image: {image}\n''')
#
#
# """ writing data to CSV """
# open OrganicWine2.csv file in "write" mode
# newline stops a blank line appearing in csv
with open('OrganicWine2.csv', 'w',newline='') as file:
# create a "writer" object
writer = csv.writer(file, delimiter=',')
# use "writer" obj to write
# you should give a "list"
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
for i in range(23):
writer.writerow([
winename[i],
price[i],
image[i],
])
In this case, to do pagination, instead of for i in range(1, 100) which is a hardcoded way of paging, it's better to use a while loop to dynamically paginate all possible pages.
"While" is an infinite loop and it will be executed until the transition to the next page is possible, in this case it will check for the presence of the button for the next page, for which the CSS selector ".fa-chevron-right" is responsible:
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1 # go to the next page
else:
break
To extract the full size image an additional request is required, CSS selector ".main-image a" is responsible for full-size images:
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
An additional step to avoid being blocked is to rotate user-agents. Ideally, it would be better to use residential proxies with random user-agent.
pandas can be used to extract data in CSV format:
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
For a quick and easy search for CSS selectors, you can use the SelectorGadget Chrome extension (not always work perfectly if the website is rendered via JavaScript).
Check code with pagination and saving information to CSV in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
import pandas as pd
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'pgnum': 1 # number page by default
}
data = []
while True:
page = requests.get(
"https://www.organicwine.com.au/vegan/?",
params=params,
headers=headers,
timeout=30,
)
soup = BeautifulSoup(page.text, "lxml")
print(f"Extracting page: {params['pgnum']}")
for products in soup.select(".price-btn-conts"):
try:
title = products.select_one(".new-h3").text
except:
title = None
try:
price = products.select_one(".price").text.strip()
except:
price = None
try:
snippet = products.select_one(".price-btn-conts p a").text
except:
snippet = None
try:
link = products.select_one(".new-h3 a")["href"]
except:
link = None
# additional request is needed to extract full size image
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
data.append(
{
"title": title,
"price": price,
"snippet": snippet,
"link": link,
"original_image": original_image
}
)
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1
else:
break
# save to CSV (install, import pandas as pd)
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Yangarra McLaren Vale GSM 2016",
"price": "$29.78 in a straight 12\nor $34.99 each",
"snippet": "The Yangarra GSM is a careful blending of Grenache, Shiraz and Mourvèdre in which the composition varies from year to year, conveying the traditional estate blends of the southern Rhône. The backbone of the wine comes fr...",
"link": "https://www.organicwine.com.au/yangarra-mclaren-vale-gsm-2016",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GSM_16.png?20211110083637"
},
{
"title": "Yangarra Old Vine Grenache 2020",
"price": "$37.64 in a straight 12\nor $41.99 each",
"snippet": "Produced from the fruit of dry grown bush vines planted high up in the Estate's elevated vineyards in deep sandy soils. These venerated vines date from 1946 and produce a wine that is complex, perfumed and elegant with a...",
"link": "https://www.organicwine.com.au/yangarra-old-vine-grenache-2020",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GRE_20.jpg?20210710165951"
},
#...
]
Create the URL by putting the page number in it, then put the rest of your code into a for loop and you can use len(winenames) to count how many results you have. You should do the writing outside the for loop. Here's your code with those changes:
import requests
import csv
from bs4 import BeautifulSoup
num_pages = 11
result = []
for pgnum in range(num_pages):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum+1}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all("div", class_="caption")
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
product_price = soup.find_all("div", class_="wrap-thumb-mob")
price = []
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
image = []
product_image_link = soup.find_all("div", class_="thumbnail-image")
for imagelink in product_image_link:
winelink = imagelink.a["href"]
response = requests.get(winelink)
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image.append(main_image['href'])
for i in range(len(winename)):
result.append([winename[i], price[i], image[i]])
with open("/tmp/OrganicWine2.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
writer.writerows(results)
And here's how I would rewrite your code to accomplish this task. It's more pythonic (you should basically never write range(len(something)), there's always a cleaner way) and it doesn't require knowing how many pages of results there are:
import csv
import itertools
import time
import requests
from bs4 import BeautifulSoup
data = []
# Try opening 100 pages at most, in case the scraping code is broken
# which can happen because websites change.
for pgnum in range(1, 100):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
search_results = soup.find_all("div", class_="thumbnail")
for search_result in search_results:
name = search_result.find("div", class_="caption").a.text
price = search_result.find("p", class_="price").span.text
# link to the product's page
link = search_result.find("div", class_="thumbnail-image").a["href"]
# get the full resolution product image
response = requests.get(link)
time.sleep(1) # rate limit
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image_url = main_image["href"]
# or you can just "guess" it from the thumbnail's URL
# thumbnail = search_result.find("div", class_="thumbnail-image").a.img['src']
# image_url = thumbnail.replace('/thumbL/', '/full/')
data.append([name, price, link, image_url])
# if there's no "next page" button or no search results on the current page,
# stop scraping
if not soup.find("i", class_="fa-chevron-right") or not search_results:
break
# rate limit
time.sleep(1)
with open("/tmp/OrganicWine3.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Link", "Wine Image Link"])
writer.writerows(data)

Facing issue at the time of Web scraping

I am trying to extract reviews from Glass door. However I am facing issues. Please follow my codes below-
import requests
from bs4 import BeautifulSoup
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
url = requests.get("https://www.glassdoor.co.in/Reviews/The-Wonderful-Company-Reviews-E1005987.htm?sort.sortType=RD&sort.ascending=false&countryRedirect=true", headers=headers)
urlContent =BeautifulSoup(url.content,"lxml")
print(urlContent)
review = urlContent.find_all('a',class_='reviewLink')
review
title = []
for i in range(0,len(review)):
title.append(review[i].get_text())
title
rating= urlContent.find_all('div',class_='v2__EIReviewsRatingsStylesV2__ratingNum v2__EIReviewsRatingsStylesV2__small')
score=[]
for i in range(0,len(rating)):
score.append(rating[i].get_text())
rev_pros=urlContent.find_all("span",{"data-test":"pros"})
pros=[]
for i in range(0,len(rev_pros)):
pros.append(rev_pros[i].get_text())
pros
rev_cons=urlContent.find_all("span",{"data-test":"cons"})
cons=[]
for i in range(0,len(rev_cons)):
cons.append(rev_cons[i].get_text())
cons
advse=urlContent.find_all("span",{"data-test":"advice-management"})
advse
advise=[]
for i in range(0,len(advse)):
advise.append(advse[i].get_text())
advise
location=urlContent.find_all('span',class_='authorLocation')
location
job_location=[]
for i in range(0,len(location)):
job_location.append(location[i].get_text())
job_location
import pandas as pd
df=pd.DataFrame()
df['Review Title']=title
df['Overall Score']=score
df['Pros']=pros
df['Cons']=cons
df['Jobs_Location']=job_location
df['Advise to Mgmt']=advise
Here I am facing two challenges-
Unable to extract anything for 'advse'(used for 'Advise to
Managemnt').
Getting error when I use 'Job Location' as a column in the data
frame.(ValueError: Length of values does not match length of index).
For this error my finding was- there were ten rows for
other columns however for 'Job Location' there are less rows as
location not disclosed in some reviews.
Can any body help me on this. Thanks in advance.
A better approach would be to find a <div> that encloses each of the reviews and then extract all the information needed from it before moving to the next. This would make it easier to deal with the case where information is missing in some reviews.
For example:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
url = requests.get("https://www.glassdoor.co.in/Reviews/The-Wonderful-Company-Reviews-E1005987.htm?sort.sortType=RD&sort.ascending=false&countryRedirect=true", headers=headers)
urlContent = BeautifulSoup(url.content,"lxml")
get_text = lambda x: x.get_text(strip=True) if x else ""
entries = []
for entry in urlContent.find_all('div', class_='row mt'):
review = entry.find('a', class_="reviewLink")
rating = entry.find('div',class_='v2__EIReviewsRatingsStylesV2__ratingNum v2__EIReviewsRatingsStylesV2__small')
rev_pros = entry.find("span", {"data-test":"pros"})
rev_cons = entry.find("span", {"data-test":"cons"})
location = entry.find('span', class_='authorLocation')
advice = entry.find("span", {"data-test":"advice-management"})
entries.append([
get_text(review),
get_text(rating),
get_text(rev_pros),
get_text(rev_cons),
get_text(location),
get_text(advice)
])
columns = ['Review Title', 'Overall Score', 'Pros', 'Cons', 'Jobs_Location', 'Advise to Mgmt']
df = pd.DataFrame(entries, columns=columns)
print(df)
The get_text() function ensures that if nothing was returned (i.e. None) then an empty string is returned.
You will need to improve your logic for extracting the advice. The information for the whole page is held inside a <script> tag. One of them holds the JSON data. The advice information is not moved into HTML until a user clicks on it, as such it would need to be extracted from the JSON. If this approach is used, then it would also make sense to extract all of the other information also directly from the JSON.
To do this, locate all the <script> tags and determine which contains the reviews. Convert the JSON into a Python data structure (using the JSON library). Now locate the reviews, for example:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
url = requests.get("https://www.glassdoor.co.in/Reviews/The-Wonderful-Company-Reviews-E1005987.htm?sort.sortType=RD&sort.ascending=false&countryRedirect=true", headers=headers)
urlContent = BeautifulSoup(url.content,"lxml")
entries = []
for script in urlContent.find_all('script'):
text = script.text
if "appCache" in text:
# extract the JSON from the script tag
data = json.loads(text[text.find('{'): text.rfind('}') + 1])
# Go through all keys in the dictionary and pick those containing reviews
for key, value in data['apolloState'].items():
if ".reviews." in key and "links" not in key:
location = value['location']
city = location['id'] if location else None
entries.append([
value['summary'],
value['ratingOverall'],
value['pros'],
value['cons'],
city,
value['advice']
])
columns = ['Review Title', 'Overall Score', 'Pros', 'Cons', 'Jobs_Location', 'Advise to Mgmt']
df = pd.DataFrame(entries, columns=columns)
print(df)
This would give you a dataframe as follows:
Review Title Overall Score Pros Cons Jobs_Location Advise to Mgmt
0 Upper management n... 3 Great benefits, lo... Career advancement... City:1146821 Listen to your emp...
1 Sales 2 Good atmosphere lo... Drive was very far... None None
2 As an organization... 2 Free water and goo... Not a lot of diver... None None
3 Great place to grow 4 If your direct man... Owners are heavily... City:1146821 None
4 Great Company 5 Great leadership, ... To grow and move u... City:1146821 None
5 Lots of opportunit... 5 This is a fast pac... There's a sense of... City:1146821 Continue listening...
6 Interesting work i... 3 Working with great... High workload and ... None None
7 Wonderful 5 This company care... The drive, but we ... City:1146577 Continue growing y...
8 Horrendous 1 The pay was fairly... Culture of abuse a... City:1146821 Upper management l...
9 Upper Leadership a... 1 Strong Company, fu... You don't have a f... City:1146577 You get rid of fol...
It would help if you added print(data) to see the whole structure of the data being returned. The only issue with this approach is a further lookup would be needed to convert the city ID into an actual location. That information is also contained in the JSON.

Tag of Google news title for beautiful soup

I am trying to extract the result of a search from Google news (vaccine for example) and provide some sentiment analysis based on the headline collected.
So far, I can't seem to find the correct tag to collect the headlines.
Here is my code:
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = 'https://www.google.com/search?q={0}&source=lnms&tbm=nws'.format(self.term)
def run (self):
response = requests.get(self.url)
print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
headline_results = soup.find_all('div', class_="phYMDf nDgy9d")
for h in headline_results:
blob = TextBlob(h.get_text())
self.sentiment += blob.sentiment.polarity / len(headline_results)
self.subjectivity += blob.sentiment.subjectivity / len(headline_results)
a = Analysis('Vaccine')
a.run()
print(a.term, 'Subjectivity: ', a.subjectivity, 'Sentiment: ' , a.sentiment)
The result are always 0 for the sentiment and 0 for the subjectivity. I feel like the issue is with the class_="phYMDf nDgy9d".
If you browse into that link, you are going to see the finished state of page but requests.get does not exeute or load any more data other than the page you request. Luckily there is some data and you can scrape that. I suggest you to use html prettifier services like codebeautify to get better understanding about what the page structure is.
Also if you see classes like phYMDf nDgy9d be sure to avoid finding with them. They are minified versions of classes so at any moment if they change a part of the CSS code, the class you are looking for is going to get a new name.
What I did is probably overkill but, I managed to dig down to scrape specific parts and your code works now.
When you look at the prettier version of requested html file, necessary contents are in a div with an id of main shown above. Then it's children are starting with a div element Google Search, continuing with a style element and after one empty div element, there are post div elements. The last two elements in that children list are footer and script elements. We can cut these off with [3:-2] and then under that tree we have pure data (pretty much). If you check the remaining part of the code after the posts variable, you can understand it I think.
Here is the code:
from textblob import TextBlob
import requests, re
from bs4 import BeautifulSoup
from pprint import pprint
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = 'https://www.google.com/search?q={0}&source=lnms&tbm=nws'.format(self.term)
def run (self):
response = requests.get(self.url)
#print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
mainDiv = soup.find("div", {"id": "main"})
posts = [i for i in mainDiv.children][3:-2]
news = []
for post in posts:
reg = re.compile(r"^/url.*")
cursor = post.findAll("a", {"href": reg})
postData = {}
postData["headline"] = cursor[0].find("div").get_text()
postData["source"] = cursor[0].findAll("div")[1].get_text()
postData["timeAgo"] = cursor[1].next_sibling.find("span").get_text()
postData["description"] = cursor[1].next_sibling.find("span").parent.get_text().split("· ")[1]
news.append(postData)
pprint(news)
for h in news:
blob = TextBlob(h["headline"] + " "+ h["description"])
self.sentiment += blob.sentiment.polarity / len(news)
self.subjectivity += blob.sentiment.subjectivity / len(news)
a = Analysis('Vaccine')
a.run()
print(a.term, 'Subjectivity: ', a.subjectivity, 'Sentiment: ' , a.sentiment)
A few outputs:
[{'description': 'It comes after US health officials said last week they had '
'started a trial to evaluate a possible vaccine in Seattle. '
'The Chinese effort began on...',
'headline': 'China embarks on clinical trial for virus vaccine',
'source': 'The Star Online',
'timeAgo': '5 saat önce'},
{'description': 'Hanneke Schuitemaker, who is leading a team working on a '
'Covid-19 vaccine, tells of the latest developments and what '
'needs to be done now.',
'headline': 'Vaccine scientist: ‘Everything is so new in dealing with this '
'coronavirus’',
'source': 'The Guardian',
'timeAgo': '20 saat önce'},
.
.
.
Vaccine Subjectivity: 0.34522727272727277 Sentiment: 0.14404040404040402
[{'description': '10 Cool Tech Gadgets To Survive Working From Home. From '
'Wi-Fi and cell phone signal boosters, to noise-cancelling '
'headphones and gadgets...',
'headline': '10 Cool Tech Gadgets To Survive Working From Home',
'source': 'CRN',
'timeAgo': '2 gün önce'},
{'description': 'Over the past few years, smart home products have dominated '
'the gadget space, with goods ranging from innovative updates '
'to the items we...',
'headline': '6 Smart Home Gadgets That Are Actually Worth Owning',
'source': 'Entrepreneur',
'timeAgo': '2 hafta önce'},
.
.
.
Home Gadgets Subjectivity: 0.48007305194805205 Sentiment: 0.3114683441558441
I used headlines and description data to do the operations but you can play with that if you want. You have the data now :)
use this
headline_results = soup.find_all('div', {'class' : 'BNeawe vvjwJb AP7Wnd'})
you already printed the response.text, if you want to find the specific data please search from the response.text result
Try to use select() instead. CSS selectors are more flexible. CSS selectors reference.
Have a look at SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
If you want to get all titles and so on, then you are looking for this container:
soup.select('.dbsr')
Make sure to pass user-agent, because Google might block your requests eventually and you'll receive a different HTML thus empty output. Check what is your user-agent
Pass user-agent:
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
requests.get("YOUR_URL", headers=headers)
I'm not sure what exactly are you trying to do but a solution from Guven Degirmenci is a bit overkill as he mentioned, with slicing, regex, doing something in div#main. It's much simpler.
Code and example in the online IDE:
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = f"https://www.google.com/search?q={self.term}&tbm=nws"
def run (self):
response = requests.get(self.url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
news_data = []
for result in soup.select('.dbsr'):
title = result.select_one('.nDgy9d').text
link = result.a['href']
source = result.select_one('.WF4CUc').text
snippet = result.select_one('.Y3v8qd').text
date_published = result.select_one('.WG9SHc span').text
news_data.append({
"title": title,
"link": link,
"source": source,
"snippet": snippet,
"date_published": date_published
})
for h in news_data:
blob = TextBlob(f"{h['title']} {h['snippet']}")
self.sentiment += blob.sentiment.polarity / len(news_data)
self.subjectivity += blob.sentiment.subjectivity / len(news_data)
a = Analysis("Lasagna")
a.run()
print(a.term, "Subjectivity: ", a.subjectivity, "Sentiment: " , a.sentiment)
# Vaccine Subjectivity: 0.3255952380952381 Sentiment: 0.05113636363636363
# Lasagna Subjectivity: 0.36556818181818185 Sentiment: 0.25386093073593075
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to maintain the parser, figure out how to parse certain elements or figuring out why something isn't working as it should, and understand how to bypass blocks from Google. All that needs to be done is to iterate over structured JSON and get what you want fast.
Code integrated with your example:
from textblob import TextBlob
import os
from serpapi import GoogleSearch
class Analysis:
def __init__(self, term):
self.term = term
self.subjectivity = 0
self.sentiment = 0
self.url = f"https://www.google.com/search"
def run (self):
params = {
"engine": "google",
"tbm": "nws",
"q": self.url,
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
news_data = []
for result in results['news_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
source = result['source']
date_published = result['date']
news_data.append({
"title": title,
"link": link,
"source": source,
"snippet": snippet,
"date_published": date_published
})
for h in news_data:
blob = TextBlob(f"{h['title']} {h['snippet']}")
self.sentiment += blob.sentiment.polarity / len(news_data)
self.subjectivity += blob.sentiment.subjectivity / len(news_data)
a = Analysis("Vaccine")
a.run()
print(a.term, "Subjectivity: ", a.subjectivity, "Sentiment: " , a.sentiment)
# Vaccine Subjectivity: 0.30957251082251086 Sentiment: 0.06277056277056277
# Lasagna Subjectivity: 0.30957251082251086 Sentiment: 0.06277056277056277
P.S - I wrote a bit more detailed blog post about how to scrape Google News.
Disclaimer, I work for SerpApi.

How to fix 'KeyError' error in BeautifulSoup

I'm learning how to use BeautifulSoup and I'm trying to read the weather from Google. I'm using this URL.
I'm getting a 'KeyError: "id"' error on the line:
if span.attrs["id"] == "wob_tm":
What does this mean and how can I solve this problem?
I got the same error specifying a different attribute, "class", so I thought it might have just been a problem with the term "class" but I'm still recieving the error no matter what I use
# Creates a list containing all appearences of the 'span' tag
# The weather value is located within a span tag
spans = soup.find_all("span")
for span in spans:
if span.attrs["id"] == "wob_tm":
print(span.content)
I expect the output to be the integer value of the weather but when I run the code I just get:
"KeyError: 'id'"
Some span tags don't have that attribute at all, so they give you the error when you try and access that. You could just refine your search:
spans = soup.find_all('span', {'id': 'wob_tm'})
This would find only objects that match. You can then just print them all:
for span in spans:
print(span.content)
Although the rest of the answers are legit, none will work in that case because the content of temperature is loaded probably using javascript so the spans you're looking won't be found. Instead you can use selenium that works fo sure. i.e.:
from selenium import webdriver
driver = webdriver.Chrome('chromedriver.exe')
driver.get("https://www.google.co.uk/search?sxsrf=ACYBGNSfZJRq-EqvQ7rSC0oFZW-FiL-S-Q%3A1571602469929&source=hp&ei=JcCsXb-ANoK4kwWgtK_4DQ&q=what%27s+the+weather+today&oq=whats+the+weather+&gs_l=psy-ab.3.0.0i10i70i256j0i10j0j0i10l3j0l3j0i10.663.2962..4144...0.0..0.82.1251.19......0....1..gws-wiz.....10..35i362i39j35i39j0i131.AWESAgn5njA")
temp = driver.find_element_by_id('wob_tm').text
print(temp)
The problem that there is no 'id' key in the dictionary 'attrs'. The code below will handle this case.
spans = soup.find_all("span")
for span in spans:
if span.attrs.get("id") == "wob_tm":
print(span.content)
else:
print('not wob_tm')
Weather data is not rendered with JavaScript as Kostas Charitidis mentioned.
You don't need to specify <span> element, and more over you don't need to use find_all()/findAll()/select() since you're looking just for one element that doesn't repeat anywhere else. Use select_one() instead:
soup.select_one('#wob_tm').text
# prints temperature
You can also use try/except if you want to return None:
try:
temperature = soup.select_one('#wob_tm').text
except: temperature = None
An if statement always costs you, it's nearly free to set up a try/except block. But when an Exception actually occurs, the cost is much higher.
The next problem that might cause that error would be no user-agent specified so Google would block your request eventually thus you'll receive a completely different HTML. I already answered about what is user-agent.
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "london weather",
"hl": "en",
"gl": "us"
}
response = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(response.text, 'lxml')
weather_condition = soup.select_one('#wob_dc').text
tempature = soup.select_one('#wob_tm').text
precipitation = soup.select_one('#wob_pp').text
humidity = soup.select_one('#wob_hm').text
wind = soup.select_one('#wob_ws').text
current_time = soup.select_one('#wob_dts').text
print(f'Weather condition: {weather_condition}\n'
f'Temperature: {tempature}°F\n'
f'Precipitation: {precipitation}\n'
f'Humidity: {humidity}\n'
f'Wind speed: {wind}\n'
f'Current time: {current_time}\n')
----
'''
Weather condition: Mostly cloudy
Temperature: 60°F
Precipitation: 3%
Humidity: 77%
Wind speed: 3 mph
Current time: Friday 7:00 AM
'''
Alternatively, you can achieve this by using the Google Direct Answer Box API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to figure out how to extract elements since it's already done for the end-user and no need to maintain a parser over time. All that needs to be done is just to iterate over structured JSON and get what you were looking for.
Code to integrate:
from serpapi import GoogleSearch
import os
params = {
"engine": "google",
"q": "london weather",
"api_key": os.getenv("API_KEY"),
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
loc = results['answer_box']['location']
weather_date = results['answer_box']['date']
weather = results['answer_box']['weather']
temp = results['answer_box']['temperature']
precipitation = results['answer_box']['precipitation']
humidity = results['answer_box']['humidity']
wind = results['answer_box']['wind']
print(f'{loc}\n{weather_date}\n{weather}\n{temp}°F\n{precipitation}\n{humidity}\n{wind}\n')
-------
'''
District 3
Friday
Mostly sunny
80°F
0%
52%
5 mph
'''
Disclaimer, I work for SerpApi.

How To Scrape Similiar Classes With One Different Attribute

Searched around on SO, but couldn't find anything for this.
I'm scraping using beautifulsoup... This is the code I'm using which I found on SO:
for section in soup.findAll('div',attrs={'id':'dmusic_tracklist_track_title_B00KHQOKGW'}):
nextNode = section
while True:
nextNode = nextNode.nextSibling
try:
tag_name = nextNode.name
except AttributeError:
tag_name = ""
if tag_name == "a":
print nextNode.text()
else:
print "*****"
break
If went to this 50 Cent album (Animal Ambition: An Untamed Desire To Win) and wanted to scrape each song, how would I do so? The problem is each song has a different ID associated with it based upon its product code. For example, here is the XPath of the first two songs' titles: //*[#id="dmusic_tracklist_track_title_B00KHQOKGW"]/div/a/text() and //*[#id="dmusic_tracklist_track_title_B00KHQOLWK"]/div/a/text().
You'll notice the end of the first id is B00KHQOKGW, while the second is B00KHQOLWK. Is there a way I can add a "wild card to the end of the id to grab each of the songs no matter what product id is at the end? For example, something like id="dmusic_tracklist_track_title_*... I replaced the product ID with a *.
Or can I use a div to target the title I want like this (I feel like this would be the best. It uses the div's class right above the title. There isn't any product ID in it):
for section in soup.findAll('div',attrs={'class':'a-section a-spacing-none overflow_ellipsis'}):
nextNode = section
while True:
nextNode = nextNode.nextSibling
try:
tag_name = nextNode.name
except AttributeError:
tag_name = ""
if tag_name == "a":
print nextNode.text()
else:
print "*****"
break
You can pass a function as an id attribute value and check if it starts with dmusic_tracklist_track_title_:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36'}
response = requests.get('http://www.amazon.com/dp/B00KHQOI8C/?tag=stackoverfl08-20', headers=headers)
soup = BeautifulSoup(response.content)
for song in soup.find_all(id=lambda x: x and x.startswith('dmusic_tracklist_track_title_')):
print song.text.strip()
Prints:
Hold On [Explicit]
Don't Worry 'Bout It [feat. Yo Gotti] [Explicit]
Animal Ambition [Explicit]
Pilot [Explicit]
Smoke [feat. Trey Songz] [Explicit]
Everytime I Come Around [feat. Kidd Kidd] [Explicit]
Irregular Heartbeat [feat. Jadakiss] [Explicit]
Hustler [Explicit]
Twisted [feat. Mr. Probz] [Explicit]
Winners Circle [feat. Guordan Banks] [Explicit]
Chase The Paper [feat. Kidd Kidd] [Explicit]
Alternatively, you can pass a regular expression pattern as an attribute value:
import re
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36'}
response = requests.get('http://www.amazon.com/dp/B00KHQOI8C/?tag=stackoverfl08-20', headers=headers)
soup = BeautifulSoup(response.content)
for song in soup.find_all(id=re.compile('^dmusic_tracklist_track_title_\w+$')):
print song.text.strip()
^dmusic_tracklist_track_title_\w+$ would match dmusic_tracklist_track_title_ followed by 1 or more "alphanumeric" (0-9a-zA-Z and _) characters.

Categories