I am trying to scrape Title,Description and URL from Google Search Page using beautifulsoup and python.
from bs4 import BeautifulSoup
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
links = [] # Initiate empty list to capture final results
titles = []
descriptions = []
# Specify number of pages on google search, each page contains 10 #links
n_pages = 6
for page in range(1, n_pages):
url = "http://www.google.com/search?q=" + query + "&start=" + str((page - 1) * 10)
print("Link : " + url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
search = soup.find_all('div', class_ = "yuRUbf")
for link in search:
links.append(link.a['href'])
description = soup.find_all('div', class_ = "VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf")
for d in description:
description_text.append(d.span.text)
title = soup.find_all('div', class_ = "yuRUbf")
for t in title:
titles.append(t.h3.text)
# Next loop if one element is not present
except:
continue
print(links)
print(len(links))
print(description_text)
print(len(description_text))
print(titles)
print(len(titles))
The description is getting stored in a list however the links and title list is empty. I inspected the elements and I am using correct class but still unable to get the data.
Can someone help me figure out what I am doing wrong.
Personally, I find working with many lists irritating and cumbersome, when content can be stored directly and structured but anyway you can get information without selecting the dynamic classes in a more generic way:
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
Example
from bs4 import BeautifulSoup
import requests
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
data = []
n_pages = 6
for page in range(1, n_pages):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = f'http://www.google.com/search?q={query}&start={str((page - 1) * 10)}'
r = requests.get(url, headers= headers)
soup = BeautifulSoup(r.text, 'lxml')
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
data
Output
[{'title': 'Welcome to Python.org',
'url': 'https://www.python.org/',
'desc': 'The official home of the Python Programming Language.'},
{'title': 'Python (Programmiersprache) - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Python_(Programmiersprache)',
'desc': 'Python ([ˈpʰaɪθn̩], [ ˈpʰaɪθɑn], auf Deutsch auch [ ˈpʰyːtɔn]) ist eine universelle, üblicherweise interpretierte, höhere Programmiersprache.'},
{'title': 'Pythons - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Pythons',
'desc': 'Die Pythons (Pythonidae; altgr. Πύθων Pythōn; Einzahl der, allgemeinsprachlich auch die Python) sind eine Familie von Schlangen aus der Überfamilie der\xa0...'},
{'title': 'Das Python-Tutorial — Das Python3.3-Tutorial auf Deutsch',
'url': 'https://py-tutorial-de.readthedocs.io/',
'desc': 'Python ist eine einfach zu lernende, aber mächtige Programmiersprache mit effizienten abstrakten Datenstrukturen und einem einfachen, aber effektiven Ansatz\xa0...'},...]
I am trying to scrape image and news url from this website. The tag which i have defined is
root_tag=["div", {"class":"ngp_col ngp_col-bottom-gutter-2 ngp_col-md-6 ngp_col-lg-4"}]
image_tag=["div",{"class":"low-rez-image"},"url"]
news_url=["a",{"":""},"href"]
and url is url ,my code for scraping the website is.
ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
headers = {'User-Agent': ua2,
'Accept': 'text/html,application/xhtml+xml,application/xml;' \
'q=0.9,image/webp,*/*;q=0.8'}
session = requests.Session()
response = session.get(url, headers=headers)
webContent = response.content
bs = BeautifulSoup(webContent, 'lxml')
all_tab_data = bs.findAll(root_tag[0], root_tag[1])
result=[]
for div in all_tab_data:
try:
news_url=None
news_url = div.find(news_tag[0], news_tag[1]).get(news_tag[2])
except Exception as e:
news_url= None
try:
image_url = None
div_img = str(div)
match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img)
if match != None:
image_url = str(match.group(0))
else:
image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2])
except Exception as e:
image_url=None
pass
result.append([news_url,image_url])
I debug the code and find that all_tab_data is empty but i am choosing correct root_tag. So i dont know what i am doing wrong
The content is loaded from a JSON.
You can get all the image urls this way:
import requests
url = "https://www.nationalgeographic.com/magazine/_jcr_content/content/promo-carousel.promo-carousel.json"
data = requests.get(url).json()
for item in data:
for sub_item in item['promo_carousel']:
p_img = sub_item['promo_image']
if p_img is not None:
print(p_img['image']['uri'])
Output:
https://www.nationalgeographic.com/content/dam/animals/2020/09/african-cheetah-snow/african-cheetah-snow-2.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/09/wallaby-atrazine/wallaby-og-a0xh8r-01.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/09/elephant-tuberculosis/r40bfj.jpg
https://www.nationalgeographic.com/content/dam/animals/2020/08/handfish/01-handfish-minden_90392182.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/08/cal-fire-update/california-fire-palley-mm9468_200905_000229.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/11/face-mask-recognition/20200901_002_out_mp4_00_00_03_18_still003.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/10/winds-fires-california/winds-fires-california-2019.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/10/fire-air-quality/fire-air-pollution-20253854760329.jpg
https://www.nationalgeographic.com/content/dam/science/2020/09/02/autopsy/mm9412_200717_000522.jpg
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/stellar-map-milky-way-og.png
https://www.nationalgeographic.com/content/dam/science/2020/07/31/vaccine/vaccine_20209514426186.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/history-magazine/2020/09-10/metric-system/og-french-metric-system.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/OG/red-terror-explainer-og.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/rights-exempt/OG/promo-medieval-pandemic.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/2020/09/Asian-American-COVID/og_asianamerican.jpg
https://www.nationalgeographic.com/content/dam/archaeologyandhistory/2020/08/goodbye-hong-kong/19-hong-kong-security-law-china.jpg
https://www.nationalgeographic.com/content/dam/travel/commercial/2020/samsung/wyoming/samsung-wyoming-mountain.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/kissing-tourism-sites/gettyimages-3332297.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/thinking-about-traveling/nationalgeographic_1085186.jpg
https://www.nationalgeographic.com/content/dam/science/commercial/2019/domestic/wyss-foundation/wyss-foundation_cfn_natgeo-image-collection_1971120.jpg
https://www.nationalgeographic.com/content/dam/travel/2020-digital/least-visited-US-national-parks/nationalgeographic_2466315.jpg
EDIT: To get title and article data use this:
for item in data:
for sub_item in item['promo_carousel']:
print(f"{sub_item['components'][0]['title']['text']}"
f"\n{sub_item['uri']}")
p_img = sub_item['promo_image']
if p_img is not None:
print(f"{p_img['image']['uri']}")
print("-" * len(sub_item['uri']))
Prints (shortened for brevity):
Rare photographs show African cheetahs in snowstorm
https://www.nationalgeographic.com/animals/2020/09/cheetahs-snow-south-africa/
https://www.nationalgeographic.com/content/dam/animals/2020/09/african-cheetah-snow/african-cheetah-snow-2.jpg
------------------------------------------------------------------------------
Wallabies exposed to common weed killer have reproductive abnormalities
https://www.nationalgeographic.com/animals/2020/09/wallaby-sexual-development-impaired-by-atrazine-herbicide/
https://www.nationalgeographic.com/content/dam/animals/2020/09/wallaby-atrazine/wallaby-og-a0xh8r-01.jpg
-------------------------------------------------------------------------------------------------------------
...
Another solution:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.nationalgeographic.com/magazine/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = json.loads( soup.select_one('[data-pestle-module="Lead"] [data-pestle-options]').string )
# to print all data uncomment next line:
# print(json.dumps(data, indent=4))
for b in data['body']:
if 'multilayout_promo_beta' not in b:
continue
for s in b['multilayout_promo_beta']['stories']:
if not s.get('lead_media'):
continue
if 'immersive_lead' not in s['lead_media']:
print(s['components'][0]['title']['text'])
print(s['lead_media']['image']['uri'])
else:
print(s['lead_media']['immersive_lead']['title'])
print(s['lead_media']['immersive_lead']['lead_media']['image']['uri'])
print(s['uri'])
print('-' * 80)
Prints:
America’s neglected hiking trails are more popular than ever—but they’re struggling
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/us-hiking-trails/us-hiking-trails-campfire-valley.jpg
https://www.nationalgeographic.com/magazine/2020/10/america-long-neglected-hiking-trails-are-more-popular-than-ever-but-they-are-struggling-feature/
--------------------------------------------------------------------------------
The heroic effort in the Amazon to save one of the world’s largest eagles
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/saving-largest-eagle/harpy-eagles-brazil-14a.jpg
https://www.nationalgeographic.com/animals/2020/04/saving-worlds-largest-eagle/
--------------------------------------------------------------------------------
The robot revolution has arrived
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/rise-of-the-machines/mm8612_190408_00122-3.jpg
https://www.nationalgeographic.com/magazine/2020/09/the-robot-revolution-has-arrived-feature/
--------------------------------------------------------------------------------
They may look goofy, but ostriches are nobody’s fool
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/ostriches/ostriches-standing-tall-male-beach.jpg
https://www.nationalgeographic.com/magazine/2020/09/they-may-look-goofy-but-ostriches-are-nobodys-fool-feature/
--------------------------------------------------------------------------------
The Great Lakes depend on ice. This winter, they barely froze.
https://www.nationalgeographic.com/content/dam/science/2020/03/19/no-ice/year-with-no-ice-sacka-46.jpg
https://www.nationalgeographic.com/science/2020/03/great-lakes-depend-on-winter-ice-low-cover/
--------------------------------------------------------------------------------
‘I put my camera to my face and cried.’ Documenting a COVID-19 hot spot
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departements-detroit-singer-funeral.jpg
https://www.nationalgeographic.com/magazine/2020/10/danny-wilcox-frazier-on-photographing-covid-19-in-detroit/
--------------------------------------------------------------------------------
COVID-19’s impact on the animal kingdom—so far
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departments-covid-animals-tiger.jpg
https://www.nationalgeographic.com/magazine/2020/10/covid-19s-impact-on-the-animal-kingdom-so-far/
--------------------------------------------------------------------------------
To prevent the next deadly disease, we must stop harming nature
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/coronavirus/departments-coronavirus-coral-reef.jpg
https://www.nationalgeographic.com/magazine/2020/09/pristine-seas-enric-sala-we-must-stop-harming-nature-to-prevent-deadly-disease-coronavirus/
--------------------------------------------------------------------------------
Beyond masks and gloves—here’s how the pros handle dangerous microbes
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/coronavirus/tool-kit-covid-testing.jpg
https://www.nationalgeographic.com/magazine/2020/09/beyond-masks-and-gloves-here-is-how-the-pros-handle-dangerous-microbes/
--------------------------------------------------------------------------------
NASA sent a map to space to help aliens find Earth. Now it needs an update.
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/departments-stellar-map-galaxy.jpg
https://www.nationalgeographic.com/magazine/2020/10/nasa-sent-a-map-to-space-to-help-aliens-find-earth-now-it-needs-an-update/
--------------------------------------------------------------------------------
This archaeologist hunts DNA from prehistoric diseases
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/coronavirus/departments-genius-rifkin.jpg
https://www.nationalgeographic.com/magazine/2020/10/archaeologist-riaan-rifkin-hunts-dna-from-prehistoric-diseases/
--------------------------------------------------------------------------------
See the ingenious cameras used to photograph elusive animals
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/10/departments/explore/departments-artifact-crittercam-wooden-fin.jpg
https://www.nationalgeographic.com/magazine/2020/10/see-the-ingenious-crittercams-used-to-photograph-elusive-animals/
--------------------------------------------------------------------------------
Popsicles and belly rubs: The joys of watching a panda grow up
https://www.nationalgeographic.com/content/dam/magazine/rights-exempt/2020/09/departments/explore/explore-essay-panda-stretching.jpg
https://www.nationalgeographic.com/magazine/2020/09/popsicles-and-belly-rubs-the-joys-of-watching-panda-bei-bei-grow-up/
--------------------------------------------------------------------------------
I'm starting to work with python again after 8 years. I'm trying to do program with BeautifulSoup and a array argument. I pass the array argument medios to the url functions count_words, but it doesn't work. Is there a way fix it or to search a word in multiple websites using BeautifulSoup?
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
# print(words)
return len(words)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/'
] }
word = 'Trump'
#count = count_words(url, word)
cuenta = count_words(medios, word)
# print('\n El Sitio: {}\n Contiene {} occurrencias de la palabra: {}'.format(url, count, word))
print('\n La palabra: {} aparece {} occurrencias en el New York Times'.format(word, cuenta))
if __name__ == '__main__':
main()
There are 3 problems here
medios is a dict. Hence, you will have to loop through the keys and values to send it to the method as the method only accepts url string.
BeautifulSoup find method needs a tag name for it to search else it will return None. If you want to count the number of occurrences of the word, then use count on the string.
You have to send User-Agent in the requests code else you will get 403 or 301.
import requests
from bs4 import BeautifulSoup
headers = {'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
def count_words(url, the_word):
r = requests.get(url, headers=headers)
return r.text.lower().count(the_word)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/']
}
word = 'trump'
for web_name, urls in medios.items():
for url in urls:
cuenta = count_words(url, word)
print('La palabra: {} aparece {} occurrencias en el {}'.format(word, cuenta, web_name))
if __name__ == '__main__':
main()
Output:
La palabra: trump aparece 47 occurrencias en el Los Angeles Times
La palabra: trump aparece 194 occurrencias en el New York Times
You are sending a dictionary to count_words(). You need to send the urls in a loop, or else loop thru the dictionary in count_words().
Perhaps you meant:
cuenta = count_words(url, word)
Update your code to the following:
cuenta = 0
for key in medios:
for url in medios[key]:
cuenta += count_words(url, word)
Basically you should pass the url not a dict, and I am assuming you want to count all the words in all elements in medios.
I'm new to beautifulsoup in python and I"m trying to extract certain information from a website. In detail, the url and the title.
I use beautifulsoup to extract the json which I successfully did but I´m unsure about the next steps, how to get the url and title
I did not manage to extract the desired information yet. I hope you guys can help me out
That is my logic so far:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
session = requests.Session()
session.cookies.get_dict()
url = 'http://www.citydis.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
metaConfig = soup.find("meta", property="configuration")
metaConfigTxt = metaConfig["content"]
csrf = json.loads(metaConfigTxt)["pageToken"]
jsonUrl = "https://www.citydis.com/s/results.json?&q=London& customerSearch=1&page=0"
headers.update({'X-Csrf-Token': csrf})
response = session.get(jsonUrl, headers=headers)
print(response.content)
And that is the output:
b'{"searchResults":{"customer":null,"signupUrl":"\\/signup\\/?pos=activityCard","isMobile":false,"tours":[{"tourId":5459,"title":"Ticket f\\u00fcr Coca-Cola London Eye 4D-Erlebnis","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459\\/","price":{"original":"27,10\\u00a0\\u20ac","min":"27,10\\u00a0\\u20ac","type":"individual"},"horizontalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-70.jpg","horizontalAlternativeImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-85.jpg","verticalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-92.jpg","mobileImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-53.jpg","horizontalSlimImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-67.jpg","highlightedDetailedImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-91.jpg","smallDescription":"Sehen Sie London aus einer anderen Perspektive vom London Eye aus und genie\\u00dfen Sie beim neuen 4D-Erlebnis einen bahnbrechenden 3D-Film mit\\u2026","description":"Sehen Sie London aus einer anderen Perspektive vom London Eye aus und genie\\u00dfen Sie beim neuen 4D-Erlebnis einen bahnbrechenden 3D-Film mit spektakul\\u00e4ren Spezialeffekten, einschlie\\u00dflich Wind und Nebel. Genie\\u00dfen Sie au\\u00dferdem bevorzugten Einlass am Eingang.","isBestseller":false,"isFeatured":false,"languageIds":[],"hasDeal":false,"dealMaxPercentage":0,"isBoostedNewTour":false,"hasBanner":false,"hasRibbon":false,"priceTag":true,"detailsLink":false,"isCertifiedPartner":true,"hasFencedDiscountDeal":false,"hasFreeCancellation":false,"hasRating":true,"averageRating":"4,5","totalRating":1633,"totalRatingTitle":"1633 Bewertungen","averageRatingClass":"45","ratingLink":"","ratingStyleModifier":"","ratingStarsClasses":"","ratingTitle":"Bewertung: 4,5 von 5","hasDuration":true,"duration":"40 Minuten","displayAbstract":true,"displayDuration":true,"displayDate":false,"displayWishlist":false,"displayRemoveButton":false,"hasDiscountedRecommendation":false,"hideImage":false,"isSkipTheLine":false,"likelyToSellOutBadge":true,"isPromoted":false,"isSpecialOffer":false,"experiments":{"hasRatingsExperiment":false,"numericRatingLabel":"Basierend auf 1633 Bewertungen","verticalImageForPriceSegmentation":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-150.jpg"},"id":"searchResults","activityCardVersion":"horizontal","limit":false,"likelyToSellOutExperiment":{"deviceDetector":{}},"hasNumericReviews":true,"resultSetPosition":0,"activityCardStyle":"plain","highlightedOrientation":"horizontal"},{"tourId":51268,"title":"Bustransfer: Flughafen Stansted - Stadtzentrum London","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/bustransfer-flughafen-stansted-stadtzentrum-london-t51268\\/","price":{"original":"9,43\\u00a0\\u20ac","min":"9,43\\u00a0\\u20ac","type":"individual"},"horizontalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-70.jpg","horizontalAlternativeImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-85.jpg","verticalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-92.jpg","mobileImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-53.jpg","horizontalSlimImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-67.jpg","highlightedDetailedImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-91.jpg","smallDescription":"Beginnen oder beenden Sie Ihren Aufenthalt in London mit dem praktischen Bustransfer zwischen dem Flughafen Stansted und dem Stadtzentrum London.\\u2026","description":"Beginnen oder beenden Sie Ihren Aufenthalt in London mit dem praktischen Bustransfer zwischen dem Flughafen Stansted und dem Stadtzentrum London. Sparen Sie sich die Fahrt mit \\u00f6ffentlichen Verkehrsmitteln und erreichen Sie London schnell und bequem.","isBestseller":false,"isFeatured":false,"languageIds":[],"hasDeal":false,"dealMaxPercentage":0,"isBoostedNewTour":false,"hasBanner":false,"hasRibbon":false,"priceTag":true,"detailsLink":false,"isCertifiedPartner":false,"hasFencedDiscountDeal":false,"hasFreeCancellation":true,"hasRating":true,"averageRating":"4,4","totalRating":541,"totalRatingTitle":"541 Bewertungen","averageRatingClass":"45","ratingLink":"","ratingStyleModifier":"","ratingStarsClasses":"","ratingTitle":"Bewertung: 4,4 von 5","hasDuration":true,"duration":"60 Minuten \\u2013 90 Minuten","displayAbstract":true,"displayDuration":true,"displayDate":false,"displayWishlist":false,"displayRemoveButton":false,"hasDiscountedRecommendation":false,"hideImage":false,"isSkipTheLine":false,"likelyToSellOutBadge":true,"isPromoted":false,"isSpecialOffer":false,"experiments":{"hasRatingsExperiment":false,"numericRatingLabel":"Basierend auf 541 Bewertungen","verticalImageForPriceSegmentation":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-150.jpg"}
What I would like to get out is the title and url only. For example:
title":"Ticket f\\u00fcr Coca-Cola London Eye 4D-Erlebnis","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459
Any feedback much appreciated
UPDATE
Thanks to the feedback I was able to solve the problem.
I´m now able to get the desired result but now I have the issue that I´m just getting one result back instead of all available:
js_dict = (json.loads(response.content.decode('utf-8')))
url = (js_dict['searchResults']["tours"][0]["url"])
print(url)
title = (js_dict['searchResults']["tours"][0]["title"])
print(title)
price = (js_dict['searchResults']["tours"][0]["price"]["original"])
print(price)
Output is the following one:
https://www.citydis.de/london-l57/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459/
Ticket für Coca-Cola London Eye 4D-Erlebnis
27,10 €
I would like to get all the titles, prices and urls back of the sightseeings which are in the JSON. I tried with the for loop but somehow it does not work.
Any feedback appreciated
UPDATE 2
Found a solution:
jsonUrl = "https://www.citydis.com/s/results.json?&q=London& customerSearch=1&page=0"
headers.update({'X-Csrf-Token': csrf})
response = session.get(jsonUrl, headers=headers)
js_dict = (json.loads(response.content.decode('utf-8')))
for item in js_dict:
headers = js_dict['searchResults']["tours"]
prices = js_dict['searchResults']["tours"]
urls = js_dict['searchResults']["tours"]
for title, price, url in zip(headers, prices, urls):
title_final = title.get("title")
url_final = url.get("url")
price_final = price.get("price")["original"]
print("Header: " + title_final + " | " + "Deeplink: " + url_final + " | " + "Price: " + price_final)
The string response.content is indeed the JSON output. You could import the json module, and parse the JSON with a statement like
js_dict = json.loads(response.content)
This will parse the JSON and produce a Python dictionary in js_dict. You can then use standard dictionary subscripting techniques to access and display the fields of interest.
Because this is such a common requirement, the response object has a json method that will do this decoding for you. You could, therefore, simply write
js_dict = response.json()
Searched around on SO, but couldn't find anything for this.
I'm scraping using beautifulsoup... This is the code I'm using which I found on SO:
for section in soup.findAll('div',attrs={'id':'dmusic_tracklist_track_title_B00KHQOKGW'}):
nextNode = section
while True:
nextNode = nextNode.nextSibling
try:
tag_name = nextNode.name
except AttributeError:
tag_name = ""
if tag_name == "a":
print nextNode.text()
else:
print "*****"
break
If went to this 50 Cent album (Animal Ambition: An Untamed Desire To Win) and wanted to scrape each song, how would I do so? The problem is each song has a different ID associated with it based upon its product code. For example, here is the XPath of the first two songs' titles: //*[#id="dmusic_tracklist_track_title_B00KHQOKGW"]/div/a/text() and //*[#id="dmusic_tracklist_track_title_B00KHQOLWK"]/div/a/text().
You'll notice the end of the first id is B00KHQOKGW, while the second is B00KHQOLWK. Is there a way I can add a "wild card to the end of the id to grab each of the songs no matter what product id is at the end? For example, something like id="dmusic_tracklist_track_title_*... I replaced the product ID with a *.
Or can I use a div to target the title I want like this (I feel like this would be the best. It uses the div's class right above the title. There isn't any product ID in it):
for section in soup.findAll('div',attrs={'class':'a-section a-spacing-none overflow_ellipsis'}):
nextNode = section
while True:
nextNode = nextNode.nextSibling
try:
tag_name = nextNode.name
except AttributeError:
tag_name = ""
if tag_name == "a":
print nextNode.text()
else:
print "*****"
break
You can pass a function as an id attribute value and check if it starts with dmusic_tracklist_track_title_:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36'}
response = requests.get('http://www.amazon.com/dp/B00KHQOI8C/?tag=stackoverfl08-20', headers=headers)
soup = BeautifulSoup(response.content)
for song in soup.find_all(id=lambda x: x and x.startswith('dmusic_tracklist_track_title_')):
print song.text.strip()
Prints:
Hold On [Explicit]
Don't Worry 'Bout It [feat. Yo Gotti] [Explicit]
Animal Ambition [Explicit]
Pilot [Explicit]
Smoke [feat. Trey Songz] [Explicit]
Everytime I Come Around [feat. Kidd Kidd] [Explicit]
Irregular Heartbeat [feat. Jadakiss] [Explicit]
Hustler [Explicit]
Twisted [feat. Mr. Probz] [Explicit]
Winners Circle [feat. Guordan Banks] [Explicit]
Chase The Paper [feat. Kidd Kidd] [Explicit]
Alternatively, you can pass a regular expression pattern as an attribute value:
import re
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36'}
response = requests.get('http://www.amazon.com/dp/B00KHQOI8C/?tag=stackoverfl08-20', headers=headers)
soup = BeautifulSoup(response.content)
for song in soup.find_all(id=re.compile('^dmusic_tracklist_track_title_\w+$')):
print song.text.strip()
^dmusic_tracklist_track_title_\w+$ would match dmusic_tracklist_track_title_ followed by 1 or more "alphanumeric" (0-9a-zA-Z and _) characters.