python beautiful soup web crawling with json

python beautiful soup web crawling with json - python

I'm new to beautifulsoup in python and I"m trying to extract certain information from a website. In detail, the url and the title.
I use beautifulsoup to extract the json which I successfully did but I´m unsure about the next steps, how to get the url and title
I did not manage to extract the desired information yet. I hope you guys can help me out
That is my logic so far:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
session = requests.Session()
session.cookies.get_dict()
url = 'http://www.citydis.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
metaConfig = soup.find("meta", property="configuration")
metaConfigTxt = metaConfig["content"]
csrf = json.loads(metaConfigTxt)["pageToken"]
jsonUrl = "https://www.citydis.com/s/results.json?&q=London& customerSearch=1&page=0"
headers.update({'X-Csrf-Token': csrf})
response = session.get(jsonUrl, headers=headers)
print(response.content)
And that is the output:
b'{"searchResults":{"customer":null,"signupUrl":"\\/signup\\/?pos=activityCard","isMobile":false,"tours":[{"tourId":5459,"title":"Ticket f\\u00fcr Coca-Cola London Eye 4D-Erlebnis","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459\\/","price":{"original":"27,10\\u00a0\\u20ac","min":"27,10\\u00a0\\u20ac","type":"individual"},"horizontalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-70.jpg","horizontalAlternativeImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-85.jpg","verticalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-92.jpg","mobileImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-53.jpg","horizontalSlimImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-67.jpg","highlightedDetailedImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-91.jpg","smallDescription":"Sehen Sie London aus einer anderen Perspektive vom London Eye aus und genie\\u00dfen Sie beim neuen 4D-Erlebnis einen bahnbrechenden 3D-Film mit\\u2026","description":"Sehen Sie London aus einer anderen Perspektive vom London Eye aus und genie\\u00dfen Sie beim neuen 4D-Erlebnis einen bahnbrechenden 3D-Film mit spektakul\\u00e4ren Spezialeffekten, einschlie\\u00dflich Wind und Nebel. Genie\\u00dfen Sie au\\u00dferdem bevorzugten Einlass am Eingang.","isBestseller":false,"isFeatured":false,"languageIds":[],"hasDeal":false,"dealMaxPercentage":0,"isBoostedNewTour":false,"hasBanner":false,"hasRibbon":false,"priceTag":true,"detailsLink":false,"isCertifiedPartner":true,"hasFencedDiscountDeal":false,"hasFreeCancellation":false,"hasRating":true,"averageRating":"4,5","totalRating":1633,"totalRatingTitle":"1633 Bewertungen","averageRatingClass":"45","ratingLink":"","ratingStyleModifier":"","ratingStarsClasses":"","ratingTitle":"Bewertung: 4,5 von 5","hasDuration":true,"duration":"40 Minuten","displayAbstract":true,"displayDuration":true,"displayDate":false,"displayWishlist":false,"displayRemoveButton":false,"hasDiscountedRecommendation":false,"hideImage":false,"isSkipTheLine":false,"likelyToSellOutBadge":true,"isPromoted":false,"isSpecialOffer":false,"experiments":{"hasRatingsExperiment":false,"numericRatingLabel":"Basierend auf 1633 Bewertungen","verticalImageForPriceSegmentation":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-412120-150.jpg"},"id":"searchResults","activityCardVersion":"horizontal","limit":false,"likelyToSellOutExperiment":{"deviceDetector":{}},"hasNumericReviews":true,"resultSetPosition":0,"activityCardStyle":"plain","highlightedOrientation":"horizontal"},{"tourId":51268,"title":"Bustransfer: Flughafen Stansted - Stadtzentrum London","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/bustransfer-flughafen-stansted-stadtzentrum-london-t51268\\/","price":{"original":"9,43\\u00a0\\u20ac","min":"9,43\\u00a0\\u20ac","type":"individual"},"horizontalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-70.jpg","horizontalAlternativeImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-85.jpg","verticalImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-92.jpg","mobileImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-53.jpg","horizontalSlimImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-67.jpg","highlightedDetailedImageUrl":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-91.jpg","smallDescription":"Beginnen oder beenden Sie Ihren Aufenthalt in London mit dem praktischen Bustransfer zwischen dem Flughafen Stansted und dem Stadtzentrum London.\\u2026","description":"Beginnen oder beenden Sie Ihren Aufenthalt in London mit dem praktischen Bustransfer zwischen dem Flughafen Stansted und dem Stadtzentrum London. Sparen Sie sich die Fahrt mit \\u00f6ffentlichen Verkehrsmitteln und erreichen Sie London schnell und bequem.","isBestseller":false,"isFeatured":false,"languageIds":[],"hasDeal":false,"dealMaxPercentage":0,"isBoostedNewTour":false,"hasBanner":false,"hasRibbon":false,"priceTag":true,"detailsLink":false,"isCertifiedPartner":false,"hasFencedDiscountDeal":false,"hasFreeCancellation":true,"hasRating":true,"averageRating":"4,4","totalRating":541,"totalRatingTitle":"541 Bewertungen","averageRatingClass":"45","ratingLink":"","ratingStyleModifier":"","ratingStarsClasses":"","ratingTitle":"Bewertung: 4,4 von 5","hasDuration":true,"duration":"60 Minuten \\u2013 90 Minuten","displayAbstract":true,"displayDuration":true,"displayDate":false,"displayWishlist":false,"displayRemoveButton":false,"hasDiscountedRecommendation":false,"hideImage":false,"isSkipTheLine":false,"likelyToSellOutBadge":true,"isPromoted":false,"isSpecialOffer":false,"experiments":{"hasRatingsExperiment":false,"numericRatingLabel":"Basierend auf 541 Bewertungen","verticalImageForPriceSegmentation":"https:\\/\\/cdn.getyourguide.com\\/img\\/tour_img-451822-150.jpg"}
What I would like to get out is the title and url only. For example:
title":"Ticket f\\u00fcr Coca-Cola London Eye 4D-Erlebnis","url":"https:\\/\\/www.getyourguide.de\\/london-l57\\/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459
Any feedback much appreciated
UPDATE
Thanks to the feedback I was able to solve the problem.
I´m now able to get the desired result but now I have the issue that I´m just getting one result back instead of all available:
js_dict = (json.loads(response.content.decode('utf-8')))
url = (js_dict['searchResults']["tours"][0]["url"])
print(url)
title = (js_dict['searchResults']["tours"][0]["title"])
print(title)
price = (js_dict['searchResults']["tours"][0]["price"]["original"])
print(price)
Output is the following one:
https://www.citydis.de/london-l57/ohne-anstehen-edf-london-eye-4d-erlebnis-t5459/
Ticket für Coca-Cola London Eye 4D-Erlebnis
27,10 €
I would like to get all the titles, prices and urls back of the sightseeings which are in the JSON. I tried with the for loop but somehow it does not work.
Any feedback appreciated
UPDATE 2
Found a solution:
jsonUrl = "https://www.citydis.com/s/results.json?&q=London& customerSearch=1&page=0"
headers.update({'X-Csrf-Token': csrf})
response = session.get(jsonUrl, headers=headers)
js_dict = (json.loads(response.content.decode('utf-8')))
for item in js_dict:
headers = js_dict['searchResults']["tours"]
prices = js_dict['searchResults']["tours"]
urls = js_dict['searchResults']["tours"]
for title, price, url in zip(headers, prices, urls):
title_final = title.get("title")
url_final = url.get("url")
price_final = price.get("price")["original"]
print("Header: " + title_final + " | " + "Deeplink: " + url_final + " | " + "Price: " + price_final)

The string response.content is indeed the JSON output. You could import the json module, and parse the JSON with a statement like
js_dict = json.loads(response.content)
This will parse the JSON and produce a Python dictionary in js_dict. You can then use standard dictionary subscripting techniques to access and display the fields of interest.
Because this is such a common requirement, the response object has a json method that will do this decoding for you. You could, therefore, simply write
js_dict = response.json()

Related

Scrape html links Python

Hello everyone I'm trying to get all href links with python by using this :
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
return links
soup = get_url(url)
print(extract(soup))
I'm supposed to get 10 htmls links as following :
https://rappel.conso.gouv.fr/fiche-rappel/4571/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4572/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4573/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4575/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4569/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4565/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4568/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4570/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4567/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4558/Interne
it actually works when I write print into the code as following :
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
print(links)
return
but I'm supposed with all the links I get from this request put them into a loop so I'll get data from each of those 10 pages and store them in a database (so it means there are lines code to write after def extract(soup)to come.
I have tried to understand with many tutorials, I get ever one html or a none

You just need to build a list of links, in your code the variable links only resets each time in the loop. Try this:
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
To print each link in main code after functions:
soup = get_url(url)
linklist = extract(soup)
for url in linklist:
print(url)

Your links variable is being rewritten inside the for loop.
You can create an empty list before the loop, then append the URL on each iteration.
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
soup = get_url(url)
print(extract(soup))

To use the links from the page to iterate over each products detail page collect the links in a list and return it from the funtion.
Try to name your functions more like what they are returning get_url() is more get_soup(),...
Example
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
def get_soup(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract_product_urls(url):
links = [url+x['href'] for x in get_soup(url).select('a.product-link')]
return links
def extract_product_details(url):
soup = get_soup(url)
items = {}
for x in soup.select('.product-desc li'):
content = x.get_text('|', strip=True).split('|')
items[content[0]]=content[1]
return items
data = []
for link in extract_product_urls(url):
data.append(extract_product_details(link))
data
Output
[{'Réf. Fiche\xa0:': '2021-11-0273',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'BATONNETS GEANTS VANILLE AMANDES',
'Identification des produits': 'GTIN',
'Conditionnements': '292G',
'Date début/Fin de commercialisation': 'Du\r\n 11/07/2019\r\n au\r\n 18/09/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},
{'Réf. Fiche\xa0:': '2021-11-0274',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'CREME GLACEE NOUGAT',
'Identification des produits': 'GTIN',
'Conditionnements': '469G',
'Date début/Fin de commercialisation': 'Du\r\n 28/06/2019\r\n au\r\n 10/10/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},...]

Web Scraping with BeautifulSoup with python

I'm scraping with BeutifulSoup via python on :
i woud like to extract the title name "Don Ḳarlos: gresṭe drama der ṿelṭ"
i get this as ouptput: "Don á¸²arlos: gresá¹e drama der á¹¿elá¹"
my code:
resp = requests.get(url)
tree = html.fromstring(resp.content)
element = tree.xpath('/html/body/div[1]/div[7]/div[2]/text()')
t = element[1]
print(t)
the html:
<dt>Title</dt>Don Ḳarlos: gresṭe drama der ṿelṭ<dd>Additional title: Don Ḳarlos</dd>
Thanks

TItle and Link Not Getting printed for Google Search

I am trying to scrape Title,Description and URL from Google Search Page using beautifulsoup and python.
from bs4 import BeautifulSoup
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
links = [] # Initiate empty list to capture final results
titles = []
descriptions = []
# Specify number of pages on google search, each page contains 10 #links
n_pages = 6
for page in range(1, n_pages):
url = "http://www.google.com/search?q=" + query + "&start=" + str((page - 1) * 10)
print("Link : " + url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
search = soup.find_all('div', class_ = "yuRUbf")
for link in search:
links.append(link.a['href'])
description = soup.find_all('div', class_ = "VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf")
for d in description:
description_text.append(d.span.text)
title = soup.find_all('div', class_ = "yuRUbf")
for t in title:
titles.append(t.h3.text)
# Next loop if one element is not present
except:
continue
print(links)
print(len(links))
print(description_text)
print(len(description_text))
print(titles)
print(len(titles))
The description is getting stored in a list however the links and title list is empty. I inspected the elements and I am using correct class but still unable to get the data.
Can someone help me figure out what I am doing wrong.

Personally, I find working with many lists irritating and cumbersome, when content can be stored directly and structured but anyway you can get information without selecting the dynamic classes in a more generic way:
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
Example
from bs4 import BeautifulSoup
import requests
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
data = []
n_pages = 6
for page in range(1, n_pages):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = f'http://www.google.com/search?q={query}&start={str((page - 1) * 10)}'
r = requests.get(url, headers= headers)
soup = BeautifulSoup(r.text, 'lxml')
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
data
Output
[{'title': 'Welcome to Python.org',
'url': 'https://www.python.org/',
'desc': 'The official home of the Python Programming Language.'},
{'title': 'Python (Programmiersprache) - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Python_(Programmiersprache)',
'desc': 'Python ([ˈpʰaɪθn̩], [ ˈpʰaɪθɑn], auf Deutsch auch [ ˈpʰyːtɔn]) ist eine universelle, üblicherweise interpretierte, höhere Programmiersprache.'},
{'title': 'Pythons - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Pythons',
'desc': 'Die Pythons (Pythonidae; altgr. Πύθων Pythōn; Einzahl der, allgemeinsprachlich auch die Python) sind eine Familie von Schlangen aus der Überfamilie der\xa0...'},
{'title': 'Das Python-Tutorial — Das Python3.3-Tutorial auf Deutsch',
'url': 'https://py-tutorial-de.readthedocs.io/',
'desc': 'Python ist eine einfach zu lernende, aber mächtige Programmiersprache mit effizienten abstrakten Datenstrukturen und einem einfachen, aber effektiven Ansatz\xa0...'},...]

Why do I run into trouble webscraping this website in Python?

I am new to Python and I am trying to webscrape this website. What I am trying to do is to get just dates and articles' titles from this website. I follow a procedure I found on SO which is as follows:
from bs4 import BeautifulSoup
import requests
url = "https://www.ecb.europa.eu/press/inter/html/index.en.html"
res = requests.get(url)
soup = BeautifulSoup(res.text)
movies = soup.select(".title a , .date")
print(movies)
movies_titles = [title.text for title in movies]
movies_links = ["http://www.ecb.europa.eu"+ title["href"] for title in movies]
print(movies_titles)
print(movies_links)
I got .title a , .date using SelectorGadget in the url I shared. However, print(movies) is empty. What am I doing wrong?
Can anyone help me?
Thanks!

The content is not part of index.en.html but is loaded in by js from
https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html
Then you can't select pairs afaik, so you need to select for titles and dates separately:
titles = soup.select(".title a")
dates = soup.select(".date")
pairs = list(zip(titles, dates))
Then you can print them out like this:
movies_titles = [pair[0].text for pair in pairs]
print(movies_titles)
movies_links = ["http://www.ecb.europa.eu" + pair[0]["href"] for pair in pairs]
print(movies_links)
Result:
['Christine Lagarde:\xa0Interview with CNBC', 'Fabio Panetta:\xa0Interview with El País ', 'Isabel Schnabel:\xa0Interview with Der Spiegel', 'Philip R. Lane:\xa0Interview with CNBC', 'Frank Elderson:\xa0Q&A on Twitter', 'Isabel Schnabel:\xa0Interview with Les Echos ', 'Philip R. Lane:\xa0Interview with the Financial Times', 'Luis de Guindos:\xa0Interview with Público', 'Philip R. Lane:\xa0Interview with Expansión', 'Isabel Schnabel:\xa0Interview with LETA', 'Fabio Panetta:\xa0Interview with Der Spiegel', 'Christine Lagarde:\xa0Interview with Le Journal du Dimanche ', 'Philip R. Lane:\xa0Interview with Süddeutsche Zeitung', 'Isabel Schnabel:\xa0Interview with Deutschlandfunk', 'Philip R. Lane:\xa0Interview with SKAI TV', 'Isabel Schnabel:\xa0Interview with Der Standard']
['http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210412~ccd1b7c9bf.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210411~44ade9c3b5.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210409~c8c348a12c.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210323~e4026c61d1.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210317_1~1d81212506.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210317~458636d643.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210316~930d09ce3c.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210302~c793ad7b68.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210226~79eba6f9fb.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210225~5f1be75a9f.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210209~af9c628e30.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210207~f6e34f3b90.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210131_1~650f5ce5f7.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210131~13d84cb9b2.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210127~9ad88eb038.en.html', 'http://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in210112~1c3f989acd.en.html']
Full code:
from bs4 import BeautifulSoup
import requests
url = "https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html"
res = requests.get(url)
soup = BeautifulSoup(res.text)
titles = soup.select(".title a")
dates = soup.select(".date")
pairs = list(zip(titles, dates))
movies_titles = [pair[0].text for pair in pairs]
print(movies_titles)
movies_links = ["http://www.ecb.europa.eu" + pair[0]["href"] for pair in pairs]
print(movies_links)

I would recommend using Python Selenium
Try something like this :
from selenium.webdriver import Chrome
url = "https://www.ecb.europa.eu/press/inter/html/index.en.html"
browser = Chrome()
browser.get(url)
interviews = browser.find_elements_by_class_name('title')
links = []
for interview in interviews:
try:
anchor = interview.find_element_by_tag_name('a')
link = anchor.get_attribute('href')
links.append(link)
except NoSuchElementException:
pass
Links will contain the links to all the interviews. You can do something similar for the dates

Web scraping with python from online dictionary(reverso context)

I want to create a program that searches for an inputted list of words in german and finds them in Reverso Context with the appropriate examples of these words. After finding them, the inputted words will be deleted and the examples will be represented without these words. I tried doing these but for one word:
import requests
from bs4 import BeautifulSoup
inp = input("Type a german word\n")
web = requests.get('https://context.reverso.net/translation/german-english/'+inp)
data = web.content
soup = BeautifulSoup(data, features = "html.parser")
tag = soup.find_all("span","text","de")
a = 1
for i in tag:
print(a, ".", i.text)
a = a+1
help to suit for requirements I wrote, please.

Every iterateion you can see this message Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': or you can make list of word and itertae it you got the same result.You can try it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0 '}
mark = ""
while mark != 'e':
inp = input("Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': ")
mark = inp
if mark == 'e':
break
s = requests.Session()
url = f'https://context.reverso.net/translation/german-english/{inp}'
web = s.get(url,headers=headers)
soup = BeautifulSoup(web.text,"lxml")
tag = soup.select("span",class_="text",lang="de")
a = 1
for i in tag:
if ('\n' or "") in i.text :
print(a, ". ", i.text.strip())
a = a+1
# print("Do You have any List of word?")
print("."*80)
Output will be:
1 . Join Reverso
2 .
3 . Facebook connect
4 . Google connect
5 . Zeigt die Anzahl der heute blockierten Ereignisse an.
6 . Displays the number of events that have been blocked today.
7 . In diesem Sinne werden wir heute die Entlastung verweigern.
8 . It is for this reason that we are today refusing to grant discharge.
9 . Die Agrarerzeugnisse sind heute ein wesentlicher Bestandteil der Verhandlungsrunden der Welthandelsorganisation.
10 . Agricultural products are now an integral part of the World Trade Organisation negotiating round.
11 . Das ist heute die wichtigste Frage.
12 . This is the pressing issue we now face.
13 . Sie wird in vergleichbaren Fällen heute anders vorgehen.
14 . It would take a different approach in comparable cases today.
15 . Kutschma regiert heute als allmächtiger Präsident.
16 . Today, Kuchma rules as an all-powerful president.
17 . Für mich verbleibt heute nur eine wesentliche Frage.
18 . In my view, there is only one important question left today.
19 . Die heute diskutierte Verordnung wird unsere Aktion fraglos verbessern helfen.
20 . The regulation we are debating today will undoubtedly contribute to improving our action.
and so on......
You can also try it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0 '}
mark = ""
while mark != 'e':
inp = input("Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': ")
mark = inp
if mark == 'e':
break
s = requests.Session()
url = f'https://context.reverso.net/translation/german-english/{inp}'
web = s.get(url,headers=headers)
soup = BeautifulSoup(web.text,"lxml")
# tag = soup.select("span",class_="text",lang="de")
sentences = [x.text.strip() for x in soup.find_all('span', {'class':'text'},{"lang" : "de"}) if '\n' in x.text]
print(sentences)
print("."*80)
You get the same result as list.

I implemented a python wrapper for Reverso Context API: https://github.com/flagist0/reverso_context_api
In your case, you can use it like this:
from itertools import islice
from reverso_context_api import Client
def get_samples(client, word, num=5):
# There can be thousands of translation samples, this function requests and returns only needed amount of them
iterator = client.get_translation_samples(word)
return list(islice(iterator, num))
client = Client(source_lang="de", target_lang="en")
# call get_samples for each word in your list
print(get_samples(client, "Fortschritt"))
# Outputs:
# [('Überprüfen Sie den Fortschritt des Datenbank-Loaders im Prozessmanager.',
# 'Check the progress of the Database Loader in your Process Manager.'),
# ('Status verfolgen auch den Fortschritt des Auftragsabschlussprozesses.',
# 'Statuses also track the progress of the job close process.'),
# ('Kommissar Vitorino hatte das Abkommen als großen Fortschritt bezeichnet.',
# "Commissioner Vitorino has described it as a 'major advance'."),
# ('Dies ist deshalb schon ein großer Fortschritt.',
# 'This is, therefore, already a major advance.'),
# ('Ich betrachte die Charta als akzeptablen Fortschritt.',
# 'I consider that the Charter of Fundamental Rights represents a valuable step forward.')]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python beautiful soup web crawling with json - python

Related

Scrape html links Python

Web Scraping with BeautifulSoup with python

TItle and Link Not Getting printed for Google Search

Why do I run into trouble webscraping this website in Python?

Web scraping with python from online dictionary(reverso context)

Categories

Resources