Website scraping javascript element with python - python

I scrape the page with bs4 and requests python. I want to get all the values ​​given below span.
My code returns an empty output.
That is my code:
import requests
from bs4 import BeautifulSoup
url = 'https://finance.yahoo.com/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAL27GM7owB-wouEznTgEc042sYEQEVDVrvFu5gPk62z1oKnTUhzN297s6vD5rzOVWHpoex7Zc8frVJe0saldAedZOe49BauM9YtLDhHtx6PMlH4ENmihvT2fgmlnqsAPFFqfC9aW1dF_NgBYi6lfREpk6uUwP7DnDhikzgEkYIUd'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for i in soup.find_all('span', class_='Trsdu(0.3s) Fz(s) Mt(4px) Mb(0px) Fw(b) D(ib)'):
print(i.text)

Change the User-Agent to obtain the correct values:
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAL27GM7owB-wouEznTgEc042sYEQEVDVrvFu5gPk62z1oKnTUhzN297s6vD5rzOVWHpoex7Zc8frVJe0saldAedZOe49BauM9YtLDhHtx6PMlH4ENmihvT2fgmlnqsAPFFqfC9aW1dF_NgBYi6lfREpk6uUwP7DnDhikzgEkYIUd"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
for i in soup.find_all(
"span", class_="Trsdu(0.3s) Fz(s) Mt(4px) Mb(0px) Fw(b) D(ib)"
):
print(i.text)
Prints:
4,181.17
33,874.85
13,962.68

Related

Created list from BeautifulSoup contains multiplicate entries -- need a unique link list

I am trying to create a list containing all unique year links from a website (see below).
When I execute the append function it gives me a huge list containing dupli-multiplicate entries.
I need to get a list containing only the unique year links.
The website : https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html
Code written so far :
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import re
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup = BeautifulSoup(page.content,"html.parser")
year = []
for link in soup.find_all('a', href=lambda href: href and "category" in href):
print(link.get('href'))
#year.append(link.get('href'))
#print(year)
The desired result would look like this (but I need this in list format):
https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html
/apofaseis-gnomodotiseis/itemlist/category/83-2022.html
/apofaseis-gnomodotiseis/itemlist/category/78-2021.html
/apofaseis-gnomodotiseis/itemlist/category/71-2020.html
/apofaseis-gnomodotiseis/itemlist/category/4-2019.html
/apofaseis-gnomodotiseis/itemlist/category/5-2018.html
/apofaseis-gnomodotiseis/itemlist/category/6-2017.html
/apofaseis-gnomodotiseis/itemlist/category/7-2016.html
/apofaseis-gnomodotiseis/itemlist/category/8-2015.html
/apofaseis-gnomodotiseis/itemlist/category/9-2014.html
/apofaseis-gnomodotiseis/itemlist/category/10-2013.html
/apofaseis-gnomodotiseis/itemlist/category/11-2012.html
/apofaseis-gnomodotiseis/itemlist/category/12-2011.html
/apofaseis-gnomodotiseis/itemlist/category/13-2010.html
/apofaseis-gnomodotiseis/itemlist/category/18-2009.html
/apofaseis-gnomodotiseis/itemlist/category/19-2008.html
/apofaseis-gnomodotiseis/itemlist/category/20-2007.html
/apofaseis-gnomodotiseis/itemlist/category/21-2006.html
/apofaseis-gnomodotiseis/itemlist/category/22-2005.html
/apofaseis-gnomodotiseis/itemlist/category/23-2004.html
/apofaseis-gnomodotiseis/itemlist/category/24-2003.html
/apofaseis-gnomodotiseis/itemlist/category/25-2002.html
/apofaseis-gnomodotiseis/itemlist/category/26-2001.html
/apofaseis-gnomodotiseis/itemlist/category/27-2000.html
/apofaseis-gnomodotiseis/itemlist/category/44-1999.html
/apofaseis-gnomodotiseis/itemlist/category/45-1998.html
/apofaseis-gnomodotiseis/itemlist/category/48-1997.html
/apofaseis-gnomodotiseis/itemlist/category/47-1996.html
/apofaseis-gnomodotiseis/itemlist/category/46-1995.html
/apofaseis-gnomodotiseis/itemlist/category/49-1994.html
Edit : I am Trying to create a case list for every year in year list :
Code :
# 1) Created an year list (year = [])
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import re
total_cases = []
#Url to scrape
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup = BeautifulSoup(page.content,"html.parser")
year = []
for link in soup.find_all('a', href=lambda href: href and "category" in href):
if link.get('href') not in year:
year.append(link.get('href'))
print(year)
# 2) Created a case list
case = []
for link in soup.find_all('a', href=lambda href: href and "apofasi" in href):
if link.get('href') not in case :
case.append(link.get('href'))
print(case)
#Trying to create a case list for every year in year list
# A)Get every year link in year list
for year_link in year :
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(year_link, headers = headers1)
soup2 = BeautifulSoup(page.content,"html.parser")
print(year)
# B)Get every case link for every case in a fixed year
for case_link in case :
total_cases.append(case_link)
#Get case link for every case for every year_link (element of year[])
???
EDIT 2 :
When I try to run the code you (HedgeHog) so kinldy posted it gives me this error :
--------------------------------------------------------------------------
FeatureNotFound Traceback (most recent call last)
C:\Users\ARISTE~1\AppData\Local\Temp/ipykernel_13944/1621925083.py in <module>
8 "X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
9 page = requests.get(URL, headers = headers)
---> 10 soup = BeautifulSoup(page.content,'lxml')
11
12 baseUrl = 'https://www.epant.gr'
~\Documents\conda\envs\conda\lib\site-packages\bs4\__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs)
243 builder_class = builder_registry.lookup(*features)
244 if builder_class is None:
--> 245 raise FeatureNotFound(
246 "Couldn't find a tree builder with the features you "
247 "requested: %s. Do you need to install a parser library?"
FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Any ideas? Thanks!
EDIT
Based on your question edits I would recommend to use a dict instead of all this lists - Following example will create a data dictionary with years as keys, it´s own url and a list of cases urls.
Example
from bs4 import BeautifulSoup
import requests
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content,'html.parser')
baseUrl = 'https://www.epant.gr'
data = {}
for href in [x['href'] for x in soup.select('a[href*=category]:has(span)')]:
page = requests.get(f'{baseUrl}{href}', headers = headers)
soup = BeautifulSoup(page.content,'html.parser')
data[href.split('-')[-1].split('.')[0]] = {
'url': f'{baseUrl}{href}'
}
data[href.split('-')[-1].split('.')[0]]['cases'] = [f'{baseUrl}{x["href"]}' for x in soup.select('h3 a')]
data
Output
{'2022': {'url': 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/83-2022.html',
'cases': []},
'2021': {'url': 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html',
'cases': ['https://www.epant.gr/apofaseis-gnomodotiseis/item/1578-apofasi-749-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1633-apofasi-743-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1575-apofasi-738-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1624-apofasi-737-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1510-apofasi-735-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1595-apofasi-733-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1600-apofasi-732-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1508-apofasi-728-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1584-apofasi-727-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1586-apofasi-726-2021.html',
'https://www.epant.gr/apofaseis-gnomodotiseis/item/1583-apofasi-725-2021.html']},...}
How to fix?
Just check if the link is not in your list of links - So it is True append it to your list:
if link.get('href') not in year:
year.append(link.get('href'))
Note
The desired result would look like this (but I need this in list
format)
This is not a list in the sense of data structure it is a printed version of each single element of a list.
Alternativ
Example
from bs4 import BeautifulSoup
import requests
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup = BeautifulSoup(page.content,"html.parser")
year = []
for link in soup.find_all('a', href=lambda href: href and "category" in href):
if link.get('href') not in year:
year.append(link.get('href'))
print(year)
Output
['https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html', '/apofaseis-gnomodotiseis/itemlist/category/83-2022.html', '/apofaseis-gnomodotiseis/itemlist/category/78-2021.html', '/apofaseis-gnomodotiseis/itemlist/category/71-2020.html', '/apofaseis-gnomodotiseis/itemlist/category/4-2019.html', '/apofaseis-gnomodotiseis/itemlist/category/5-2018.html', '/apofaseis-gnomodotiseis/itemlist/category/6-2017.html', '/apofaseis-gnomodotiseis/itemlist/category/7-2016.html', '/apofaseis-gnomodotiseis/itemlist/category/8-2015.html', '/apofaseis-gnomodotiseis/itemlist/category/9-2014.html', '/apofaseis-gnomodotiseis/itemlist/category/10-2013.html', '/apofaseis-gnomodotiseis/itemlist/category/11-2012.html', '/apofaseis-gnomodotiseis/itemlist/category/12-2011.html', '/apofaseis-gnomodotiseis/itemlist/category/13-2010.html', '/apofaseis-gnomodotiseis/itemlist/category/18-2009.html', '/apofaseis-gnomodotiseis/itemlist/category/19-2008.html', '/apofaseis-gnomodotiseis/itemlist/category/20-2007.html', '/apofaseis-gnomodotiseis/itemlist/category/21-2006.html', '/apofaseis-gnomodotiseis/itemlist/category/22-2005.html', '/apofaseis-gnomodotiseis/itemlist/category/23-2004.html', '/apofaseis-gnomodotiseis/itemlist/category/24-2003.html', '/apofaseis-gnomodotiseis/itemlist/category/25-2002.html', '/apofaseis-gnomodotiseis/itemlist/category/26-2001.html', '/apofaseis-gnomodotiseis/itemlist/category/27-2000.html', '/apofaseis-gnomodotiseis/itemlist/category/44-1999.html', '/apofaseis-gnomodotiseis/itemlist/category/45-1998.html', '/apofaseis-gnomodotiseis/itemlist/category/48-1997.html', '/apofaseis-gnomodotiseis/itemlist/category/47-1996.html', '/apofaseis-gnomodotiseis/itemlist/category/46-1995.html', '/apofaseis-gnomodotiseis/itemlist/category/49-1994.html']
Use a set as the intermediate storage for the HREFs then convert to a list later.
from bs4 import BeautifulSoup
import requests
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/itemlist/category/78-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB"}
page = requests.get(URL, headers=headers1)
soup = BeautifulSoup(page.content, "lxml")
year = set()
for link in soup.find_all('a', href=lambda href: href and "category" in href):
year.add(link.get('href'))
print(list(year))

How to extract this dictionary using Beautiful Soup

I want to get to variable last element of dictionary (pasted below), it's in another dictionary "offers", and i have no clue how to extract it.
html = s.get(url=url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(html.text, 'html.parser')
products = soup.find_all('script', {'type': "application/ld+json"})
{"#context":"http://schema.org","#type":"Product","aggregateRating":{"#type":"AggregateRating","bestRating":5,"ratingValue":"4.8","ratingCount":11,"worstRating":3,"reviewCount":5},"brand":{"#type":"Brand","name":"New Balance"},"color":"white/red/biały","image":["https://img01.ztat.net/3"],"itemCondition":"http://schema.org/NewCondition","manufacturer":"New Balance","name":"550 UNISEX - Sneakersy niskie - white/red","offers":[{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110001000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"},{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110002000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"} (...)
As mentioned extract contents via BeautifulSoup decode the string with json.loads():
import json
products = '{"#context":"http://schema.org","#type":"Product","aggregateRating":{"#type":"AggregateRating","bestRating":5,"ratingValue":"4.8","ratingCount":11,"worstRating":3,"reviewCount":5},"brand":{"#type":"Brand","name":"New Balance"},"color":"white/red/biały","image":["https://img01.ztat.net/3"],"itemCondition":"http://schema.org/NewCondition","manufacturer":"New Balance","name":"550 UNISEX - Sneakersy niskie - white/red","offers":[{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110001000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"},{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110002000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"}]}'
products = json.loads(products)
To get the last element (dict) in offers:
products['offers'][-1]
Output:
{'#type': 'Offer',
'availability': 'http://schema.org/OutOfStock',
'price': '489',
'priceCurrency': 'PLN',
'sku': 'NE215O06U-A110002000',
'url': '/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html'}
Example
In your special case you also have to replace('"','"') first:
from bs4 import BeautifulSoup
import requests, json
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
html = requests.get('https://www.zalando.de/new-balance-550-unisex-sneaker-low-whitered-ne215o06u-a11.html', headers=headers)
soup = BeautifulSoup(html.content, 'lxml')
jsonData = json.loads(soup.select_one('script[type="application/ld+json"]').text.replace('"','"'))
jsonData['offers'][-1]

unable to scrape website pages with unchanged url - python

im trying to get the names of all games within this website "https://slotcatalog.com/en/The-Best-Slots#anchorFltrList".To do so im using the following code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "https://slotcatalog.com/en/The-Best-Slots#anchorFltrList"
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
data = []
table = soup.find_all('div', attrs={'class':'providerCard'})
for game in range(0,len(table)-1):
print(table[game].find('a')['title'])
and i get what i want.
I would like to replicate the same across all pages available on the website, but given that the url is not changing, I looked at the network (XMR) events on the page happening when clicking on a different page and I tried to send a request using the following code:
for page_no in range(1, 100):
data = {
"blck":"fltrGamesBlk",
"ajax":"1",
"lang":"end",
"p":str(page_no),
"translit":"The-Best-Slots",
"tag":"TOP",
"dt1":"",
"dt2":"",
"sorting":"SRANK",
"cISO":"GB",
"dt_period":"",
"rtp_1":"50.00",
"rtp_2":"100.00",
"max_exp_1":"2.00",
"max_exp_2":"250000.00",
"min_bet_1":"0.01",
"min_bet_2":"5.00",
"max_bet_1":"3.00",
"max_bet_2":"10000.00"
}
page = requests.post('https://slotcatalog.com/index.php',
data=data,
headers={'Host' : 'slotcatalog.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'
})
soup = BeautifulSoup(page.content, 'html.parser')
for row in soup.find_all('div', attrs={'class':'providerCard'}):
name = row.find('a')['title']
print(name)
result : ("KeyError: 'title'") - meaning that its not finding the class "providerCard".
Has the request to the website been done in the wrong way? If so, where should i change the code?
thanks in advance
Alright, so, you had a typo. XD It was this "lang":"end" from the payload but it should have been "lang": "en", among other things.
Anyhow, I've cleaned your code up a bit and it works as expected. You can keep looping for all the games, if you want.
import requests
from bs4 import BeautifulSoup
headers = {
"referer": "https://slotcatalog.com/en/The-Best-Slots",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/50.0.2661.102 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
payload = {
"blck": "fltrGamesBlk",
"ajax": "1",
"lang": "en",
"p": 1,
"translit": "The-Best-Slots",
"tag": "TOP",
"dt1": "",
"dt2": "",
"sorting": "SRANK",
"cISO": "EN",
"dt_period": "",
"rtp_1": "50.00",
"rtp_2": "100.00",
"max_exp_1": "2.00",
"max_exp_2": "250000.00",
"min_bet_1": "0.01",
"min_bet_2": "5.00",
"max_bet_1": "3.00",
"max_bet_2": "10000.00"
}
page = requests.post(
"https://slotcatalog.com/index.php",
data=payload,
headers=headers,
)
soup = BeautifulSoup(page.content, "html.parser")
print([i.get("title") for i in soup.find_all("a", {"class": "providerName"})])
Output (for page 1 only):
['Starburst', 'Bonanza', 'Rainbow Riches', 'Book of Dead', "Fishin' Frenzy", 'Wolf Gold', 'Twin Spin', 'Slingo Rainbow Riches', "Gonzo's Quest", "Gonzo's Quest Megaways", 'Eye of Horus (Reel Time Gaming)', 'Age of the Gods God of Storms', 'Lightning Roulette', 'Buffalo Blitz', "Fishin' Frenzy Megaways", 'Fluffy Favourites', 'Blue Wizard', 'Legacy of Dead', '9 Pots of Gold', 'Buffalo Blitz II', 'Cleopatra (IGT)', 'Quantum Roulette', 'Reel King Mega', 'Mega Moolah', '7s Deluxe', "Rainbow Riches Pick'n'Mix", "Shaman's Dream"]

Parse URL with python

I would like to parse the following URL :
Espacenet link
and I would like to obtain the URL corresponding to the text :
BATTERY PACK WITH A BUS BAR HAVING NOVEL STRUCTURE
I'm using python but I'm not really familiar with javascript.
How can I can get the job done ?
So far I've seen requests_html and I tried this code :
from requests_html import HTMLSession
from bs4 import BeautifulSoup
publication_number_to_scrape = "EP2814089"
url = "https://worldwide.espacenet.com/searchResults?ST=singleline&locale=fr_EP&submitted=true&DB=&query=ep2814089" + publication_number_to_scrape
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
# create an HTML Session object
session = HTMLSession()
# Use the object above to connect to needed webpage
resp = session.get(url, headers=headers)
print(resp.content)
# Run JavaScript code on webpage
html2 = resp.html.render()
soup = BeautifulSoup(resp.content, 'html.parser')
print(soup)
and in the printed result, I've seen this part :
</li>
<li class="bendractive"><a accesskey="b" href="">Liste de résultats</a></li>
<li class="bendr"><a accesskey="c" class="ptn" href="/mydocumentslist?submitted=true&locale=fr_EP" id="menuPnStar">Ma liste de brevets (<span id="menuPnCount"></span>)</a></li>
<li class="bendr"><a accesskey="d" href="/queryHistory?locale=fr_EP">Historique des requêtes</a></li>
<li class="spacer"></li>
<li class="bendl"><a accesskey="e" href="/settings?locale=fr_EP">Paramètres</a></li>
<li class="bendl last">
<a accesskey="f" href="/help?locale=fr_EP&method=handleHelpTopic&topic=index">Aide</a>
</li>
My goal is to obtain the following URL from the results :
Wanted URL
My final goal is to get a list with the string of each document appearing in that URL:
I don't need the URLs of said documents, only the following list :
result = ['EP2814089 (A4)', 'EP2814089 (B1)', ....]
Use selenium from Pyp https://pypi.org/project/selenium/
and get the id of what you're intersted or the xpath.
In your case :
id=publicationId1 or //a[#id='publicationId1']
or xpath=(.//*[normalize-space(text()) and normalize-space(.)='|'])[5]/following::a[2]
I think this will do the job:
import requests
from bs4 import BeautifulSoup
cookies = {
'JSESSIONID': '9ULYIsd9+RmCkgzGPoLdCWMP.espacenet_levelx_prod_1',
'org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE': 'fr_EP',
'menuCurrentSearch': '%2F%2Fworldwide.espacenet.com%2FsearchResults%3FDB%3D%26ST%3Dsingleline%26locale%3Dfr_EP%26query%3Dep2814089',
'currentUrl': 'https%3A%2F%2Fworldwide.espacenet.com%2FsearchResults%3FDB%3D%26ST%3Dsingleline%26locale%3Dfr_EP%26query%3Dep2814089',
'PGS': '10',
}
headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
'Sec-Fetch-User': '?1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr,tr-TR;q=0.9',
}
params = (
('DB', ''),
('ST', 'singleline'),
('locale', 'fr_EP'),
('query', 'ep2814089'),
)
response = requests.get('https://worldwide.espacenet.com/searchResults', headers=headers, params=params, cookies=cookies)
soup = BeautifulSoup(response.text, 'html.parser')

Encoding Error while scraping a website using Beautiful Soup

I am trying to scrape text from this website. It returns text like this:
डा. भà¥à¤·à¤¬à¤¹à¤¾à¤¦à¥à¤° थापालाठपà¥à¤¤à¥à¤°à¥à¤¶à¥à¤, à¤à¤®à¥à¤°à¤¿à¤à¤¾à¤®à¤¾ तà¥à¤à¤¶à¥à¤°à¥à¤à¥ निधन
instead of:
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै.
Current Code:
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
soup = BeautifulSoup(url_request.text, 'lxml')
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})
Using EncodingDetector:
from bs4.dammit import EncodingDetector
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
http_encoding = url_request.encoding if 'charset' in url_request.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(url_request.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})
print(title_card.text)
OUTPUT:
होमपेज /
समाचार /
राष्ट्रिय समाचार
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै
प्रधानमन्त्रीलगायत शीर्ष नेतासँग भेट्ने
.
.
.

Categories