Web scraping using Python

Web scraping using Python - python

I'm trying to get data from a list of companies (currently testing only for one) from a website. I am not sure I can recognise how to get the score that I want because I can only find the formatting part instead of the actual data. Please could someone help?
from selenium import webdriver
import time
from selenium.webdriver.support.select import Select
driver=webdriver.Chrome(executable_path='C:\webdrivers\chromedriver.exe')
driver.get('https://www.refinitiv.com/en/sustainable-finance/esg-scores')
driver.maximize_window()
time.sleep(1)
cookie= driver.find_element("xpath", '//button[#id="onetrust-accept-btn-handler"]')
try:
cookie.click()
except:
pass
company_name=driver.find_element("id",'searchInput-1')
company_name.click()
company_name.send_keys('Jumbo SA')
time.sleep(1)
search=driver.find_element("xpath", '//button[#class="SearchInput-searchButton"]')
search.click()
time.sleep(2)
company_score = driver.find_elements("xpath",'//div[#class="fiscal-year"]')
print(company_score)
That's what I have so far. I want the number "42" to come back to my results but instead I get the below;
[<selenium.webdriver.remote.webelement.WebElement (session="bffa2fe80dd3785618b5c52d7087096d", element="62eaf2a8-d1a2-4741-8374-c0f970dfcbfe")>]
My issue is that the locator is not working.
//div[#class="fiscal-year"] = This part I think is wrong - but I am not sure what I need to pick from the website.
Website Screenshot

please use requests look at this example:
import requests
url = "https://www.refinitiv.com/bin/esg/esgsearchsuggestions"
payload = ""
response = requests.request("GET", url, data=payload)
print(response.text)
so this returns something like this:
[
{
"companyName": "GEK TERNA Holdings Real Estate Construction SA",
"ricCode": "HRMr.AT"
},
{
"companyName": "Mytilineos SA",
"ricCode": "MYTr.AT"
},
{
"companyName": "Hellenic Telecommunications Organization SA",
"ricCode": "OTEr.AT"
},
{
"companyName": "Jumbo SA",
"ricCode": "BABr.AT"
},
{
"companyName": "Folli Follie Commercial Manufacturing and Technical SA",
"ricCode": "HDFr.AT"
},
{
]
Here we can see the text and the code behind it so for Jumbo SA its BABr.AT. Now with this info lets get the data:
import requests
url = "https://www.refinitiv.com/bin/esg/esgsearchresult"
querystring = {"ricCode":"BABr.AT"} ## supply the company code
payload = ""
headers = {"cookie": "encaddr=NeVecfNa7%2FR1rLeYOqY57g%3D%3D"}
response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
print(response.text)
Now we see the response is in json:
{
"industryComparison": {
"industryType": "Specialty Retailers",
"scoreYear": "2020",
"rank": "162",
"totalIndustries": "281"
},
"esgScore": {
"TR.TRESGCommunity": {
"score": 24,
"weight": 0.13
},
"TR.TRESGInnovation": {
"score": 9,
"weight": 0.05
},
"TR.TRESGHumanRights": {
"score": 31,
"weight": 0.08
},
"TR.TRESGShareholders": {
"score": 98,
"weight": 0.08
},
"TR.SocialPillar": {
"score": 43,
"weight": 0.42999998
},
"TR.TRESGEmissions": {
"score": 19,
"weight": 0.08
},
"TR.TRESGManagement": {
"score": 47,
"weight": 0.26
},
"TR.GovernancePillar": {
"score": 53,
"weight": 0.38999998569488525
},
"TR.TRESG": {
"score": 42,
"weight": 1
},
"TR.TRESGWorkforce": {
"score": 52,
"weight": 0.1
},
"TR.EnvironmentPillar": {
"score": 20,
"weight": 0.19
},
"TR.TRESGResourceUse": {
"score": 30,
"weight": 0.06
},
"TR.TRESGProductResponsibility": {
"score": 62,
"weight": 0.12
},
"TR.TRESGCSRStrategy": {
"score": 17,
"weight": 0.05
}
}
}
Now you can get the data you want without using selenium. This way its faster, easier and better.
Please accept this as an answer.

Related

How to search for a specific key in a json file in python

I am solving a python problem in which I have to request a JSON file of the current status of bitcoin and from that, I have to fetch the current price of USD, I am stuck on the part where I have to fetch the rate from JSON file which is deep inside some dictonaries which contains a lot of data and I don't know how to get a specific key from them ...
here is my code sample (currently I just want to fetch the USD rate and print it)
import sys
from urllib import response
import requests
import json
if len(sys.argv) == 1 :
sys.exit("missing argument!")
elif type(float(sys.argv[1])) != float :
sys.exit("argument is not number")
response = requests.get("https://api.coindesk.com/v1/bpi/currentprice.json")
o = response.json()
print()
and here an example of json file...
{
"time": {
"updated": "Aug 16, 2022 18:13:00 UTC",
"updatedISO": "2022-08-16T18:13:00+00:00",
"updateduk": "Aug 16, 2022 at 19:13 BST"
},
"disclaimer": "This data was produced from the CoinDesk Bitcoin Price Index (USD). Non-USD currency data converted using hourly conversion rate from openexchangerates.org",
"chartName": "Bitcoin",
"bpi": {
"USD": {
"code": "USD",
"symbol": "$",
"rate": "23,960.8828",
"description": "United States Dollar",
"rate_float": 23960.8828
},
"GBP": {
"code": "GBP",
"symbol": "£",
"rate": "20,021.5220",
"description": "British Pound Sterling",
"rate_float": 20021.522
},
"EUR": {
"code": "EUR",
"symbol": "€",
"rate": "23,341.3982",
"description": "Euro",
"rate_float": 23341.3982
}
}
}

you can use the keys as indices.
o["bpi"]["USD"]["rate"]

I need help navigating through the api in python

I need my little program to give me the current price that is listed in the api using python.
but the problef is I don't know how to get the price from the api.
This is what the api looks like:
{
"data": {
"1": {
"id": 1,
"name": "Bitcoin",
"symbol": "BTC",
"website_slug": "bitcoin",
"rank": 1,
"circulating_supply": 18353462,
"total_supply": 17418787,
"max_supply": 21000000,
"quotes": {
"USD": {
"price": 8856.88527092981,
"volume_24h": 7633930889.83218,
"market_cap": 165283232374.455,
"percentage_change_1h": 0.08,
"percentage_change_24h": 6.62,
"percentage_change_7d": 19.68
}
},
"last_updated": 1588256701
}
},
"metadata": {
"timestamp": 1588256701,
"num_cryptocurrencies": 392,
"error": null
}
}
And here is my python script:
import requests, json
from time import sleep
def getBitcoinPrice():
URL = 'https://api.alternative.me/v2/ticker/1/'
try:
r = requests.get(URL)
priceFloat = float(json.loads(r.json)['?'])
return priceFloat
except requests.ConnectionError:
print("Error")
while True:
print(str(getBitcoinPrice()))
sleep(10)

Error in SQL syntax after sending post request to web page form in python

I need to post some json data here: https://online.superpoistenie.sk/pzp/kalkulacka/
the webpage will proceed data and show results
Basicly you insert some info and the webpage will generate all possible prices from different brands for your insurance specificly car.
Basicly i want to
post json data to page
get back answer from page in json
best result will be if i will get name of company name of the offer and price
I am only interested in cars.
I tried to do this in node.js with axios didnt got far and i was new to node.js now i am using python and request lib and i am having some nice answer i just cant finish it.
Here is my code:
import requests, json, urllib3, bs4, pprint
url = "https://online.superpoistenie.sk/pzp/kalkulacka/"
json_head = "/predvolba.json"
json_file = "C:/Users/GC/Desktop/web crawler poistovne/Json_Storage"
payload = json_file + json_head
with open(payload, 'r') as f:
data = f.read()
json_data = json.loads(data)
pprint.pprint(json_data)
try:
r = requests.post(url, data=json_data, headers="")
pprint.pprint(r.status_code)
except EnvironmentError as er:
print('Decoding json failed')
print(er)
r = requests.get(url)
print(r) #will output code 200
r = requests.post(url, json_data)
print(r) #will output code 200
print(r.content)
the output is html file with error codes in it meaning i screwed up something in json but i dont know what. i want it to be html code with the redirected page after i send form and i want to filter the html code to parts where the name of company are and prices.
Thanks so much for any answer!

this is the json i am seding to the site
[
{
"comp838":0
},
{
"comp839":null
},
{
"typ_vozidla":"1"
},
{
"len_pzp_check":true
},
{
"comp773":5
},
{
"comp771":1
},
{
"comp770":"motorka"
},
{
"comp769":"do+3%2C5+t"
},
{
"comp143":"BMW"
},
{
"comp144":"rad%2F%F8ada+2+Cabriolet"
},
{
"comp763": "BMW"
},
{
"comp772":"-vyberte-zo-zoznamu-"
},
{
"comp167":"Benz%EDn"
},
{
"comp678":"%28F23%29+%282014-%29+Benz%EDn&prevedenie=2-dv.%2C+4-m.%2C+kabriolet%2C+osob."
},
{
"comp600":"2-dv.%2C+4-m.%2C+kabrilet%2C+osob."
},
{
"comp677":"3.0+%282998+ccm%2C+250+kW%2C+turbo+29"
},
{
"comp147":"250"
},
{
"comp148":"2998"
},
{
"comp149":"2000"
},
{
"comp157":"1"
},
{
"comp779":"Adam"
},
{
"comp780":"Tilingerov"
},
{
"comp781":"Adam"
},
{
"comp782":"Tilingerov"
},
{
"comp158":"1982"
},
{
"comp150":"90028"
},
{
"comp151":"90028--Z%E1lesie%ESC%%EBA"
},
{
"comp212":"0947588699"
},
{
"comp211":"learningpotential2%40gmail.com"
},
{
"comp205":"1"
},
{
"comp286":"1"
},
{
"comp287":"1"
},
{
"comp810":"1"
},
{
"comp288":"0"
},
{
"comp591":"0"
},
{
"comp401":"Nie"
},
{
"comp806": "1"
},
{
"typ_vozidla":1
},
{
"len_pzp_check":true
},
{
"tel_pov":22
},
{
"odoslany_kasko":3
},
{
"scenar":1
},
{
"zmena":false
}
]

Website always hangs using python requests library

I am trying to use the python requests library to get the html from this url https://www.adidas.com/api/products/EF2302/availability?sitePath=us
However every time I run my code it hangs when making the get request
header = BASE_REQUEST_HEADER
url = 'https://www.adidas.com/api/products/EF2302/availability?sitePath=us'
r = requests.get(url, headers = header)
I checked the network tab in chrome and copied all the headers used including user agent so that is not the issue. I was also able to load the page in chrome with javascript and cookies disabled.
This code works fine with other websites. I simply cant get a response from any of the adidas websites (including https://www.adidas.com/us).
Any suggestions are greatly appreciated.

This site doesn't like the default User-Agent field supplied by requests, change it to Firefox/Chrome (I chose Firefox in my example), and you can read data successfully:
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.adidas.com/api/products/EF2302/availability?sitePath=us'
r = requests.get(url, headers=headers)
json_data = json.loads(r.text)
print(json.dumps(json_data, indent=4))
Prints:
{
"id": "EF2302",
"availability_status": "PREORDER",
"variation_list": [
{
"sku": "EF2302_530",
"availability": 15,
"availability_status": "PREORDER",
"size": "4",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_550",
"availability": 15,
"availability_status": "PREORDER",
"size": "5",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_570",
"availability": 15,
"availability_status": "PREORDER",
"size": "6",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_590",
"availability": 15,
"availability_status": "PREORDER",
"size": "7",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_610",
"availability": 15,
"availability_status": "PREORDER",
"size": "8",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_630",
"availability": 15,
"availability_status": "PREORDER",
"size": "9",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_650",
"availability": 15,
"availability_status": "PREORDER",
"size": "10",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_670",
"availability": 15,
"availability_status": "PREORDER",
"size": "11",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_690",
"availability": 15,
"availability_status": "PREORDER",
"size": "12",
"instock_date": "2018-08-16T00:00:00.000Z"
},
{
"sku": "EF2302_710",
"availability": 15,
"availability_status": "PREORDER",
"size": "13",
"instock_date": "2018-08-16T00:00:00.000Z"
}
]
}

One different is the User-agent field, which requests sets as
User-Agent: python-requests/2.18.4
Adidas may be just dropping these http requests to stop people abusing their system.
(btw, it also happens for just www.adidas.com)
I reproduced the issue and took a look at wireshark packet sniffer. It seems the http request is good and there is tcp acknowledgement but no http reply.

No response when query elasticsearch with python

I have some code to query specific strings in a field message as below:
"message": "Oct 29 11:38:46 1893 192.168.1.114 TCP_MISS/200 153925 GET http://www.pravda.ru/science/ - DIRECT/185.103.135.90 text/html"
Here is my code:
from elasticsearch import Elasticsearch
import json
client = Elasticsearch(['http://192.168.1.114:9200'])
response = client.search(
index="squidlog-2017.10.29",
body={
"query": {
"match": {
"message": 'GET'
}
}
}
)
for hit in response['hits']['hits']:
print json.dumps(hit['_source'], indent=4, sort_keys=True)
When I query with specific strings: GET with template above, everything is ok. But when I want to query something about url in message, I don't receive anything, like for the following query:
body={
"query": {
"match": {
"message": 'pravda'
}
}
}
Is there any problem with slashes in my message when I query? Anyone please give me an advice. Thanks.

You might consider using a different tokenizer, which will make the desired search possible. But let me explain why your query does not return you the result in the second case.
standard analyzer and tokenizer
By default standard analyzer consists of standard tokenizer, which will apparently keep the domain name not split by dots. You can try different analyzers and tokenizers with _analyze endpoint, like this:
GET _analyze
{
"text": "Oct 29 11:38:46 1893 192.168.1.114 TCP_MISS/200 153925 GET http://www.pravda.ru/science/ - DIRECT/185.103.135.90 text/html"
}
The response is a list of tokens that ElasticSearch will be using to represent this string while searching. Here it is:
{
"tokens": [
{
"token": "oct",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 0
}, ...
{
"token": "http",
"start_offset": 59,
"end_offset": 63,
"type": "<ALPHANUM>",
"position": 11
},
{
"token": "www.pravda.ru",
"start_offset": 66,
"end_offset": 79,
"type": "<ALPHANUM>",
"position": 12
},
{
"token": "science",
"start_offset": 80,
"end_offset": 87,
"type": "<ALPHANUM>",
"position": 13
}, ...
]
}
As you can see, "pravda" is not in the list of tokens, hence you cannot search for it. You can only search for the tokens that your analyzer emits.
Note that "pravda" is part of the domain name, which is a analyzed as a separate token: "www.pravda.ru".
lowercase tokenizer
If you use different tokenizer, for instance, lowercase tokenizer, it will do emit pravda as a token and it will be possible to search for it:
GET _analyze
{
"tokenizer" : "lowercase",
"text": "Oct 29 11:38:46 1893 192.168.1.114 TCP_MISS/200 153925 GET http://www.pravda.ru/science/ - DIRECT/185.103.135.90 text/html"
}
And the list of tokens:
{
"tokens": [
{
"token": "oct",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0
}, ...
{
"token": "http",
"start_offset": 59,
"end_offset": 63,
"type": "word",
"position": 4
},
{
"token": "www",
"start_offset": 66,
"end_offset": 69,
"type": "word",
"position": 5
},
{
"token": "pravda",
"start_offset": 70,
"end_offset": 76,
"type": "word",
"position": 6
},
{
"token": "ru",
"start_offset": 77,
"end_offset": 79,
"type": "word",
"position": 7
},
{
"token": "science",
"start_offset": 80,
"end_offset": 87,
"type": "word",
"position": 8
}, ...
]
}
How to define analyzer before indexing?
To be able to search for such tokens, you have to analyze them during the index phase differently. It means to define a different mapping with different analyzer. Like in this example:
PUT yet_another_index
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "lowercase"
}
}
}
},
"mappings": {
"my_type": {
"properties": {
"message": {
"type": "text",
"fields": {
"lowercased": {
"type": "text",
"analyzer": "my_custom_analyzer"
}
}
}
}
}
}
}
Here, we first define a custom analyzer with desired tokenizer, and then tell ElasticSearch to index our message field twice via fields feature: implicitly with default analyzer, and explicitly with my_custom_analyzer.
Now we are able to query for the desired token. Request to the original field will give no response:
POST yet_another_index/my_type/_search
{
"query": {
"match": {
"message": "pravda"
}
}
}
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
But the query to the message.lowercased will succeed:
POST yet_another_index/my_type/_search
{
"query": {
"match": {
"message.lowercased": "pravda"
}
}
}
"hits": {
"total": 1,
"max_score": 0.25316024,
"hits": [
{
"_index": "yet_another_index",
"_type": "my_type",
"_id": "AV9u1qZmB9pi5Gaw0rj1",
"_score": 0.25316024,
"_source": {
"message": "Oct 29 11:38:46 1893 192.168.1.114 TCP_MISS/200 153925 GET http://www.pravda.ru/science/ - DIRECT/185.103.135.90 text/html"
}
}
]
}
There are plenty of options, this solution answers the example you provided. Check out different analyzers and tokenizers to find which one suits you more.
Hope that helps!

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping using Python - python

Related

How to search for a specific key in a json file in python

I need help navigating through the api in python

Error in SQL syntax after sending post request to web page form in python

Website always hangs using python requests library

No response when query elasticsearch with python

Categories

Resources