Scraping table with Beautifulsoup and output to compare with value

Scraping table with Beautifulsoup and output to compare with value - python

I try to scrape a webpage with hourly energy prices. I want to use the data for home-automation. if the hourly price =< baseload price, certain times should turn on via Mqtt.
I managed to get the data from the baseload price and the hourly prices from its column. The output from the column seems not to be in one list but in 24 lists. correct? how to fix this so that the hourly price can be compared with the baseload price?
import datetime
import pytz
import requests
from bs4 import BeautifulSoup as bs
today_utc = pytz.utc.localize(datetime.datetime.utcnow())
today = today_utc.astimezone(pytz.timezone("Europe/Amsterdam"))
text_today = today.strftime("%y-%m-%d")
print(today)
print(text_today)
yesterday = datetime.datetime.now(tz=pytz.timezone("Europe/Amsterdam")) - datetime.timedelta(1)
text_yesterday = yesterday.strftime("%y-%m-%d")
print(yesterday)
print(text_yesterday)
url_part1 = 'https://www.epexspot.com/en/market-data?market_area=NL&trading_date='
url_part2 = '&delivery_date='
url_part3 = '&underlying_year=&modality=Auction&sub_modality=DayAhead&technology=&product=60&data_mode=table&period=&production_period='
url_text = url_part1+text_yesterday+url_part2+text_today+url_part3
print(url_text)
html_text = requests.get(url_text).text
#print(html_text)
soup = bs(html_text,'lxml')
#print(soup.prettify())
baseload = soup.find_all('div', class_='flex day-1')
for baseload_price in baseload:
baseload_price = baseload_price.find('span').text.replace(' ', '')
print(baseload_price)
table = soup.find_all('tr',{'class':"child"})
#print(table)
for columns in table:
column3 = columns.find_all('td')[3:]
#print(columns)
column3_text = [td.text.strip() for td in column3]
column3_text = column3_text
print(column3_text)

In the for loop for columns in table, you are creating a new list column3_text. If you intend for column3 text to be a list of the next 24 hours, you can replace this for loop with this:
column3_text = [column.find_all("td")[3].text.strip() for column in table]
Additionally, if you are going to be comparing the baseload price to the hourly prices, you'll want to convert the strings to floats or Decimals. :)

You simply need to use join:
column3_text = "".join([td.text.strip() for td in column3])

If you want to compare the values use pandas.
Here's how:
import datetime
import urllib.parse
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
}
today = datetime.datetime.today().strftime("%Y-%m-%d")
yesterday = (
datetime.datetime.today() - datetime.timedelta(days=1)
).strftime("%Y-%m-%d")
url = "https://www.epexspot.com/en/market-data?"
data = {
"market_area": "NL",
"trading_date": yesterday,
"delivery_date": today,
"underlying_year": "",
"modality": "Auction",
"sub_modality": "DayAhead",
"technology": "",
"product": "60",
"data_mode": "table",
"period": "",
"production_period": "",
}
query_url = f"{url}{urllib.parse.urlencode(data)}"
with requests.Session() as s:
s.headers.update(headers)
response = s.get(query_url).text
baseload = (
BeautifulSoup(response, "html.parser")
.select_one(".day-1 > span:nth-child(1)")
.text
)
print(f"Baselaod: {baseload}")
df = pd.concat(pd.read_html(response, flavor="lxml"), ignore_index=True)
df.columns = range(df.shape[1])
df = df.drop(df.columns[[4, 5, 6, 7]], axis=1)
df['is_higher'] = df[[3]].apply(lambda x: (x >= float(baseload)), axis=1)
df['price_diff'] = df[[3]].apply(lambda x: (x - float(baseload)), axis=1)
df = df.set_axis(
[
"buy_volume",
"sell_volume",
"volume",
"price",
"is_higher",
"price_diff",
],
axis=1,
copy=False,
)
df.insert(
0,
"hours",
[
f"0{value}:00 - {value + 1}:00" if value < 10
else f"{value}:00 - {value + 1}:00"
for value in range(0, 24)
],
)
print(df)
Output:
Baselaod: 144.32
hours buy_volume sell_volume ... price is_higher price_diff
0 00:00 - 1:00 2052.2 3608.7 ... 124.47 False -19.85
1 01:00 - 2:00 2467.8 3408.9 ... 119.09 False -25.23
2 02:00 - 3:00 2536.8 3220.5 ... 116.32 False -28.00
3 03:00 - 4:00 2552.0 3206.5 ... 114.60 False -29.72
4 04:00 - 5:00 2524.4 3010.0 ... 115.07 False -29.25
5 05:00 - 6:00 2542.4 3342.7 ... 123.54 False -20.78
6 06:00 - 7:00 2891.2 3872.2 ... 145.42 True 1.10
7 07:00 - 8:00 3413.2 3811.0 ... 166.40 True 22.08
8 08:00 - 9:00 3399.4 3566.0 ... 168.00 True 23.68
9 09:00 - 10:00 2919.3 3159.4 ... 153.30 True 8.98
10 10:00 - 11:00 2680.2 3611.5 ... 143.35 False -0.97
11 11:00 - 12:00 2646.8 3722.3 ... 141.95 False -2.37
12 12:00 - 13:00 2606.4 3723.3 ... 141.96 False -2.36
13 13:00 - 14:00 2559.7 3232.3 ... 145.96 True 1.64
14 14:00 - 15:00 2544.9 3261.2 ... 155.00 True 10.68
15 15:00 - 16:00 2661.7 3428.0 ... 169.15 True 24.83
16 16:00 - 17:00 3072.2 3529.4 ... 173.36 True 29.04
17 17:00 - 18:00 3593.7 3091.4 ... 192.00 True 47.68
18 18:00 - 19:00 3169.0 3255.4 ... 182.86 True 38.54
19 19:00 - 20:00 2710.1 3630.3 ... 167.96 True 23.64
20 20:00 - 21:00 2896.3 3728.8 ... 147.17 True 2.85
21 21:00 - 22:00 3160.3 3639.2 ... 136.78 False -7.54
22 22:00 - 23:00 3506.2 3196.3 ... 119.90 False -24.42
23 23:00 - 24:00 3343.8 3414.1 ... 100.00 False -44.32

Related

Python Dataframe - Can't replace text with a number

I am working Bicycle dataset. I want to replace text values in 'weather' column with numbers 1 to 4. This field is an object field. I tried all of these following ways but none seems to work.
There is another field called 'season'. If I apply same code on 'season', my code works fine. Please help.
Sample data:
datetime season holiday workingday weather temp atemp humidity windspeed
0 5/10/2012 11:00 Summer NaN 1 Clear + Few clouds 21.32 25.000 48 35.0008
1 6/9/2012 7:00 Summer NaN 0 Clear + Few clouds 23.78 27.275 64 7.0015
2 3/6/2011 20:00 Spring NaN 0 Light Snow, Light Rain 11.48 12.120 100 27.9993
3 10/13/2011 11:00 Winter NaN 1 Mist + Cloudy 25.42 28.790 83 0.0000
4 6/2/2012 12:00 Summer NaN 0 Clear + Few clouds 25.42 31.060 43 23.9994
I tried following, none worked on 'weather' but when i use same code on 'season' column it works fine.
test["weather"] = np.where(test["weather"]=="Clear + Few clouds", 1,
(np.where(test["weather"]=="Mist + Cloudy",2,(np.where(test["weather"]=="Light Snow, Light
Rain",3,(np.where(test["weather"]=="Heavy Rain + Thunderstorm",4,0)))))))
PE_weather = [
(train['weather'] == ' Clear + Few clouds '),
(train['weather'] =='Mist + Cloudy') ,
(train['weather'] >= 'Light Snow, Light Rain'),
(train['weather'] >= 'Heavy Rain + Thunderstorm')]
PE_weather_value = ['1', '2', '3','4']
train['Weather'] = np.select(PE_weather, PE_weather_value)
test.loc[test.weather =='Clear + Few clouds', 'weather']='1'

I suggest you make a dictionary to look up the corresponding values and then apply a lookup to the weather column.
weather_lookup = {
'Clear + Few clouds': 1,
'Mist + Cloudy': 2,
'Light Snow, Light Rain': 3,
'Heavy Rain + Thunderstorm': 4
}
def lookup(w):
return weather_lookup.get(w, 0)
test['weather'] = test['weather'].apply(lookup)
Output:
datetime season holiday workingday weather temp atemp humidity windspeed
0 5/10/2012 11:00 Summer NaN 1 1 21.32 25.000 48 35.0008
1 6/9/2012 7:00 Summer NaN 0 1 23.78 27.275 64 7.0015
2 3/6/2011 20:00 Spring NaN 0 3 11.48 12.120 100 27.9993 NaN
3 10/13/2011 11:00 Winter NaN 1 2 25.42 28.790 83 0.0000
4 6/2/2012 12:00 Summer NaN 0 1 25.42 31.060 43 23.9994

How to speed up this python script with multiprocessing

I have a script that get data from a dataframe, use those data to make a request to a website, using fuzzywuzzy module find the exact href and then runs a function to scrape odds. I would speed up this script with the multiprocessing module, it is possible?
Date HomeTeam AwayTeam
0 Monday 6 December 2021 20:00 Everton Arsenal
1 Monday 6 December 2021 17:30 Empoli Udinese
2 Monday 6 December 2021 19:45 Cagliari Torino
3 Monday 6 December 2021 20:00 Getafe Athletic Bilbao
4 Monday 6 December 2021 15:00 Real Zaragoza Eibar
5 Monday 6 December 2021 17:15 Cartagena Tenerife
6 Monday 6 December 2021 20:00 Girona Leganes
7 Monday 6 December 2021 19:45 Niort Toulouse
8 Monday 6 December 2021 19:00 Jong Ajax FC Emmen
9 Monday 6 December 2021 19:00 Jong AZ Excelsior
Script
df = pd.read_excel(path)
dates = df.Date
hometeams = df.HomeTeam
awayteams = df.AwayTeam
matches_odds = list()
for i,(a,b,c) in enumerate(zip(dates, hometeams, awayteams)):
try:
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
except requests.exceptions.ConnectionError:
sleep(10)
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
soup = BeautifulSoup(r.text, 'html.parser')
f = soup.find_all('td', class_="table-main__tt")
for tag in f:
match = fuzz.ratio(f'{b} - {c}', tag.find('a').text)
hour = a.split(" ")[4]
if hour.split(':')[0] == '23':
act_hour = '00' + ':' + hour.split(':')[1]
else:
act_hour = str(int(hour.split(':')[0]) + 1) + ':' + hour.split(':')[1]
if match > 70 and act_hour == tag.find('span').text:
href_id = tag.find('a')['href']
table = get_odds(href_id)
matches_odds.append(table)
print(i, ' of ', len(dates))
PS: The monthToNum function just replace the month name to his number

First, you make a function of your loop body with inputs i, a, b and c. Then, you create a multiprocessing.Pool and submit this function with the proper arguments (i, a, b, c) to the pool.
import multiprocessing
df = pd.read_excel(path)
dates = df.Date
hometeams = df.HomeTeam
awayteams = df.AwayTeam
matches_odds = list()
def fetch(data):
i, (a, b, c) = data
try:
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
except requests.exceptions.ConnectionError:
sleep(10)
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
soup = BeautifulSoup(r.text, 'html.parser')
f = soup.find_all('td', class_="table-main__tt")
for tag in f:
match = fuzz.ratio(f'{b} - {c}', tag.find('a').text)
hour = a.split(" ")[4]
if hour.split(':')[0] == '23':
act_hour = '00' + ':' + hour.split(':')[1]
else:
act_hour = str(int(hour.split(':')[0]) + 1) + ':' + hour.split(':')[1]
if match > 70 and act_hour == tag.find('span').text:
href_id = tag.find('a')['href']
table = get_odds(href_id)
matches_odds.append(table)
print(i, ' of ', len(dates))
if __name__ == '__main__':
num_processes = 20
with multiprocessing.Pool(num_processes) as pool:
pool.map(fetch, enumerate(zip(dates, hometeams, awayteams)))
Besides, multiprocessing is not the only way to improve the speed. Asynchronous programming can be used as well and is probably better for this scenario, although multiprocessing does the job, too - just want to mention that.
If carefully read the Python multiprocessing documentation, then it'll be obvious.

Python WebScraping FlashScore

I am using the following code to extract the outcome of the matches on FlashScore:
from requests_html import AsyncHTMLSession
from collections import defaultdict
import pandas as pd
url = 'https://www.flashscore.com/football/netherlands/eredivisie/results/'
asession = AsyncHTMLSession()
async def get_scores():
r = await asession.get(url)
await r.html.arender()
return r
results = asession.run(get_scores)
results = results[0]
times = results.html.find("div.event__time")
home_teams = results.html.find("div.event__participant.event__participant--home")
scores = results.html.find("div.event__scores.fontBold")
away_teams = results.html.find("div.event__participant.event__participant--away")
event_part = results.html.find("div.event__part")
dict_res = defaultdict(list)
for ind in range(len(times)):
dict_res['times'].append(times[ind].text)
dict_res['home_teams'].append(home_teams[ind].text)
dict_res['scores'].append(scores[ind].text)
dict_res['away_teams'].append(away_teams[ind].text)
dict_res['event_part'].append(event_part[ind].text)
df_res = pd.DataFrame(dict_res)
print(df_res)
This results in the following out:
times home_teams scores away_teams event_part
0 22.01. 20:00 Willem II 1 - 3 Zwolle (1 - 0)
1 17.01. 16:45 Ajax 1 - 0 Feyenoord (1 - 0)
2 17.01. 14:30 Groningen 2 - 2 Twente (0 - 2)
3 17.01. 14:30 Venlo 1 - 1 Heerenveen (0 - 0)
4 17.01. 12:15 Waalwijk 1 - 1 Willem II (1 - 0)
.. ... ... ... ... ...
101 25.10. 20:00 Den Haag 2 - 2 AZ Alkmaar (0 - 1)
102 25.10. 16:45 Waalwijk 2 - 2 Feyenoord (0 - 0)
103 25.10. 14:30 Sparta Rotterdam 1 - 1 Heracles (0 - 0)
104 25.10. 14:30 Vitesse 2 - 1 PSV (1 - 0)
105 25.10. 12:15 Sittard 1 - 3 Groningen (0 - 2)
[106 rows x 5 columns]
However, whenever going to the website https://www.flashscore.com/football/netherlands/eredivisie/results/, it shows at the bottom a 'Show more matches' button. The output shows only the first couple of matches, and not the additional information which shows up if you click on 'Show more matches'. Is it possible to also extract this additional information?

Comparing time in Python

My current project is scraping weather data from websites for calculation. Part of this calculation involves different logic depending on if the current time is before or after noon.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
# Arkansas State Plant Board Weather Web data
url1 = "http://170.94.200.136/weather/Inversion.aspx"
response1 = requests.get(url1)
soup1 = BeautifulSoup(response1.content)
table1 = soup1.find("table", id="MainContent_GridView1")
data1 = pd.read_html(str(table1),header=0)[0]
data1.columns = ['Station', 'Low Temp (F)', 'Time of Low', 'Current Temp (F)', 'Current Time', 'Wind Speed (MPH)', 'Wind Dir', 'High Temp (F)', 'Time Of High']
print(url1)
print(data1[0:4])
array1 = np.array(data1[0:4])
This is my code to bring in the data I need. However, I don't know how to compare the current time I request as a Unicode string to see if it is before or after noon. Can anyone help me with this?
Edit: some data from the current request
Station Low Temp (F) Time of Low Current Temp (F) Current Time \
0 Arkansas 69.0 5:19 AM 88.7 2:09 PM
1 Ashley 70.4 4:39 AM 91.2 2:14 PM
2 Bradley 69.4 4:09 AM 90.6 2:14 PM
3 Chicot -40.2 2:14 PM -40.2 2:14 PM
Wind Speed (MPH) Wind Dir High Temp (F) Time Of High
0 4.3 213 88.9 2:04 PM
1 4.1 172 91.2 2:14 PM
2 6.0 203 90.6 2:09 PM
3 2.2 201 -40.1 12:24 AM

Just check if the meridian is PM or AM.
time = "2:09 PM"
meridian = time.split(' ')[-1] # get just the meridian
before_noon = meridian == 'AM'
after_noon = meridian == 'PM'

You can do it like this:
t = pd.to_datetime(data1['Current Time'][0:1][0])
noon = pd.to_datetime("12:00 PM")
if t < noon:
print("yes")
else:
print("no")
>>> no
t
>>> Timestamp('2016-07-11 14:04:00')
noon
>>> Timestamp('2016-07-11 12:00:00')

data extraction and its summation using python

I have a following data representation in an text file called as
data.txt
03/05/2016 11:00 50
03/05/2016 11:10 10
03/05/2016 11:20 30
03/05/2016 11:30 40
03/05/2016 11:40 40
03/05/2016 11:50 50
03/05/2016 11:60 70
03/05/2016 12:00 25
03/05/2016 12:10 69
03/05/2016 12:20 25
03/05/2016 12:30 59
03/05/2016 12:40 25
03/05/2016 12:50 29
03/05/2016 12:60 25
I want to perform certain mathematical operation such that i can obtain the end result as
03/05/2016 11:00 - 12:00 290
03/05/2016 12:00 - 13:00 257
where this result is stored in another text file say data1.txt
here 290 is the sum of data from 11:00 to 12:00 and 257 is the sum of the data from 12:00 to 13:00
I want to write this code in python 2.7
How can i achieve this....
**UPDATED**
import time
import datetime
while 1:
final_sensorvalue = 0
st_time = time.time()
crntdatetime = 0.0
while ((time.time() - st_time) < 600.0):
sensorvalue = 10 # read sensor value
final_sensorvalue = final_sensorvalue + sensorvalue
time.sleep(2)
f = open('data.txt','a')
crntdatetime = datetime.datetime.now()
timestamp = crntdatetime.strftime("%d/%m/%Y %H:%M")
outstring = str(timestamp)+" "+str(final_sensorvalue)+ "\n"
print outstring
f.write(outstring)
f.close()
time.sleep(2)

You could convert the lines to Counter objects where the key is date & hour ('03/05/2016 11') and value is number as int. Then you could add all the Counter objects together, sort the items and write them to a file:
from collections import Counter
import re
with open('test.txt') as f:
res = sum((Counter({x.group(1): int(x.group(2))})
for x in (re.search('(.*?):.*\s(\d+)', line) for line in f) if x),
Counter())
with open('output.txt', 'w') as f:
f.writelines('{0}:00 - {1}:00 {2}\n'.format(k, int(k.split()[-1]) + 1, v)
for k, v in sorted(res.iteritems()))
Contents of output.txt:
03/05/2016 11:00 - 12:00 290
03/05/2016 12:00 - 13:00 257

You can try like this :
fo = open("data.txt","r")
lines = fo.readlines()
#print lines
d={}
for i in range(0,len(lines),2):
l = lines[i].split()
if int(l[1].split(":")[0]) != 23:
time = l[1].split(":")[0] + ":00-" + str(int(l[1].split(":")[0])+1) +":00"
else:
time = l[1].split(":")[0] + ":00-0:00"
#key = l[0]+"_"+l[1].split(":")[0]
key = l[0]+"_"+time
if key in d:
d[key] = int(d[key]) + int(l[2])
else:
d[key] = int(l[2])
print d
>>>
{'03/05/2016_11:00-12:00': 290, '03/05/2016_12:00-13:00': 257}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping table with Beautifulsoup and output to compare with value - python

You simply need to use join: column3_text = "".join([td.text.strip() for td in column3])

Related

Python Dataframe - Can't replace text with a number

How to speed up this python script with multiprocessing

Python WebScraping FlashScore

Comparing time in Python

data extraction and its summation using python

Categories

Resources