How to web scrape from two websites in one script? - python

I am currently working on a model and need to gather information not just regarding game results
(this link https://www.hltv.org/stats/teams/matches/4991/fnatic?startDate=2019-01-01&endDate=2019-12-31)
but I would also like the script to open another link within the HTML source.. the link is available in the source and it'll take me to a page that explains each matches detailed result,
(as in who want what round, https://www.hltv.org/stats/matches/mapstatsid/89458/cr4zy-vs-fnatic?startDate=2019-01-01&endDate=2019-12-31&contextIds=4991&contextTypes=team), the main objective is I want to know who won the match (from first link) and who won the first round of each individual match (in the second link). Is this possible? This is my current script;
import requests
r = requests.get('https://www.hltv.org/stats/teams/maps/6665/Astralis')
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('tr')
AstralisResults = []
for result in results[1:]:
date = result.contents[1].text
event = result.contents[3].text
opponent = result.contents[7].text
Map = result.contents[9].text
Score = "'" + result.contents[11].text
WinorLoss = result.contents[13].text
AstralisResults.append((date,event,opponent,Map,Score,WinorLoss))
import pandas as pd
df5 = pd.DataFrame(AstralisResults,columns=['date','event','opponent','Map','Score','WinorLoss'])
df5.to_csv('AstralisResults.csv',index=False,encoding='utf-8')
So I would be looking for the following information:
Date | Opponent | Map | Score | Result | Round1Result |

Looks like the site blocks if you scrape too fast, so had to put in a time delay. There's ways to make this code more efficient, but overall, I think it gets what you asked for:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
r = requests.get('https://www.hltv.org/stats/teams/matches/4991/fnatic?startDate=2019-01-01&endDate=2019-12-31' , headers=headers)
print (r)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('tr')
df5 = pd.DataFrame()
cnt=1
for result in results[1:]:
print ('%s of %s' %(cnt, len(results)-1))
date = result.contents[1].text
event = result.contents[3].text
opponent = result.contents[7].text
Map = result.contents[9].text
Score = "'" + result.contents[11].text
WinorLoss = result.contents[13].text
round_results = result.find('td', {'class':'time'})
link = round_results.find('a')['href']
r2 = requests.get('https://www.hltv.org' + link ,headers=headers)
soup2 = BeautifulSoup(r2.text, 'html.parser')
round_history = soup2.find('div', {'class':'standard-box round-history-con'})
teams = round_history.find_all('img', {'class':'round-history-team'})
teams_list = [ x['title'] for x in teams ]
rounds_winners = {}
n = 1
row = round_history.find('div',{'class':'round-history-team-row'})
for each in row.find_all('img',{'class':'round-history-outcome'}):
if 'emptyHistory' in each['src']:
winner = teams_list[1]
loser = teams_list[0]
else:
winner = teams_list[0]
loser = teams_list[1]
rounds_winners['Round%02dResult' %n] = winner
n+=1
round_row_df = pd.DataFrame.from_dict(rounds_winners,orient='index').T
temp_df = pd.DataFrame([[date,event,opponent,Map,Score,WinorLoss]],columns=['date','event','opponent','Map','Score','WinorLoss'])
temp_df = temp_df.merge(round_row_df, left_index=True, right_index=True)
df5 = df5.append(temp_df, sort=True).reset_index(drop=True)
time.sleep(.5)
cnt+=1
df5 = df5[['date','event','opponent','Map','Score','WinorLoss', 'Round01Result']]
df5 = df5.rename(columns={'date':'Date',
'event':'Event',
'WinorLoss':'Result',
'Round01Result':'Round1Result'})
df5.to_csv('AstralisResults.csv',index=False,encoding='utf-8')
Output:
print (df5.head(10).to_string())
Date Event opponent Map Score Result Round1Result
0 20/07/19 Europe Minor - StarLadder Major 2019 CR4ZY Dust2 '13 - 16 L fnatic
1 20/07/19 Europe Minor - StarLadder Major 2019 CR4ZY Train '13 - 16 L fnatic
2 19/07/19 Europe Minor - StarLadder Major 2019 mousesports Inferno '8 - 16 L mousesports
3 19/07/19 Europe Minor - StarLadder Major 2019 mousesports Dust2 '13 - 16 L fnatic
4 17/07/19 Europe Minor - StarLadder Major 2019 North Train '16 - 9 W fnatic
5 17/07/19 Europe Minor - StarLadder Major 2019 North Nuke '16 - 2 W fnatic
6 17/07/19 Europe Minor - StarLadder Major 2019 Ancient Mirage '16 - 7 W fnatic
7 04/07/19 ESL One Cologne 2019 Vitality Overpass '17 - 19 L Vitality
8 04/07/19 ESL One Cologne 2019 Vitality Mirage '16 - 19 L fnatic
9 03/07/19 ESL One Cologne 2019 Astralis Nuke '6 - 16 L fnatic

Related

cannot scrape ratings

My issue is that I cannot use bs4 to scrape sub ratings in its reviews.
Below is an example:
So far, I have discovered where these stars are, but their codes are the same regardless of the color (i.e., green or grey)... I need to be able to identify the color to identify the ratings, not just scrape the stars. Below is my code:
url='https://www.glassdoor.com/Reviews/Walmart-Reviews-E715_P2.htm?filter.iso3Language=eng'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
com = soup.find(class_ = "ratingNumber mr-xsm")
com1 = soup.find(class_ = "gdReview")
com1_1 = com1.find(class_ = "content")
For getting the star rating breakdown (which seems to have no numeric display or meta value), I don't think there's any very simple-and-straight-forward short method since it's done by css in a style tag connected by a class of the container element.
You could use something like soup.select('style:-soup-contains(".css-1nuumx7")') [ the css-1nuumx7 part is specific to rating mentioned above], but :-soup-contains needs html5lib parser and can be a bit slow, so it's better to figure out the data-emotion-css attribute of the style tag instead:
def getDECstars(starCont, mSoup, outOf=5, isv=False):
classList = starCont.get('class', [])
if type(classList) != list: classList = [classList]
classList = [str(c) for c in classList if str(c).startswith('css-')]
if not classList:
if isv: print('Stars container has no "css-" class')
return None
demc = classList[0].replace('css-', '', 1)
demc_sel = f'style[data-emotion-css="{demc}"]'
cssStyle = mSoup.select_one(demc_sel)
if not cssStyle:
if isv: print(f'Nothing found with selector {demc_sel}')
return None
cssStyle = cssStyle.get_text()
errMsg = ''
if '90deg,#0caa41 ' not in cssStyle: errMsg += 'No #0caa41'
if '%' not in cssStyle.split('90deg,#0caa41 ', 1)[-1][:20]:
errMsg += ' No %'
if not errMsg:
rPerc = cssStyle.split('90deg,#0caa41 ', 1)[-1]
rPerc = rPerc.split('%')[0]
try:
rPerc = float(rPerc)
if 0 <= rPerc <= 100:
if type(outOf) == int and outOf > 0: rPerc = (rPerc/100)*outOf
return float(f'{float(rPerc):.3}')
errMsg = f'{demc_sel} --> "{rPerc}" is out of range'
except: errMsg = f'{demc_sel} --> cannot convert to float "{rPerc}"'
if isv: print(f'{demc_sel} --> unexpected format {errMsg}')
return None
OR, if you don't care so much about why there's a missing rating:
def getDECstars(starCont, mSoup, outOf=5, isv=False):
try:
demc = [c for c in starCont.get('class', []) if c[:4]=='css-'][0].replace('css-', '', 1)
demc_sel = f'style[data-emotion-css="{demc}"]'
rPerc = float(mSoup.select_one(demc_sel).get_text().split('90deg,#0caa41 ', 1)[1].split('%')[0])
return float(f'{(rPerc/100)*outOf if type(outOf) == int and outOf > 0 else rPerc:.3}')
except: return None
Here's an example of how you might use it:
pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std'
pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth'
refDict = {
'rating_num': 'span.ratingNumber',
'emp_status': 'div:has(> div > span.ratingNumber) + span',
'header': 'h2 > a.reviewLink',
'subheader': 'h2:has(> a.reviewLink) + span',
'pros': f'{pcDiv}:first-of-type > p.pb',
'cons': f'{pcDiv}:nth-of-type(2) > p.pb'
}
subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)'
empRevs = []
for r in soup.select('li[id^="empReview_"]'):
rDet = {'reviewId': r.get('id')}
for sr in r.select(subRatSel):
k = sr.select_one('div:first-of-type').get_text(' ').strip()
sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup)
rDet[f'[rating] {k}'] = sval
for k, sel in refDict.items():
sval = r.select_one(sel)
if sval: sval = sval.get_text(' ').strip()
rDet[k] = sval
empRevs.append(rDet)
If empRevs is viewed as a table:
reviewId
[rating] Work/Life Balance
[rating] Culture & Values
[rating] Diversity & Inclusion
[rating] Career Opportunities
[rating] Compensation and Benefits
[rating] Senior Management
rating_num
emp_status
header
subheader
pros
cons
empReview_71400593
5
4
4
4
5
3
3
great pay but bit of obnoxious enviornment
Nov 26, 2022 - Sales Associate/Cashier in Bensalem, PA
-Walmart's fair pay policy is ...
-some locations wont build emp...
empReview_70963705
3
3
2
2
2
2
2
Former Employee
Walmart Employees Trained Thrown to the Wolves
Nov 10, 2022 - Data Entry
Getting a snack at break was e...
I worked at Walmart for a very...
empReview_71415031
4
4
4
4
4
4
5
Current Employee, more than 1 year
Work
Nov 27, 2022 - Warehouse Associate in Springfield, GA
The money there is good during...
It can get stressful at times ...
empReview_69136451
nan
nan
nan
nan
nan
nan
4
Current Employee
Walmart
Sep 16, 2022 - Sales Associate/Cashier
I'm a EXPERIENCED WORKER. I ✨...
In my opinion I believe that W...
empReview_71398525
4
3
4
3
4
3
4
Current Employee
Depends heavily on your team
Nov 26, 2022 - Personal Digital Shopper
I have a generally excellent t...
Generally, departments are sho...
empReview_71227029
1
1
1
1
3
1
1
Former Employee, less than 1 year
Managers are treated like a slave.
Nov 19, 2022 - Auto Care Center Manager (ACCM) in Cottonwood, AZ
Great if you like working with...
you only get to work in your a...
empReview_71329467
1
3
3
3
4
1
1
Current Employee, more than 3 years
No more values
Nov 23, 2022 - GM Coach in Houston, TX
Pay compare to other retails a...
Walmart is not a bad company t...
empReview_71512609
5
5
5
5
5
5
5
Former Employee
Walmart midnight stocker
Nov 30, 2022 - Midnight Stocker in Taylor, MI
2 paid 15 min breaks and 1 hou...
Honestly nothing that I can th...
empReview_70585957
3
4
4
4
4
4
4
Former Employee
Lots of Opportunity
Oct 28, 2022 - Human Resources People Lead
Plenty of opportunities if one...
As with any job, management is...
empReview_71519435
3
4
4
5
4
4
5
Current Employee, more than 3 years
Lot of work but worth it
Nov 30, 2022 - People Lead
I enjoy making associates live...
Sometimes an overwhelming amou...
Markdown for the table above was printed with pandas:
erdf = pandas.DataFrame(empRevs).set_index('reviewId')
erdf['pros'] = [p[:30] + '...' if len(p) > 33 else p for p in erdf['pros']]
erdf['cons'] = [p[:30] + '...' if len(p) > 33 else p for p in erdf['cons']]
print(erdf.to_markdown())

How to speed up this python script with multiprocessing

I have a script that get data from a dataframe, use those data to make a request to a website, using fuzzywuzzy module find the exact href and then runs a function to scrape odds. I would speed up this script with the multiprocessing module, it is possible?
Date HomeTeam AwayTeam
0 Monday 6 December 2021 20:00 Everton Arsenal
1 Monday 6 December 2021 17:30 Empoli Udinese
2 Monday 6 December 2021 19:45 Cagliari Torino
3 Monday 6 December 2021 20:00 Getafe Athletic Bilbao
4 Monday 6 December 2021 15:00 Real Zaragoza Eibar
5 Monday 6 December 2021 17:15 Cartagena Tenerife
6 Monday 6 December 2021 20:00 Girona Leganes
7 Monday 6 December 2021 19:45 Niort Toulouse
8 Monday 6 December 2021 19:00 Jong Ajax FC Emmen
9 Monday 6 December 2021 19:00 Jong AZ Excelsior
Script
df = pd.read_excel(path)
dates = df.Date
hometeams = df.HomeTeam
awayteams = df.AwayTeam
matches_odds = list()
for i,(a,b,c) in enumerate(zip(dates, hometeams, awayteams)):
try:
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
except requests.exceptions.ConnectionError:
sleep(10)
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
soup = BeautifulSoup(r.text, 'html.parser')
f = soup.find_all('td', class_="table-main__tt")
for tag in f:
match = fuzz.ratio(f'{b} - {c}', tag.find('a').text)
hour = a.split(" ")[4]
if hour.split(':')[0] == '23':
act_hour = '00' + ':' + hour.split(':')[1]
else:
act_hour = str(int(hour.split(':')[0]) + 1) + ':' + hour.split(':')[1]
if match > 70 and act_hour == tag.find('span').text:
href_id = tag.find('a')['href']
table = get_odds(href_id)
matches_odds.append(table)
print(i, ' of ', len(dates))
PS: The monthToNum function just replace the month name to his number
First, you make a function of your loop body with inputs i, a, b and c. Then, you create a multiprocessing.Pool and submit this function with the proper arguments (i, a, b, c) to the pool.
import multiprocessing
df = pd.read_excel(path)
dates = df.Date
hometeams = df.HomeTeam
awayteams = df.AwayTeam
matches_odds = list()
def fetch(data):
i, (a, b, c) = data
try:
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
except requests.exceptions.ConnectionError:
sleep(10)
r = requests.get(f'https://www.betexplorer.com/results/soccer/?year={a.split(" ")[3]}&month={monthToNum(a.split(" ")[2])}&day={a.split(" ")[1]}')
soup = BeautifulSoup(r.text, 'html.parser')
f = soup.find_all('td', class_="table-main__tt")
for tag in f:
match = fuzz.ratio(f'{b} - {c}', tag.find('a').text)
hour = a.split(" ")[4]
if hour.split(':')[0] == '23':
act_hour = '00' + ':' + hour.split(':')[1]
else:
act_hour = str(int(hour.split(':')[0]) + 1) + ':' + hour.split(':')[1]
if match > 70 and act_hour == tag.find('span').text:
href_id = tag.find('a')['href']
table = get_odds(href_id)
matches_odds.append(table)
print(i, ' of ', len(dates))
if __name__ == '__main__':
num_processes = 20
with multiprocessing.Pool(num_processes) as pool:
pool.map(fetch, enumerate(zip(dates, hometeams, awayteams)))
Besides, multiprocessing is not the only way to improve the speed. Asynchronous programming can be used as well and is probably better for this scenario, although multiprocessing does the job, too - just want to mention that.
If carefully read the Python multiprocessing documentation, then it'll be obvious.

Not clickable point (Selenium)

I'm trying to click the "Show more" button, but I can't.
Any help? Thank you very much.
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.scorespro.com/basketball/china/cba/results/')
time.sleep(2)
showmore = driver.find_element_by_link_text("Show more")
showmore.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")``
Dividing the height with some number can decrease the scroll height and will stop where Show More visible
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.scorespro.com/basketball/china/cba/results/')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.35);")
time.sleep(2)
driver.find_element_by_class_name("show_more").click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
If you are after the tables, you don't need to use Selenium. you can pull the data straight away with requests, and parse with pandas.
To find that, when you go to the page, you want to right-click and 'Inspect' (or Shft-Ctrl-I). This will open a side panel. When it opens, you want to go to Network and XHR. And you want to sort of browse those requests (and you can click on Preview to see what it return. You may need to 1) reload the page; and 2) click around the table. For example, once I clicked "show more" at the bottom of the table, it popped up.
Once you find it, click on Headers and you'l see the url and payload, etc. I highlighted it in the pic for you:
import requests
import pandas as pd
url = 'https://www.scorespro.com/basketball/ajaxdata_more.php'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Mobile Safari/537.36'}
dfList = []
for season in ['2018-2019','2019-2020']:
continueLoop = True
page = 1
while continueLoop == True:
print ('%s Collecting Page: %s' %(season,page))
payload = {
'country': 'china',
'comp': 'cba',
'season': season,
'status': 'results',
'league': '',
'page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
try:
dfs = pd.read_html(response.text)
except ValueError:
print ('No more tables found.')
continueLoop = False
page+=1
dfList.extend(dfs)
dfList_single = []
cols = ['Date','Final Time', 'Team', 'Final Score','Q1','Q2','Q3','Q4','OT','Half Time Score','Combined Total Score']
for each in dfList:
each = each.loc[:,[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12]]
each.columns = cols
teamA = each.iloc[0,:]
teamB = each.iloc[1,2:]
temp_df = pd.concat([teamA, teamB], axis=0).to_frame().T
dfList_single.append(temp_df)
df = pd.concat(dfList_single)
df = df.reset_index(drop=True)
Output:
print(df.head(10).to_string())
Date Final Time Team Final Score Q1 Q2 Q3 Q4 Half Time Score Combined Total Score
0 15.08.20 15:00 FT Guandong 123 26 28 41 28 54 238
1 15.08.20 15:00 FT Liaoning 115 29 18 39 29 47 238
2 13.08.20 15:00 FT Liaoning 115 34 24 23 34 58 228
3 13.08.20 15:00 FT Guandong 113 38 28 27 20 66 228
4 11.08.20 15:00 FT Guandong 110 25 30 25 30 55 198
5 11.08.20 15:00 FT Liaoning 88 24 26 23 15 50 198
6 08.08.20 15:00 FT Guandong 88 16 21 26 25 37 173
7 08.08.20 15:00 FT Beijing 85 13 24 20 28 37 173
8 07.08.20 15:00 FT Liaoning 119 22 40 29 28 62 232
9 07.08.20 15:00 FT Xinjiang 113 33 22 34 24 55 232
Two issue with your code:
First you are trying to scroll after clicking, where as it should be before. Second You are using height of screen which may work in one device bit not in other if size varies.
Better way is to scroll to element itself and the click. See below code. It worked fine:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome('..\drivers\chromedriver')
driver.get("https://www.scorespro.com/basketball/china/cba/results/")
driver.maximize_window()
# I have accept cookies on page , so below step
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[text()='Agree']"))).click()
showMore = driver.find_element_by_link_text("Show more")
driver.execute_script("arguments[0].scrollIntoView();", showMore)
time.sleep(2)
showMore.click()

Python splitting strings and convert them to a list that notices empty fields

it took me the whole day trying to fix this problem but I didn't found a solution so I hope you can help me. I already tried to extract the data from the website. But the problem is that I don't know how to split the list so that 500g converts to 500,g. The problem is that on the website sometimes the quantity is 1 and sometimes 1 1/2 kg or sth. And now I need to convert it into a CSV file and then into a MySQL database. What I want at the end is a CSV file with the columns: ingredients ID, ingredients, quantity, and the unit of the quantity from the ingredient. So for example:
0, meat, 500, g. This is the code I already have to extract the data from this website:
import re
from bs4 import BeautifulSoup
import requests
import csv
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
mainurl = "https://www.chefkoch.de/rs/s0e1n1z1b0i1d1,2,3/Rezepte.html"
urls_urls = []
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
ingredients = []
menge = []
def read_recipes():
for url, id2 in zip(urls_recipes, range(len(urls_recipes))):
soup2 = BeautifulSoup(requests.get(url).content, "lxml")
for ingredient in soup2.select('.td-left'):
menge.append([*[re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))]])
for ingredient in soup2.select('.recipe-ingredients h3, .td-right'):
if ingredient.name == 'h3':
ingredients.append([id2, *[ingredient.get_text(strip=True)]])
else:
ingredients.append([id2, *[re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))]])
read_recipes()
I hope you can help me Thank You!
It appears that the strings containing fractions use the unicode symbols for 1/2 etc., so I think a good way of starting is replacing those by looking up the specific code and passing it to str.replace(). Splitting up the units and the amount for this example was easy, since they are separated by a space. But it might be necessary to generalize this more if you encounter other combinations.
The following code works for this specific example:
import re
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
mainurl = "https://www.chefkoch.de/rs/s0e1n1z1b0i1d1,2,3/Rezepte.html"
urls_urls = []
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
ingredients = []
menge = []
einheit = []
for url, id2 in zip(urls_recipes, range(len(urls_recipes))):
soup2 = BeautifulSoup(requests.get(url).content)
for ingredient in soup2.select('.td-left'):
# get rid of multiple spaces and replace 1/2 unicode character
raw_string = re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True)).replace(u'\u00BD', "0.5")
# split into unit and number
splitlist = raw_string.split(" ")
menge.append(splitlist[0])
if len(splitlist) == 2:
einheit.append(splitlist[1])
else:
einheit.append('')
for ingredient in soup2.select('.recipe-ingredients h3, .td-right'):
if ingredient.name == 'h3':
continue
else:
ingredients.append([id2, re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))])
result = pd.DataFrame(ingredients, columns=["ID", "Ingredients"])
result.loc[:, "unit"] = einheit
result.loc[:, "amount"] = menge
Output:
>>> result
ID Ingredients unit amount
0 0 Beinscheibe(n), vom Rind, ca. 4 cm dick geschn... 4
1 0 Mehl etwas
2 0 Zwiebel(n) 1
3 0 Knoblauchzehe(n) 2
4 0 Karotte(n) 1
5 0 Lauchstange(n) 1
6 0 Staudensellerie 0.5
7 0 Tomate(n), geschält Dose 1
8 0 Tomatenmark EL 1
9 0 Rotwein zum Ablöschen
10 0 Rinderfond oder Fleischbrühe Liter 0.5
11 0 Olivenöl zum Braten
12 0 Gewürznelke(n) 2
13 0 Pimentkörner 10
14 0 Wacholderbeere(n) 5
15 0 Pfefferkörner
16 0 Salz
17 0 Pfeffer, schwarz, aus der Mühle
18 0 Thymian
19 0 Rosmarin
20 0 Zitrone(n), unbehandelt 1
21 0 Knoblauchzehe(n) 2
22 0 Blattpetersilie Bund 1

My year list doesn't work on BeautifulSoup. Why?

I'm newbie learning BeautifulSoup. May someone have a look at the following code? I'd like to scrape data from a website without any success. I'd like to create a dataframe with the sum of player arrivals per year and with a column of players average age.
dataframe repeating codes:
img dataframe error
my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
anos_list = list(range(2005, 2018))
anos_lista = []
valor_contratos_lista = []
idade_média_lista = []
for ano_lista in anos_list:
url = 'https://www.transfermarkt.com/flamengo-rio-de-janeiro/transfers/verein/614/saison_id/'+ str(anos_list) + ''
page = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(page.text, 'html.parser')
tag_list = soup.tfoot.find_all('td')
valor = (tag_list[0].string)
idade = (tag_list[1].string)
ano = ano_lista
valor_contratos_lista.append(valor)
idade_media_lista.append(idade)
anos_lista.append(ano)
flamengo_df = pd.DataFrame({'Ano': ano_lista,
'Despesa com contratações':valor_contratos_lista,
'Média de idade': idade_média_lista
})
flamengo_df.to_csv('flamengo.csv', encoding = 'utf-8')`
Here's my approach:
Using Beautiful Soup + Regex:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
# Set min and max years as variables
min_year = 2005
max_year = 2019
year_range = list(range(min_year, 2019+1))
base_url = 'https://www.transfermarkt.com/flamengo-rio-de-janeiro/transfers/verein/614/saison_id/'
# Begin iterating
records = []
for year in year_range:
url = base_url+str(year)
# get the page
page = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(page.text, 'html.parser')
# I used the class of "responsive table"
tables = soup.find_all('div',{'class':'responsive-table'})
rows = tables[0].find_all('tr')
cells = [row.find_all('td', {'class':'zentriert'}) for row in rows]
# get variable names:
variables = [x.text for x in rows[0].find_all('th')]
variables_values = {x:[] for x in variables}
# get values
for row in rows:
values = [' '.join(x.text.split()) for x in row.find_all('td')]
values = [x for x in values if x!='']
if len(variables)< len(values):
values.pop(4)
values.pop(2)
for k,v in zip(variables_values.keys(), values):
variables_values[k].append(v)
num_pattern = re.compile('[0-9,]+')
to_float = lambda x: float(x) if x!='' else np.NAN
get_nums = lambda x: to_float(''.join(num_pattern.findall(x)).replace(',','.'))
# Add values to an individual record
rec = {
'Url':url,
'Year':year,
'Total Transfers':len(variables_values['Player']),
'Avg Age': np.mean([int(x) for x in variables_values['Age']]),
'Avg Cost': np.nanmean([get_nums(x) for x in variables_values['Fee'] if ('loan' not in x)]),
'Total Cost': np.nansum([get_nums(x) for x in variables_values['Fee'] if ('loan' not in x)]),
}
# Store record
records.append(rec)
Thereafter, initialize dataframe:
Of note, some of the numbers represent millions and would need to be adjusted for.
import pandas as pd
# Drop the URL
df = pd.DataFrame(records, columns=['Year','Total Transfers','Avg Age','Avg Cost','Total Cost'])
Year Total Transfers Avg Age Avg Cost Total Cost
0 2005 26 22.038462 2.000000 2.00
1 2006 32 23.906250 240.660000 1203.30
2 2007 37 22.837838 462.750000 1851.00
3 2008 41 22.926829 217.750000 871.00
4 2009 31 23.419355 175.000000 350.00
5 2010 46 23.239130 225.763333 1354.58
6 2011 47 23.042553 340.600000 1703.00
7 2012 45 24.133333 345.820000 1037.46
8 2013 36 24.166667 207.166667 621.50
9 2014 37 24.189189 111.700000 335.10
10 2015 49 23.530612 413.312000 2066.56
11 2016 41 23.341463 241.500000 966.00
12 2017 31 24.000000 101.433333 304.30
13 2018 18 25.388889 123.055000 738.33
14 2019 10 25.300000 NaN 0.00

Categories