Scrape data using beautifulsoup - python

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name and surname in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM

One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page += 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 periagomer#hotmail.com
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 mariabelenabadgarcia#hotmail.com
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 jabad#carlosgallegoabogados.es
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 paloma#abadsaezabogados.com
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 3004#icagr.es
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

Related

How to scrape a table from a page and create a multi-column dataframe with python?

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022.
I am trying to scrape the table, narrative, probable cause and classification for every accidents in the year 2015 and 2016: https://aviation-safety.net/database/dblist.php?Year=2015. With the below code I am able to scrape the table only:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep
def scraping(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
#sleep(randint(1,3))
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
#info = []
tl = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
print(new_url)
#sleep(randint(1,3))
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table')
for index,row in enumerate(table.find_all('tr')):
if index == 0:
continue
link_ = 'https://aviation-safety.net/'+row.find('a')['href']
#sleep(randint(1,3))
new_page = requests.get(link_, headers=headers)
new_soup = BeautifulSoup(new_page.text, 'lxml')
table1 = new_soup.find('table')
for i in table1.find_all('tr'):
title = i.text
tl.append(title)
df= pd.DataFrame(tl)
df.columns = ['status']
df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 2015
STOP = 2016
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scraping,years)
But the data is not organized. The dataframe looks like this:
The outcome should be like this:
It looks the values of tl are strings, e.g. 'Status:Accident investigation report completed and information captured'.
Converting the list of strings into a pd.DataFrame gets you a single column with all the values in the list.
If you want to use the "name" of the string, e.g. Status as a column header, you'll need to separate it from the rest of the text.
# maxsplit of 1 so we don't accidentally split up the values, e.g. time
title, text = title.split(":", maxsplit=1)
This looks like
('Status', 'Accident investigation report completed and information captured')
Now we create a dictionary
row_dict[title] = text
Giving us
{'Status': 'Accident investigation report completed and information captured'}
We will add to this same dictionary in the last loop
# old
for i in table1.find_all('tr'):
title = i.text
tl.append(title)
# new
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
After we've gathered all the data from page, i.e. completed the row_dict loop, we append to tl.
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
tl.append(row_dict)
All together now
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep
def scraping(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
#sleep(randint(1,3))
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
#info = []
tl = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
print(new_url)
#sleep(randint(1,3))
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table')
for index,row in enumerate(table.find_all('tr')):
if index == 0:
continue
link_ = 'https://aviation-safety.net/'+row.find('a')['href']
#sleep(randint(1,3))
new_page = requests.get(link_, headers=headers)
new_soup = BeautifulSoup(new_page.text, 'lxml')
table1 = new_soup.find('table')
# make changes here!!!!!!!
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
tl.append(row_dict)
df= pd.DataFrame(tl)
df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 2015
STOP = 2016
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scraping,years)
The read_html()
method offers convenient access to such datasets.
>>> url = "https://web.archive.org/web/20221027040903/https://aviation-safety.net/database/dblist.php?Year=2015"
>>>
>>> dfs = pd.read_html(url)
>>>
>>> df = dfs[1].drop(columns="operator").dropna(axis=1, how="all")
>>> df["date"] = pd.to_datetime(df.date.str.replace("??-", "01-", regex=False), format="%d-%b-%Y")
>>> df.set_index("date")
type registration fat. location cat
date
2015-01-02 Saab 340B G-LGNL 0 Stornoway Ai... A1
2015-01-03 Antonov An-26B-100 RA-26082 0 Magadan-Soko... A1
2015-01-04 Fokker 50 5Y-SIB 0 Nairobi-Jomo... A1
2015-01-08 Bombardier Challenger 300 PR-YOU 0 São Paulo-Co... O1
2015-01-09 Cessna 208B Grand Caravan 8R-GAB 0 Matthews Rid... A2
... ... ... ... ... ..
2015-06-11 Eclipse 500 N508JA 0 Sacramento-E... A2
2015-06-11 Hawker 800XP N497AG 0 Port Harcour... A1
2015-06-12 Boeing 737-33A VH-NLK 0 near Kosrae Airpo... I2
2015-06-15 Antonov An-2R RA-84553 0 Tatsinsky di... A1
2015-06-16 Boeing 737-322 (WL) LY-FLB 0 Aktau Airpor... O1
[100 rows x 5 columns]
It's hard to control the
user-agent
header, so either use a cooperative site,
or do a bit of extra work with requests or curl
to obtain the html text beforehand.

Find span element based on text written inside li Bs4 scraping

I want to find the text located in the <li>, if it exists I want to scrape the <span> text, but if it does not exist I will raise exception, for example:
if 'Floor' found then scrape the span
This is my code and it works perfect but scraping everything without any condition :
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
# Set base url & headers :
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' :
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
test_link = 'https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat'
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
title = soup.find('h1').text.replace('\n','')
loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
sub_loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
floor = soup.find('span' , class_='badge badge-default').text.replace('\n','')
room = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
baths = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
finish = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').text.replace('\n','')
view = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
area = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
date = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
price = soup.find('div' , class_='listing-price-content').find_next('span').text
print(title,loc,sub_loc,floor,room,baths,finish,view,area,date,price)
In general, it would be good to check if the tag you are looking for exists before applying the text method:
title = title.text.strip() if (title := soup.find('h1')) else None
To select tag by text and check if it exists, you can go with css selectors and -soup-contains():
floor = tag.text.strip() if (tag := soup.select_one('ul.list-group li:-soup-contains("Floor") span')) else None
Above works well for some tags, but to go generic and get rid of these confusing property selections, I would suggest the following - Use a dict to store the information in a list of dicts. So you are save if you create a dataframe based on it and a propertiy is missing. Pandas will fill this outomatically with nan.
data = {}
data['title'] = soup.find('h1').text.strip()
data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))
Benefits - You can do adjustments simple, filter if you like and export results in a strucured way.
Example
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' :
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
data = []
def scrape(test_link):
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
data = {}
data['title'] = soup.find('h1').text.strip()
data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))
return data
urlList = ['https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat',
'https://aqarmap.com.eg/en/listing/3124476-for-rent-cairo-new-cairo-el-narges-el-narges-omarat?source=related-listing-source']
for url in urlList:
data.append(scrape(url))
pd.DataFrame(data)
Output
title
loc
sub_loc
Floor
Room
Baths
Finish Type
Size (in meters)
Listing ID
Publish Date
Price
Seller Role
Payment Method
Price Per Meter
View
Furnished Apartment For rent in El Narges Omarat
El Narges
El Narges Omarat
3
3
2
SUPER_LUX
180 M²
EG-3138984
09/01/2022
19,000 EGP
Agent
Cash
106 EGP/M²
nan
Furnished Apartment For rent in El Narges Omarat
El Narges
El Narges Omarat
2
2
2
SUPER_LUX
180 M²
EG-3124476
30/12/2021
19,000 EGP
Agent
Cash
106 EGP/M²
Garden
You might find the following approach useful, particularly an alternative way to extract items from the "bread crumb" list at the top:
import requests
from bs4 import BeautifulSoup as bs
# Set base url & headers :
baseurl = 'https://aqarmap.com.eg'
headers = {
'User_Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
test_link = 'https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat'
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
# Store all available items in a dictionary
ul_list_group = soup.find('ul', class_='list-group')
data = {}
for li_item in ul_list_group.find_all('li'):
items = list(li_item.stripped_strings)
if len(items) == 2:
data[items[0]] = items[1]
req_elements = [
'Floor', 'Room', 'Baths',
'Finish Type', 'Size (in meters)', 'Listing ID',
'Publish Date', 'Price', 'Seller Role',
'Payment Method', 'Price Per Meter',
'Swimming pool', # test case to show missing item
]
# Store all parts of the breadcrump in a list (last 3 required)
ul_breadcrumb = soup.find('ul', class_='breadcrumb')
breadcrumb_data = [li.get_text(strip=True) for li in ul_breadcrumb.find_all('li')]
title = breadcrumb_data[-1]
sub_loc = breadcrumb_data[-2]
loc = breadcrumb_data[-3]
# Build a suitable row of data (give missing entries 'N/A')
row = [title, sub_loc, loc] + [data.get(element, 'N/A') for element in req_elements]
print(row)
Giving you:
['Furnished Apartment For rent in El Narges Omarat', 'El Narges Omarat', 'El Narges', '3', '3', '2', 'SUPER_LUX', '180 M²', 'EG-3138984', '09/01/2022', '19,000 EGP', 'Agent', 'Cash', '106 EGP/M²', 'N/A']

Pagination not iterating over pages

Want to iterate all pages from this url ""url = "https://www.iata.org/en/about/members/airline-list/"" and dump the results in a .csv file.
How could implementing a piece of code to iterate through the pages be included in the current code below?
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://www.iata.org/en/about/members/airline-list/'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
if soup.select_one('span.pagination-link.is-active + div a[href]'):
url = soup.select_one('span.pagination-link.is-active + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('airline-list.csv',encoding='utf-8-sig',index=False)
Try this approach:
for i in range(1, 30):
url = f'https://www.iata.org/en/about/members/airline-list/?page={i}&search=&ordering=Alphabetical'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
To get data dynamically, use:
import pandas as pd
import requests
import bs4
url = 'https://www.iata.org/en/about/members/airline-list/?page={page}&search=&ordering=Alphabetical'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# Total number of pages
html = requests.get(url.format(page=1), headers=headers)
soup = bs4.BeautifulSoup(html.text)
pages = int(soup.find_all('a', {'class': 'pagination-link'})[-2].text)
data = []
for page in range(1, pages+1):
html = requests.get(url.format(page=page, headers=headers))
data.append(pd.read_html(html.text)[0])
df = pd.concat(data)
Output:
>>> df
Airline Name IATA Designator 3 digit code ICAO code Country / Territory
0 ABX Air GB 832 ABX United States
1 Aegean Airlines A3 390 AEE Greece
2 Aer Lingus EI 53 EIN Ireland
3 Aero Republica P5 845 RPB Colombia
4 Aeroflot SU 555 AFL Russian Federation
.. ... ... ... ... ...
3 WestJet WS 838 WJA Canada
4 White coloured by you WI 97 WHT Portugal
5 Wideroe WF 701 WIF Norway
6 Xiamen Airlines MF 731 CXA China (People's Republic of)
7 YTO Cargo Airlines YG 860 HYT China (People's Republic of)
[288 rows x 5 columns]

How to append data in data frame using beautiful soup

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://locations.atipt.com/'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://locations.atipt.com/al')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('ul',class_='list-unstyled')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=baseurl+link['href']
productlinks.append(comp)
temp=[]
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
tag=soup.find_all('div',class_='listing content-card')
for pro in tag:
for tup in pro.find_all('p'):
temp.append([text for text in tup.stripped_strings])
df = pd.DataFrame(temp)
print(df)
This is the output I get
9256 Parkway E Ste A
Birmingham,
Alabama 35206
but I doesn't how to give the name in data frame I give name address to 9256 Parkway ESte A and City to Birmingham and state to ALabama 35206 if it is possible that kindly help in these matter
temp=[]
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
tag=soup.find_all('div',class_='listing content-card')
for pro in tag:
data=[tup.text for tup in pro.find_all('p')]
address="".join(data[:2])
splitdata=data[2].split(",")
city=splitdata[0]
splitsecond=splitdata[-1].split("\xa0")
state=splitsecond[0]
postalcode=splitsecond[-1]
temp.append([address,city,state])
import pandas as pd
df=pd.DataFrame(temp,columns=["Address","City","State"])
df
Output:
Address City State Postalcode
0 634 1st Street NSte 100 Alabaster AL 35007
1 9256 Parkway ESte A Birmingham AL 35206
....
If you want to add call details just add this statement after postalcode
callNumber=pro.find("span",class_="directory-phone").get_text(strip=True).split("\n")[-1].lstrip()
and append this to temp list

Beautifulsoup: activate web button and continue scraping on new page

I'm having a university project and need to get data online. I would like to get some data from this website.
https://www.footballdatabase.eu/en/transfers/-/2020-10-03
For the 3rd of October I managed to get the first 19 rows but then there are 6 pages and I'm struggling to activate the button for loading the next page.
This is the html code for the button:
2
My code so far:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
page = "https://www.footballdatabase.eu/en/transfers/-/2020-10-03"
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
Players = pageSoup.find_all("span", {"class": "name"})
Team = pageSoup.find_all("span", {"class": "firstteam"})
Values = pageSoup.find_all("span", {"class": "transferamount"})
Values[0].text
PlayersList = []
TeamList = []
ValuesList = []
j=1
for i in range(0,20):
PlayersList.append(Players[i].text)
TeamList.append(Team[i].text)
ValuesList.append(Values[i].text)
j=j+1
df = pd.DataFrame({"Players":PlayersList,"Team":TeamList,"Values":ValuesList})
Thank you very much!
You can use requests module to simulate the Ajax call. For example:
import requests
from bs4 import BeautifulSoup
data = {
'date': '2020-10-03',
'pid': 1,
'page': 1,
'filter': 'full',
}
url = 'https://www.footballdatabase.eu/ajax_transfers_show.php'
for data['page'] in range(1, 7): # <--- adjust number of pages here.
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
for line in soup.select('.line'):
name = line.a.text
first_team = line.select_one('.firstteam').a.text if line.select_one('.firstteam').a else 'Free'
second_team = line.select_one('.secondteam').a.text if line.select_one('.secondteam').a else 'Free'
amount = line.select_one('.transferamount').text
print('{:<30} {:<20} {:<20} {}'.format(name, first_team, second_team, amount))
Prints:
Bruno Amione Belgrano  Hellas Vérone  1.7 M€
Ismael Gutierrez Betis Deportivo Atlético B 1 M€
Vitaly Janelt Bochum  Brentford  500 k€
Sven Ulreich Bayern Munich  Hambourg SV  500 k€
Salim Ali Al Hammadi Baniyas  Khor Fakkan  Prêt
Giovanni Alessandretti Ascoli U-20 Recanatese  Prêt
Gabriele Bellodi AC Milan U-20 Alessandria  Prêt
Louis Britton Bristol City B Torquay United  Prêt
Juan Brunetta Godoy Cruz  Parme  Prêt
Bobby Burns Barrow  Glentoran  Prêt
Bohdan Butko Shakhtar Donetsk  Lech Poznan  Prêt
Nicolò Casale Hellas Vérone  Empoli  Prêt
Alessio Da Cruz Parme  FC Groningue  Prêt
Dalbert Henrique Inter Milan  Rennes  Prêt
...and so on.

Categories