I'm pretty new to Python and getting to know Beautiful Soup.
So I have this problem: I need to get data from an event company, specifically contact data. They have this main tables with all participant names and their location. But to get the contact data (phone, email) you need to press on each of the company name from the table and it opens the new window with all the additional information. I'm looking for a way to get that info from a href and combine it with the data in a main table.
So I can get the table and all the href:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
test_url = "https://standconstruction.messe-duesseldorf.de/vis/v1/en/hallindex/1.09?oid=2656&lang=2"
test_data = urlopen(test_url)
test_html = test_data.read()
test_data.close()
page_soup = soup(test_html, "html.parser")
test_table = page_soup.findAll("div", {"class": "exh-table-col"})
print(test_table)
As a result I get all the table and have this kind of info (example of one row), including Name, Adress and href:
<a class="flush" href="/vis/v1/en/exhibitors/aluminium2020.2661781?oid=2656&lang=2">
<h2 class="exh-table-item__name" itemprop="name">Aerospace Engineering Equipment (Suzhou) Co LTD</h2>
</a>
</div>, <div class="exh-table-col exh-table-col--address">
<span class=""><i class="fa fa-map-marker"></i> <span class="link-fix--text">Hall 9 / G57</span></span>
That's where my problem starts, I have no Idea how to get the additional data from the href and combine it with the main data.
I would be very thankful for any possible solutions or at least a tip, where can I find one.
Updating the question:
I needed a Table which contains the information of the following columns:
1.Name; 2.Hall; 3.PDF; 4.Phone; 5.Email.
If you collect the Data by hand - to get the Phone and Email you need to click the appropriate link for it to show.
I wanted to know if there was a way to export the Phone and Email from those Link and add them to the first 3 columns using Python.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
params = {
"oid": "2656",
"lang": "2"
}
def main(url):
with requests.Session() as req:
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("div.exh-table-item")
names = [name.h2.text for name in target]
hall = [hall.span.text.strip() for hall in target]
pdf = [pdf.select_one("a.color--darkest")['href'] for pdf in target]
links = [f"{url[:46]}{link.a['href']}" for link in target]
phones = []
emails = []
for num, link in enumerate(links):
print(f"Extracting {num +1} of {len(links)}")
r = req.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
goal = soup.select_one("div[class^=push--bottom]")
try:
phone = goal.select_one("span[itemprop=telephone]").text
except:
phone = "N/A"
try:
email = goal.select_one("a[itemprop=email]").text
except:
email = "N/A"
emails.append(email)
phones.append(phone)
sleep(1)
df = pd.DataFrame(list(zip(names, hall, pdf, phones, emails)), columns=[
"Name", "Hall", "PDF", "Phone", "Email"])
print(df)
df.to_csv("data.csv", index=False)
main("https://standconstruction.messe-duesseldorf.de/vis/v1/en/hallindex/1.09")
Output: View Online
Related
I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)
so this is my code
from bs4 import BeautifulSoup
import requests
import time
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = soup.select_one('td[class^="subject windowbg2"]').text
while True:
# reloads the page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
book = soup.select_one('td[class^="subject windowbg2"]').text
# repeats after 30 seconds
time.sleep(30)
but if you go to the website and have a look I get the text of the newest book uploaded but I want to be able to separate the title and the author and the title and author are in different elements but they don't have a way to identify them (like a class or an ID) so if you can help please do, thanks
Assuming html remains consistent across entries (I only checked a few) then when next text is found under the pinned listings at the top (I assume this to be a new book) then you need to extract the book url, visit that url, then you can use ``:-soup-containsto target author and book title by specific text andnext_sibling` to get the required return values.
N.B. I have removed the while loop for the purposes of this answer. The additions to the elif are the important ones.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = '' # for testing altered this line
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
new_book_url = soup.select_one('tr:not([class]) td:not([class*=stickybg]) ~ .subject a')['href']
r = requests.get(new_book_url)
soup = BeautifulSoup(r.content, "html.parser")
for member in ['TITLE ', 'AUTHOR']:
print(soup.select_one(f'strong:-soup-contains("{member}")').next_sibling.next_sibling)
I have a script which scrapes a website for the name, region and province of companies in Spain. There is another link within the html, which takes you to a page that contains the phone number, but when I try to even scrape the html, it prints "none". Is there a way that the script can automatically move to the page, scrape the number and match it with the company row?
import requests
from googlesearch import search
from bs4 import BeautifulSoup
for page in range(1,65):
url = "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{page}.html".format(page =page)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.find('href', class_ ="col1")
info = [title, location, province, link]
print(info)
Alternatively, is there is a way to do it with googlesearch library?
Many thanks
first url "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html" not
"https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/1.html"
for this reason your script does not return output.
you can try like this
import requests
# from googlesearch import search
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,5)]
allurls = baseurl + urls
print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
info = [title, location, province, link]
print(info)
Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()
I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)