Multiple Pages Web Scraping with Python and Beautiful Soup - python

I'm trying to write a code to scrape some date from pages about hotels. The final information (name of the hotel and address) should be export to csv. The code works but only on one page...
import requests
import pandas as pd
from bs4 import BeautifulSoup # HTML data structure
page_url = requests.get('https://e-turysta.pl/noclegi-krakow/')
soup = BeautifulSoup(page_url.content, 'html.parser')
list = soup.find(id='nav-lista-obiektow')
items = list.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
nazwa_noclegu = [item.find(class_='h3 et-list__details__name').get_text() for item in items]
adres_noclegu = [item.find(class_='et-list__city').get_text() for item in items]
dane = pd.DataFrame(
{
'nazwa' : nazwa_noclegu,
'adres' : adres_noclegu
}
)
print(dane)
dane.to_csv('noclegi.csv')
I tried a loop but doesn't work:
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')
Any ideas?

Urls are different then you use - you forgot ?page=.
And you have to use {} to add value to string
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(i+1)
or concatenate it
url = 'https://e-turysta.pl/noclegi-krakow/?page=' + str(i+1)
or use f-string
url = f'https://e-turysta.pl/noclegi-krakow/?page={i+1}'
EDIT: working code
import requests
from bs4 import BeautifulSoup # HTML data structure
import pandas as pd
def get_page_data(number):
print('number:', number)
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(number)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
container = soup.find(id='nav-lista-obiektow')
items = container.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
# better group them - so you could add default value if there is no nazwa or adres
dane = []
for item in items:
nazwa = item.find(class_='h3 et-list__details__name').get_text(strip=True)
adres = item.find(class_='et-list__city').get_text(strip=True)
dane.append([nazwa, adres])
return dane
# --- main ---
wszystkie_dane = []
for number in range(1, 23):
dane_na_stronie = get_page_data(number)
wszystkie_dane.extend(dane_na_stronie)
dane = pd.DataFrame(wszystkie_dane, columns=['nazwa', 'adres'])
dane.to_csv('noclegi.csv', index=False)

in your loop you use the .format() function but need to insert the brackets into the string you are formatting.
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/{}'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')

Related

How to select one by one the element in web scraping using python

I want only h3[0] and h6[1], for example.
<div class="span16">
<h3>Shroot, Stephanie</h3>
<h6>Chemistry</h6>
<h6>December 2021</h6>
<p>Thesis or dissertation
<h3>Shroot</h3>
i use BeautifulSoup, and for loop to get information
url = line.strip()
r_html = requests.get(url, headers=headers).text
r_html_sc = requests.get(url, headers=headers).status_code
soup = BeautifulSoup(r_html, "html.parser")
thesis_infos = soup.find('div',{"class":"span16"})
if thesis_infos is not None:
thesis_infos_text = thesis_infos.text.strip()
else: thesis_infos_1 = " "
print(thesis_infos_text)
thesis_infos_lines = thesis_infos_text.readlines()
author1_1 = thesis_infos_lines[0]
year1_1 = thesis_infos_lines[2]
Edit:
The easiest way is probably to use BeautifulSoup, like so:
soup.find_all("h3")[0]
soup.find_all("h6")[1]
Here is a short example, filtering for links on google.com:
import requests as requests
from bs4 import BeautifulSoup
html = requests.get("https://www.google.com").text
soup = BeautifulSoup(html, "html.parser")
links = soup.findAll("a")
print(links[0])
Is this what you are looking for?
import re
code = """
<div class="span16">
<h3>Shroot, Stephanie</h3>
<h6>Chemistry</h6>
<h6>December 2021</h6>
<p>Thesis or dissertation
<h3>Shroot</h3>
"""
h3_matches = re.findall(".*<h3>(.+)<\\/h3>", code)
h6_matches = re.findall(".*<h6>(.+)<\\/h6>", code)
print(h3_matches[0])
print(h6_matches[1])
output:
Shroot, Stephanie
December 2021
thesis_infos = soup.find('div',{"class":"span16"})
code = str(thesis_infos)
h3_matches = re.findall(".*<h3>(.+)<\\/h3>", code)
h6_matches = re.findall(".*<h6>(.+)<\\/h6>", code)
print(h3_matches[0])
print(h6_matches[1])

How to get the tokens in data-search-meta-sol

def extract(page):
url = f'https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
jobs = soup.find_all('div', class_='sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu')
for job in jobs[:29]:
for token in job.find_all('div', attrs={'data-search-sol-meta': True}):
more_details = token.text.strip()
job_detail = {
'more details': more_details
}
joblist.append(job_detail)
joblist = []
dummy = 2
for i in range(0, dummy, 1):
c = extract(i + 1)
transform(c)
print(f'Progress Page: [{int(i) + 1}/{dummy}]')
time.sleep(4)
df = pd.DataFrame(joblist)
I want to scrape the tokens in those data-search-sol-meta tags, how to i get it?
<div data-search-sol-meta="{"searchRequestToken":"62781aeb-4a14-43c9-b985-8be617cc1107","token":"0~62781aeb-4a14-43c9-b985-8be617cc1107","jobId":"jobstreet-my-job-5011156","section":"MAIN","sectionRank":1,"jobAdType":"ORGANIC","tags":{"mordor__flights":"mordor_80","jobstreet:userGroup":"BB","jobstreet:s_vi":"[CS]v1|314CC40D0D655F39-400007A66AC825EB[CE]"}}">
the results in the pd (more_details column) that I've got is just "None"
I would use a more robust css selector list i.e. not the dynamic classes. Be high enough in the DOM to be able to select both the attributes you want and then the job info. You can extract the attribute with the tokens and use json library to list separately.
import requests, json
from bs4 import BeautifulSoup
def extract(page):
url = f"https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
jobs = soup.select("[data-automation=jobListing] > div:has(article)")
for job in jobs:
print(job.select_one("h1 span").text)
print()
print(job["data-search-sol-meta"])
print()
data = json.loads(job["data-search-sol-meta"])
print("searchRequestToken: ", data["searchRequestToken"])
print("token: ", data["token"])
print()
soup = extract(1)
transform(soup)

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:
You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

How do I pull multiple values from html page using python?

I'm performing some data analysis for my own knowledge from nhl spread/betting odds information. I'm able to pull some information, but Not the entire data set. I want to pull the list of games and the associated into a panda dataframe, but I have been able to perform the proper loop around the html tags. I've tried the findAll option and the xpath route. I'm not successful with either.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
name_box = page_content.find('div', attrs={'class': 'datarow'})
name = name_box.text.strip()
print (name)
This script goes through each datarow and pulls out each item individually and then appends them into a pandas DataFrame.
from bs4 import BeautifulSoup
import requests
import pandas as pd
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
tables = page_content.find_all('div', class_='datarow')
# Iterate through rows
rows = []
# Iterate through each datarow and pull out each home/away separately
for table in tables:
# Get time and date
time_and_date_tag = table.find_all('div', attrs={"class": "time"})[0].contents
date = time_and_date_tag[1]
time = time_and_date_tag[-1]
# Get teams
teams_tag = table.find_all('div', attrs={"class": "datacell teams"})[0].contents[-1].contents
home_team = teams_tag[1].text
away_team = teams_tag[-1].text
# Get opening
opening_tag = table.find_all('div', attrs={"class": "child-open"})[0].contents
home_open_value = opening_tag[1]
away_open_value = opening_tag[-1]
# Get current
current_tag = table.find_all('div', attrs={"class": "child-current"})[0].contents
home_current_value = current_tag[1]
away_current_value = current_tag[-1]
# Create list
rows.append([time, date, home_team, away_team,
home_open_value, away_open_value,
home_current_value, away_current_value])
columns = ['time', 'date', 'home_team', 'away_team',
'home_open', 'away_open',
'home_current', 'away_current']
print(pd.DataFrame(rows, columns=columns))
Here is my solution to your question.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
for cell in page_content.find_all('div', attrs={'class': 'datarow'}):
name = cell.text.strip()
print (name)

Categories