Scrape all URLs of a webpage

Scrape all URLs of a webpage - python

I have the following url https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=517801 where the last 6 digits is a unique identifier for a specific runner. I want to find all of the 6 digit unique identifiers on this page.
I've tried to scrape all urls on the page (code shown below), but unfortunately I only get a high-level summary. Rather than an in depth list which should contain >5000 runners. Im hoping to get a list/dataframe which shows:
https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=517801
https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=500000
https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=500005
etc.
This is what i've been able to do so far. I appreciate any help!
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
req = Request("https://www.gbgb.org.uk//greyhound-profile//")
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
print(links)
Thanks for the help in advance!

The data is loaded dynamicall from the external API URL. You can use next example how to load the data (with the IDs):
import json
import requests
api_url = "https://api.gbgb.org.uk/api/results/dog/517801" # <-- 517801 is the ID from your URL in the question
params = {"page": "1", "itemsPerPage": "20", "race_type": "race"}
page = 1
while True:
params["page"] = page
data = requests.get(api_url, params=params).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
if not data["items"]:
break
for i in data["items"]:
print(
"{:<30} {}".format(
i.get("winnerOr2ndName", ""), i.get("winnerOr2ndId", "")
)
)
page += 1
Prints:
Ferndale Boom 534358
Laganore Mustang 543937
Tickity Kara 535237
Thor 511842
Ballyboughlewiss 519556
Beef Cakes 551323
Distant Millie 546674
Lissan Kels 525148
Rosstemple Marko 534276
Happy Harry 550042
Porthall Ella 550841
Southlodge Eden 531677
Effernogue Beef 547416
Faydas Truffle 528780
Johns Lass 538763
Faydas Truffle 528780
Toms Hero 543659
Affane Buzz 547555
Emkay Flyer 531456
Ballymac Tilly 492923
Kilcrea Duke 542178
Sporting Sultan 541880
Droopys Poet 542020
Shortwood Elle 527241
Rosstemple Marko 534276
Erics Bozo 541863
Swift Launch 536667
Longsearch 523017
Swift Launch 536667
Takemyhand 535023
Floral Print 527192
Rustys Aero 497270
Autumn Dapper 519528
Droopys Kiwi 511989
Deep Chest 520634
Newtack Henry 525511
Indian Nightmare 524636
Lady Mascara 528399
Tarsna Yankee 517373
Leathems Act 516918
Final Star 514015
Ascot Faye 500812
Ballymac Ernie 503569

you can convert the result content to a pandas dataframe then just use winnerOr2ndName and winnerOr2ndId columns
Example
import json
import requests
import pandas as pd
def get_items(dog_id):
url = f"https://api.gbgb.org.uk/api/results/dog/{dog_id}?page=-1"
params = {"page": "-1", "itemsPerPage": "20", "race_type": "race"}
response = requests.get(url, params=params).json()
MAX_PAGES = response["meta"]["pageCount"]
result = pd.DataFrame(pd.DataFrame(response["items"]).loc[:, ['winnerOr2ndName','winnerOr2ndId']].dropna())
result["winnerOr2ndId"] = result["winnerOr2ndId"].astype(int)
while int(params.get("page"))<MAX_PAGES:
params["page"] = str(int(params.get("page")) + 1)
response = requests.get(url, params=params).json()
new_items = pd.DataFrame(pd.DataFrame(response["items"]).loc[:, ['winnerOr2ndName','winnerOr2ndId']].dropna())
new_items["winnerOr2ndId"] = new_items["winnerOr2ndId"].astype(int)
result = pd.concat([result, new_items])
return result.drop_duplicates()
It would generate a dataframe looking like this:

Related

How to scrape stock data incorporating pagination next tag using python bs4?

The code can not get the next page, it only repeats in an infinite loop. I am using the example from oxylabs
Could you tell me what I'm doing wrong? Thank you.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html'
while True:
response = requests.get(url)
soup = bs(response.content, "lxml")
symbols = soup.find_all('td', class_='STOCK_CODE' )
for s in symbols:
symbol = s.find('a').text
print(symbol)
next_page = soup.select_one('span', id = 'next')
if next_page:
next_url = next_page.get('href')
url = urljoin(url, next_url)
else:
break
print(url)

The information you want for the other pages is being returned via another call. You need to recreate that call (use your browser's network tools to see what is happening).
The request requires a token that is returned when the homepage is requested. This needs to be provided when requesting the other pages.
For example:
from bs4 import BeautifulSoup as bs
import requests
session = requests.Session()
req_homepage = session.get('https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html')
soup_homepage = bs(req_homepage.content, "lxml")
for meta in soup_homepage.find_all('meta'):
if meta.get('name', None) == '__RequestVerificationToken':
token = meta['content']
data = {
"p_issearch" : 0,
"p_keysearch" : "",
"p_market_code" : "",
"p_orderby" : "STOCK_CODE",
"p_ordertype" : "ASC",
"p_currentpage" : 2,
"p_record_on_page" : 10,
}
headers = {
"Referer" : "https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html",
"__RequestVerificationToken" : token,
"X-Requested-With" : "XMLHttpRequest",
}
for page in range(1, 4):
print(f"Page {page}")
data['p_currentpage'] = page
req = session.post('https://hnx.vn/ModuleIssuer/List/ListSearch_Datas', data=data, headers=headers)
json_content = req.json()['Content']
soup = bs(json_content, "lxml")
for td in soup.find_all('td', class_='STOCK_CODE'):
symbol = td.find('a').text
print(' ', symbol)
This would give you the following output:
Page 1
AAV
ACM
ADC
ALT
AMC
AME
AMV
API
APP
APS
Page 2
ARM
ART
ATS
BAB
BAX
BBS
BCC
BCF
BDB
BED
Page 3
BII
BKC
BLF
BNA
BPC
BSC
BST
BTS
BTW
BVS

Duplicated results while scraping the website

I have the following script which scraps all the information from a website.
However, when I run it, I get duplicated records of the blogs.
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
blog_topics = []
page = "https://www.bartonassociates.com/blog/"
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
for link in soup.find_all(href=re.compile("/blog/tag")):
url = link.get('href')
if '/blog/tag/p' not in urlparse(link.get('href')).path:
blog_topics.append(url)
else:
pass
# VARIABLE TO DEFINE A RANGE BASED ON NO.OF PAGES
pages = np.arange(1)
# DEFINING CUSTOM VARIABLES
title_blognames_links_ = []
author_and_dates_ = []
# LOOP TO RETRIEVE TITLE, BLOG NAMES, LINKS, AUTHORS AND DATE PUBLISHED
for page in pages:
for blogs in blog_topics:
blog_url= blogs +'/p' + str(page)
sleep(randint(2,7))
soup = BeautifulSoup(requests.get(blog_url).content, 'html.parser')
#Information on title, blog names and their links
for h4 in soup.select("h4"):
for h2 in soup.select("h2"):
title_blognames_links_.append((h4.get_text(strip=True), h4.a["href"], h2.get_text(strip=True).replace('"',"")[11:]))
#Information of authors and dates
for tag in soup.find_all(class_="author"):
author_and_dates_.append(tag.get_text(strip=True))
I believe it has to do something with the pages = np.arange(1) range I have provided.
P.S. (1) was just a trail. I have tried (1,17),(1),(2)
Background: The maximum pages in a blog topic I have is 17, each topic
has 10 blogs each (approx)
What I am looking for, is to get all unique blog information from all the blog topics
Not sure what I am doing wrong here

To get all information from all topics you can first grab all topic links (you've done that in your code too) and then for each topic get all pages and all information (not other way around):
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.bartonassociates.com/blog"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
topics = [
a["href"] for a in soup.select('h3:-soup-contains("Blog Topics") + ul a')
]
all_data = []
for t in topics:
while True:
soup = BeautifulSoup(requests.get(t).content, "html.parser")
topic_name = re.search(
r'"([^"]+)"', soup.select_one("h2").get_text(strip=True)
).group(1)
for entry in soup.select(".blog-entry"):
title = entry.h4.get_text(strip=True)
title = entry.h4.get_text(strip=True)
link = entry.a["href"]
tmp = entry.select_one(".author").get_text(strip=True)
if tmp:
author, date = map(
str.strip,
entry.select_one(".author").get_text(strip=True).split("|"),
)
else:
author, date = "N/A", "N/A"
all_data.append([topic_name, title, link, author, date])
print(topic_name, title, link, author, date, sep="\n")
print()
t = soup.select_one('a:-soup-contains("View More")')
if not t:
break
t = t["href"]
df = pd.DataFrame(
all_data, columns=["topic", "title", "link", "author", "date"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
Healthcare News and Trends
DO vs. MD: What’s the Difference?
https://www.bartonassociates.com/blog/whats-the-difference-do-md
Tayla Holman September 09, 2021
Healthcare News and Trends
What is “The Great Resignation?”
https://www.bartonassociates.com/blog/what-is-the-great-resignation
Chris Keeley September 02, 2021
Healthcare News and Trends
CME Requirements for Physicians by State
https://www.bartonassociates.com/blog/cme-requirements-for-physicians-by-state
Teresa Otto, MD July 15, 2021
...and so on.
and saves data.csv (screenshot from LibreOffice):

How to get all products from a beautifulsoup page

I want to get all the products on this page:
nike.com.br/snkrs#estoque
My python code is this:
produtos = []
def aviso():
print("Started!")
request = requests.get("https://www.nike.com.br/snkrs#estoque")
soup = bs4(request.text, "html.parser")
links = soup.find_all("a", class_="btn", text="Comprar")
links_filtred = list(set(links))
for link in links_filtred:
if(produto not in produtos):
request = requests.get(f"{link['href']}")
soup = bs4(request.text, "html.parser")
produto = soup.find("div", class_="nome-preco-produto").get_text()
if(code_formated == ""):
code_formated = "\u200b"
print(f"Nome: {produto} Link: {link['href']}\n")
produtos.append(link["href"])
aviso()
Guys, this code gets the products from the page, but not all yesterday, I suspect that the content is dynamic, but how can I get them all with request and beautifulsoup? I don't want to use Selenium or an automation library, how do I do that? I don't want to have to change my code a lot because it's almost done, how do I do that?

DO NOT USE requests.get if you are dealing with the same HOST.
Reason: read-that
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
allin = []
with requests.Session() as req:
for page in range(1, 6):
params = {
'p': page,
'demanda': 'true'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x.find_next('h2').get_text(strip=True, separator=" "), x['href'])
for x in soup.select('.aspect-radio-box')]
allin.extend(goal)
df = pd.DataFrame(allin, columns=['Title', 'Url'])
print(df)
main('https://www.nike.com.br/Snkrs/Feed')
Output:
Title Url
0 Dunk High x Fragment design Black https://www.nike.com.br/dunk-high-x-fragment-d...
1 Dunk Low Infantil (16-26) City Market https://www.nike.com.br/dunk-low-infantil-16-2...
2 ISPA Flow 2020 Desert Sand https://www.nike.com.br/ispa-flow-2020-153-169...
3 ISPA Flow 2020 Pure Platinum https://www.nike.com.br/ispa-flow-2020-153-169...
4 Nike iSPA Men's Lightweight Packable Jacket https://www.nike.com.br/nike-ispa-153-169-211-...
.. ... ...
115 Air Jordan 1 Mid Hyper Royal https://www.nike.com.br/air-jordan-1-mid-153-1...
116 Dunk High Orange Blaze https://www.nike.com.br/dunk-high-153-169-211-...
117 Air Jordan 5 Stealth https://www.nike.com.br/air-jordan-5-153-169-2...
118 Air Jordan 3 Midnight Navy https://www.nike.com.br/air-jordan-3-153-169-2...
119 Air Max 90 Bacon https://www.nike.com.br/air-max-90-153-169-211...
[120 rows x 2 columns]

To get the data you can send a request to:
https://www.nike.com.br/Snkrs/Estoque?p=<PAGE>&demanda=true
where providing a page number between 1-5 to p= in the URL.
For example, to print the links, you can try:
import requests
from bs4 import BeautifulSoup
url = "https://www.nike.com.br/Snkrs/Estoque?p={page}&demanda=true"
for page in range(1, 6):
response = requests.get(url.format(page=page))
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find_all("a", class_="btn", text="Comprar"))

Web-scrape. BeautifulSoup. Multiple Pages. How on earth would you do that?

Hi I am a Newbie to programming. So I spent 4 days trying to learn python. I evented some new swear words too.
I was particularly interested in trying as an exercise some web-scraping to learn something new and get some exposure to see how it all works.
This is what I came up with. See code at end. It works (to a degree)
But what's missing?
This website has pagination on it. In this case 11 pages worth.  How would you go about adding to this script and getting python to go look at those other pages too and carry out the same scrape. Ie scrape page one , scrape page 2, 3 ... 11 and post the results to a csv?
https://www.organicwine.com.au/vegan/?pgnum=1
https://www.organicwine.com.au/vegan/?pgnum=2
https://www.organicwine.com.au/vegan/?pgnum=3
https://www.organicwine.com.au/vegan/?pgnum=4
https://www.organicwine.com.au/vegan/?pgnum=5
https://www.organicwine.com.au/vegan/?pgnum=6
https://www.organicwine.com.au/vegan/?pgnum=7
8, 9,10, and 11
On these pages the images are actually a thumbnail images something like 251px by 251px.
How would you go about adding to this script to say. And whilst you are at it follow the links to the detailed product page and capture the image link from there where the images are 1600px by 1600px and post those links to CSV
https://www.organicwine.com.au/mercer-wines-preservative-free-shiraz-2020
When we have identified those links lets also download those larger images to a folder
CSV writer. Also I don't understand line 58
for i in range(23)
how would i know how many products there were without counting them (i.e. there is 24 products on page one)
So this is what I want to learn how to do. Not asking for much (he says sarcastically) I could pay someone on up-work to do it but where's the fun in that? and that does not teach me how to 'fish'.
Where is a good place to learn python? A master class on web-scraping. It seems to be trial and error and blog posts and where ever you can pick up bits of information to piece it all together.
Maybe I need a mentor.
I wish there had been someone I could have reached out to, to tell me what beautifulSoup was all about. worked it out by trial and error and mostly guessing. No understanding of it but it just works.
Anyway, any help in pulling this all together to produce a decent script would be greatly appreciated.
Hopefully there is someone out there who would not mind helping me.
Apologies to organicwine for using their website as a learning tool. I do not wish to cause any harm or be a nuisance to the site
Thank you in advance
John
code:
import requests
import csv
from bs4 import BeautifulSoup
URL = "https://www.organicwine.com.au/vegan/?pgnum=1"
response = requests.get(URL)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all('div', class_="caption")
# print(product_title)
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
print(f'''Wine Name: {winetext}''')
# print(f'''\nWine Name: {winename}\n''')
product_price = soup.find_all('div', class_='wrap-thumb-mob')
# print(product_price.text)
price =[]
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
print(f'''Wine Price: {wineprice}''')
# print(f'''\nWine Price: {price}\n''')
image =[]
product_image_link = (soup.find_all('div', class_='thumbnail-image'))
# print(product_image_link)
for imagelink in product_image_link:
wineimagelink = imagelink.a['href']
image.append(wineimagelink)
# image.append(imagelink)
print(f'''Wine Image Lin: {wineimagelink}''')
# print(f'''\nWine Image: {image}\n''')
#
#
# """ writing data to CSV """
# open OrganicWine2.csv file in "write" mode
# newline stops a blank line appearing in csv
with open('OrganicWine2.csv', 'w',newline='') as file:
# create a "writer" object
writer = csv.writer(file, delimiter=',')
# use "writer" obj to write
# you should give a "list"
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
for i in range(23):
writer.writerow([
winename[i],
price[i],
image[i],
])

In this case, to do pagination, instead of for i in range(1, 100) which is a hardcoded way of paging, it's better to use a while loop to dynamically paginate all possible pages.
"While" is an infinite loop and it will be executed until the transition to the next page is possible, in this case it will check for the presence of the button for the next page, for which the CSS selector ".fa-chevron-right" is responsible:
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1 # go to the next page
else:
break
To extract the full size image an additional request is required, CSS selector ".main-image a" is responsible for full-size images:
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
An additional step to avoid being blocked is to rotate user-agents. Ideally, it would be better to use residential proxies with random user-agent.
pandas can be used to extract data in CSV format:
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
For a quick and easy search for CSS selectors, you can use the SelectorGadget Chrome extension (not always work perfectly if the website is rendered via JavaScript).
Check code with pagination and saving information to CSV in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
import pandas as pd
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'pgnum': 1 # number page by default
}
data = []
while True:
page = requests.get(
"https://www.organicwine.com.au/vegan/?",
params=params,
headers=headers,
timeout=30,
)
soup = BeautifulSoup(page.text, "lxml")
print(f"Extracting page: {params['pgnum']}")
for products in soup.select(".price-btn-conts"):
try:
title = products.select_one(".new-h3").text
except:
title = None
try:
price = products.select_one(".price").text.strip()
except:
price = None
try:
snippet = products.select_one(".price-btn-conts p a").text
except:
snippet = None
try:
link = products.select_one(".new-h3 a")["href"]
except:
link = None
# additional request is needed to extract full size image
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
data.append(
{
"title": title,
"price": price,
"snippet": snippet,
"link": link,
"original_image": original_image
}
)
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1
else:
break
# save to CSV (install, import pandas as pd)
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Yangarra McLaren Vale GSM 2016",
"price": "$29.78 in a straight 12\nor $34.99 each",
"snippet": "The Yangarra GSM is a careful blending of Grenache, Shiraz and Mourvèdre in which the composition varies from year to year, conveying the traditional estate blends of the southern Rhône. The backbone of the wine comes fr...",
"link": "https://www.organicwine.com.au/yangarra-mclaren-vale-gsm-2016",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GSM_16.png?20211110083637"
},
{
"title": "Yangarra Old Vine Grenache 2020",
"price": "$37.64 in a straight 12\nor $41.99 each",
"snippet": "Produced from the fruit of dry grown bush vines planted high up in the Estate's elevated vineyards in deep sandy soils. These venerated vines date from 1946 and produce a wine that is complex, perfumed and elegant with a...",
"link": "https://www.organicwine.com.au/yangarra-old-vine-grenache-2020",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GRE_20.jpg?20210710165951"
},
#...
]

Create the URL by putting the page number in it, then put the rest of your code into a for loop and you can use len(winenames) to count how many results you have. You should do the writing outside the for loop. Here's your code with those changes:
import requests
import csv
from bs4 import BeautifulSoup
num_pages = 11
result = []
for pgnum in range(num_pages):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum+1}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all("div", class_="caption")
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
product_price = soup.find_all("div", class_="wrap-thumb-mob")
price = []
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
image = []
product_image_link = soup.find_all("div", class_="thumbnail-image")
for imagelink in product_image_link:
winelink = imagelink.a["href"]
response = requests.get(winelink)
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image.append(main_image['href'])
for i in range(len(winename)):
result.append([winename[i], price[i], image[i]])
with open("/tmp/OrganicWine2.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
writer.writerows(results)
And here's how I would rewrite your code to accomplish this task. It's more pythonic (you should basically never write range(len(something)), there's always a cleaner way) and it doesn't require knowing how many pages of results there are:
import csv
import itertools
import time
import requests
from bs4 import BeautifulSoup
data = []
# Try opening 100 pages at most, in case the scraping code is broken
# which can happen because websites change.
for pgnum in range(1, 100):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
search_results = soup.find_all("div", class_="thumbnail")
for search_result in search_results:
name = search_result.find("div", class_="caption").a.text
price = search_result.find("p", class_="price").span.text
# link to the product's page
link = search_result.find("div", class_="thumbnail-image").a["href"]
# get the full resolution product image
response = requests.get(link)
time.sleep(1) # rate limit
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image_url = main_image["href"]
# or you can just "guess" it from the thumbnail's URL
# thumbnail = search_result.find("div", class_="thumbnail-image").a.img['src']
# image_url = thumbnail.replace('/thumbL/', '/full/')
data.append([name, price, link, image_url])
# if there's no "next page" button or no search results on the current page,
# stop scraping
if not soup.find("i", class_="fa-chevron-right") or not search_results:
break
# rate limit
time.sleep(1)
with open("/tmp/OrganicWine3.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Link", "Wine Image Link"])
writer.writerows(data)

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.

The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here

if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).

You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.

I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here

Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)

The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape all URLs of a webpage - python

Related

How to scrape stock data incorporating pagination next tag using python bs4?

Duplicated results while scraping the website

How to get all products from a beautifulsoup page

Web-scrape. BeautifulSoup. Multiple Pages. How on earth would you do that?

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

Categories

Resources