Scraping website with BS4 // accessing class - python

I am tring to extract different information from websites with BeautifulSoup, such as title of the product and the price.
I do that with different urls, looping through the urls with for...in.... Here, I'll just provide a snippet without the loop.
from bs4 import BeautifulSoup
import requests
import csv
url= 'https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
price = soup.find('meta', property="product:price:amount")
title = soup.find("div", {"class": "flix-model-name"})
title2 = soup.find('div', class_="flix-model-name")
title3 = soup.find("div", attrs={"class": "flix-model-name"})
print(price['content'])
print(title)
print(title2)
print(title3)
So from this URL https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html I wasnt to extract the product number. the only place I find it is in the div class="flix-model-name". However, I am totally unable to reach it. I tried different ways to access it in the title, title2, title3, but I always have the output none.
I am a bit of a beginner, so I guess I am probably missing something basic... If so, please pardon me for that.
Any help is welcome! Many thanks in advance!
just for info, with each url I thought of appending the data and write them on a CSV file like that:
for url in urls:
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
row=[]
try:
# title = YOUR VERY WELCOMED ANSWER
prices = soup.find('meta', property="product:price:amount")
row = (title.text+','+prices['content']+'\n')
data.append(row)
except:
pass
file = open('database.csv','w')
i = 0
while i < (len(data)):
file.write(data[i])
i +=1
file.close()
Many thanks in advance for your help!
David

Try below approach using python - requests simple, straightforward, reliable, fast and less code is required when it comes to requests. I have fetched the API URL from website itself after inspecting the network section of google chrome browser.
What exactly below script is doing:
First it will take the API URL, create the URL based on 2 dynamic parameters(product and category) and then do GET request to get the data.
After getting the data script will parse the JSON data using json.loads library.
Finally, it will iterate all over the list of products one by one and print the details which are divided in 2 categotries 'box1_ProductToProduct' and 'box2_KategorieTopseller' like Brand, Name, Product number and Unit price. Same way you can add more details by looking in to the API call.
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def scrap_product_details():
PRODUCT = 'MMCH1991479' #Product number
CATEGORY = '680942' #Category number
URL = 'https://www.mediamarkt.ch/rde_server/res/MMCH/recomm/product_detail/sid/WACXyEbIf3khlu6FcHlh1B1?product=' + PRODUCT + '&category=' + CATEGORY # dynamic URL
response = requests.get(URL,verify = False) #GET request to fetch the data
result = json.loads(response.text) # Parse JSON data using json.loads
box1_ProductToProduct = result[0]['box1_ProductToProduct'] # Extracted data from API
box2_KategorieTopseller = result[1]['box2_KategorieTopseller']
for item in box1_ProductToProduct: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
for item in box2_KategorieTopseller: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
scrap_product_details()

Related

How do I scrape "description" of movies in the IMDB website using BeautifulSoup?

I am using BeautifulSoup to scrape movies in the IMDB website. I was able to scrape name, genre, duration, rating of movies successfully. But I am not able to scrape description of the movies as when I am looking at the classes, it is "text-muted" and since this class is there multiple times holding other data such as rating, genre, duration. But since these data has inner classes also, so it was easier for me to scrape it but when it is coming to description, it does not have any inner class. So when pulling out data just using "text-muted" is giving other data also. How do I just get the description of the movies?
Attaching the code and screenshot for reference:
The sample code which I used to scrape genre is as follows:
genre_tags=data.select(".text-muted .genre")
genre=[g.get_text() for g in genre_tags]
Genre = [item.strip() for item in genre if str(genre)]
print(Genre)
In general, lxml is much better than beautifulsoup.
import requests
from lxml
import html
url = "xxxx"
r = requests.get(url)
tree = html.fromstring(r.text)
rows = tree.xpath('//div[#class="lister-item mode-detail"]')
for row in rows:
description = row.xpath('.//div[#class="ratings-bar"]/following-sibling::p[#class="text-muted"]/text()')[0].strip()
You can use this, :) , if helped you, UP my solution pls.. thks,
from bs4 import BeautifulSoup
from requests_html import HTMLSession
URL = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm' #url of Most Popular Movies in IMDB
PAGE = HTMLSession().get(URL)
PAGE_BS4 = BeautifulSoup(PAGE.html.html,'html.parser')
MoviesObj = PAGE_BS4.find_all("tbody","lister-list") #get table body of Most Popular Movies
for index in range(len(MoviesObj[0].find_all("td","titleColumn"))):
a = list(MoviesObj[0].find_all("td","titleColumn")[index])[1]
href = 'https://www.imdb.com'+a.get('href') #get each link for movie page
moviepage = HTMLSession().get(href) #request each page of movie
moviepage = BeautifulSoup(moviepage.html.html,'html.parser')
title = list(moviepage.find_all('h1')[0].stripped_strings)[0] #parse title
year = list(moviepage.find_all('h1')[0].stripped_strings)[2] #parse year
try:
score = list(moviepage.find_all('div','ratingValue')[0].stripped_strings)[0] #parse score if is available
except IndexError:
score = '-' #if score is not available '-' is filled
description = list(moviepage.find_all('div','summary_text')[0].stripped_strings)[0] #parse description
print(f'TITLE: {title} YEAR: {year} SCORE: {score}\nDESCRIPTION:{description}\n')
PRINT
Junior Saldanha
#UmSaldanha

How to scrape embedded links and tabular information

I'm trying to scrape information about the datasets available on this website.
I want to collect the URLs to the resources and at least the title of the dataset.
Using this resource as an example, I want to capture the URL embedded in "Go to resource" and the title listed in the table:
I have created a basic scraper, but it doesn't seem work:
import requests
import csv
from bs4 import BeautifulSoup
site = requests.get('https://data.nsw.gov.au/data/dataset');
data_list=[]
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
internals = content.select('.resource-url-analytics')
for url in internals:
title = internals.select=('.resource-url-analytics')[0].get_text()
link = internals.select=('.resource-url-analytics')[0].get('href')
new_data = {"title": title, "link": link}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["dataset", "link"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
I would like to write the output to a CSV with columns for the URLs and the titles.
This is an example of the desired output
Greatly appreciative for any assistance
Have a look at the API for the datasets that will likely be the easiest way to do this.
In the meantime, here is how you can get the API links at id level from those pages and store the entire package info for all packages in one list, data_sets, and just the info of interest in another variable (results). Be sure to review the API documentation in case there is a better method - for example, it would be nice if ids could be submitted in batches rather than per id.
Answer below is taking advantage of the endpoint detailed in the documentation which is used to get a full JSON representation of a dataset, resource or other object
Taking the current first result on landing page of:
Vegetation of the Guyra 1:25000 map sheet VIS_ID 240.
We want the last child a of parent h3 with a parent having class .dataset-item. In the below, the spaces between selectors are descendant combinators.
.dataset-item h3 a:last-child
You can shorten this to h3 a:last-child for a small efficiency gain.
This relationship reliably selects all relevant links on page.
Continuing with this example, visiting that retrieved url for first listed item, we can find the id using api endpoint (which retrieves json related to this package), via an attribute=value selector with contains, *, operator. We know this particular api endpoint has a common string so we substring match on the href attribute value:
[href*="/api/3/action/package_show?id="]
The domain can vary and some retrieved links are relative so we have to test if relative and add the appropriate domain.
First page html for that match:
Notes:
data_sets is a list containing all the package data for each package and is extensive. I did this in case you are interest in looking at what is in those packages (besides reviewing the API documentation)
You can get total number of pages from soup object on a page via
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
You can alter the loop for less pages.
Session object is used for efficiency of re-using connection. I'm sure there are other improvements to be made. In particular I would look for any method which reduced the number of requests (why I mentioned looking for a batch id endpoint for example).
There can be none to more than one resource url within a returned package. See example here. You can edit code to handle this.
Python:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
All pages
(very long running so consider threading/asyncio):
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
r = s.get('https://data.nsw.gov.au/data/dataset')
soup = bs(r.content, 'lxml')
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
links = [item['href'] for item in soup.select('.dataset-item h3 a:last-child')]
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
if num_pages > 1:
for page in range(1, num_pages + 1): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
For simplicity use selenium package:
from selenium import webdriver
import os
# initialise browser
browser = webdriver.Chrome(os.getcwd() + '/chromedriver')
browser.get('https://data.nsw.gov.au/data/dataset')
# find all elements by xpath
get_elements = browser.find_elements_by_xpath('//*[#id="content"]/div/div/section/div/ul/li/div/h3/a[2]')
# collect data
data = []
for item in get_elements:
data.append((item.text, item.get_attribute('href')))
Output:
('Vegetation of the Guyra 1:25000 map sheet VIS_ID 240', 'https://datasets.seed.nsw.gov.au/dataset/vegetation-of-the-guyra-1-25000-map-sheet-vis_id-2401ee52')
('State Vegetation Type Map: Riverina Region Version v1.2 - VIS_ID 4469', 'https://datasets.seed.nsw.gov.au/dataset/riverina-regional-native-vegetation-map-version-v1-0-vis_id-4449')
('Temperate Highland Peat Swamps on Sandstone (THPSS) spatial distribution maps...', 'https://datasets.seed.nsw.gov.au/dataset/temperate-highland-peat-swamps-on-sandstone-thpss-vegetation-maps-vis-ids-4480-to-4485')
('Environmental Planning Instrument - Flood', 'https://www.planningportal.nsw.gov.au/opendata/dataset/epi-flood')
and so on

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)

Categories