In the code below, I fill a form then submit it on a website. Then I scrape the resulting data and then writes it to to a csv file (All these work very well). But there is a on that result page with its text 'Later' please how can I click this link. I am using. I have checked a similar question: this but it doesn't quite answer my question.
# import needed libraries
from mechanize import Browser
from datetime import datetime
from bs4 import BeautifulSoup
import csv
br = Browser()
# Ignore robots.txt
br.set_handle_robots(False)
# Google demands a user-agent that isn't a robot
br.addheaders = [('User-agent', 'Chrome')]
# Retrieve the Google home page, saving the response
br.open('http://fahrplan.sbb.ch/bin/query.exe/en')
# Enter the text input (This section should be automated to read multiple text input as shown in the question)
br.select_form(nr=6)
br.form["REQ0JourneyStopsS0G"] = 'Eisenstadt' # Origin train station (From)
br.form["REQ0JourneyStopsZ0G"] ='sarajevo' # Destination train station (To)
br.form["REQ0JourneyTime"] = '5:30' # Search Time
br.form["date"] = '18.01.17' # Search Date
# Get the search results
br.submit()
# get the response from mechanize Browser
soup = BeautifulSoup(br.response().read(), 'html.parser', from_encoding="utf-8")
trs = soup.select('table.hfs_overview tr')
# scrape the contents of the table to csv (This is not complete as I cannot write the duration column to the csv)
with open('out.csv', 'w') as f:
for tr in trs:
locations = tr.select('td.location')
if len(locations) > 0:
location = locations[0].contents[0].strip()
prefix = tr.select('td.prefix')[0].contents[0].strip()
time = tr.select('td.time')[0].contents[0].strip()
#print tr.select('td.duration').contents[0].strip()
durations = tr.select('td.duration')
#print durations
if len(durations) == 0:
duration = ''
#print("oops! There aren't any durations.")
else:
duration = durations[0].contents[0].strip()
f.write("{},{},{}, {}\n".format(location.encode('utf-8'), prefix, time, duration))
The HTML with the Later link looks like
<a accesskey="l" class="hafas-browse-link" href="http://fahrplan.sbb.ch/bin/query.exe/en?ld=std2.a&seqnr=1&ident=kv.047469247.1487285405&REQ0HafasScrollDir=1" id="hfs_linkLater" title="Search for later connections">Later</a>
You can find the url using:
In [22]: soup.find('a', text='Later')['href']
Out[22]: u'http://fahrplan.sbb.ch/bin/query.exe/en?ld=std2.a&seqnr=1&ident=kv.047469247.1487285405&REQ0HafasScrollDir=1'
To make the browser go to that link call br.open:
In [21]: br.open(soup.find('a', text='Later')['href'])
Out[21]: <response_seek_wrapper at 0x7f346a5da320 whose wrapped object = <closeable_response at 0x7f3469bee830 whose fp = <socket._fileobject object at 0x7f34697f26d0>>>
Related
I can't seem to figure out how to get this script to read the actual documents within the links it pulls. I can't seem to get it to bring back in text from the actual document within the links. I also tried to use iframe and src but was unsuccessful.
I have never tried anything like this before so a little stumped on what else I can do.
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse, parse_qs
import io
from PyPDF2 import PdfReader
# specify the web driver path
driver = webdriver.Chrome("path/to/chromedriver")
# navigate to the website
url = "https://probaterecords.shelbyal.com/shelby/search.do?indexName=opr&templateName=Main&searchQuery=richard+wygle&lq=&searchType=1®ex=%5B%5Ea-zA-Z0-9_%5D®exwithspaces=%5B%5Ea-zA-Z0-9%5Cs_%5D®exwithasterisks=%5B%5E*a-zA-Z0-9%5Cs_%5D&sortBy=InstrumentFilter&desc=N&searchable=DisplayName%2CLastName%2CFirstName%2CInstrument%2CRecDate%2CPartyRole%2CDocTypeDesc%2CDocType%2CBook%2CPage%2CLot%2CBlock%2CTownship%2COther%2CFreeform%2COtherName&isPhoneticSearch=&q=richard+wygle&basicSortOrder=InstrumentFilter%7CN&Instrument=&Instrument_select=AND&RecDate=&RecDate=&RecDate_select=AND&LastName=&LastName_select=AND&searchkindLast=StartsLast&FirstName=&FirstName_select=OR&FirstName2=&FirstName2_select=AND&DocTypeDesc=&DocTypeDesc_select=AND&Book=&Book_select=AND&Page=&Page_select=AND&MAPBOOK=&MAPBOOK_select=AND&MAPPAGE=&MAPPAGE_select=AND&Lot%23=&Lot%23_select=AND&Lot=&Lot_select=AND&Block=&Block_select=AND&Section=&Section_select=AND&Township=&Township_select=AND&Range=&Range_select=AND&QT=&QT_select=AND&BQT=&BQT_select=AND&LegacyNum=&LegacyNum_select=AND&advancedSortOrder=InstrumentFilter%7CN"
driver.get(url)
# get the page source
html = driver.page_source
# parse the HTML
soup = BeautifulSoup(html, 'html.parser')
# find all the anchor tags with class "nocolor pphoto"
links = soup.select('a[class="nocolor pphoto"]')
# Create an empty dictionary to store the links
unique_links = {}
for link in links:
href = link['href']
if href.startswith('/shelby/search.do?indexName=shelbyimages&lq='):
# construct the full link
full_link = 'https://probaterecords.shelbyal.com' + href
# parse the query parameters from the link
parsed_url = urlparse(full_link)
query_params = parse_qs(parsed_url.query)
# extract the instrument number from the query parameters
instrument_number = query_params['lq'][0]
# Extract the document type
options = soup.select('select[name="DocTypeDesc"] option')
for option in options:
# check if the option value contains "deed"
if "deeds" in option.get_text().lower():
doc_type = option.get_text()
# add the link to the dictionary
unique_links[instrument_number] = (full_link, doc_type)
# Iterate over the unique links
for instrument_number, link_info in unique_links.items():
full_link, doc_type = link_info
# Open the PDF file from the url
response = requests.get(full_link)
pdf_file = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
# Get the number of pages
pages = len(pdf_reader.pages)
# Initialize a variable to store the text
text = ""
# Iterate over the pages
for page in pdf_reader.pages:
# Extract the text from the page
text += page.extract_text()
# Print the document type, instrument number and the text
print("Document Type: ", doc_type)
print("Instrument Number: ", instrument_number)
print("Text: ", text)
I'm trying to scrape information about the datasets available on this website.
I want to collect the URLs to the resources and at least the title of the dataset.
Using this resource as an example, I want to capture the URL embedded in "Go to resource" and the title listed in the table:
I have created a basic scraper, but it doesn't seem work:
import requests
import csv
from bs4 import BeautifulSoup
site = requests.get('https://data.nsw.gov.au/data/dataset');
data_list=[]
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
internals = content.select('.resource-url-analytics')
for url in internals:
title = internals.select=('.resource-url-analytics')[0].get_text()
link = internals.select=('.resource-url-analytics')[0].get('href')
new_data = {"title": title, "link": link}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["dataset", "link"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
I would like to write the output to a CSV with columns for the URLs and the titles.
This is an example of the desired output
Greatly appreciative for any assistance
Have a look at the API for the datasets that will likely be the easiest way to do this.
In the meantime, here is how you can get the API links at id level from those pages and store the entire package info for all packages in one list, data_sets, and just the info of interest in another variable (results). Be sure to review the API documentation in case there is a better method - for example, it would be nice if ids could be submitted in batches rather than per id.
Answer below is taking advantage of the endpoint detailed in the documentation which is used to get a full JSON representation of a dataset, resource or other object
Taking the current first result on landing page of:
Vegetation of the Guyra 1:25000 map sheet VIS_ID 240.
We want the last child a of parent h3 with a parent having class .dataset-item. In the below, the spaces between selectors are descendant combinators.
.dataset-item h3 a:last-child
You can shorten this to h3 a:last-child for a small efficiency gain.
This relationship reliably selects all relevant links on page.
Continuing with this example, visiting that retrieved url for first listed item, we can find the id using api endpoint (which retrieves json related to this package), via an attribute=value selector with contains, *, operator. We know this particular api endpoint has a common string so we substring match on the href attribute value:
[href*="/api/3/action/package_show?id="]
The domain can vary and some retrieved links are relative so we have to test if relative and add the appropriate domain.
First page html for that match:
Notes:
data_sets is a list containing all the package data for each package and is extensive. I did this in case you are interest in looking at what is in those packages (besides reviewing the API documentation)
You can get total number of pages from soup object on a page via
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
You can alter the loop for less pages.
Session object is used for efficiency of re-using connection. I'm sure there are other improvements to be made. In particular I would look for any method which reduced the number of requests (why I mentioned looking for a batch id endpoint for example).
There can be none to more than one resource url within a returned package. See example here. You can edit code to handle this.
Python:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
All pages
(very long running so consider threading/asyncio):
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
r = s.get('https://data.nsw.gov.au/data/dataset')
soup = bs(r.content, 'lxml')
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
links = [item['href'] for item in soup.select('.dataset-item h3 a:last-child')]
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
if num_pages > 1:
for page in range(1, num_pages + 1): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
For simplicity use selenium package:
from selenium import webdriver
import os
# initialise browser
browser = webdriver.Chrome(os.getcwd() + '/chromedriver')
browser.get('https://data.nsw.gov.au/data/dataset')
# find all elements by xpath
get_elements = browser.find_elements_by_xpath('//*[#id="content"]/div/div/section/div/ul/li/div/h3/a[2]')
# collect data
data = []
for item in get_elements:
data.append((item.text, item.get_attribute('href')))
Output:
('Vegetation of the Guyra 1:25000 map sheet VIS_ID 240', 'https://datasets.seed.nsw.gov.au/dataset/vegetation-of-the-guyra-1-25000-map-sheet-vis_id-2401ee52')
('State Vegetation Type Map: Riverina Region Version v1.2 - VIS_ID 4469', 'https://datasets.seed.nsw.gov.au/dataset/riverina-regional-native-vegetation-map-version-v1-0-vis_id-4449')
('Temperate Highland Peat Swamps on Sandstone (THPSS) spatial distribution maps...', 'https://datasets.seed.nsw.gov.au/dataset/temperate-highland-peat-swamps-on-sandstone-thpss-vegetation-maps-vis-ids-4480-to-4485')
('Environmental Planning Instrument - Flood', 'https://www.planningportal.nsw.gov.au/opendata/dataset/epi-flood')
and so on
I wish to write to a CSV file a list of all authors with their URL to a CSV file who class themselves as a specific tag on Google Scholar. For example, if we were to take 'security' I would want this output:
author url
Howon Kim https://scholar.google.pl/citations?user=YUoJP-oAAAAJ&hl=pl
Adrian Perrig https://scholar.google.pl/citations?user=n-Oret4AAAAJ&hl=pl
... ...
I have written this code which prints each author's name
# -*- coding: utf-8 -*-
import urllib.request
import csv
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
mydivs = soup.findAll("h3", { "class" : "gsc_1usr_name"})
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
for each in mydivs:
for anchor in each.find_all('a'):
print (anchor.text)
However, this only does it for the first page. Instead, I would like to go through every page. How can I do this?
I'm not writing the code for you.. but I'll give you an outline for how you can.
Look at the bottom of the page. See the next button? Search for it the containing div has an id of gsc_authors_bottom_pag which should be easy to find. I'd do this with selenium, find the next button (right) and click it. Wait for the page to load, scrape repeat. Handle edge cases (out of pages, etc).
If the after_author=* bit didn't change in the url you could just increment the url start.. but unless you want to try to crack that code (unlikely) then just click the next button.
This page use <button> instead of <a> for link to next/previous page.
Button to next page has aria-label="Następna".
There are two buttons to next page but you can use any of them.
Button has JavaScript code to redirect to new page
window.location=url_to_next_page
but it is simple text so you can use slicing to get only url
import urllib.request
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
while True:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
# ... do something on page ...
# find buttons to next page
buttons = soup.findAll("button", {"aria-label": "Następna"})
# exit if no buttons
if not buttons:
break
on_click = buttons[0].get('onclick')
print('javascript:', on_click)
#add `domain` and remove `window.location='` and `'` at the end
url = 'http://scholar.google.pl' + on_click[17:-1]
# converting some codes to chars
url = url.encode('utf-8').decode('unicode_escape')
print('url:', url)
BTW: if you speak Polish then you can visit on Facebook: Python Poland or Python: pierwsze kroki
Since furas is already answered on how to loop through all pages, this is a complementary answer to his answer. The script below scrapes much more than your question asks and scrapes to a .csv file.
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, os, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
def get_profiles_to_csv():
html = requests.get('http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
# creating CSV File
with open('awesome_file.csv', mode='w') as csv_file:
# defining column names
fieldnames = ['Author', 'URL']
# defining .csv writer
# https://docs.python.org/3/library/csv.html#csv.DictWriter
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writing (creating) columns
writer.writeheader()
# collecting scraped data
author_data = []
# Selecting container where all data located
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
# id = link
# id_identifer = 'user='
# before_keyword, keyword, after_keyword = id.partition(id_identifer)
# author_id = after_keyword
# affiliations = result.select_one('.gs_ai_aff').text
# email = result.select_one('.gs_ai_eml').text
# try:
# interests = result.select_one('.gs_ai_one_int').text
# except:
# interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
# cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# because we have a csv.DictWriter() we converting to the required format
# dict() keys should be exactly the same as fieldnames, otherwise it will throw an error
author_data.append({
'Author': name,
'URL': f'https://scholar.google.com{link}',
})
# iterating over celebrity data list() that became dict() and writing it to the .csv
for data in author_data:
writer.writerow(data)
# print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')
# output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=pl&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=pl&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=pl&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=pl&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=pl&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=pl&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=pl&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=pl&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=pl&user=ty7wIXoAAAAJ
'''
Alternatively, you can do the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free plan.
Code to integrate:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import csv, os
def get_profiles_to_csv():
with open('awesome_serpapi_file_pagination.csv', mode='w') as csv_file:
fieldnames = ['Author', 'URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_profiles",
"mauthors": "label:security"
}
search = GoogleSearch(params)
while True:
results = search.get_dict()
try:
for result in results['profiles']:
name = result['name']
link = result['link']
writer.writerow({'Author': name, 'URL': link})
except:
print('Done')
break
if (not 'pagination' in results) and (not 'next' in results['pagination']):
break
search.params_dict.update(dict(parse_qsl(urlsplit(results["pagination"]["next"]).query)))
get_profiles_to_csv()
# part of the output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=en&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=en&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=en&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=en&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=en&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=en&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=en&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=en&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=en&user=ty7wIXoAAAAJ
'''
Disclaimer, I work for SerpApi.
This is for Python 3.5.x
What I'm looking for is to find the header, after a peice of the HTML code being
<h3 class = "title-link__title"><span class="title=link__text">News Here</span>
with urllib.request.urlopen('http://www.bbc.co.uk/news') as r:
HTML = r.read()
HTML = list(HTML)
for i in range(len(HTML)):
HTML[i] = chr(HTML[i])
How can I get it so I extract just the header as that's all I need. I'll try and help for detail in anyway i can.
Fetching information from webpages is called web scraping.
One of the best tools to do this job is the BeautifulSoup library.
from bs4 import BeautifulSoup
import urllib
#opening page
r = urllib.urlopen('http://www.bbc.co.uk/news').read()
#creating soup
soup = BeautifulSoup(r)
#useful for understanding the layout of your page info
#print soup.prettify()
#creating a ResultSet with all h3 tags that contains a class named 'title-link__title'
a = soup.findAll("h3", {"class":"title-link__title"})
#counting ocurrences
len(a)
#result = 44
#get text of first header
a[0].text
#result = u'\nMay v Leadsom to be next UK PM\n'
#get text of second header
a[1].text
#result = u'\nVideo shows US police shooting aftermath\n'
I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)