I have created a website scraper which will scrape all info from yellow pages (for educational purposes)
def actual_yellow_pages_scrape(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type):
print(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type)
r = requests.get(link,headers=REQUEST_HEADERS)
soup = BeautifulSoup(r.content,"html.parser")
workbook = xlwt.Workbook()
sheet = workbook.add_sheet(str(ypfind))
count = 0
for i in soup.find_all(class_="business-name"):
sheet.write(count,0,str(i.text))
sheet.write(count,1,str("http://www.yellowpages.com"+i.get("href")))
r1 = requests.get("http://www.yellowpages.com"+i.get("href"))
soup1 = BeautifulSoup(r1.content,"html.parser")
website = soup1.find("a",class_="custom-link")
try:
print("Acquiring Website")
sheet.write(count,2,str(website.get("href")))
except:
sheet.write(count,2,str("None"))
email = soup1.find("a",class_="email-business")
try:
print(email.get("href"))
EMAIL = re.sub("mailto:","",str(email.get("href")))
sheet.write(count,3,str(EMAIL))
except:
sheet.write(count,3,str("None"))
phonetemp = soup1.find("div",class_="contact")
try:
phone = phonetemp.find("p")
print(phone.text)
sheet.write(count,4,str(phone.text))
except:
sheet.write(count,4,str("None"))
reviews = soup1.find(class_="count")
try:
print(reviews.text)
sheet.write(count,5,str(reviews.text))
except:
sheet.write(count,5,str("None"))
count+=1
save = dir+"\\"+ypfind+str(no)+".xls"
workbook.save(save)
no+=1
for i in soup.find_all("a",class_="next ajax-page"):
print(i.get("href"))
actual_yellow_pages_scrape("http://www.yellowpages.com"+str(i.get("href")),no,dir,gui,sel,ypfind,terminal,user,password,port,type)
The code is my above portion of the scraper. I have created the break points at soup and in the for loop not even a single line of for loop gets executed. No errors thrown. I tried the same with printing numbers from 1-10 it works but this is not working why?
Thank you
Answer has been found,
I used a text visulaizer to find what is in "r.content" I soupified it and got a clean HTML and gone through the HTML file and finally found that the browser is unsupported so I removed the requests header and ran the code finally got what I wanted
Related
I'm trying to scrape yellow pages, my code is stuck in taking the first business of each page but skips every other business on the page. Ex. 1st company of page 1, 1st company of page2 etc.
I have no clue why it isn't iterating first through the 'web_page' variable, then checking for additional pages and thirdly looking for closing statement and executing ´break´.
If anyone can provide me with clues or help it would be highly appreciated!
web_page_results = []
def yellow_pages_scraper(search_term, location):
page = 1
while True:
url = f'https://www.yellowpages.com/search?search_terms={search_term}&geo_location_terms={location}&page={page}'
r = requests.get(url, headers = headers)
soup = bs(r.content, 'html.parser')
web_page = soup.find_all('div', {'class':'search-results organic'})
for business in web_page:
business_dict = {}
try:
business_dict['name'] = business.find('a', {'class':'business-name'}).text
print(f'{business_dict["name"]}')
except AttributeError:
business_dict['name'] = ''
try:
business_dict['street_address'] = business.find('div', {'class':'street-address'}).text
except AttributeError:
business_dict['street_address'] = ''
try:
business_dict['locality'] = business.find('div', {'class':'locality'}).text
except AttributeError:
business_dict['locality'] = ''
try:
business_dict['phone'] = business.find('div', {'class':'phones phone primary'}).text
except AttributeError:
business_dict['phone'] = ''
try:
business_dict['website'] = business.find('a', {'class':'track-visit-website'})['href']
except AttributeError:
business_dict['website'] = ''
try:
web_page_results.append(business_dict)
print(web_page_results)
except:
print('saving not working')
# If the last iterated page doesn't find the "next page" button, break the loop and return the list
if not soup.find('a', {'class': 'next ajax-page'}):
break
page += 1
return web_page_results
It's worth looking at this line;
web_page = soup.find_all('div', {'class':'search-results organic'})
When I go to the request url I can only find one instance of search-results organic on the page. You then go and iterate over the list (web_page), but there will only be 1 value in the list. So when you do the for loop;
for business in web_page:
you will always only do it once, due to the single item in the list and therefore only get the first result on the page.
You need to loop through the list of businesses on the page not the container holding the business listings. I recommend creating a list from class='srp-listing':
web_page = soup.find_all('div', {'class':'srp-listing'})
This should give you a list of all the businesses on the page. When you iterate over the new list of businesses you will go through more than just the one listing.
I have gotten this code off of a forum and I have installed all the dependencies but I get a few errors.
I dont know python very well so I decided to look it up, I was not able to fix it that way.
#all cats are yellow
from selenium import webdriver
from bs4 import BeautifulSoup
import time
#Quora Login Information
email=""
passy=""
# File With Questions Here
filey = "fileys.txt"
#Read File ,strip new lines ,return question list
def readFile(filey):
with open(filey, "r") as f:
q = f.readlines()
qlist = [x.strip() for x in q]
# qlist=reversed(qlist) #Will reverse the question list if needed
print len(qlist), "Total Questions Loaded"
return qlist
#Login to Quora
def login(email, passy):
print "Logging in..."
driver.get("http://quora.com")
# Create Soup Object and find all form_column classes
forms = BeautifulSoup(driver.page_source, "lxml").find_all(class_="form_column")
# Iterate through forms
# Find polymorphic id string,append a hashtag(#) to create css_selector
for form in forms:
try:
# This is for email/password entry box
data = form.find("input")["name"]
if data == "email":
email_css = "#" + form.find("input")["id"]
if data == "password":
password_css = "#" + form.find("input")["id"]
except:
pass
try:
# This is for the Login Button
data = form.find("input")["value"]
if data == "Login":
button_css = "#" + form.find("input")["id"]
except:
pass
driver.find_element_by_css_selector(email_css).send_keys(email)
driver.find_element_by_css_selector(password_css).send_keys(passy)
time.sleep(2)
driver.find_element_by_css_selector(button_css).click()
time.sleep(2)
# LOGIN FINISHED
#Create Question List
qlist = readFile(filey)
#Create Webdriver Vroom Vroom
driver = webdriver.Chrome()
#Total Questions Posted Counter
county=0
# Iterate through qlist ask questions till no more
for question in qlist:
try:
print question
driver.get("http://quora.com")
soup=BeautifulSoup(driver.page_source,"lxml")
# Find all text areas
blox = soup.find_all("textarea")
# Find polymorphic id string for Ask Question entry field
for x in blox:
try:
placeholder = x["placeholder"]
if placeholder.__contains__("Ask or Search Quora"): # Fix this later
askbar_css = "#" + x["id"]
print askbar_css
except:
pass
askbutton = "#" + soup.find(class_="AskQuestionButton")["id"]# Fix this later
# Type out Question
driver.find_element_by_css_selector(askbar_css).send_keys(question)
# Wait for askbutton to become clickable
time.sleep(.2) # Fix later
try:
driver.find_element_by_css_selector(askbutton).click()
except:
#Click Failed # Fix later
pass
# Find the popup
while True:
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="Modal AskQuestionModal")
break
except:
pass
soup = BeautifulSoup(driver.page_source,"lxml")
popup = "#" + soup.find(class_="submit_button modal_action")["id"]
driver.find_element_by_css_selector(popup).click()
for x in range(0,17):
time.sleep(.1)
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="PMsgContainer") #Found Popup
if str(popExists).__contains__("You asked"): #big no no
county += 1
break
except:
pass
print "county=>",county
except Exception,e:
print e
print "ERROR"
pass
So the code opens chrome and it loads quora, however it gets stuck on logging in and the script ends. I am also on a mac. I get the following error:
'NoneType' object has no attribute '__getitem__'
ERROR
I am new to web scraping so please forgive my ignorance.
I built a program to scrape Zillow, and everything has worked fine for the most part. My problem is I am using a proxy service called proxycrawl that easily allows me to integrate proxies into my program. This is done by placing https://api.proxycrawl.com/?token=xxx&url= before my actual URL. What I have noticed is that when the program clicks on an "a" tag, the URL changes to the example below:
Before:
Before Click
After:
After Click
Any 11 clicks through the program or manually result in the site changing to the proxycrawl site, where I get the 404 error. Any ideas?
#Browser open
print(".....Opening Browser.....")
Browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
Browser.maximize_window()
#browser page
url = urllib.parse.quote_plus('https://www.zillow.com/homes/for_sale/Bakersfield-CA-93312/house,mobile,land,townhouse_type/97227_rid/35.4606,-119.037467,35.317856,-119.200888_rect/12_zm/0_mmm/')
Browser.get('https://api.proxycrawl.com/?token=xxx&url=' + url)
print("Opening Zillow")
time.sleep(10)
last_page = int(Browser.find_element_by_xpath("""//ol[#class="zsg-pagination"]//li[last()-1]""").text)
#print last_page
page = 0
count = 0
csv_file = open('listings.csv','w')
fieldnames = ['address', 'price', 'zestimate', 'beds', 'baths', 'feet', 'desc', 'Type', 'year_built', 'heating', 'cooling', 'parking', 'lot',
'days_on_market', 'pricepsqr', 'saves', 'interior', 'spaces_amenities', 'construction', 'exterior', 'parking1', 'mls', 'other']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for i in range(last_page):
page = page + 1
n = 0
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
for i in range(len(listings)):
n = i + 1
listing_dict = {}
print("Scraping the listing number {0} on page {1}, the count is {2}".format(n, page, count))
if (count) % 11 == 0:
listings = Browser.find_elements_by_xpath('//*[#id="search-results"]/ul/li')
time.sleep(2)
try:
# Finds Listings
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
print("Looking Up listings")
# Opens Listing
listings[i].find_elements_by_tag_name('a')[0].click()
print("Opening Listing")
time.sleep(2)
# Opens "See More Tab"
Browser.find_element_by_partial_link_text('See More').click()
# Prepare for Scrape
time.sleep(2)
I did speak with proxycrawl, and they stated that the URL had to be encoded, which I did do with no luck. After encoding, I replied and got the following statement:
"You are sending your requests double encoded and your get a response of pc_status: 602. Those requests are failing and you should fix them. Please only encode the URLs once, encoding the URLs more than once will result in a failing request."
It look like the page is trying to redirect you relatively.
In this specific use case, you could hack your way around the encoding issue by doing something similar to the following
# https://api.proxycrawl.com/homes/for_sale/Test/one,two
x = driver.current_url
#/homes/for_sale/Test/one,two
r = x[26:]
# base url = https://api.proxycrawl.com/?token=xxx&url=
u = base_url + r
driver.get(u)
My goal is to scrape data from the PGA website to extract all the golf course locations in the USA. I aim to scrape from the 907 pages the name, address, ownership, phone number, and website.
I have created the script below but when the CSV is created it produces errors. The CSV file created from the script has data repetitions of the first few pages and the pages of the website. It does not give the whole data of the 907 pages.
How can I fix my script so that it will scrape all 907 pages and produce a CSV with all the golf courses listed on the PGA website?
Below is my script:
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
Her is the code that you want. It will first parse the current page before going on to the next one. (There are some blank rows, I hope you can fix that yourself).
import csv
import requests
from bs4 import BeautifulSoup
def encode(l):
out = []
for i in l:
text = str(i).encode('utf-8')
out.append(''.join([i if ord(i) < 128 else ' ' for i in text])) #taken from Martjin Pieter's answer
# http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20078869#20078869
return out
courses_list = []
for i in range(5): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
for item in g_data2:
try:
name = item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1= item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2= item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website= item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber= item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(encode(course))
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
EDIT: After the inevitable problems of unicode encoding/decoding, I have modified the answer and it will (hopefully) work now. But http://nedbatchelder.com/text/unipain.html see this.
I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)