URL changes while using proxy and Selenium - python

I am new to web scraping so please forgive my ignorance.
I built a program to scrape Zillow, and everything has worked fine for the most part. My problem is I am using a proxy service called proxycrawl that easily allows me to integrate proxies into my program. This is done by placing https://api.proxycrawl.com/?token=xxx&url= before my actual URL. What I have noticed is that when the program clicks on an "a" tag, the URL changes to the example below:
Before:
Before Click
After:
After Click
Any 11 clicks through the program or manually result in the site changing to the proxycrawl site, where I get the 404 error. Any ideas?
#Browser open
print(".....Opening Browser.....")
Browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
Browser.maximize_window()
#browser page
url = urllib.parse.quote_plus('https://www.zillow.com/homes/for_sale/Bakersfield-CA-93312/house,mobile,land,townhouse_type/97227_rid/35.4606,-119.037467,35.317856,-119.200888_rect/12_zm/0_mmm/')
Browser.get('https://api.proxycrawl.com/?token=xxx&url=' + url)
print("Opening Zillow")
time.sleep(10)
last_page = int(Browser.find_element_by_xpath("""//ol[#class="zsg-pagination"]//li[last()-1]""").text)
#print last_page
page = 0
count = 0
csv_file = open('listings.csv','w')
fieldnames = ['address', 'price', 'zestimate', 'beds', 'baths', 'feet', 'desc', 'Type', 'year_built', 'heating', 'cooling', 'parking', 'lot',
'days_on_market', 'pricepsqr', 'saves', 'interior', 'spaces_amenities', 'construction', 'exterior', 'parking1', 'mls', 'other']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for i in range(last_page):
page = page + 1
n = 0
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
for i in range(len(listings)):
n = i + 1
listing_dict = {}
print("Scraping the listing number {0} on page {1}, the count is {2}".format(n, page, count))
if (count) % 11 == 0:
listings = Browser.find_elements_by_xpath('//*[#id="search-results"]/ul/li')
time.sleep(2)
try:
# Finds Listings
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
print("Looking Up listings")
# Opens Listing
listings[i].find_elements_by_tag_name('a')[0].click()
print("Opening Listing")
time.sleep(2)
# Opens "See More Tab"
Browser.find_element_by_partial_link_text('See More').click()
# Prepare for Scrape
time.sleep(2)
I did speak with proxycrawl, and they stated that the URL had to be encoded, which I did do with no luck. After encoding, I replied and got the following statement:
"You are sending your requests double encoded and your get a response of pc_status: 602. Those requests are failing and you should fix them. Please only encode the URLs once, encoding the URLs more than once will result in a failing request."

It look like the page is trying to redirect you relatively.
In this specific use case, you could hack your way around the encoding issue by doing something similar to the following
# https://api.proxycrawl.com/homes/for_sale/Test/one,two
x = driver.current_url
#/homes/for_sale/Test/one,two
r = x[26:]
# base url = https://api.proxycrawl.com/?token=xxx&url=
u = base_url + r
driver.get(u)

Related

selinium slow preformance on realtime Django project

I'm working on a project which I'm getting posts from couple of websites and show it in the main page of my website but with filter and letting users to search for keywords and see the posts with that keywords.
this is how the code works :
In here with get the customized url of the site with our keyword and city filter
def link_gen(city='', Kword='' ):
# for example.com
urls =[]
if Kword != '':
if city =='':
url = f'https://www.example.com/search/with{Kword}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
url = f'https://www.example.com/search/with{Kword}in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else:
if city != '':
url = f'https://www.example.com/search/in{city}'
url = url.strip()
url = url.replace(" ", "-")
urls.append(url)
else: urls.append('none')
return urls
this part is where we crawl for the posts of the target website
# function for getting the title, link, icon, desc of all posts
def get_cards(urls):
data = []
# for example.com
if urls[0] != 'none':
# we use webdriver to get site with dynamic component and design
url = urls[0]
options = Options()
options.headless = True
browser = webdriver.Firefox(options=options)
browser.get(url)
print ("Headless Firefox Initialized")
soup = BeautifulSoup(browser.page_source, 'html.parser')
jobs = soup.find_all( 'div', class_="job-list-item", limit=3)
# looping through all the cards
for job in jobs :
# get the title, link, icon, desc
title = job.find('a', class_= "title vertical-top display-inline" ).text
icon = job.find(tage_name_img)['src']
link = job.find('a', class_= "title vertical-top display-inline" )['href']
date = job.find('div', class_= "date" ).text
data.append(dict(
title = title,
icon = f'https://www.example.com/{icon}',
link = f'https://www.example.com/{link}',
date = date,
site = 'example'
))
browser.close()
return data
but the problem is for getting the post and dynamic tags on the websites I needed to use selenium I can't use session.get(url) because it won't return all the tags
and with selenium it takes for ever to return the posts even though I only crawl 3 posts
but I think webdriver uses a lot of resources.
I ran out of ram when I when I tried to run it locally
any suggestion would be so much appreciated

Problem. python scrape with requests + selenium

CODE IS HERE
Hi guys
I have some problem with scraping this dynamic site (https://kvartiry-bolgarii.ru/)
I need to get all the links to the home sale ads
I used selenium to load the page and get links to ads after that I move the page down to load new ads. After the new ads are loaded, I start to parse all the links on the page and write them to the list again.
But the data in the list is not updated and the script continues to work with the links that were on the page before scrolling down.
By the way, I set a check so that the script is executed until the last announcement on the site appears in the list, the link to which I found out in advance
How can this problem be corrected?
def get_link_info():
try:
url = "https://kvartiry-bolgarii.ru/"
driver = webdriver.Chrome(
executable_path=r'C:\Users\kk\Desktop\scrape_house\drivers\chromedriver.exe',
options=options
)
driver.get(url)
req = requests.get(url)
req.encoding = 'utf8'
soup = BeautifulSoup(req.text, "lxml")
articles = soup.find_all("div", class_="content")
links_urls = []
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
#print(links_urls)
first_link_number = links_urls[-2].split("-")[-1]
first_link_number = first_link_number[1:]
#print(first_link_number)
last_link_number = links_urls[-1].split("-")[-1]
last_link_number = last_link_number[1:]
#print(last_link_number)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
check = "https://kvartiry-bolgarii.ru/kvartira-v-elitnom-komplekse-s-unikalynym-sadom-o21751"
for a in links_urls:
if a != check:
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
print(links_urls[-1])
else:
print(links_urls[0], links_urls[-1])
print("all links are ready")
Some pointers. You don't need to mix selenium,requests and BeautifulSoup. Just selenium is enough. When you are scrolling infinitely, you need to remove duplicate elements before adding them to your list.
You can try this. This should work.
from selenium import webdriver
import time
def get_link_info():
all_links = []
try:
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
driver.get('https://kvartiry-bolgarii.ru/')
time.sleep(3)
old_links = set() # Empty Set
while True:
# Scroll to get more ads
driver.execute_script("window.scrollBy(0,3825)", "")
# Wait for new ads to load
time.sleep(8)
links_divs = driver.find_elements_by_xpath('//div[#class="content"]//a') # Find Elements
ans = set(links_divs) - set(old_links) # Remove old elements
for link in ans:
# Scroll to the link.
driver.execute_script("arguments[0].scrollIntoView();", link)
fir = link.get_attribute('href')
all_links.append(fir)
# Remove Duplicates
old_links = links_divs
except Exception as e:
raise e
get_link_info()

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

why it is skipping the whole for loop?

I have created a website scraper which will scrape all info from yellow pages (for educational purposes)
def actual_yellow_pages_scrape(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type):
print(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type)
r = requests.get(link,headers=REQUEST_HEADERS)
soup = BeautifulSoup(r.content,"html.parser")
workbook = xlwt.Workbook()
sheet = workbook.add_sheet(str(ypfind))
count = 0
for i in soup.find_all(class_="business-name"):
sheet.write(count,0,str(i.text))
sheet.write(count,1,str("http://www.yellowpages.com"+i.get("href")))
r1 = requests.get("http://www.yellowpages.com"+i.get("href"))
soup1 = BeautifulSoup(r1.content,"html.parser")
website = soup1.find("a",class_="custom-link")
try:
print("Acquiring Website")
sheet.write(count,2,str(website.get("href")))
except:
sheet.write(count,2,str("None"))
email = soup1.find("a",class_="email-business")
try:
print(email.get("href"))
EMAIL = re.sub("mailto:","",str(email.get("href")))
sheet.write(count,3,str(EMAIL))
except:
sheet.write(count,3,str("None"))
phonetemp = soup1.find("div",class_="contact")
try:
phone = phonetemp.find("p")
print(phone.text)
sheet.write(count,4,str(phone.text))
except:
sheet.write(count,4,str("None"))
reviews = soup1.find(class_="count")
try:
print(reviews.text)
sheet.write(count,5,str(reviews.text))
except:
sheet.write(count,5,str("None"))
count+=1
save = dir+"\\"+ypfind+str(no)+".xls"
workbook.save(save)
no+=1
for i in soup.find_all("a",class_="next ajax-page"):
print(i.get("href"))
actual_yellow_pages_scrape("http://www.yellowpages.com"+str(i.get("href")),no,dir,gui,sel,ypfind,terminal,user,password,port,type)
The code is my above portion of the scraper. I have created the break points at soup and in the for loop not even a single line of for loop gets executed. No errors thrown. I tried the same with printing numbers from 1-10 it works but this is not working why?
Thank you
Answer has been found,
I used a text visulaizer to find what is in "r.content" I soupified it and got a clean HTML and gone through the HTML file and finally found that the browser is unsupported so I removed the requests header and ran the code finally got what I wanted

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)

Categories