How to navigate to next page in selenium? - python

I'm a fresher in this field. So I'm trying to navigate to a web page to scrape a data, when I execute the code, its scrapes the 1st page data but never navigates to the next pages. I have tried in a many ways but I couldn't find. Please check below my code I have written a code for pagination. please anyone help me on this. thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()

Maybe the page isn't loaded yet, try to replace the last block in the loop by
next = driver.find_element_by_css_selector('.pagination_next a')
url = next.get_attribute('href')
driver.get(url)
sleep(3) # Maybe it's not necessary

Try like below. I was able to visit all the other 5 pages.
driver.implicitly_wait(10)
driver.get("https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60")
time.sleep(5) # Would be better to apply Explicit wait to click on `Απόρριψη`
driver.find_element_by_xpath("//a[contains(text(),'Απόρριψη')]").click()
nextbuttons = driver.find_element_by_xpath("//ul[#class='pagination']/li[5]/a")
length = int(nextbuttons.get_attribute("innerText"))
for i in range(2,length+1):
nextopt = driver.find_element_by_xpath("//ul[#class='pagination']/li/a[contains(text(),'{}')]".format(i))
nextopt.click()
time.sleep(5)

Related

Variable from a for loop not showing in another loop

I've been practicing webscraping with the nba.com playerlist, but I've ran into a problem where a link that I scraped in one for loop does not appear when I call on it in another for loop.
I have already tried to make more variables in both the original for loop and the for loop I want the variable to show up in, but it does not appear. I am trying to use the link that I scraped (it is not the full link but rather the end part of the link which I'm trying to combine with the base of the link).
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
for one_player in player_list:
print (one_player.name)
print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())
As shown in the offset code starting with names.find_all('a'):, everything works properly and the link gets transferred and prints out following the template(ex. Abrines, Alex /player/203518/), but when it gets to for p in player_list[0:2]:, when I put in p.link, it doesn't get transferred over, and when I tried to make it print p.link, nothing got printed. Any help would be appreciated as I've been testing out so many things for a while now!
player_list variable is not a global variable. You define this in a single function, To keep track it in all functions, yous should initialize it globally.
This is because the first href in null.In that case you need to put condition before you adding this in the list.I have added that step now check.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
if name["href"]:
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
# for one_player in player_list:
# print (one_player.name)
# print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
print('https://stats.nba.com'+p.link)
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())

Storing a list of objects as csv in python

I am writing a program that scrapes a website and extracts the names and the links from a table. I store each name and respective link in an object and append it to a list of objects by running a for loop. I want to save this list as a csv file, but haven't been able to do so using the filewriter as it converts the whole object to str, which is not what I want. I would be really grateful if someone can guide me with this. Here is my code:
from selenium import webdriver
from bs4 import BeautifulSoup
class Player():
def __init__(self):
self.name = ""
self.link = ""
player_list = []
driver = webdriver.PhantomJS(executable_path = r'C:\Users\sarim\Desktop\Scraper\phantomjs.exe')
driver.get('https://dak.gg/ranks/na/solo-fpp/rating')
pause = 0
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
html_doc = driver.page_source
soup = BeautifulSoup(html_doc, 'lxml')
players = soup.find('table', class_ = 'list rating')
player_names = players.find_all('td', class_ = 'nick')
add_link = "https://dak.gg"
for i in player_names[0:3]:
newPlay = Player()
n = i.find('a')
l = add_link + n['href']
newPlay.link = l
newPlay.name = n.text.strip()
#print newPlay.name
#print newPlay.link
player_list.append(newPlay)
#print players_names[1]
driver.quit()
Try something like this
import csv
with open('data.csv', 'w',) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Name', 'Link'])
for player in player_list:
writer.writerow([player.name, player.link])

I'm trying to make the date to be concise, but it doesn't work

I'm trying many methods to solve this data, but it can't work. strip() and replace() methods look like in the picture one, it doesn't work. Please help me.
` improt requests
from lxml import html,etree
from selenium import webdriver
import time
file_name = 'dubanxinlixue.json'
driver = webdriver.Chrome()
url_string = []
name_data, price_data = [], []
jd_goods_data = {}
page = 0
while True:
url = 'https://book.douban.com/tag/%E5%BF%83%E7%90%86%E5%AD%A6?start={page}&type=S'.format(page=page)
url_string.append(url)
page += 20
if page > 980:
break
for i in url_string:
driver.get(i)
base_html = driver.page_source
selctor = etree.HTML(base_html)
j = 1
for j in range(20):
j += 1
name = '//*[#id="subject_list"]/ul/li[%d]/div[2]/h2/a[1]/#title'%(j)
get_name =selctor.xpath(name)[0]
describe = '//*[#id="subject_list"]/ul/li[%d]/div[2]/div[1]/text()'%(j)
get_describe = selctor.xpath(describe)[0]
get_describe.string.strip()
print(get_describe)`
the get_describe looks like this ,[the result of get_describe][1]
new_get_describe = get_describe.strip()
print(new_get_describe)

PhantomJS not extracting links Selenium

I'm scraping a website using Selenium , Scrapy and PhantomJS. The problem with the code is , although the code scrolls the page perfectly it extracts link only upto certain limit. Beyond that it completely ignores the result of scrolling. When i use Firefox Webdriver , it is working perfectly. Since i'm running the code in server, i used PhantomJS and thus encountered the problem. Below is the code:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
class DukeSpider(BaseSpider):
name = "dspider"
allowed_domains = ["dukemedicine.org"]
start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor
def __init__(self):
self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
self.driver.maximize_window()
print 'here'
def parse(self, response):
print 'nowhere'
print response
print response.url
b = open('doc_data_duke.csv', 'a')
a = csv.writer(b, lineterminator='\n')
print 'a'
self.driver.get(response.url)
time.sleep(10)
wait = WebDriverWait(self.driver, 10)
print 'helo'
click = self.driver.find_element_by_xpath("//span[#id='specialty']")
click.click()
click_again = self.driver.find_element_by_xpath("//ul[#class='doctor-type']/li[#class='ng-binding ng-scope'][2]")
click_again.click()
time.sleep(25)
act = ActionChains(self.driver)
act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()
print 'now here'
for i in range(0, 75):
#self.driver.find_element_by_xpath("//div[#id='doctor-matrix-section']").send_keys(Keys.PAGE_DOWN)
#self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
#self.driver.find_element_by_tag_name("body").click()
#self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)#findElement(By.tagName("body")).sendKeys(Keys.UP);
#self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
#bg = self.driver.find_element_by_css_selector('body')
#bg.send_keys(Keys.SPACE)
act.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
print i
i += 1
links = self.driver.find_elements_by_xpath("//div[#class = 'result-information']/div[#class='name']/a")
for l in links:
print l
doc_list = l.get_attribute('href')
if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):
print doc_list
dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr.maximize_window()
dr.get(doc_list)
try:
name_title = dr.find_element_by_xpath('//div[#class="header1 ng-binding"]').text
name_titles = name_title.split(",", 1)
name = name_titles[0].encode('utf-8')
title = name_titles[1]
print name.encode('utf-8')
title = title[1:].encode('utf-8')
print title.encode('utf-8')
except:
name = ''
title = ''
try:
speciality = dr.find_element_by_xpath('//p[#class="specialties ng-scope"]').text
except:
speciality = ''
try:
language = dr.find_element_by_xpath(
'//div[#class="lang ng-scope"]/div[#class="plainText inline ng-binding"]').text
except:
language = ''
if dr.find_elements_by_xpath('//div[#class="location-info"]'):
locations = dr.find_elements_by_xpath('//div[#class="location-info"]')
if len(locations) >= 3:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationB.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = locations[2].text.encode('utf-8')
locationC = locationC.replace('\n', '')
locationC = locationC.replace('Directions', '')
elif len(locations) == 2:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationA.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = ''
elif len(locations) == 1:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = ''
locationC = ''
else:
locationA = ''
locationB = ''
locationC = ''
dr.close()
data = [title, name, speciality, language, locationA, locationB, locationC]
print 'aaaa'
print data
a.writerow(data)
No matter what higher value i set in the range , it ignores result beyond a certain point.
Let's use the fact that there is an element having the total result count:
The idea is to iteratively scroll into view of the last found doctor until we've got all doctors loaded.
Implementation (with clarifying comments, leaving only relevant "selenium" specific part):
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://www.dukemedicine.org/find-doctors-physicians")
# close optional survey popup if exists
try:
driver.find_element_by_css_selector("area[alt=close]").click()
except NoSuchElementException:
pass
# open up filter dropdown
click = driver.find_element_by_id("specialty")
click.click()
# choose specialist
specialist = driver.find_element_by_xpath("//ul[#class = 'doctor-type']/li[contains(., 'specialist')]")
specialist.click()
# artificial delay: TODO: fix?
time.sleep(15)
# read total results count
total_count = int(driver.find_element_by_id("doctor-number").text)
# get the initial results count
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
# iterate while all of the results would not be loaded
while current_count < total_count:
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
print "Current results count: %d" % current_count
# report total results
print "----"
print "Total results loaded: %d" % current_count
driver.quit()
Works for me perfectly in both PhantomJS and Chrome. Here is what I get on the console:
Current results count: 36
Current results count: 54
Current results count: 72
Current results count: 90
...
Current results count: 1656
Current results count: 1674
Current results count: 1692
Current results count: 1708
----
Total results loaded: 1708
Additionally note I've added --load-images=false command-line argument that actually speeds things up dramatically.

Append scraped data to different columns

while True:
for rate in soup.find_all('div',{"class":"rating"}):
if rate.img is not None:
print (rate.img['alt'])
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
while True:
for rate in soup.findAll('div',{"class":"listing_title"}):
print (rate.a.text)
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
This should do what you're looking for. You should grab the parent class of both (I chose .listing, and get each attribute from there, insert them in a dict, and then write the dicts to CSV with the Python CSV library. Just as a fair warning, I didn't run it until it broke, I just broke after the second loop to save some computing.
WARNING HAVE NOT TESTED ON FULL SITE
import csv
import time
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
url = 'http://www.tripadvisor.in/Hotels-g186338-London_England-Hotels.html'
driver = webdriver.Firefox()
driver.get(url)
hotels = []
while True:
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('div.listing')
for l in listings:
hotel = {}
hotel['name'] = l.select('a.property_title')[0].text
hotel['rating'] = float(l.select('img.sprite-ratings')[0]['alt'].split('of')[0])
hotels.append(hotel)
next = driver.find_element_by_link_text('Next')
if not next:
break
else:
next.click()
time.sleep(0.5)
if len(hotels) > 0:
with open('ratings.csv', 'w') as f:
fieldnames = [ k for k in hotels[0].keys() ]
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
for h in hotels:
writer.writerow(h)
driver.quit()
You should look at using a list.
I would try something like this:
for rate in soup.findAll('div',{"class":["rating","listing_title"]}):
(could be wrong, this machine doesn't have bs4 for me to check, sorry)

Categories