I have a problem to do a scrape with Python. I need to collect patent data for multiple firms, but when a firms' patent is available my code does not move to the next item on the list. For example, the first firm of my list does not have a registered patent, so I want run the code with the next firm id. At the end of the function I inserted
except Exception:
print (f'CNPJ {pj} with problem. Check the list.')
pass
but it was not enough. I really appreciate if someone can help me. Below is my code.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 15 03:52:42 2019
Project: Patents
#author: caique
"""
# Create List
cnpj = ['00.000.100/0000-00', '76.487.032/0001-25', '46.068.425/0001-33', '00.348.003/0001-10', '17.217.985/0001-04']
# Create Function
def patente_pj(cnpj):
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from contextlib import suppress
os.chdir("/home/caique/Desktop/Patentes INPI")
# Chrome Headless
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
# CAMINHO PARA O EXECUTÁVEL
patentes = []
try:
for pj in cnpj:
pj = str(pj).replace('.', '').replace('/', '').replace(' ', '').replace('-', '')
driver = webdriver.Chrome(executable_path = r"/home/caique/Desktop/Patentes INPI/chromedriver")
driver.get("https://gru.inpi.gov.br/pePI/jsp/patentes/PatenteSearchBasico.jsp")
driver.find_element_by_link_text("Continuar....").click()
driver.get("https://gru.inpi.gov.br/pePI/jsp/patentes/PatenteSearchBasico.jsp")
driver.find_element_by_link_text("Pesquisa Avançada").click()
destination_page_link = driver.find_element_by_xpath("//*[#id='principal']/div[7]/button")
destination_page_link.click()
driver.find_element_by_xpath(
"//*[#id='principal']/div[7]/div/table/tbody/tr[2]/td[2]/font/input").send_keys(pj)
driver.find_element_by_xpath(
"//*[#id='principal']/table[3]/tbody/tr[1]/td/font[2]/select/option[1]").click()
driver.find_element_by_xpath("//*[#id='principal']/table[3]/tbody/tr[2]/td/font/input[1]").click()
html_source = driver.page_source
html_source
soup = BeautifulSoup(html_source, 'lxml')
tabela = soup.find_all("tr")[8]
soup1 = soup.find_all("tbody")[1]
soup2 = soup.find_all("tbody")[2]
hits = len(soup2.find_all("a")) - 1
linha1 = []
for col in tabela.find_all("td"):
linha1.append(col.text)
table_dados = []
for i in range(0, len(soup1.find_all("b"))):
dict = {}
linha_teste = soup1.find_all("tr")[i]
dict[linha1[0]] = linha_teste.find_all("a")[0].text.replace('\n\t', '').replace('\n', '').replace(' ',
'')
dict[linha1[1]] = linha_teste.find_all("td")[1].text[19:29]
dict[linha1[2]] = linha_teste.find_all("b")[0].text.replace('\n\t', '').replace('\n', '').replace(' ',
'')
dict[linha1[3]] = linha_teste.find_all("font")[3].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
table_dados.append(dict)
desired_width = 700 # LARGURA
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)
tabela_de_teste = pd.DataFrame(table_dados)
driver.find_element_by_link_text("Próxima»").click()
cont = 2
for cont in range(1, int(driver.find_element_by_xpath("//*[#id='tituloEResumoContextGlobal']/font/b[3]").text)):
html_source = driver.page_source
html_source
soup = BeautifulSoup(html_source, 'lxml')
soup1 = soup.find_all("tbody")[1]
for i in range(0, len(soup1.find_all("b"))):
dict = {}
linha_teste = soup1.find_all("tr")[i]
dict[linha1[0]] = linha_teste.find_all("a")[0].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
dict[linha1[1]] = linha_teste.find_all("td")[1].text[19:29]
dict[linha1[2]] = linha_teste.find_all("b")[0].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
dict[linha1[3]] = linha_teste.find_all("font")[3].text.replace('\n\t', '').replace('\n',
'').replace(' ',
'')
table_dados.append(dict)
if cont <-1+ int(driver.find_element_by_xpath("//*[#id='tituloEResumoContextGlobal']/font/b[3]").text):
driver.find_element_by_link_text("Próxima»").click()
print(pd.DataFrame(table_dados))
driver.quit()
tabela_final = pd.DataFrame(table_dados)
patentes.append(tabela_final.to_csv('/home/caique/Desktop/Patentes INPI/CSV/patentes_'+pj+'.csv'))
except Exception:
print(f'CNPJ {pj} with problem. Check the list.')
pass
return patentes
# Run Function
patente_pj(cnpj)
Put the try ... except statements into the start and end of each loop.
You may want to consider printing the error message when the exception occurs to debug better.
Related
Here is my code:
So I wanted to extract all the bollywood movies, and the project requires, movie titles, cast, crew, IMDB id etc.... I am not able to get all the IMDb IDs with the error nonetype. When I used it on one page only it was working quite well, however, when I use it on multiple pages it shows an error. Please help
#importing the libraries needed
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
from random import randint
#declaring the list of empty variables, So that we can append the data overall
movie_name = []
year = []
time=[]
rating=[]
votes = []
description = []
director_s = []
starList= []
imdb_id = []
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=num_votes,desc&start=1&ref_=adv_nxt"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
name = store.h3.a.text
movie_name.append(name)
year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text
year.append(year_of_release)
runtime = store.p.find("span", class_ = 'runtime').text if store.p.find("span", class_ = 'runtime') else " "
time.append(runtime)
rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if store.find('div', class_ = "inline-block ratings-imdb-rating") else " "
rating.append(rate)
value = store.find_all('span', attrs = {'name': "nv"})
vote = value[0].text if store.find_all('span', attrs = {'name': "nv"}) else " "
votes.append(vote)
# Description of the Movies
describe = store.find_all('p', class_ = 'text-muted')
description_ = describe[1].text.replace('\n', '') if len(describe) > 1 else ' '
description.append(description_)
## Director
ps = store.find_all('p')
for p in ps:
if 'Director'in p.text:
director =p.find('a').text
director_s.append(director)
## ID
imdbID = store.find('span','rating-cancel').a['href'].split('/')[2]
imdb_id.append(imdbID)
## actors
star = store.find("p", attrs={"class":""}).text.replace("Stars:", "").replace("\n", "").replace("Director:", "").strip()
starList.append(star)
Error:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17576/2711511120.py in <module>
63
64 ## IDs
---> 65 imdbID = store.find('span','rating-cancel').a['href'].split('/')[2] if store.find('span','rating-cancel').a['href'].split('/')[2] else ' '
66 imdb_id.append(imdbID)
67
AttributeError: 'NoneType' object has no attribute 'a'
Change your condition to the following, cause first you have to check if <span> exists:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
Example
Check the url, here are some of the <span> missing:
import requests
from bs4 import BeautifulSoup
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=my_ratings,desc"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.find_all('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
print(imdbID)
Output
tt9900050
tt9896506
tt9861220
tt9810436
tt9766310
tt9766294
tt9725058
tt9700334
tt9680166
tt9602804
Even better scrape the id via image tag cause these is always there even if there is only the placholder:
imdbID = store.img.get('data-tconst')
I am creating a project that scrapes indeeds website and it was working fine but when I ran it today, all of a sudden without having made any changes, instead of returning the entire page of results, it no only displays the first result in duplicates. May someone help me correct this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
#print(html_text)
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"}
for official in officials:
jobTitle = soup.find('h2',{'class': 'jobTitle'}).text
companyName = soup.find('div',{'class': 'comapny_location'})
location = soup.find('div',{'class': 'companyLocation'}).text
salary = soup.find('div',{'class': 'salary-snippet'})
actualSalary = salary.find('span').text
summary = soup.find('div',{'class': 'job-snippet'}).text
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
#print(str(official))
print(' ')
driver.quit()
Try this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"})
for i in range(len(officials)):
jobTitle = soup.findAll('h2',{'class': 'jobTitle'})[i].text
companyName = soup.findAll('div',{'class': 'comapny_location'})[i].text if len(soup.findAll('div',{'class': 'comapny_location'})) > i else "NULL"
location = soup.findAll('div',{'class': 'companyLocation'})[i].text if len(soup.findAll('div',{'class': 'companyLocation'})) > i else "NULL"
salary = soup.findAll('div',{'class': 'salary-snippet'})[i].text if len(soup.findAll('div',{'class': 'salary-snippet'})) > i else "NULL"
actualSalary = salary.find('span')
summary = soup.findAll('div',{'class': 'job-snippet'})[i].text if len(soup.findAll('div',{'class': 'job-snippet'})) > i else "NULL"
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
print(' ')
driver.quit()
Below is the code, when I run the script, it starts over from the 1st page while running. The aim is to get the post title, date, and body from each page and then click next at the bottom of each page to start the process again.
Here are the includes:
import requests
import csv
import urllib.parse as urlparse
from urllib.parse import parse_qs
from bs4 import BeautifulSoup
from selenium import webdriver
import time
browser = webdriver.Chrome('/Users/Xander/desktop/scraper/chromedriver')
URL = "https://www.jancox.com/jans-daily-news"
browser.get(URL)
URL_PAG = None
PAG = None
# Function Definition
def scrapeP(r):
count = 0
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
quotes = []
table = soup.find('div', attrs = {'class':'main-content'})
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
print(quote['date_published'])
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
time.sleep(2)
doc.click()
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(10)
print(count, ' - ', URL_PAG)
if(count % 10 == 0):
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
while True:
if(PAG == True):
browser.get(URL_PAG)
r = requests.get(URL_PAG)
print(URL_PAG)
scrapeP(r)
else:
browser.get(URL)
r = requests.get(URL)
scrapeP(r)
The loop never starts. The get request that you see is the one fired at line 3
PAG is None and you change its value inside your scrapeP function, so you never change its value outside scrapeP function's scope.
define
global pag
inside your scrapeP function to change its value from within a scrapeP
>>> PAG = None
>>> def scrapeP():
... global PAG
... PAG = True
...
>>> scrapeP()
>>> PAG
True
The issue was that I had the function to click next and print in the for loop. I needed to allow the for loop to read the entire page before clicking next.
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
quote['date_published'] = post.time.text
quote['post'] = post.div.text.strip()
print(quote['date_published'])
quotes.append(quote)
time.sleep(2)
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
doc.click()
time.sleep(2)
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(2)
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
f.close()
I am trying to scrape some news. I have a larger list of 3k articles from this site, selected by criteria, and (considering I am new to Python) I came out with this script to scrape them:
import pandas as pd
import bs4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
href = []
title = []
subtitle = []
time = []
article = []
art1 = []
#
#dd = soup.find("div", "art_author").text
#dd
filename = "scraped.csv"
f = open(filename, "w")
#headers = "href;title;subtitle;time;article\n"
headers = "title;subtitle;time;article\n"
f.write(headers)
for url in list2:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml').decode('windows-1251')
href = url
title = soup.find("h1", "title").string
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").string
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1.extend(soup.find("div", id="art_start").find_all("p"))
for a in art1:
#article.extend(art1.find_all("p"))
article = ([a.text.strip()])
break
#href = "".join(href)
title = "".join(title)
subtitle = "".join(subtitle)
time = "".join(time)
article = "".join(article)
#f.write(href + ";" + title + ";" + subtitle + ";" + time + ";" + article + "\n")
f.write(title + ";" + subtitle + ";" + time + ";" + article +"\n")
f.close()
The main problem for now is that I get an error:
File "<ipython-input-12-9a796b182a82>", line 24, in <module>
title = soup.find("h1", "title").string
TypeError: slice indices must be integers or None or have an __index__ method
I can't really find a solution to this.
And the second problem is whenever I succeed scraping one site, some empty cells occur, which means I have to find a way through Ajax.
I use the Anaconda version 2018.12.
Something I stumbled upon ([here] https://www.youtube.com/watch?v=FSH77vnOGqU ):
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print(js_test.text)
if __name__ == '__main__': main()
Ok. I fixed the issue with your soup object stored as a string, so you can use bs4 to parse the html. I also opted to use pandas .to_csv(), as I'm just more familiar with it, but it gets you the desired output:
import pandas as pd
from bs4 import BeautifulSoup
import requests
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
results = pd.DataFrame()
for url in list2:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
href = url
title = soup.find("h1", "title").text
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").text
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1 = soup.find("div", id="art_start").find_all("p")
article = []
for a in art1:
if 'googletag.cmd.push' not in a.text:
article.append(a.text.strip())
article = ' '.join(article)
temp_df = pd.DataFrame([[title, subtitle, time, article]], columns = ['title','subtitle','time','article'])
results = results.append(temp_df).reset_index(drop=True)
results.to_csv("scraped.csv", index=False, encoding='utf-8-sig')
Output:
print (results.to_string())
title subtitle time article
0 Борисов се похвали: Приходите от горивата са с... Мерките за изсветляване на сектора действат, к... Обновена: 13 мар 2019 13:24 | 13 мар 2019 11:3... Приходите от горивата са със 7% повече. Това с...
1 "Пазим езика си": Правопис под патронажа на Ра... Грамотността зависи не само от училището, смят... Обновена: 13 мар 2019 11:34 | 13 мар 2019 11:2... За втора поредна година Сдружение "Живата вода...
2 Политиката – "неканен гост" на празничната нов... Основателни ли бяха критиките на президента Ру... 3 яну 2019 10:45, Цветелин Димитров Оказа ли се политиката "неканен гост" на празн...
I'm scraping a website using Selenium , Scrapy and PhantomJS. The problem with the code is , although the code scrolls the page perfectly it extracts link only upto certain limit. Beyond that it completely ignores the result of scrolling. When i use Firefox Webdriver , it is working perfectly. Since i'm running the code in server, i used PhantomJS and thus encountered the problem. Below is the code:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
class DukeSpider(BaseSpider):
name = "dspider"
allowed_domains = ["dukemedicine.org"]
start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor
def __init__(self):
self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
self.driver.maximize_window()
print 'here'
def parse(self, response):
print 'nowhere'
print response
print response.url
b = open('doc_data_duke.csv', 'a')
a = csv.writer(b, lineterminator='\n')
print 'a'
self.driver.get(response.url)
time.sleep(10)
wait = WebDriverWait(self.driver, 10)
print 'helo'
click = self.driver.find_element_by_xpath("//span[#id='specialty']")
click.click()
click_again = self.driver.find_element_by_xpath("//ul[#class='doctor-type']/li[#class='ng-binding ng-scope'][2]")
click_again.click()
time.sleep(25)
act = ActionChains(self.driver)
act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()
print 'now here'
for i in range(0, 75):
#self.driver.find_element_by_xpath("//div[#id='doctor-matrix-section']").send_keys(Keys.PAGE_DOWN)
#self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
#self.driver.find_element_by_tag_name("body").click()
#self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)#findElement(By.tagName("body")).sendKeys(Keys.UP);
#self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
#bg = self.driver.find_element_by_css_selector('body')
#bg.send_keys(Keys.SPACE)
act.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
print i
i += 1
links = self.driver.find_elements_by_xpath("//div[#class = 'result-information']/div[#class='name']/a")
for l in links:
print l
doc_list = l.get_attribute('href')
if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):
print doc_list
dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr.maximize_window()
dr.get(doc_list)
try:
name_title = dr.find_element_by_xpath('//div[#class="header1 ng-binding"]').text
name_titles = name_title.split(",", 1)
name = name_titles[0].encode('utf-8')
title = name_titles[1]
print name.encode('utf-8')
title = title[1:].encode('utf-8')
print title.encode('utf-8')
except:
name = ''
title = ''
try:
speciality = dr.find_element_by_xpath('//p[#class="specialties ng-scope"]').text
except:
speciality = ''
try:
language = dr.find_element_by_xpath(
'//div[#class="lang ng-scope"]/div[#class="plainText inline ng-binding"]').text
except:
language = ''
if dr.find_elements_by_xpath('//div[#class="location-info"]'):
locations = dr.find_elements_by_xpath('//div[#class="location-info"]')
if len(locations) >= 3:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationB.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = locations[2].text.encode('utf-8')
locationC = locationC.replace('\n', '')
locationC = locationC.replace('Directions', '')
elif len(locations) == 2:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationA.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = ''
elif len(locations) == 1:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = ''
locationC = ''
else:
locationA = ''
locationB = ''
locationC = ''
dr.close()
data = [title, name, speciality, language, locationA, locationB, locationC]
print 'aaaa'
print data
a.writerow(data)
No matter what higher value i set in the range , it ignores result beyond a certain point.
Let's use the fact that there is an element having the total result count:
The idea is to iteratively scroll into view of the last found doctor until we've got all doctors loaded.
Implementation (with clarifying comments, leaving only relevant "selenium" specific part):
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://www.dukemedicine.org/find-doctors-physicians")
# close optional survey popup if exists
try:
driver.find_element_by_css_selector("area[alt=close]").click()
except NoSuchElementException:
pass
# open up filter dropdown
click = driver.find_element_by_id("specialty")
click.click()
# choose specialist
specialist = driver.find_element_by_xpath("//ul[#class = 'doctor-type']/li[contains(., 'specialist')]")
specialist.click()
# artificial delay: TODO: fix?
time.sleep(15)
# read total results count
total_count = int(driver.find_element_by_id("doctor-number").text)
# get the initial results count
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
# iterate while all of the results would not be loaded
while current_count < total_count:
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
print "Current results count: %d" % current_count
# report total results
print "----"
print "Total results loaded: %d" % current_count
driver.quit()
Works for me perfectly in both PhantomJS and Chrome. Here is what I get on the console:
Current results count: 36
Current results count: 54
Current results count: 72
Current results count: 90
...
Current results count: 1656
Current results count: 1674
Current results count: 1692
Current results count: 1708
----
Total results loaded: 1708
Additionally note I've added --load-images=false command-line argument that actually speeds things up dramatically.