I'm scraping a website using Selenium , Scrapy and PhantomJS. The problem with the code is , although the code scrolls the page perfectly it extracts link only upto certain limit. Beyond that it completely ignores the result of scrolling. When i use Firefox Webdriver , it is working perfectly. Since i'm running the code in server, i used PhantomJS and thus encountered the problem. Below is the code:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
class DukeSpider(BaseSpider):
name = "dspider"
allowed_domains = ["dukemedicine.org"]
start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor
def __init__(self):
self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
self.driver.maximize_window()
print 'here'
def parse(self, response):
print 'nowhere'
print response
print response.url
b = open('doc_data_duke.csv', 'a')
a = csv.writer(b, lineterminator='\n')
print 'a'
self.driver.get(response.url)
time.sleep(10)
wait = WebDriverWait(self.driver, 10)
print 'helo'
click = self.driver.find_element_by_xpath("//span[#id='specialty']")
click.click()
click_again = self.driver.find_element_by_xpath("//ul[#class='doctor-type']/li[#class='ng-binding ng-scope'][2]")
click_again.click()
time.sleep(25)
act = ActionChains(self.driver)
act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()
print 'now here'
for i in range(0, 75):
#self.driver.find_element_by_xpath("//div[#id='doctor-matrix-section']").send_keys(Keys.PAGE_DOWN)
#self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
#self.driver.find_element_by_tag_name("body").click()
#self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)#findElement(By.tagName("body")).sendKeys(Keys.UP);
#self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
#bg = self.driver.find_element_by_css_selector('body')
#bg.send_keys(Keys.SPACE)
act.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
print i
i += 1
links = self.driver.find_elements_by_xpath("//div[#class = 'result-information']/div[#class='name']/a")
for l in links:
print l
doc_list = l.get_attribute('href')
if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):
print doc_list
dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr.maximize_window()
dr.get(doc_list)
try:
name_title = dr.find_element_by_xpath('//div[#class="header1 ng-binding"]').text
name_titles = name_title.split(",", 1)
name = name_titles[0].encode('utf-8')
title = name_titles[1]
print name.encode('utf-8')
title = title[1:].encode('utf-8')
print title.encode('utf-8')
except:
name = ''
title = ''
try:
speciality = dr.find_element_by_xpath('//p[#class="specialties ng-scope"]').text
except:
speciality = ''
try:
language = dr.find_element_by_xpath(
'//div[#class="lang ng-scope"]/div[#class="plainText inline ng-binding"]').text
except:
language = ''
if dr.find_elements_by_xpath('//div[#class="location-info"]'):
locations = dr.find_elements_by_xpath('//div[#class="location-info"]')
if len(locations) >= 3:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationB.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = locations[2].text.encode('utf-8')
locationC = locationC.replace('\n', '')
locationC = locationC.replace('Directions', '')
elif len(locations) == 2:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationA.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = ''
elif len(locations) == 1:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = ''
locationC = ''
else:
locationA = ''
locationB = ''
locationC = ''
dr.close()
data = [title, name, speciality, language, locationA, locationB, locationC]
print 'aaaa'
print data
a.writerow(data)
No matter what higher value i set in the range , it ignores result beyond a certain point.
Let's use the fact that there is an element having the total result count:
The idea is to iteratively scroll into view of the last found doctor until we've got all doctors loaded.
Implementation (with clarifying comments, leaving only relevant "selenium" specific part):
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://www.dukemedicine.org/find-doctors-physicians")
# close optional survey popup if exists
try:
driver.find_element_by_css_selector("area[alt=close]").click()
except NoSuchElementException:
pass
# open up filter dropdown
click = driver.find_element_by_id("specialty")
click.click()
# choose specialist
specialist = driver.find_element_by_xpath("//ul[#class = 'doctor-type']/li[contains(., 'specialist')]")
specialist.click()
# artificial delay: TODO: fix?
time.sleep(15)
# read total results count
total_count = int(driver.find_element_by_id("doctor-number").text)
# get the initial results count
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
# iterate while all of the results would not be loaded
while current_count < total_count:
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
print "Current results count: %d" % current_count
# report total results
print "----"
print "Total results loaded: %d" % current_count
driver.quit()
Works for me perfectly in both PhantomJS and Chrome. Here is what I get on the console:
Current results count: 36
Current results count: 54
Current results count: 72
Current results count: 90
...
Current results count: 1656
Current results count: 1674
Current results count: 1692
Current results count: 1708
----
Total results loaded: 1708
Additionally note I've added --load-images=false command-line argument that actually speeds things up dramatically.
Related
I know you've probably seen 100 Indeed Scraping posts on here, and i'm hoping mine is a bit different. Essentially, I'm trying to build an Indeed job scraper that pulls company name and job title, based on a search with "job title" and "location" being variables. Additionally, when selenium opens chrome, Indeed is auto-populating my location, which doesn't get overwritten by the location I've inputting in the code.
I'm fairly new to Python, and I'm relying on the foundation built by someone else from GitHub, so I am having trouble diagnosing the problem.
Would love any help or insight!
Here is my code:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import json
from time import sleep
list_of_description = ["warehouse","associate"]
URL = "https://www.indeed.com/"
MAIN_WINDOW_HANDLER = 0
JOB_TITLE = " "
JOB_LOCATION = " "
JSON_DICT_ARRAY = []
def main():
pageCounter = 0
bool_next = True
newUrl = ""
# theUrl = "https://ca.indeed.com/jobs?q=developer&l=Winnipeg%2C+MB"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get( URL )
# Change text in where
whatElement = browser.find_element(By.ID,"text-input-what")
whatElement.send_keys( JOB_TITLE )
# Change text in where
whereElement = browser.find_element(By.ID,"text-input-where")
whereElement.send_keys(Keys.CONTROL + "a")
whereElement.send_keys(Keys.BACK_SPACE)
whereElement.send_keys( JOB_LOCATION )
whereElement.submit()
MAIN_WINDOW_HANDLER = browser.window_handles[0]
fileName = "{} Jobs in {}.json".format(JOB_TITLE, JOB_LOCATION)
newPage = True
nextNumber = 2
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
browser.get( browser.current_url )
browser.get( browser.current_url )
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
if( check_exists_by_xpath(browser, '//button[#id="onetrust-accept-btn-handler"]') ):
try:
theElement = browser.find_element(By.XPATH, '//button[#id="onetrust-accept-btn-handler"]' )
print(type(theElement))
theElement.click()
print("I clicked")
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
except:
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
with open(fileName, "w") as data:
for it in JSON_DICT_ARRAY:
data.write(json.dumps(it))
data.write(",\n")
data.close()
def scrapeJobListing(linkElements, reqResultText, browser, mainHandler):
jobDes = ""
for i in range( len(linkElements) ):
print("\n ",i)
jsonDataDict = {}
list = re.findall(r'["](.*?)["]',str(linkElements[i]))
currJobMap = "jobmap[{}]= ".format(i)
openBracketIndex = reqResultText.find(currJobMap) + len(currJobMap)
findNewString = reqResultText[openBracketIndex:openBracketIndex+600]
print (findNewString)
closeBracketIndex = findNewString.find("}") + 1
cmpOpen = findNewString.find("cmp:'") + len("cmp:'")
cmpClose = findNewString.find("',cmpesc:")
titleOpen = findNewString.find("title:'") + len("title:'")
titleClose = findNewString.find("',locid:")
parsedString = str( findNewString[0:closeBracketIndex] )
print (parsedString)
print("\n")
cmpName = parsedString[cmpOpen:cmpClose]# Company Name
jobTitle = parsedString[titleOpen:titleClose]# Job Title
jsonDataDict['(2) Company Name'] = cmpName
jsonDataDict['(1) Job Title'] = jobTitle
try:
title = browser.find_element(By.ID,list[4]) # 4th quotation is the Job Description
print('Found <%s> element with that class name!' % (title.tag_name))
title.click()
window_after = browser.window_handles[1]
browser.switch_to.window(window_after)
theCurrURL = browser.current_url
browser.get(theCurrURL)
currPageSource = browser.page_source
jsonDataDict['(4) Job Link'] = theCurrURL
print (theCurrURL)
jobDes = getElementFromHTML('div #jobDescriptionText', currPageSource)
soup = bs4.BeautifulSoup(str(jobDes), "html.parser")
jobDescText = soup.get_text('\n')
jsonDataDict['(3) Job Description'] = jobDescText
JSON_DICT_ARRAY.append(jsonDataDict)
browser.close()
print(jobDes)
except:
print('Was not able to find an element with that name.')
# sleep(2)
print (mainHandler)
browser.switch_to.window(mainHandler) #Not necessary right?
def getElementBySearch(searchTag, theURL):
reqResult = download_file(theURL)
soup = bs4.BeautifulSoup(reqResult.text, "html.parser")
element = soup.select(searchTag)
return element
def getElementFromHTML(searchTag, htmlText):
soup = bs4.BeautifulSoup(htmlText, "html.parser")
element = soup.select(searchTag)
return element
def check_exists_by_xpath(webdriver, xpath):
try:
webdriver.find_elements(By.XPATH,xpath)
except NoSuchElementException:
return False
return True
def download_file(searchPhrase):
result = requests.get(searchPhrase)
# type(result)
# Check for error
try:
result.raise_for_status()
except Exception as exc:
print('There was a problem: %s' % (exc))
return result
if __name__== "__main__":
main()
Right now, the script essentially opens Indeed, looks through each page and posts the links. But I'm not sure why it's not providing the job title and company information.
The output looks like this -
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=10&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=20&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=30&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=40&vjk=2fd38d5eb42b6ca4
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=50&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=60&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=70&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=80&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
Search Concluded
I'm a fresher in this field. So I'm trying to navigate to a web page to scrape a data, when I execute the code, its scrapes the 1st page data but never navigates to the next pages. I have tried in a many ways but I couldn't find. Please check below my code I have written a code for pagination. please anyone help me on this. thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
Maybe the page isn't loaded yet, try to replace the last block in the loop by
next = driver.find_element_by_css_selector('.pagination_next a')
url = next.get_attribute('href')
driver.get(url)
sleep(3) # Maybe it's not necessary
Try like below. I was able to visit all the other 5 pages.
driver.implicitly_wait(10)
driver.get("https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60")
time.sleep(5) # Would be better to apply Explicit wait to click on `Απόρριψη`
driver.find_element_by_xpath("//a[contains(text(),'Απόρριψη')]").click()
nextbuttons = driver.find_element_by_xpath("//ul[#class='pagination']/li[5]/a")
length = int(nextbuttons.get_attribute("innerText"))
for i in range(2,length+1):
nextopt = driver.find_element_by_xpath("//ul[#class='pagination']/li/a[contains(text(),'{}')]".format(i))
nextopt.click()
time.sleep(5)
I have a problem to do a scrape with Python. I need to collect patent data for multiple firms, but when a firms' patent is available my code does not move to the next item on the list. For example, the first firm of my list does not have a registered patent, so I want run the code with the next firm id. At the end of the function I inserted
except Exception:
print (f'CNPJ {pj} with problem. Check the list.')
pass
but it was not enough. I really appreciate if someone can help me. Below is my code.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 15 03:52:42 2019
Project: Patents
#author: caique
"""
# Create List
cnpj = ['00.000.100/0000-00', '76.487.032/0001-25', '46.068.425/0001-33', '00.348.003/0001-10', '17.217.985/0001-04']
# Create Function
def patente_pj(cnpj):
import os
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from contextlib import suppress
os.chdir("/home/caique/Desktop/Patentes INPI")
# Chrome Headless
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
# CAMINHO PARA O EXECUTÁVEL
patentes = []
try:
for pj in cnpj:
pj = str(pj).replace('.', '').replace('/', '').replace(' ', '').replace('-', '')
driver = webdriver.Chrome(executable_path = r"/home/caique/Desktop/Patentes INPI/chromedriver")
driver.get("https://gru.inpi.gov.br/pePI/jsp/patentes/PatenteSearchBasico.jsp")
driver.find_element_by_link_text("Continuar....").click()
driver.get("https://gru.inpi.gov.br/pePI/jsp/patentes/PatenteSearchBasico.jsp")
driver.find_element_by_link_text("Pesquisa Avançada").click()
destination_page_link = driver.find_element_by_xpath("//*[#id='principal']/div[7]/button")
destination_page_link.click()
driver.find_element_by_xpath(
"//*[#id='principal']/div[7]/div/table/tbody/tr[2]/td[2]/font/input").send_keys(pj)
driver.find_element_by_xpath(
"//*[#id='principal']/table[3]/tbody/tr[1]/td/font[2]/select/option[1]").click()
driver.find_element_by_xpath("//*[#id='principal']/table[3]/tbody/tr[2]/td/font/input[1]").click()
html_source = driver.page_source
html_source
soup = BeautifulSoup(html_source, 'lxml')
tabela = soup.find_all("tr")[8]
soup1 = soup.find_all("tbody")[1]
soup2 = soup.find_all("tbody")[2]
hits = len(soup2.find_all("a")) - 1
linha1 = []
for col in tabela.find_all("td"):
linha1.append(col.text)
table_dados = []
for i in range(0, len(soup1.find_all("b"))):
dict = {}
linha_teste = soup1.find_all("tr")[i]
dict[linha1[0]] = linha_teste.find_all("a")[0].text.replace('\n\t', '').replace('\n', '').replace(' ',
'')
dict[linha1[1]] = linha_teste.find_all("td")[1].text[19:29]
dict[linha1[2]] = linha_teste.find_all("b")[0].text.replace('\n\t', '').replace('\n', '').replace(' ',
'')
dict[linha1[3]] = linha_teste.find_all("font")[3].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
table_dados.append(dict)
desired_width = 700 # LARGURA
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)
tabela_de_teste = pd.DataFrame(table_dados)
driver.find_element_by_link_text("Próxima»").click()
cont = 2
for cont in range(1, int(driver.find_element_by_xpath("//*[#id='tituloEResumoContextGlobal']/font/b[3]").text)):
html_source = driver.page_source
html_source
soup = BeautifulSoup(html_source, 'lxml')
soup1 = soup.find_all("tbody")[1]
for i in range(0, len(soup1.find_all("b"))):
dict = {}
linha_teste = soup1.find_all("tr")[i]
dict[linha1[0]] = linha_teste.find_all("a")[0].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
dict[linha1[1]] = linha_teste.find_all("td")[1].text[19:29]
dict[linha1[2]] = linha_teste.find_all("b")[0].text.replace('\n\t', '').replace('\n', '').replace(
' ', '')
dict[linha1[3]] = linha_teste.find_all("font")[3].text.replace('\n\t', '').replace('\n',
'').replace(' ',
'')
table_dados.append(dict)
if cont <-1+ int(driver.find_element_by_xpath("//*[#id='tituloEResumoContextGlobal']/font/b[3]").text):
driver.find_element_by_link_text("Próxima»").click()
print(pd.DataFrame(table_dados))
driver.quit()
tabela_final = pd.DataFrame(table_dados)
patentes.append(tabela_final.to_csv('/home/caique/Desktop/Patentes INPI/CSV/patentes_'+pj+'.csv'))
except Exception:
print(f'CNPJ {pj} with problem. Check the list.')
pass
return patentes
# Run Function
patente_pj(cnpj)
Put the try ... except statements into the start and end of each loop.
You may want to consider printing the error message when the exception occurs to debug better.
I have this assignment of extracting some items from each row of a table in HTML. I have figured out how to grab the whole table from the web using Selenium with Python. Following is the code for that:
from selenium import webdriver
import time
import pandas as pd
mydriver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
mydriver.get("https://www.bseindia.com/corporates/ann.aspx?expandable=0")
time.sleep(5) # wait 5 seconds until DOM will load completly
table = mydriver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table/tbody')
for row in table.find_elements_by_xpath('./tr'):
print(row.text)
I am unable to understand the way I can grab specific items from the table itself. Following are the items that I require:
Company Name
PDF Link(if it does not exist, write "No PDF Link")
Received Time
Dessiminated Time
Time Taken
Description
Any help in logic would be helpful.
Thanks in Advance.
for tr in mydriver.find_elements_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table//tr'):
tds = tr.find_elements_by_tag_name('td')
print ([td.text for td in tds])
I went through a rough time to get this working. I think it works just fine now. Its pretty inefficient though. Following is the code:
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
mydriver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
mydriver.get("https://www.bseindia.com/corporates/ann.aspx?expandable=0")
time.sleep(5) # wait 5 seconds until DOM will load completly
trs = mydriver.find_elements_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table/tbody/tr')
del trs[0]
names = []
r_time = []
d_time = []
t_taken = []
desc = []
pdfs = []
codes = []
i = 0
while i < len(trs):
names.append(trs[i].text)
l = trs[i].text.split()
for item in l:
try:
code = int(item)
if code > 100000:
codes.append(code)
except:
pass
link = trs[i].find_elements_by_tag_name('td')
pdf_count = 2
while pdf_count < len(link):
try:
pdf = link[pdf_count].find_element_by_tag_name('a')
pdfs.append(pdf.get_attribute('href'))
except NoSuchElementException:
pdfs.append("No PDF")
pdf_count = pdf_count + 4
time = trs[i + 1].text.split()
if len(time) == 5:
r_time.append("No Time Given")
d_time.append(time[3] + " " + time[4])
t_taken.append("No Time Given")
else:
r_time.append(time[3] + " " + time[4])
d_time.append(time[8] + " " + time[9])
t_taken.append(time[12])
desc.append(trs[i+2].text)
i = i + 4
df = pd.DataFrame.from_dict({'Name':names,'Description':desc, 'PDF Link' : pdfs,'Company Code' : codes, 'Received Time' : r_time, 'Disseminated Time' : d_time, 'Time Taken' : t_taken})
df.to_excel('corporate.xlsx', header=True, index=False) #print the data in the excel sheet.
Also, I have added another aspect that was asked, I got the company code in another column as well. Thats the result I get.
With the following code I try to get the home city and places where the backers are located from kickstarter. However, I keep running into the following error:
File "D:/location", line 60, in < module >
page1 = urllib.request.urlopen(projects[counter])
IndexError: list index out of range
Does someone have a more elegant solution to feed the page to urllib.request.urlopen? (see the lines in ** **)
code:
# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
#while True:
# try:
# browser.find_element_by_class_name('load_more').click()
# except:
# break
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for project in projects:
**page1 = urllib.request.urlopen(projects[counter])**
soup1 = BeautifulSoup(page1, "lxml")
**page2 = urllib.request.urlopen(projects[counter].split('?')**[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = str(soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text)
try:
project_dict['Backer State'] = str(soup2.find(class_='location-list-wrapper js-location-list-wrapper').text)
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data.csv')
If you only use counter to print the project status message, you can use range or enumerate instead. Here is an example with enumerate:
for counter, project in enumerate(projects):
... code ...
enumerate produces a tuple ( index, item ) , so the rest of your code should work fine as it is.
A fiew more things:
List index starts at 0 so when you use counter to access items you get an IndexError because you initiate counter with 1.
In the for loop you don't need projects[counter], just use project