I need images similar to my own pictures. So I'm trying to download auto images from google lens results.tutorial example link
The images in the google images section were downloaded but I could not download the images in the google lens. I would love any help. Which parts of the code should I change ?
import bs4
import requests
from selenium import webdriver
import os
import time
from selenium.webdriver.common.by import By
# creating a directory to save images
folder_name = 'images/soup'
if not os.path.isdir(folder_name):
os.makedirs(folder_name)
def download_image(url, folder_name, num):
# write image to file
reponse = requests.get(url)
if reponse.status_code == 200:
with open(os.path.join(folder_name, str(num) + ".jpg"), 'wb') as file:
file.write(reponse.content)
from selenium.webdriver.chrome.service import Service
chromePath='C:/Users/A/Downloads/chromedriver_win322/chromedriver.exe'
driver=webdriver.Chrome(chromePath)
# s = Service('C:/Users/ASUS/Downloads/chromedriver_win322/chromedriver.exe')
# driver = webdriver.Chrome(service=s)
search_URL = "https://www.google.com/search?q=sulu+k%C3%B6fte&tbm=isch&ved=2ahUKEwiNiqr85YD9AhXcwwIHHT59D74Q2-cCegQIABAA&oq=sulu+k&gs_lcp=CgNpbWcQARgAMggIABCABBCxAzIICAAQgAQQsQMyBQgAEIAEMgsIABCABBCxAxCDATIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDoECAAQQzoICAAQsQMQgwE6BwgAELEDEEM6BQgAELEDUI8IWL4bYL8kaAFwAHgAgAG_AYgBpAiSAQMwLjiYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=FeXgY82rG9yHi-gPvvq98As&bih=722&biw=1519&hl=tr"
driver.get(search_URL)
# //*[#id="islrg"]/div[1]/div[1]
# //*[#id="islrg"]/div[1]/div[50]
# //*[#id="islrg"]/div[1]/div[25]
# //*[#id="islrg"]/div[1]/div[75]
# //*[#id="islrg"]/div[1]/div[350]
a = input("Waiting...")
# Scrolling all the way up
driver.execute_script("window.scrollTo(0, 0);")
page_html = driver.page_source
pageSoup = bs4.BeautifulSoup(page_html, 'html.parser')
containers = pageSoup.findAll('div', {'class': "isv-r PNCib MSM1fd BUooTd"})
print(len(containers))
len_containers = len(containers)
for i in range(1, len_containers + 1):
if i % 25 == 0:
continue
xPath = """//*[#id="islrg"]/div[1]/div[%s]""" % (i)
previewImageXPath = """//*[#id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img""" % (i)
previewImageElement = driver.find_element_by_xpath(previewImageXPath)
previewImageURL = previewImageElement.get_attribute("src")
# print("preview URL", previewImageURL)
# print(xPath)
driver.find_element_by_xpath(xPath).click()
# time.sleep(3)
# //*[#id="islrg"]/div[1]/div[16]/a[1]/div[1]/img
# input('waawgawg another wait')
# page = driver.page_source
# soup = bs4.BeautifulSoup(page, 'html.parser')
# ImgTags = soup.findAll('img', {'class': 'n3VNCb', 'jsname': 'HiaYvf', 'data-noaft': '1'})
# print("number of the ROI tags", len(ImgTags))
# link = ImgTags[1].get('src')
# #print(len(ImgTags))
# #print(link)
#
# n=0
# for tag in ImgTags:
# print(n, tag)
# n+=1
# print(len(ImgTags))
# /html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img
# It's all about the wait
timeStarted = time.time()
while True:
# driver.find_element(By.XPATH, "//*[#id='search']").click()
#imageElement = driver.find_element(By.XPATH, '//*[#id="yDmH0d"]/div[6]/div/div[2]/div[2]/div['
# imageElement = driver.find_element_by_xpath(
# """//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img""")
# imageElement = driver.find_element(By.XPATH,'//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div[1]/a/img')
imageElement = driver.find_element_by_xpath(
"""//*[#id="Sva75c"]/div[2]/div/div[2]/div[2]/div[2]/c-wiz/div[2]/div[1]/div[1]/div[2]/div/a/img""") # titles = driver.find_elements(By.XPATH, '//div[#id="srp-river-results"]//img[#class="s-item__image-img"]')
# titles = driver.find_elements(By.XPATH, '//img[#class="s-item__image-img"]')
imageURL = imageElement.get_attribute('src')
if imageURL != previewImageURL:
# print("actual URL", imageURL)
break
else:
# making a timeout if the full res image can't be loaded
currentTime = time.time()
if currentTime - timeStarted > 10:
print("Timeout! Will download a lower resolution image and move onto the next one")
break
# Downloading image
try:
download_image(imageURL, folder_name, i)
print("Downloaded element %s out of %s total. URL: %s" % (i, len_containers + 1, imageURL))
except:
print("Couldn't download an image %s, continuing downloading the next one" % (i))
Related
I know you've probably seen 100 Indeed Scraping posts on here, and i'm hoping mine is a bit different. Essentially, I'm trying to build an Indeed job scraper that pulls company name and job title, based on a search with "job title" and "location" being variables. Additionally, when selenium opens chrome, Indeed is auto-populating my location, which doesn't get overwritten by the location I've inputting in the code.
I'm fairly new to Python, and I'm relying on the foundation built by someone else from GitHub, so I am having trouble diagnosing the problem.
Would love any help or insight!
Here is my code:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import json
from time import sleep
list_of_description = ["warehouse","associate"]
URL = "https://www.indeed.com/"
MAIN_WINDOW_HANDLER = 0
JOB_TITLE = " "
JOB_LOCATION = " "
JSON_DICT_ARRAY = []
def main():
pageCounter = 0
bool_next = True
newUrl = ""
# theUrl = "https://ca.indeed.com/jobs?q=developer&l=Winnipeg%2C+MB"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get( URL )
# Change text in where
whatElement = browser.find_element(By.ID,"text-input-what")
whatElement.send_keys( JOB_TITLE )
# Change text in where
whereElement = browser.find_element(By.ID,"text-input-where")
whereElement.send_keys(Keys.CONTROL + "a")
whereElement.send_keys(Keys.BACK_SPACE)
whereElement.send_keys( JOB_LOCATION )
whereElement.submit()
MAIN_WINDOW_HANDLER = browser.window_handles[0]
fileName = "{} Jobs in {}.json".format(JOB_TITLE, JOB_LOCATION)
newPage = True
nextNumber = 2
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
browser.get( browser.current_url )
browser.get( browser.current_url )
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
if( check_exists_by_xpath(browser, '//button[#id="onetrust-accept-btn-handler"]') ):
try:
theElement = browser.find_element(By.XPATH, '//button[#id="onetrust-accept-btn-handler"]' )
print(type(theElement))
theElement.click()
print("I clicked")
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
except:
# scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
while ( newPage and check_exists_by_xpath(browser, searchPhrase) ):
theElement = browser.find_elements(By.XPATH, searchPhrase )
try:
theElement[0].click()
except:
newPage = False
if(newPage):
browser.get(browser.current_url)
print(browser.current_url)
nextNumber += 1
searchPhrase = '//span[contains(text(), "{0}") and #class="pn"]'.format(nextNumber)
currentHTML = browser.page_source
linkElements = getElementFromHTML('div .title', currentHTML) # searching for div tags with title class
reqResultText = currentHTML #(download_file(URL)).text
scrapeJobListing(linkElements, reqResultText, browser, MAIN_WINDOW_HANDLER)
else:
print ("Search Concluded")
with open(fileName, "w") as data:
for it in JSON_DICT_ARRAY:
data.write(json.dumps(it))
data.write(",\n")
data.close()
def scrapeJobListing(linkElements, reqResultText, browser, mainHandler):
jobDes = ""
for i in range( len(linkElements) ):
print("\n ",i)
jsonDataDict = {}
list = re.findall(r'["](.*?)["]',str(linkElements[i]))
currJobMap = "jobmap[{}]= ".format(i)
openBracketIndex = reqResultText.find(currJobMap) + len(currJobMap)
findNewString = reqResultText[openBracketIndex:openBracketIndex+600]
print (findNewString)
closeBracketIndex = findNewString.find("}") + 1
cmpOpen = findNewString.find("cmp:'") + len("cmp:'")
cmpClose = findNewString.find("',cmpesc:")
titleOpen = findNewString.find("title:'") + len("title:'")
titleClose = findNewString.find("',locid:")
parsedString = str( findNewString[0:closeBracketIndex] )
print (parsedString)
print("\n")
cmpName = parsedString[cmpOpen:cmpClose]# Company Name
jobTitle = parsedString[titleOpen:titleClose]# Job Title
jsonDataDict['(2) Company Name'] = cmpName
jsonDataDict['(1) Job Title'] = jobTitle
try:
title = browser.find_element(By.ID,list[4]) # 4th quotation is the Job Description
print('Found <%s> element with that class name!' % (title.tag_name))
title.click()
window_after = browser.window_handles[1]
browser.switch_to.window(window_after)
theCurrURL = browser.current_url
browser.get(theCurrURL)
currPageSource = browser.page_source
jsonDataDict['(4) Job Link'] = theCurrURL
print (theCurrURL)
jobDes = getElementFromHTML('div #jobDescriptionText', currPageSource)
soup = bs4.BeautifulSoup(str(jobDes), "html.parser")
jobDescText = soup.get_text('\n')
jsonDataDict['(3) Job Description'] = jobDescText
JSON_DICT_ARRAY.append(jsonDataDict)
browser.close()
print(jobDes)
except:
print('Was not able to find an element with that name.')
# sleep(2)
print (mainHandler)
browser.switch_to.window(mainHandler) #Not necessary right?
def getElementBySearch(searchTag, theURL):
reqResult = download_file(theURL)
soup = bs4.BeautifulSoup(reqResult.text, "html.parser")
element = soup.select(searchTag)
return element
def getElementFromHTML(searchTag, htmlText):
soup = bs4.BeautifulSoup(htmlText, "html.parser")
element = soup.select(searchTag)
return element
def check_exists_by_xpath(webdriver, xpath):
try:
webdriver.find_elements(By.XPATH,xpath)
except NoSuchElementException:
return False
return True
def download_file(searchPhrase):
result = requests.get(searchPhrase)
# type(result)
# Check for error
try:
result.raise_for_status()
except Exception as exc:
print('There was a problem: %s' % (exc))
return result
if __name__== "__main__":
main()
Right now, the script essentially opens Indeed, looks through each page and posts the links. But I'm not sure why it's not providing the job title and company information.
The output looks like this -
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=10&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=20&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=30&vjk=cbe41d08db5e3eaa
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=40&vjk=2fd38d5eb42b6ca4
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=50&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=60&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=70&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
https://www.indeed.com/jobs?q=Warehouse&l=Eugene,%20Oregon&radius=50&start=80&vjk=acbe5c6e5afce1d6
CDwindow-07A1095B627FCB670750FBBDC552B60D
Search Concluded
I'm a fresher in this field. So I'm trying to navigate to a web page to scrape a data, when I execute the code, its scrapes the 1st page data but never navigates to the next pages. I have tried in a many ways but I couldn't find. Please check below my code I have written a code for pagination. please anyone help me on this. thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
Maybe the page isn't loaded yet, try to replace the last block in the loop by
next = driver.find_element_by_css_selector('.pagination_next a')
url = next.get_attribute('href')
driver.get(url)
sleep(3) # Maybe it's not necessary
Try like below. I was able to visit all the other 5 pages.
driver.implicitly_wait(10)
driver.get("https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60")
time.sleep(5) # Would be better to apply Explicit wait to click on `Απόρριψη`
driver.find_element_by_xpath("//a[contains(text(),'Απόρριψη')]").click()
nextbuttons = driver.find_element_by_xpath("//ul[#class='pagination']/li[5]/a")
length = int(nextbuttons.get_attribute("innerText"))
for i in range(2,length+1):
nextopt = driver.find_element_by_xpath("//ul[#class='pagination']/li/a[contains(text(),'{}')]".format(i))
nextopt.click()
time.sleep(5)
I wrote a python code for web scraping so that I can import the data from flipkart.
I need to load multiple pages so that I can import many products but right now only 1 product page is coming.
from urllib.request import urlopen as uReq
from requests import get
from bs4 import BeautifulSoup as soup
import tablib
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1'
uClient2 = uReq(my_url)
page_html = uClient2.read()
uClient2.close()
page_soup = soup(page_html, "html.parser")
containers11 = page_soup.findAll("div",{"class":"_3O0U0u"})
filename = "FoodProcessor.csv"
f = open(filename, "w", encoding='utf-8-sig')
headers = "Product, Price, Description \n"
f.write(headers)
for container in containers11:
title_container = container.findAll("div",{"class":"_3wU53n"})
product_name = title_container[0].text
price_con = container.findAll("div",{"class":"_1vC4OE _2rQ-NK"})
price = price_con[0].text
description_container = container.findAll("ul",{"class":"vFw0gD"})
product_description = description_container[0].text
print("Product: " + product_name)
print("Price: " + price)
print("Description" + product_description)
f.write(product_name + "," + price.replace(",","") +"," + product_description +"\n")
f.close()
You have to check if the next page button exist or not. If yes then return True, go to that next page and start scraping if no then return False and move to the next container. Check for the class name of that button first.
# to check if a pagination exists on the page:
def go_next_page():
try:
button = driver.find_element_by_xpath('//a[#class="<class name>"]')
return True, button
except NoSuchElementException:
return False, None
You can Firstly get the number of pages available and iterate over for each of the pages and parse the data respectively.
Like if you change the URL with respect to page
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1' which points to page 1
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=2' which points to page 2
try:
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except ElementClickInterceptedException as ec:
classes = "_3ighFh"
overlay = driver.find_element_by_xpath("(//div[#class='{}'])[last()]".format(classes))
driver.execute_script("arguments[0].style.visibility = 'hidden'",overlay)
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except Exception as e:
print(str(e.msg()))
break
except TimeoutException:
print("Page Timed Out")
driver.quit()
For me, the easiest way is to add an extra loop with the "page" variable:
# just check the number of the last page on the website
page = 1
while page != 10:
print(f'Scraping page: {page}')
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page={page}'
# here add the for loop you already have
page += 1
This method should work.
Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.
Here is the code. Github here: https://github.com/kenpeter/py_mm
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)
The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src instead of src which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:
<img
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>
In other words you have to check for both attributes src and data-src giving data-src priority over src while creating the list of image URLs.
THIS code does the "trick" and downloads the proper images:
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls_1a = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
img_urls_2 = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#data-src")
img_urls = img_urls_1b + img_urls_2
# print(img_urls)
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)
I'm scraping a website using Selenium , Scrapy and PhantomJS. The problem with the code is , although the code scrolls the page perfectly it extracts link only upto certain limit. Beyond that it completely ignores the result of scrolling. When i use Firefox Webdriver , it is working perfectly. Since i'm running the code in server, i used PhantomJS and thus encountered the problem. Below is the code:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
class DukeSpider(BaseSpider):
name = "dspider"
allowed_domains = ["dukemedicine.org"]
start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor
def __init__(self):
self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
self.driver.maximize_window()
print 'here'
def parse(self, response):
print 'nowhere'
print response
print response.url
b = open('doc_data_duke.csv', 'a')
a = csv.writer(b, lineterminator='\n')
print 'a'
self.driver.get(response.url)
time.sleep(10)
wait = WebDriverWait(self.driver, 10)
print 'helo'
click = self.driver.find_element_by_xpath("//span[#id='specialty']")
click.click()
click_again = self.driver.find_element_by_xpath("//ul[#class='doctor-type']/li[#class='ng-binding ng-scope'][2]")
click_again.click()
time.sleep(25)
act = ActionChains(self.driver)
act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()
print 'now here'
for i in range(0, 75):
#self.driver.find_element_by_xpath("//div[#id='doctor-matrix-section']").send_keys(Keys.PAGE_DOWN)
#self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
#self.driver.find_element_by_tag_name("body").click()
#self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)#findElement(By.tagName("body")).sendKeys(Keys.UP);
#self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
#bg = self.driver.find_element_by_css_selector('body')
#bg.send_keys(Keys.SPACE)
act.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(2)
print i
i += 1
links = self.driver.find_elements_by_xpath("//div[#class = 'result-information']/div[#class='name']/a")
for l in links:
print l
doc_list = l.get_attribute('href')
if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):
print doc_list
dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr.maximize_window()
dr.get(doc_list)
try:
name_title = dr.find_element_by_xpath('//div[#class="header1 ng-binding"]').text
name_titles = name_title.split(",", 1)
name = name_titles[0].encode('utf-8')
title = name_titles[1]
print name.encode('utf-8')
title = title[1:].encode('utf-8')
print title.encode('utf-8')
except:
name = ''
title = ''
try:
speciality = dr.find_element_by_xpath('//p[#class="specialties ng-scope"]').text
except:
speciality = ''
try:
language = dr.find_element_by_xpath(
'//div[#class="lang ng-scope"]/div[#class="plainText inline ng-binding"]').text
except:
language = ''
if dr.find_elements_by_xpath('//div[#class="location-info"]'):
locations = dr.find_elements_by_xpath('//div[#class="location-info"]')
if len(locations) >= 3:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationB.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = locations[2].text.encode('utf-8')
locationC = locationC.replace('\n', '')
locationC = locationC.replace('Directions', '')
elif len(locations) == 2:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = locations[1].text.encode('utf-8')
locationB = locationA.replace('Directions', '')
locationB = locationB.replace('\n', '')
locationC = ''
elif len(locations) == 1:
locationA = locations[0].text.encode('utf-8')
locationA = locationA.replace('Directions', '')
locationA = locationA.replace('\n', '')
locationB = ''
locationC = ''
else:
locationA = ''
locationB = ''
locationC = ''
dr.close()
data = [title, name, speciality, language, locationA, locationB, locationC]
print 'aaaa'
print data
a.writerow(data)
No matter what higher value i set in the range , it ignores result beyond a certain point.
Let's use the fact that there is an element having the total result count:
The idea is to iteratively scroll into view of the last found doctor until we've got all doctors loaded.
Implementation (with clarifying comments, leaving only relevant "selenium" specific part):
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://www.dukemedicine.org/find-doctors-physicians")
# close optional survey popup if exists
try:
driver.find_element_by_css_selector("area[alt=close]").click()
except NoSuchElementException:
pass
# open up filter dropdown
click = driver.find_element_by_id("specialty")
click.click()
# choose specialist
specialist = driver.find_element_by_xpath("//ul[#class = 'doctor-type']/li[contains(., 'specialist')]")
specialist.click()
# artificial delay: TODO: fix?
time.sleep(15)
# read total results count
total_count = int(driver.find_element_by_id("doctor-number").text)
# get the initial results count
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
# iterate while all of the results would not be loaded
while current_count < total_count:
driver.execute_script("arguments[0].scrollIntoView();", results[-1])
results = driver.find_elements_by_css_selector("div.doctor-result")
current_count = len(results)
print "Current results count: %d" % current_count
# report total results
print "----"
print "Total results loaded: %d" % current_count
driver.quit()
Works for me perfectly in both PhantomJS and Chrome. Here is what I get on the console:
Current results count: 36
Current results count: 54
Current results count: 72
Current results count: 90
...
Current results count: 1656
Current results count: 1674
Current results count: 1692
Current results count: 1708
----
Total results loaded: 1708
Additionally note I've added --load-images=false command-line argument that actually speeds things up dramatically.