I'm a beginner in Scrapy. I want to collect links of items in the index page and get the information from the item pages. Because I need to deal with the javascript on the index page, I use selenium webdriver with scrapy. Here's my code in progress.py .
from scrapy.spider import Spider
from scrapy.http import Request
from selenium import selenium
from selenium import webdriver
from mustdo.items import MustdoItem
import time
class ProgressSpider(Spider):
name = 'progress' # spider's name
allowed_domains = ['example.com'] # crawling domain
start_urls = ['http://www.example.com']
def __init__(self):
Spider.__init__(self)
self.log('----------in __init__----------')
self.driver = webdriver.Firefox()
def parse(self, response):
self.log('----------in parse----------')
self.driver.get(response.url)
# Here're some operations of self.driver with javascript.
elements = []
elements = self.driver.find_elements_by_xpath('//table/tbody/tr/td/a[1]')
#get the number of the item
self.log('----------Link number is----------'+str(len(elements)))
for element in elements:
#get the url of the item
href = element.get_attribute('href')
print href
self.log('----------next href is ----------'+href)
yield Request(href,callback=self.parse_item)
self.driver.close()
def parse_item(self, response):
self.log('----------in parse_item----------')
self.driver.get(response.url)
#build item
item = MustdoItem()
item['title'] = self.driver.find_element_by_xpath('//h2').text
self.log('----------item created----------'+self.driver.find_element_by_xpath('//h2').text)
time.sleep(10)
return item
Also, I have items.py defining the MustdoItem used here. Here's the code.
from scrapy.item import Item, Field
class MustdoItem(Item):
title = Field()
When I run the spider, I can get several items (probably 6 to 7 out of 20). But after a while, I get error messages as below.
Traceback (most recent call last):
File "F:\Python27\lib\site-packages\twisted\internet\base.py", line 82
4, in runUntilCurrent
call.func(*call.args, **call.kw)
File "F:\Python27\lib\site-packages\twisted\internet\task.py", line 63
8, in _tick
taskObj._oneWorkUnit()
File "F:\Python27\lib\site-packages\twisted\internet\task.py", line 48
4, in _oneWorkUnit
result = next(self._iterator)
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\uti
ls\defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\uti
ls\defer.py", line 96, in iter_errback
yield next(it)
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\con
trib\spidermiddleware\offsite.py", line 23, in process_spider_output
for x in result:
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\con
trib\spidermiddleware\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\con
trib\spidermiddleware\urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "F:\Python27\lib\site-packages\scrapy-0.22.2-py2.7.egg\scrapy\con
trib\spidermiddleware\depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "mustdo\spiders\progress.py", line 32, in parse
print element.tag_name
File "F:\Python27\lib\site-packages\selenium\webdriver\remote\webeleme
nt.py", line 50, in tag_name
return self._execute(Command.GET_ELEMENT_TAG_NAME)['value']
File "F:\Python27\lib\site-packages\selenium\webdriver\remote\webeleme
nt.py", line 369, in _execute
return self._parent.execute(command, params)
File "F:\Python27\lib\site-packages\selenium\webdriver\remote\webdrive
r.py", line 164, in execute
self.error_handler.check_response(response)
File "F:\Python27\lib\site-packages\selenium\webdriver\remote\errorhan
dler.py", line 164, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: u'El
ement not found in the cache - perhaps the page has changed since it was looked
up' ; Stacktrace:
at fxdriver.cache.getElementAt (resource://fxdriver/modules/web_elem
ent_cache.js:7610)
at Utils.getElementAt (file:///c:/users/marian/appdata/local/temp/tm
pmgnqid/extensions/fxdriver#googlecode.com/components/command_processor.js:7210)
at WebElement.getElementTagName (file:///c:/users/marian/appdata/loc
al/temp/tmpmgnqid/extensions/fxdriver#googlecode.com/components/command_processo
r.js:10353)
at DelayedCommand.prototype.executeInternal_/h (file:///c:/users/mar
ian/appdata/local/temp/tmpmgnqid/extensions/fxdriver#googlecode.com/components/c
ommand_processor.js:10878)
at DelayedCommand.prototype.executeInternal_ (file:///c:/users/maria
n/appdata/local/temp/tmpmgnqid/extensions/fxdriver#googlecode.com/components/com
mand_processor.js:10883)
at DelayedCommand.prototype.execute/< (file:///c:/users/marian/appda
ta/local/temp/tmpmgnqid/extensions/fxdriver#googlecode.com/components/command_pr
ocessor.js:10825)
I've tested my codes and found out that if I removed "yield Request(href,callback=self.parse_item)" in parse function, I could get all the links of items. And when "progress.py" was running, I observed that after the first print of "----------in parse_item----------" in self.log, the error messages came out. With my inference, yield sequence results in the error. But I don't know how to deal with this problem.
Any insight is appreciated!
Best regards! :)
Related
I am trying to scrape a website to learn a little more how does scrapy works. I have a little experience with the packages requests and bs4 (BeautifulSoup). I am working in an miniconda3 environment on my Ubuntu 20.04.1 LTS machine. I use python 3.7.
I have created an item named 'PostscrapeItem' which has only one attribute: full_text = scrapy.Field(). I have not touched the structure of the project that has been automatically created by scrapy.
I have made a spider which is only supposed to find occurrences of an html tag ('em') on this webpage: https://blog.scrapinghub.com/page/1/
Here is the code of my spider:
import scrapy
from bs4 import BeautifulSoup
from postscrape.items import PostscrapeItem
class PostSpider(scrapy.Spider):
name = "posts"
start_urls = [
'https://blog.scrapinghub.com/page/1/'
]
def parse(self, response):
so = BeautifulSoup(response.text, 'html.parser')
item = PostscrapeItem()
if so.find('em'):
concatenated = ""
text_samples = so.find_all('em')
for t_s in text_samples:
concatenated += t_s.text
item['full_text'] = concatenated
return PostscrapeItem
The problem I have is that I have an error when I run this code with 'scrapy crawl posts' in my terminal and it says: 'TypeError: 'ItemMeta' object is not iterable
'. With the little I think I know, the only ItemMeta that is present in my program is the object PostscrapeItem. It seams to me that I am not iterating on this object in my code. That's why I am asking you.
Here is the complete error message:
Traceback (most recent call last):
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/defer.py",
line 117, in iter_errback
yield next(it)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
TypeError: 'ItemMeta' object is not iterable`
Thank you in advance and let me know how to improve the clarity and the quality of my questions.
Luc
You're not returning an item, you're returning the item class object.
Scrapy tries iterating it when it's returned from the spider, so you get your TypeError.
Simply correcting the last line to return item should fix your code.
As a side note, scrapy has its own parsing utilities, so there's no need to import and use BS.
as per #stranac answer, I have corrected full code and its work.
import scrapy
from bs4 import BeautifulSoup
class PostscrapeItem(scrapy.Item):
full_text = scrapy.Field()
class PostSpider(scrapy.Spider):
name = "posts"
start_urls = [
'https://blog.scrapinghub.com/page/1/'
]
def parse(self, response):
so = BeautifulSoup(response.text, 'html.parser')
item = PostscrapeItem()
if so.find('em'):
concatenated = ""
text_samples = so.find_all('em')
for t_s in text_samples:
concatenated += t_s.text
item['full_text'] = concatenated
return item
I have this weird error with selenium when I try to find sportsbook odds from oddsportal.com. It looks like selenium object does not work like a normal list and I can not just loop over every url. Test url that should work http://www.oddsportal.com/soccer/england/premier-league/ (Script is not written for home-draw-away odds).
So what am I doing wrong here?
My script:
from selenium import webdriver
from selenium.common.exceptions import NoSuchAttributeException,NoSuchElementException
from selenium.webdriver.common.keys import Keys
class Odds():
def odds(self,driver,url):
kertoimet = ['','']
driver.get(url)
odds = driver.find_elements_by_xpath("""//*[#id="odds-data table"]/div/table/tbody/tr""")
for item in odds:
data = item.text.replace(' ','').split('\n')
if data[0] == 'Pinnacle':
kertoimet = [data[1],data[2]]
return kertoimet
def odds_finder(self,data,driver):
for item in data:
if item.get_attribute('href') != '':
print(Odds().odds(driver,str(item.get_attribute('href'))))
def url_finder2(self,URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) #http://www.oddsportal.com/soccer/england/premier-league/
data = driver.find_elements_by_xpath("""//*[#id="tournamentTable"]/tbody/tr/td/a""")
Odds().odds_finder(list(data),driver)
Odds().url_finder2(URL)
Error:
Traceback (most recent call last):
File "odds.py", line 79, in <module>
Odds().url_finder2(open('oddsportal_odds.csv'))
File "odds.py", line 61, in url_finder2
Odds().odds_finder(list(data),driver)
File "odds.py", line 49, in odds_finder
if item.get_attribute('href') != '':
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 141, in
get_attribute
resp = self._execute(Command.GET_ELEMENT_ATTRIBUTE, {'name': name})
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webelement.py", line 494, in
_execute
return self._parent.execute(command, params)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/webdriver.py", line 236, in execute
self.error_handler.check_response(response)
File "/Library/Python/2.7/site-
packages/selenium/webdriver/remote/errorhandler.py", line 192, in
check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message:
stale element reference: element is not attached to the page document
(Session info: chrome=58.0.3029.110)
(Driver info: chromedriver=2.29.461585
(0be2cd95f834e9ee7c46bcc7cf405b483f5ae83b),platform=Mac OS X 10.12.3
x86_64)
You just need to call data again because the state gets changed.
Try modifying this 2 function.
def odds_finder(self,driver):
for item in driver.find_elements_by_xpath("//*[#id="tournamentTable"]/tbody/tr/td/a"):
time.sleep(5)
if item.get_attribute('href') != '':
print(Odds().odds(driver, str(item.get_attribute('href'))))
def url_finder2(self, URL):
driver = webdriver.Chrome("/usr/local/bin/chromedriver 2")
driver.get(URL) # http://www.oddsportal.com/soccer/england/premier-league/
Odds().odds_finder(driver)
I'm trying to scrape a retail clothing shopping site. For some reason, whenever I run the following code, I end up getting a couple of items from three categories (as defined in parse() as nth-children) and a slew of items from li:nth-child(5).
Sometimes the following error appears:
2017-01-09 20:33:30 [scrapy] ERROR: Spider error processing <GET http://www.example.com/jackets> (referer: http://www.example.com/)
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/BeardedMac/projects/thecurvyline-scraper/spiders/example.py", line 47, in parse_items
price = node.find_element_by_css_selector('div.flex-wrapper--prod-details > div.pricing > div.price > div.standardprice').text
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 307, in find_element_by_css_selector
return self.find_element(by=By.CSS_SELECTOR, value=css_selector)
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 511, in find_element
{"using": by, "value": value})['value']
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 494, in _execute
return self._parent.execute(command, params)
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 236, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 192, in check_response
raise exception_class(message, screen, stacktrace)
StaleElementReferenceException: Message: The element reference is stale. Either the element is no longer attached to the DOM or the page has been refreshed
However, if I change the nth-child selector to say, li:nth-child(3), I get a slew of items from that category, but I can't seem to get them all at once.
I'm pretty new to Python and Scrapy, so I might just be missing something elemental.
def __init__(self):
self.driver = webdriver.Chrome('/MyPath/chromedriver')
self.driver.set_page_load_timeout(10)
def parse(self, response):
for href in response.css('#main-menu > div > li:nth-child(n+3):nth-child(-n+6) > a::attr(href)').extract():
yield scrapy.Request(response.urljoin(href), callback=self.parse_items)
def get_item(self, response):
sizes = response.css('#pdpMain > div.productdetailcolumn.productinfo > div > div.variationattributes > div.swatches.size > ul > li > a::text').extract()
product_id = response.css('#riiratingsfavorites > div.riiratings > a::attr(rel)').extract_first()
response.meta['product']['sizes'] = sizes
response.meta['product']['product_id'] = product_id
yield response.meta['product']
def parse_items(self, response):
category = response.css('#shelf > div.category-header > h2::text').extract_first()
self.driver.get(response.url)
nodes = self.driver.find_elements_by_css_selector('#search > div.productresultarea > div.product.producttile')
for node in nodes:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
price = node.find_element_by_css_selector('div.flex-wrapper--prod-details > div.pricing > div.price > div.standardprice').text
images = node.find_element_by_css_selector('div.image > div.thumbnail > p > a > img:nth-child(1)').get_attribute('src')
name = node.find_element_by_css_selector('div.flex-wrapper--prod-details > div.name > a').text
product_url = node.find_element_by_css_selector('div.flex-wrapper--prod-details > div.name > a').get_attribute('href')
product = Product ()
product['title'] = name
product['price'] = price
product['product_url'] = product_url
product['retailer'] = 'store7'
product['categories'] = category
product['images'] = images
product['sizes'] = []
product['product_id'] = []
product['base_url'] = ''
product_page = response.urljoin(product_url)
yield scrapy.Request(product_page, callback=self.get_item, meta={'product': product})
To put it shortly - what happens here is because scrapy is concurrent and your selenium implementation isn't, your selenium driver gets confused - during your crawl scrapy keeps asking your selenium driver to load new urls when it is still working with the old ones.
To avoid this you can disable concurrency in your spider by setting CONCURRENT_REQUESTS setting to 1. E.g. add this to your settings.py file:
CONCURRENT_REQUESTS = 1
or add a custom_settings entry in your spider if you wish to restrict this setting to one spider:
class MySpider(scrapy.Spider):
custom_settings = {'CONCURRENT_REQUESTS', 1}
If you want to keep concurrency (which is a really nice thing to have) you can try replacing selenium with more friendly python technologies such as Splash
I am trying to use python scrapy tool for extracting the information from the bitcointalk.org website about the users and the public keys that they post in the forum for donation.
I found this piece of code online, made changes to it so that it runs on my desired website, but I am running into an error AttributeError response object has no attribute text.
Below is the code for reference
class BitcointalkSpider(CrawlSpider):
name = "bitcointalk"
allowed_domains = ["bitcointalk.org"]
start_urls = ["https://bitcointalk.org/index.php"]
rules = (
Rule(SgmlLinkExtractor(deny=[
'https://bitcointalk\.org/index\.php\?action=ignore',
'https://bitcointalk\.org/index\.php\?action=profile',
],
allow_domains='bitcointalk.org'), callback='parse_item', follow=True),
)
def parse_item(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[contains(#class, "td_headerandpost")]')
items = []
for site in sites:
item = BitcoinItem()
item["membername"] = site.xpath('.//td[#class="poster_info"]/b/a/text()').extract()
addresses = site.xpath('.//div[contains(#class, "signature")]/text()').re(r'(1[1-9A-HJ-NP-Za-km-z]{26,33})')
if item["membername"] and addresses:
addr_list = set()
for addr in addresses:
if (bcv.check_bc(addr)):
addr_list.add(addr)
item["address"] = addr_list
if len(addr_list) > 0:
items.append(item)
return items
and the error that I am receiving is :
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiders/crawl.py", line 72, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
File "/home/sunil/Desktop/Nikhil/Thesis/mit_bitcoin/bitcoin/spiders/bitcointalk_spider.py", line 24, in parse_item
sel = Selector(response)
File "/usr/local/lib/python2.7/dist-packages/scrapy/selector/unified.py", line 63, in __init__
text = response.text
AttributeError: 'Response' object has no attribute 'text'
Something is likely wrong with one of your requests, since it seems like the response from at least one url your crawling is not properly formatted. Either the request itself failed, or you're not making requests appropriately.
See here for the source of your error.
And see here for a clue as to why your request may be poorly formatted. It looks like Selector expects an HtmlResponse object, or a similar type.
I am trying to run a scraper using Scrapy, I was able to do in the past using this code, but now I get a strange error.
_rules =(Rule(LinkExtractor(restrict_xpaths=(xpath_str)), follow=True,
callback='parse_url'),)
def parse_url(self, response):
print response.url
...
Basically what I get back when I run it is:
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
TypeError: 'str' object is not callable
Any ideas why this happens? I have a really similar code in another scraper which works?!
Here is the full code
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..model import Properties
class TestScraper(CrawlSpider):
name = "test"
start_urls = [Properties.start_url]
_rules =( Rule(LinkExtractor(restrict_xpaths=(Properites.xpath)), follow=True, callback='parse_url'), )
def parse_url(self, response):
print response.url
Change callback='parse_url' to callback=self.parse_url.