I am trying to run a scraper using Scrapy, I was able to do in the past using this code, but now I get a strange error.
_rules =(Rule(LinkExtractor(restrict_xpaths=(xpath_str)), follow=True,
callback='parse_url'),)
def parse_url(self, response):
print response.url
...
Basically what I get back when I run it is:
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
TypeError: 'str' object is not callable
Any ideas why this happens? I have a really similar code in another scraper which works?!
Here is the full code
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..model import Properties
class TestScraper(CrawlSpider):
name = "test"
start_urls = [Properties.start_url]
_rules =( Rule(LinkExtractor(restrict_xpaths=(Properites.xpath)), follow=True, callback='parse_url'), )
def parse_url(self, response):
print response.url
Change callback='parse_url' to callback=self.parse_url.
Related
I ´m getting this error when I run my scraper :
2022-09-19 23:17:00 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/top-mujer-reebok-ts-ubf-seamless-rojo/p> (referer: https://www.justforsport.com.ar/mujer?page=7)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_mujer.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I try to understand what does it mean, but I can´t find the problem. The link works fine...but data is not collected...
My script looks like this:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_mujer.csv'):
os.remove('jfs_mujer.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_mujer(scrapy.Spider):
name = 'jfs_mujer'
start_urls = ["https://www.justforsport.com.ar/mujer?page=1"]
def parse(self,response):
# total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/32) + 2
for count in range(1, 40):
yield SplashRequest(url=f'https://www.justforsport.com.ar/mujer?page={count}',
callback=self.parse_links, meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
#Extrae links de cada pagina de la seccion
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail ,meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_mujer.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 3,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 3,
'DOWNLOAD_DELAY':24,
#'AUTOTHROTTLE_MAX_DELAY' : 12,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_mujer)
process.start()
What's wrong wit the script? or it's something about settings? . I think it has something to do with the way I join the prices, but from 770 products, it works fine for almost 660...I don´t understand... thanks for touyr help!
Your error message means that your CSS selector doesn't find anything.
You can try above XPath to get the price:
price = response.xpath('//meta[#property="product:price:amount"]/#content').get()
I have a .py scraper, and whe it runs, works fine but is not getting the 100% of the data. I 'm getting lot of errors like this:
2022-05-05 20:53:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/buzo-hombre-361-degrees-y2201my002a-urban-1-gris/p> (referer: https://www.justforsport.com.ar/hombre?page=3)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_hombre.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
this is my script:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def parse(self,response):
total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/27) + 1
for count in range(1, total_products):
yield SplashRequest(url=f'https://www.justforsport.com.ar/hombre?page={count}',
callback=self.parse_links)
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_hombre.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 16,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY' : 2,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_hombre)
process.start()
I don´t understand what the error is about...why sometimes I get the 100% of the info and sometimes I get these messages? it's something related to the script, the user_agent, about the moment when the process run?
Thanks in advance!
Data is also generatig from from API calls json response as GET method and you call grab all data point whatever you want with the easiest and the superfast way. So below is given an example of working solution.
import scrapy
from scrapy.crawler import CrawlerProcess
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'productName': result['productName']
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl()
process.start()
Output:
'downloader/response_status_count/200': 1,
'item_scraped_count': 576,
I am trying to scrape a website to learn a little more how does scrapy works. I have a little experience with the packages requests and bs4 (BeautifulSoup). I am working in an miniconda3 environment on my Ubuntu 20.04.1 LTS machine. I use python 3.7.
I have created an item named 'PostscrapeItem' which has only one attribute: full_text = scrapy.Field(). I have not touched the structure of the project that has been automatically created by scrapy.
I have made a spider which is only supposed to find occurrences of an html tag ('em') on this webpage: https://blog.scrapinghub.com/page/1/
Here is the code of my spider:
import scrapy
from bs4 import BeautifulSoup
from postscrape.items import PostscrapeItem
class PostSpider(scrapy.Spider):
name = "posts"
start_urls = [
'https://blog.scrapinghub.com/page/1/'
]
def parse(self, response):
so = BeautifulSoup(response.text, 'html.parser')
item = PostscrapeItem()
if so.find('em'):
concatenated = ""
text_samples = so.find_all('em')
for t_s in text_samples:
concatenated += t_s.text
item['full_text'] = concatenated
return PostscrapeItem
The problem I have is that I have an error when I run this code with 'scrapy crawl posts' in my terminal and it says: 'TypeError: 'ItemMeta' object is not iterable
'. With the little I think I know, the only ItemMeta that is present in my program is the object PostscrapeItem. It seams to me that I am not iterating on this object in my code. That's why I am asking you.
Here is the complete error message:
Traceback (most recent call last):
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/defer.py",
line 117, in iter_errback
yield next(it)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/luc/.local/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
TypeError: 'ItemMeta' object is not iterable`
Thank you in advance and let me know how to improve the clarity and the quality of my questions.
Luc
You're not returning an item, you're returning the item class object.
Scrapy tries iterating it when it's returned from the spider, so you get your TypeError.
Simply correcting the last line to return item should fix your code.
As a side note, scrapy has its own parsing utilities, so there's no need to import and use BS.
as per #stranac answer, I have corrected full code and its work.
import scrapy
from bs4 import BeautifulSoup
class PostscrapeItem(scrapy.Item):
full_text = scrapy.Field()
class PostSpider(scrapy.Spider):
name = "posts"
start_urls = [
'https://blog.scrapinghub.com/page/1/'
]
def parse(self, response):
so = BeautifulSoup(response.text, 'html.parser')
item = PostscrapeItem()
if so.find('em'):
concatenated = ""
text_samples = so.find_all('em')
for t_s in text_samples:
concatenated += t_s.text
item['full_text'] = concatenated
return item
I am trying to use python scrapy tool for extracting the information from the bitcointalk.org website about the users and the public keys that they post in the forum for donation.
I found this piece of code online, made changes to it so that it runs on my desired website, but I am running into an error AttributeError response object has no attribute text.
Below is the code for reference
class BitcointalkSpider(CrawlSpider):
name = "bitcointalk"
allowed_domains = ["bitcointalk.org"]
start_urls = ["https://bitcointalk.org/index.php"]
rules = (
Rule(SgmlLinkExtractor(deny=[
'https://bitcointalk\.org/index\.php\?action=ignore',
'https://bitcointalk\.org/index\.php\?action=profile',
],
allow_domains='bitcointalk.org'), callback='parse_item', follow=True),
)
def parse_item(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[contains(#class, "td_headerandpost")]')
items = []
for site in sites:
item = BitcoinItem()
item["membername"] = site.xpath('.//td[#class="poster_info"]/b/a/text()').extract()
addresses = site.xpath('.//div[contains(#class, "signature")]/text()').re(r'(1[1-9A-HJ-NP-Za-km-z]{26,33})')
if item["membername"] and addresses:
addr_list = set()
for addr in addresses:
if (bcv.check_bc(addr)):
addr_list.add(addr)
item["address"] = addr_list
if len(addr_list) > 0:
items.append(item)
return items
and the error that I am receiving is :
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiders/crawl.py", line 72, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
File "/home/sunil/Desktop/Nikhil/Thesis/mit_bitcoin/bitcoin/spiders/bitcointalk_spider.py", line 24, in parse_item
sel = Selector(response)
File "/usr/local/lib/python2.7/dist-packages/scrapy/selector/unified.py", line 63, in __init__
text = response.text
AttributeError: 'Response' object has no attribute 'text'
Something is likely wrong with one of your requests, since it seems like the response from at least one url your crawling is not properly formatted. Either the request itself failed, or you're not making requests appropriately.
See here for the source of your error.
And see here for a clue as to why your request may be poorly formatted. It looks like Selector expects an HtmlResponse object, or a similar type.
I have this code in python:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from site_auto_1.items import AutoItem
class AutoSpider(CrawlSpider):
name = "auto"
allowed_host = ["autowereld.nl"]
url = "http://www.autowereld.nl/"
start_urls = [
"http://www.autowereld.nl/zoeken.html?mrk=187&mdl%5B%5D=463&prvan=500&prtot=3000&brstf%5B%5D=2&bjvan=2000&bjtot=2004&geoloc=&strl=&trns%5B%5D=&kmvan=&kmtot=&klr%5B%5D=&q=",
]
path = '//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
def parse_item(self, response):
print "found item :', response.url
and it gives me this error:
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'
I don't know what i'm doing wrong so i started to comment code and see witch one tows the error and i figure it out that is this part :
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
But i don't know what i'm doing wrong, i tried to make the restrict_xpaths a list, a tuple ... i'm new to scrapy and i can't figure it out ...
The XPath configured inside restict_xpaths should point to an element, not an attribute.
Replace:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href
with:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a