I have this code in python:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from site_auto_1.items import AutoItem
class AutoSpider(CrawlSpider):
name = "auto"
allowed_host = ["autowereld.nl"]
url = "http://www.autowereld.nl/"
start_urls = [
"http://www.autowereld.nl/zoeken.html?mrk=187&mdl%5B%5D=463&prvan=500&prtot=3000&brstf%5B%5D=2&bjvan=2000&bjtot=2004&geoloc=&strl=&trns%5B%5D=&kmvan=&kmtot=&klr%5B%5D=&q=",
]
path = '//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
def parse_item(self, response):
print "found item :', response.url
and it gives me this error:
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'
I don't know what i'm doing wrong so i started to comment code and see witch one tows the error and i figure it out that is this part :
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
But i don't know what i'm doing wrong, i tried to make the restrict_xpaths a list, a tuple ... i'm new to scrapy and i can't figure it out ...
The XPath configured inside restict_xpaths should point to an element, not an attribute.
Replace:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href
with:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a
Related
I ´m getting this error when I run my scraper :
2022-09-19 23:17:00 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/top-mujer-reebok-ts-ubf-seamless-rojo/p> (referer: https://www.justforsport.com.ar/mujer?page=7)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_mujer.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I try to understand what does it mean, but I can´t find the problem. The link works fine...but data is not collected...
My script looks like this:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_mujer.csv'):
os.remove('jfs_mujer.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_mujer(scrapy.Spider):
name = 'jfs_mujer'
start_urls = ["https://www.justforsport.com.ar/mujer?page=1"]
def parse(self,response):
# total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/32) + 2
for count in range(1, 40):
yield SplashRequest(url=f'https://www.justforsport.com.ar/mujer?page={count}',
callback=self.parse_links, meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
#Extrae links de cada pagina de la seccion
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail ,meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_mujer.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 3,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 3,
'DOWNLOAD_DELAY':24,
#'AUTOTHROTTLE_MAX_DELAY' : 12,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_mujer)
process.start()
What's wrong wit the script? or it's something about settings? . I think it has something to do with the way I join the prices, but from 770 products, it works fine for almost 660...I don´t understand... thanks for touyr help!
Your error message means that your CSS selector doesn't find anything.
You can try above XPath to get the price:
price = response.xpath('//meta[#property="product:price:amount"]/#content').get()
I have a .py scraper, and whe it runs, works fine but is not getting the 100% of the data. I 'm getting lot of errors like this:
2022-05-05 20:53:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/buzo-hombre-361-degrees-y2201my002a-urban-1-gris/p> (referer: https://www.justforsport.com.ar/hombre?page=3)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_hombre.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
this is my script:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def parse(self,response):
total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/27) + 1
for count in range(1, total_products):
yield SplashRequest(url=f'https://www.justforsport.com.ar/hombre?page={count}',
callback=self.parse_links)
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_hombre.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 16,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY' : 2,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_hombre)
process.start()
I don´t understand what the error is about...why sometimes I get the 100% of the info and sometimes I get these messages? it's something related to the script, the user_agent, about the moment when the process run?
Thanks in advance!
Data is also generatig from from API calls json response as GET method and you call grab all data point whatever you want with the easiest and the superfast way. So below is given an example of working solution.
import scrapy
from scrapy.crawler import CrawlerProcess
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'productName': result['productName']
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl()
process.start()
Output:
'downloader/response_status_count/200': 1,
'item_scraped_count': 576,
This is my spider code
spider.py
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'moneycontrol'
# allowed_domains = ['moneycontrol.com']
start_urls = ['https://www.moneycontrol.com/india/stockpricequote/']
def parse(self, response):
stoke_link_list = response.css("table a::attr(href)").getall()
if response.css("span.span_price_wrap::text").getall(): # value of this variable only present in first run
stock_name = response.css("h1.pcstname::text").get()
bse_price, nse_price = response.css("span.span_price_wrap::text").getall()
print(stock_name + ' ' + bse_price + ' ' + nse_price)
else:
print('stock_name bse_price nse_price')
for link in stoke_link_list:
if link is not None:
next_page = response.urljoin(link)
# yield scrapy.Request(next_page, callback=self.parse)
yield response.follow(next_page, callback=self.parse)
while running this I am getting a strange error. It give error while scraping some website while running that again it gives error while scraping different website (I mean may run for previous website).
Error:
2020-08-20 19:52:49 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.moneycontrol.com/mutual-funds/nav/motilal-oswal-midcap-30-fund-regular-plan/MMO025> (referer: https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/abbottindia/AI51)
Traceback (most recent call last):
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 340, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/moneycontrol/moneycontrol/spiders/my_spider.py", line 24, in parse
yield scrapy.Request(next_page, callback=self.parse)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 69, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: javascript:void(0);
Run2
2020-08-20 19:55:15 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.moneycontrol.com/mutual-funds/nav/dsp-equity-opportunities-fund-regular-plan/MDS011> (referer: https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/alkemlaboratories/AL05)
Traceback (most recent call last):
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 340, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/moneycontrol/moneycontrol/spiders/my_spider.py", line 24, in parse
yield scrapy.Request(next_page, callback=self.parse)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 69, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: javascript:void(0);
I looked other stackoverflow answers but none of them is solving my issue. like trying start_urls to list, using follow etc.
Missing scheme in request URL
The error 'Missing scheme in request url' means that the URL does not have the http:// or https:// prefix.
The problem is happening due to the presence of links with relative URLs in the webpage being tested.
For example, the link named 'Zee entertain' on the moneycontrol.com website has the
href value of "/india/stockpricequote/mediaentertainment/zeeentertainmententerprises/ZEE"
So, when the Python program is trying to open this link, the 'Missing scheme' error is being thrown.
How to fix the problem?
The problem of 'missing scheme' can be fixed by prepending
https://hostname to all relative URL links (I.E., the links that do not begin with http:// or https://)
Code snippet to prepend https://hostname to relative URLs:
for link in stoke_link_list:
if link is not None:
if not link.startswith("https://moneycontrol.com/")
page_url = ("https://moneycontrol.com/" + link)
from scrapy import Spider
from scrapy.http import Request
class CourseSpider(Spider):
name = 'course'
allowed_domains = ['coursera.org']
start_urls = ['https://coursera.org/about/partners']
def parse(self, response):
listings = response.xpath('//div[#class="rc-PartnerBox vertical-box"]')
for listing in listings:
title = listing.xpath('.//div[#class="partner-box-wrapper card-one-clicker flex-1"]/p').extract_first()
relative_url = listing.xpath('.//a/#href').extract_first()
absolute_url = response.urljoin(relative_url)
yield Request(response.urljoin(relative_url), callback = self.parse_listing,meta={'title':title,'absolute_url':absolute_url})
def parse_listing(self,response):
titles = response.meta.get('title')
absolute_url = response.meta.get('absolute_url')
titles_course = response.xpath('//div[#class="name headline-1-text"]/text()').extract()
url_link = response.xpath('//div[#class="rc-Course"]/a/#href').extract()
abs_url = response.urljoin(url_link)
yield {'title':title,
'titles':title,
'absolute_url':absolute_url,
'titles_course':titles_course,
'abs_url':abs_url}
However, upon running the script through the cmd. I am getting errors. These errors suggest that I cannot mix str and non-str arguments and I am confused over how to deal with this problem. Any help would be appreciated.
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 28, in parse_listing
yield {'title':title,
NameError: name 'title' is not defined
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/checkpoint> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
abs_url = response.urljoin(url_link)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
return urljoin(get_base_url(self), url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
abs_url = response.urljoin(url_link)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
return urljoin(get_base_url(self), url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] INFO: Closing spider (finished)
I tried adding the extract() function since it was mentioned on some previous stackoverflow question on the listings container to get rid of that error but then my xpath is not getting the desired output.
You are looking for .extract_first() or its new name .get(), because .extract() produces a list, which one cannot use in .urljoin
I am trying to run a scraper using Scrapy, I was able to do in the past using this code, but now I get a strange error.
_rules =(Rule(LinkExtractor(restrict_xpaths=(xpath_str)), follow=True,
callback='parse_url'),)
def parse_url(self, response):
print response.url
...
Basically what I get back when I run it is:
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
TypeError: 'str' object is not callable
Any ideas why this happens? I have a really similar code in another scraper which works?!
Here is the full code
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..model import Properties
class TestScraper(CrawlSpider):
name = "test"
start_urls = [Properties.start_url]
_rules =( Rule(LinkExtractor(restrict_xpaths=(Properites.xpath)), follow=True, callback='parse_url'), )
def parse_url(self, response):
print response.url
Change callback='parse_url' to callback=self.parse_url.