Scraper not getting total data - python

I have a .py scraper, and whe it runs, works fine but is not getting the 100% of the data. I 'm getting lot of errors like this:
2022-05-05 20:53:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/buzo-hombre-361-degrees-y2201my002a-urban-1-gris/p> (referer: https://www.justforsport.com.ar/hombre?page=3)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_hombre.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
this is my script:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def parse(self,response):
total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/27) + 1
for count in range(1, total_products):
yield SplashRequest(url=f'https://www.justforsport.com.ar/hombre?page={count}',
callback=self.parse_links)
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_hombre.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 16,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY' : 2,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_hombre)
process.start()
I don´t understand what the error is about...why sometimes I get the 100% of the info and sometimes I get these messages? it's something related to the script, the user_agent, about the moment when the process run?
Thanks in advance!

Data is also generatig from from API calls json response as GET method and you call grab all data point whatever you want with the easiest and the superfast way. So below is given an example of working solution.
import scrapy
from scrapy.crawler import CrawlerProcess
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'productName': result['productName']
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl()
process.start()
Output:
'downloader/response_status_count/200': 1,
'item_scraped_count': 576,

Related

scrapy splash gets part of data

I ´m getting this error when I run my scraper :
2022-09-19 23:17:00 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/top-mujer-reebok-ts-ubf-seamless-rojo/p> (referer: https://www.justforsport.com.ar/mujer?page=7)
Traceback (most recent call last):
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_mujer.py", line 41, in parse_article_detail
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I try to understand what does it mean, but I can´t find the problem. The link works fine...but data is not collected...
My script looks like this:
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_mujer.csv'):
os.remove('jfs_mujer.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_mujer(scrapy.Spider):
name = 'jfs_mujer'
start_urls = ["https://www.justforsport.com.ar/mujer?page=1"]
def parse(self,response):
# total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/32) + 2
for count in range(1, 40):
yield SplashRequest(url=f'https://www.justforsport.com.ar/mujer?page={count}',
callback=self.parse_links, meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
#Extrae links de cada pagina de la seccion
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail ,meta= {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5}}})
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_mujer.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 3,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 3,
'DOWNLOAD_DELAY':24,
#'AUTOTHROTTLE_MAX_DELAY' : 12,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_mujer)
process.start()
What's wrong wit the script? or it's something about settings? . I think it has something to do with the way I join the prices, but from 770 products, it works fine for almost 660...I don´t understand... thanks for touyr help!
Your error message means that your CSS selector doesn't find anything.
You can try above XPath to get the price:
price = response.xpath('//meta[#property="product:price:amount"]/#content').get()

How resolve these error Cannot mix str and non-str arguments

They show me these error that Cannot mix str and non-str argumentsI'm writing some scraping codes and experiencing an error as above. My code is following.However, upon running the script through the cmd. I am getting errors. These errors suggest that I cannot mix str and non-str arguments and I am confused over how to deal with this problem. Any help would be appreciated.
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break
yield response.follow(next, callback = self.parse)
Error
Traceback (most recent call last):
File "e:\python39\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
yield next(it)
File "e:\python39\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "e:\python39\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
return next(self.data)
File "e:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "e:\python39\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "e:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "e:\python39\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
return (_set_referer(r) for r in result or ())
File "e:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "e:\python39\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
return (r for r in result or () if _filter(r))
File "e:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "e:\python39\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "e:\python39\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable:
File "C:\Users\Dell\sample\sample\spiders\test.py", line 64, in parse_book
yield response.follow(next, callback = self.parse)
File "e:\python39\lib\site-packages\scrapy\http\response\text.py", line 169, in follow
return super().follow(
File "e:\python39\lib\site-packages\scrapy\http\response\__init__.py", line 155, in follow
url = self.urljoin(url)
File "e:\python39\lib\site-packages\scrapy\http\response\text.py", line 102, in urljoin
return urljoin(get_base_url(self), url)
File "e:\python39\lib\urllib\parse.py", line 532, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "e:\python39\lib\urllib\parse.py", line 125, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments

Raise ValueError('Missing scheme in request url: %s' % self._url) ValueError: Missing scheme in request url: javascript:void(0);

This is my spider code
spider.py
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'moneycontrol'
# allowed_domains = ['moneycontrol.com']
start_urls = ['https://www.moneycontrol.com/india/stockpricequote/']
def parse(self, response):
stoke_link_list = response.css("table a::attr(href)").getall()
if response.css("span.span_price_wrap::text").getall(): # value of this variable only present in first run
stock_name = response.css("h1.pcstname::text").get()
bse_price, nse_price = response.css("span.span_price_wrap::text").getall()
print(stock_name + ' ' + bse_price + ' ' + nse_price)
else:
print('stock_name bse_price nse_price')
for link in stoke_link_list:
if link is not None:
next_page = response.urljoin(link)
# yield scrapy.Request(next_page, callback=self.parse)
yield response.follow(next_page, callback=self.parse)
while running this I am getting a strange error. It give error while scraping some website while running that again it gives error while scraping different website (I mean may run for previous website).
Error:
2020-08-20 19:52:49 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.moneycontrol.com/mutual-funds/nav/motilal-oswal-midcap-30-fund-regular-plan/MMO025> (referer: https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/abbottindia/AI51)
Traceback (most recent call last):
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 340, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/moneycontrol/moneycontrol/spiders/my_spider.py", line 24, in parse
yield scrapy.Request(next_page, callback=self.parse)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 69, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: javascript:void(0);
Run2
2020-08-20 19:55:15 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.moneycontrol.com/mutual-funds/nav/dsp-equity-opportunities-fund-regular-plan/MDS011> (referer: https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/alkemlaboratories/AL05)
Traceback (most recent call last):
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/utils/python.py", line 346, in __next__
return next(self.data)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 340, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/home/vishvajeet/Desktop/Programming/python/scrapy/moneycontrol/moneycontrol/spiders/my_spider.py", line 24, in parse
yield scrapy.Request(next_page, callback=self.parse)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/home/vishvajeet/Desktop/Programming/python/scrapy/env/lib/python3.6/site-packages/scrapy/http/request/__init__.py", line 69, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: javascript:void(0);
I looked other stackoverflow answers but none of them is solving my issue. like trying start_urls to list, using follow etc.
Missing scheme in request URL
The error 'Missing scheme in request url' means that the URL does not have the http:// or https:// prefix.
The problem is happening due to the presence of links with relative URLs in the webpage being tested.
For example, the link named 'Zee entertain' on the moneycontrol.com website has the
href value of "/india/stockpricequote/mediaentertainment/zeeentertainmententerprises/ZEE"
So, when the Python program is trying to open this link, the 'Missing scheme' error is being thrown.
How to fix the problem?
The problem of 'missing scheme' can be fixed by prepending
https://hostname to all relative URL links (I.E., the links that do not begin with http:// or https://)
Code snippet to prepend https://hostname to relative URLs:
for link in stoke_link_list:
if link is not None:
if not link.startswith("https://moneycontrol.com/")
page_url = ("https://moneycontrol.com/" + link)

Scrapy callback str issue

I am trying to run a scraper using Scrapy, I was able to do in the past using this code, but now I get a strange error.
_rules =(Rule(LinkExtractor(restrict_xpaths=(xpath_str)), follow=True,
callback='parse_url'),)
def parse_url(self, response):
print response.url
...
Basically what I get back when I run it is:
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
TypeError: 'str' object is not callable
Any ideas why this happens? I have a really similar code in another scraper which works?!
Here is the full code
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..model import Properties
class TestScraper(CrawlSpider):
name = "test"
start_urls = [Properties.start_url]
_rules =( Rule(LinkExtractor(restrict_xpaths=(Properites.xpath)), follow=True, callback='parse_url'), )
def parse_url(self, response):
print response.url
Change callback='parse_url' to callback=self.parse_url.

Error using scrapy

I have this code in python:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from site_auto_1.items import AutoItem
class AutoSpider(CrawlSpider):
name = "auto"
allowed_host = ["autowereld.nl"]
url = "http://www.autowereld.nl/"
start_urls = [
"http://www.autowereld.nl/zoeken.html?mrk=187&mdl%5B%5D=463&prvan=500&prtot=3000&brstf%5B%5D=2&bjvan=2000&bjtot=2004&geoloc=&strl=&trns%5B%5D=&kmvan=&kmtot=&klr%5B%5D=&q=",
]
path = '//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
def parse_item(self, response):
print "found item :', response.url
and it gives me this error:
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'
I don't know what i'm doing wrong so i started to comment code and see witch one tows the error and i figure it out that is this part :
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href'),
callback='parse_item',
),
)
But i don't know what i'm doing wrong, i tried to make the restrict_xpaths a list, a tuple ... i'm new to scrapy and i can't figure it out ...
The XPath configured inside restict_xpaths should point to an element, not an attribute.
Replace:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/#href
with:
//*[#id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a

Categories