I am trying to use scrapy on this page: http://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/
But I can't bring the image of the product, it can't find anything I might be missing?
I tried by attribute, by ID, by class and nothing
import scrapy
from scrapy import Request
import random
class BrickSetSpider(scrapy.Spider):
name = 'spider'
USER_AGENT_LIST = [
'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
]
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/',
]
download_delay = 5
FEED_EXPORT_ENCODING = 'utf-8'
def start_requests(self):
for url in self.start_urls:
headers = {'User-Agent': random.choice(self.USER_AGENT_LIST)}
yield Request(url, headers=headers)
def parse(self, response):
SET_SELECTOR = '.content-left'
for brickset in response.css(SET_SELECTOR):
SEARCH_SELECTOR = response.url
NAME_SELECTOR = 'span.keyValue span ::text'
IMAGE_SELECTOR = 'img[itemprop="image"] ::attr(src)'
yield {
'search': SEARCH_SELECTOR,
'name': brickset.css(NAME_SELECTOR).re('[^\t\n]+'),
'link': brickset.css(IMAGE_SELECTOR).extract(),
}
If you are using Chrome, you can test this in the console $$(".images [data-test='zoom-wrap'] img") to get the image.
So, you can use this CSS selector in the Scrapy code. You will have to extract the src parameter.
I hope it helps!
The image is generated dynamically by JS. Try the following code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
import re
class MySpider(Spider):
name = 'rs-online.com'
# allowed_domains = ['example.com']
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/'
]
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# print (doc.html)
div = doc.getElementByClass('content-left')
imgs = re.compile(u'largeImageURL: ".*"').findall(div.script.html)
imgs = ['https:'+img[len('largeImageURL: "'):-1] for img in imgs]
lis = doc.getElementByClass('keyDetailsLL').lis
names = {}
for li in lis:
spans=li.spans
names[spans[0].text]=spans[1].text
data = [{'imgs':imgs,'names':names}]
print (data)
return {"Urls": [], "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling
Result:
[{'imgs': ['https://media.rs-online.com/t_large/F7858468-01.jpg', 'https://media.rs-online.com/t_large/F7858468-02.jpg'], 'names': {'Codice RS': '785-8468', 'Codice costruttore': 'E2E-S05S12-WC-B1 2M', 'Costruttore': 'Omron'}}]
Related
I am trying to scrape an e-commerce website for its products, and I am currently facing an issue that not all of the pages I get with pagination are visited. The links themselves are valid, and visitable, not non-existing.
My spider code:
import scrapy
import json
from pbl.items import ShopCard
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['www.trobos.lt']
start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
item = []
list = [{
'sid': 10,
'name': 'Maxima',
'domain': 'hhttps://www.maxima.lt/',
'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
'product': item
}]
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllItemsXpath = '//*[#id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/#href'
self.TitleXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/h1/text()'
self.PriceXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)
next_page = [response.url + '&page='+str(x) for x in range(1,193)]
for page in next_page:
print('-'* 100)
print(page)
print('-'* 100)
url = page
yield scrapy.Request(url, callback=self.parse)
def parse_main_item(self,response):
shop = ShopCard()
Title = response.xpath(self.TitleXpath).extract_first()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath(self.PriceXpath).extract_first()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
self.item.append(shop['item'])
def closed(self, reason):
with open("spiderMaxima.json", "w") as final:
json.dump(self.list, final, indent=2, ensure_ascii=False)
I am using a list with range() function, because in the response (from scrapy shell view(response), pagination buttons are connected to a script.
I have also tried scrapy shell several of the links, the outputs for xpaths work, but still, the pages are not getting scraped. What may be the issue? Are there other ways to deal with the pagination?
There are many things wrong with your code, and other things that can be improved. Please read the documentation carefully.
There's really no need to create xpath attributes.
You can write the xpath way shorter.
You can create a start_urls from the beginning.
You can let the item exporter to handle the json.
Here's an example, change it to your needs.
import scrapy
class ShopCard(scrapy.Item):
item = scrapy.Field()
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['trobos.lt']
start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
items = []
custom_settings = {
'DOWNLOAD_DELAY': 0.4,
'FEEDS': {
'spiderMaxima.json': {
'format': 'json',
'indent': 2,
}
}
}
def parse(self, response):
for url in response.xpath('//div[#class="card small"]//a[contains(#class, "shrink")]/#href').getall():
yield response.follow(url=url, callback=self.parse_main_item)
def parse_main_item(self, response):
shop = ShopCard()
Title = response.xpath('//h1/text()').get()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath('//div[#class="price"]//span/text()').get()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
yield shop
I wanted to scrape the feed of sitepoint.com, this is my code:
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def parse(self, response):
data = []
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
text = scrapy.Request(url, callback=self.parse_article)
data.append(
{"title": title, "href": href, "img": img, "time": time, "text": text}
)
yield data
def parse_article(self, response):
text = response.xpath(
'//*[#id="main-content"]/article/div/div/div[1]/section/text()'
).extract()
yield text
And this is the response I get:-
[{'title': 'How to Build an MVP with React and Firebase',
'href': '/react-firebase-build-mvp/',
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp-
app.jpg',
'time': 'September 28, 2021',
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]
It just does not scrape the urls. I followed everything said in this question but still could not make it work.
You have to visit the detail page from the listing to scrape the article.
In that case you have to yield the URL first then yield the data in the last spider
Also, the //*[#id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag
One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data
here is the full working code
import re
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def clean_text(self, raw_html):
"""
:param raw_html: this will take raw html code
:return: text without html tags
"""
cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(cleaner, '', raw_html)
def parse(self, response):
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
"href": href,
"img": img,
"time": time})
def parse_article(self, response):
title = response.request.meta["title"]
href = response.request.meta["href"]
img = response.request.meta["img"]
time = response.request.meta["time"]
all_data = {}
article_html = response.xpath('//*[#id="main-content"]/article/div/div/div[1]/section').get()
all_data["title"] = title
all_data["href"] = href
all_data["img"] = img
all_data["time"] = time
all_data["text"] = self.clean_text(article_html)
yield all_data
I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
After writing my first ?recursive? spider, i face some problems, i cant get fixed the whole day..
I did research, which misstakes can cause that 301 error, but every solution i tried, didnt help me out yet.
My console output
My modified settings.py
USER_AGENT = 'kartonage (Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0)'
DOWNLOAD_DELAY = 0.5
HTTPERROR_ALLOW_ALL = True
this user_Agent and httperror_allow_all were some solutions for other people with a redirected 301 error
My modified items.py
import scrapy
class KartonageItem(scrapy.Item):
SKU = scrapy.Field()
Title = scrapy.Field()
Link = scrapy.Field()
Price = scrapy.Field()
Delivery_Status = scrapy.Field()
Weight = scrapy.Field()
QTY = scrapy.Field()
Volume = scrapy.Field()
My code i used
import scrapy
from ..items import KartonageItem
class KartonSpider(scrapy.Spider):
name = "kartons12"
allow_domains = ['karton.eu']
start_urls = [
'https://www.karton.eu/Faltkartons'
]
custom_settings = {'FEED_EXPORT_FIELDS': ['SKU', 'Title', 'Link', 'Price', 'Delivery_Status', 'Weight', 'QTY', 'Volume'] }
def parse(self, response):
url = response.xpath('//div[#class="cat-thumbnails"]')
for a in url:
link = a.xpath('a/#href')
yield response.follow(url=link.get(), callback=self.parse_category_cartons)
def parse_category_cartons(self, response):
url2 = response.xpath('//div[#class="cat-thumbnails"]')
for a in url2:
link = a.xpath('a/#href')
yield response.follow(url=link.get(), callback=self.parse_target_page)
def parse_target_page(self, response):
card = response.xpath('//div[#class="text-center articelbox"]')
for a in card:
items = KartonageItem()
link = a.xpath('a/#href')
items ['SKU'] = a.xpath('.//div[#class="delivery-status"]/small/text()').get()
items ['Title'] = a.xpath('.//h5[#class="title"]/a/text()').get()
items ['Link'] = a.xpath('.//h5[#class="text-center artikelbox"]/a/#href').extract()
items ['Price'] = a.xpath('.//strong[#class="price-ger price text-nowrap"]/span/text()').get()
items ['Delivery_Status'] = a.xpath('.//div[#class="signal_image status-2"]/small/text()').get()
yield response.follow(url=link.get(),callback=self.parse_item, meta={'items':items})
def parse_item(self,response):
table = response.xpath('//span[#class="product-info-inner"]')
items = KartonageItem()
items = response.meta['items']
items['Weight'] = a.xpath('.//span[#class="staffelpreise-small"]/text()').get()
items['Volume'] = a.xpath('.//td[#class="icon_contenct"][7]/text()').get()
yield items
HTTP 301 isn't an error, it is a response for Moved Permanently. It automatically redirects you to the new address for that page. You can see in your execution logs that you got redirected.
That by itself shouldn't be a problem. Is it something else this may be causing? Any behavior from the spider that is unexpected?