I am trying to grasp the concept of Scrapy Callback. I had not been able to find any answers that sutied my issue as i needed to yield items twice in both pars an yet be able to callback.
This is my spider:
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
#for i in xrange(1000):
for i in range(1, 1000):
yield self.make_requests_from_url("http://www.snipplr.com/all/page/%d" % i)
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
return Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item
def parse2(self, response):
for sel in response.xpath('//div[#class="description"]'):
item = response.meta["item"]
item['desc'] = sel.xpath('p/text()').extract()
yield item
This is my pipeline:
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='sq', passwd='rt', db='sq')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s, %s)",
(item['title'][0], item['link'], item['desc'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
i am basically trying to get the data both from the first page and the page there after the page has been crawled. I am using Scrapy webcrawler an mysql.
You just need to yield a Request, not return:
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
yield Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item
Related
I am currently learning Scrapy and I want to crawl the price and properties of Rolex watches. So far my crawler is running and displaying all the data correctly. However, now I want to save the data from my crawler to a mysql database, however I am having problems. I get the data with the crawler "Watchbot" however the pipeline does not get the items. I have already checked Settings.py and enabled the pipeline. Where exactly is my error and how can I transfer the data to the mysql DB?
This is my cralwer called Watchbot
import scrapy
from scrapy.crawler import CrawlerProcess
from watches.watches.items import WatchesItem
class WatchbotSpider(scrapy.Spider):
name = "watchbot"
start_urls = ["https://www.watch.de/english/rolex.html"]
def parse(self, response, **kwargs):
for link in response.css("div.product-item-link a::attr(href)"):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
item = WatchesItem()
item["itemnr"] = response.xpath('//span[#itemprop="sku"]/text()').extract()[0]
item["reference"] = response.xpath('//span[#itemprop="mpn"]/text()').extract()[0]
item["year"] = response.xpath(
'//div[#class="product-option baujahr"]/div[#class="product-option-value"]/text()'
).extract()[0]
yield item
That is the Pipeline.py
import mysql
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(host="", user="", passwd="", database="")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute(
"""insert into watches values (%s), (%s), (%s)""",
(item["YEAR"][0], item["REFERENCE"][0], item["ITEMNR"][0]),
)
self.conn.commit()
and that is my items.py
import scrapy
class WatchesItem(scrapy.Item):
year = scrapy.Field()
itemnr = scrapy.Field()
reference = scrapy.Field()
print(itemnr)
my scrapy crawler collects data from a set of urls, but when I run it again to add new content, the old content is saved to my Mongodb database. Is there a way to check if this item is already found in my Mongodb database(duplicate items have the same title field) and if so, drop it from the pipeline. Also, is it better to delete them from the database after they are saved and if so, how would I implement that in my project.
This is my pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
bbcDict = {}
if item['art_content'] != []:
bbcDict['art_content'] = item['art_content']
bbcDict['date'] = item['date']
bbcDict['date_str'] = item['date_str']
bbcDict['title'] = item['title']
bbcDict['url'] = item['url']
self.db[self.collection_name].insert_one(dict(bbcDict))
return item
# self.db[self.collection_name].insert(dict(item))
# logging.debug("Post added to MongoDB")
# return item
This is my crawler
from datetime import datetime as dt
import scrapy
from ArtScraper.items import ArtscraperItem
class PostSpider(scrapy.Spider):
article = ""
name = 'crawly'
allowed_domains = []
start_urls = ['http://feeds.bbci.co.uk/arabic/rss.xml']
def parse(self, response):
# get the subreddit from the URL
#sub = response.url.split('/')[4]
#Get the title
# parse thru each of the posts
#for post in response.css('div.thing'):
articles = response.xpath('//channel/item')
for article in articles:
item = ArtscraperItem()
print ('hello')
item['date'] = dt.today()
item['date_str'] = article.xpath('pubDate/text()').extract_first()
item['url'] = article.xpath('link/text()').extract_first()
item['title'] = article.xpath('title/text()').extract_first()
url = item['url']
yield scrapy.Request(
url,
callback=self.parse_article,
meta={'item': item}, # carry over our item
)
#request = scrapy.Request(url, callback=self.parse_article)
#request.meta['item'] = item
#yield request
def parse_article(self, response):
item = response.meta['item']
pars = response.xpath("//div[#class='story-body']/div[#class='story-body__inner']/p/text()").extract()
item['art_content'] = '-'.join(pars)
print ("HHHH")
yield item
Thanks in advance.
You can filter out duplicates by creating a list of titles on your MongoPipeline class as the items are processed, and use DropItem to delete items during process_items. The official docs provide a great example. You can then save to MongoDB when the item is returned.
In your case here, this would be the implementation of a duplicates filter in your pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.titles_seen = set()
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
if item['title'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % item)
else:
self.titles_seen.add(item['title'])
return item
For me was necessary import the ItemAdapter to convert the Item to Array
from itemadapter import ItemAdapter
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if self.db[self.collection_name].find_one({'id':adapter['id']}) != None:
dado = self.db[self.collection_name].find_one_and_update({'id':adapter['id']})
## ----> raise DropItem(f"Duplicate item found: {item!r}") <------
print(f"Duplicate item found: {dado!r}")
else:
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item
I preferred to update rather than trigger dropitem.
I want to check the title of an item in the csv file, and then add to the csv file if it does not exists. I searched almost any responses related to duplicate values. Mostly, they are about DuplicatesPipeline and the others did not work for me.
This is my custom pipeline which is the pipelines.py
class CheckCsvPipeline(object):
def __init__(self):
csv_path = r"C:\Users\HP\PycharmProjects\ToScrape\book\items.csv"
self.csvfile = open(csv_path, 'r')
self.readCsv = csv.reader(self.csvfile, delimiter=',')
def process_item(self, item, spider):
for row in self.readCsv:
if item['title'] in row:
raise DropItem("This title exists: %s" %item)
else:
return item
Here is my spider:
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
I run the spider with the following code, but it still adds the existing values.
scrapy crawl books -o items.csv
I suggest you to maintain a list of titles in your spider, and then inside pipeline, check if title already exists in that lists, then do not yield it.
class CheckCsvPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
if item['title'] in spider.allTitles:
raise DropItem("This title exists: %s" % item)
else:
return item
in your spider, do this
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
allTitles = []
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
self.allTitles.extend([ title ])
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
I have tried to extract all link from a web. My spider is a subclass of a superclass called GeneralSpider. The problem is that when I change the name of the method 'parse_url' by parse (overriding a method of the superclass) link extractor get all links of the main page, but is not following the links. If I don't change the method name, spider does not work. Am I doing something wrong?
# -*- coding: utf-8 -*-
from core.generalSpider import GeneralSpider
from scrapy.linkextractors import LinkExtractor
from scrapy import log
from scrapy.contrib.spiders import Rule
from scrapy.item import Item, Field
from spiders.settings import GET_ITEMS
class MyItem(Item):
url = Field()
text = Field()
item = Field()
class GetItemsSpider(GeneralSpider):
name = GET_ITEMS
start_urls = 'http://www.example.com'
allowed_domains = ['example.com']
rules = (Rule(LinkExtractor(allow=()), callback='parse_url', follow=True), )
def __init__(self, port, **kwargs):
super(GetItemsSpider, self).__init__(port, **kwargs)
# User agent
self.user_agent = Utils.get_random_item_from_list(core_settings.USER_AGENT_LIST)
# Scrapy logs
self.log('GetItemsSpider init start_urls= %s parameters= %s ' %
(self.start_urls, str(self.parameters)), level=log.DEBUG)
self.log('%s init start_urls= %s parameters= %s ' %
(self.name, self.start_urls, str(self.parameters)), level=log.INFO)
self.log('USER AGENT = %s' % self.user_agent, level=log.INFO)
self.log('PORT = %s' % self._proxy_port, level=log.INFO)
def parse_url(self, response):
items = []
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
items.append(item)
return items
there is no better explanation that the one on documentation, check the warning here
Just don't override parse.
At the end I could not find why my code was not working, but I found an alternative solution:
def parse_url(self, response):
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
if condition:
yield Request(urlparse.urljoin(response.url, link.url), callback=self.parse)
yield item
This solution is based in Philip Adzanoukpe's example. I hope this can be useful.
I am actually very new to Scrapy and I'm not sure why am I not getting the information which I want. I am using Scrapy on the website www.kayak.com and i want to extract the check in and check out time from all the hotels in New York. I have successfully scraped out data from the same page which the check in and check out time is in but couldnt scrape out data for both these fields.
The code I have is shown below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hotel_crawl.items import HotelCrawlItem
from bs4 import BeautifulSoup
import time
import urlparse
class MySpider(CrawlSpider):
name = "kayaksite"
allowed_domains = ["www.kayak.com"]
start_urls = ["http://www.kayak.com/New-York-Hotels.15830.hotel.ksp"]
rules = (
Rule(LinkExtractor(
restrict_xpaths=("//a[#class='actionlink pagenumber' [contains(text(),'Next')]", )), callback="parse_item", follow=True),
def parse_start_url(self, response):
print "test"
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item2(self, response):
print "test--------------"
self.logger.info('Hi, this is an item page! %s', response.url)
item = response.meta['item']
item['location'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[1]/text()").extract()
item['postcode'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[3]/text()").extract()
item['check_in'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
item['check_out'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
yield item