my scrapy crawler collects data from a set of urls, but when I run it again to add new content, the old content is saved to my Mongodb database. Is there a way to check if this item is already found in my Mongodb database(duplicate items have the same title field) and if so, drop it from the pipeline. Also, is it better to delete them from the database after they are saved and if so, how would I implement that in my project.
This is my pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
bbcDict = {}
if item['art_content'] != []:
bbcDict['art_content'] = item['art_content']
bbcDict['date'] = item['date']
bbcDict['date_str'] = item['date_str']
bbcDict['title'] = item['title']
bbcDict['url'] = item['url']
self.db[self.collection_name].insert_one(dict(bbcDict))
return item
# self.db[self.collection_name].insert(dict(item))
# logging.debug("Post added to MongoDB")
# return item
This is my crawler
from datetime import datetime as dt
import scrapy
from ArtScraper.items import ArtscraperItem
class PostSpider(scrapy.Spider):
article = ""
name = 'crawly'
allowed_domains = []
start_urls = ['http://feeds.bbci.co.uk/arabic/rss.xml']
def parse(self, response):
# get the subreddit from the URL
#sub = response.url.split('/')[4]
#Get the title
# parse thru each of the posts
#for post in response.css('div.thing'):
articles = response.xpath('//channel/item')
for article in articles:
item = ArtscraperItem()
print ('hello')
item['date'] = dt.today()
item['date_str'] = article.xpath('pubDate/text()').extract_first()
item['url'] = article.xpath('link/text()').extract_first()
item['title'] = article.xpath('title/text()').extract_first()
url = item['url']
yield scrapy.Request(
url,
callback=self.parse_article,
meta={'item': item}, # carry over our item
)
#request = scrapy.Request(url, callback=self.parse_article)
#request.meta['item'] = item
#yield request
def parse_article(self, response):
item = response.meta['item']
pars = response.xpath("//div[#class='story-body']/div[#class='story-body__inner']/p/text()").extract()
item['art_content'] = '-'.join(pars)
print ("HHHH")
yield item
Thanks in advance.
You can filter out duplicates by creating a list of titles on your MongoPipeline class as the items are processed, and use DropItem to delete items during process_items. The official docs provide a great example. You can then save to MongoDB when the item is returned.
In your case here, this would be the implementation of a duplicates filter in your pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.titles_seen = set()
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
if item['title'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % item)
else:
self.titles_seen.add(item['title'])
return item
For me was necessary import the ItemAdapter to convert the Item to Array
from itemadapter import ItemAdapter
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if self.db[self.collection_name].find_one({'id':adapter['id']}) != None:
dado = self.db[self.collection_name].find_one_and_update({'id':adapter['id']})
## ----> raise DropItem(f"Duplicate item found: {item!r}") <------
print(f"Duplicate item found: {dado!r}")
else:
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item
I preferred to update rather than trigger dropitem.
Related
I am currently learning Scrapy and I want to crawl the price and properties of Rolex watches. So far my crawler is running and displaying all the data correctly. However, now I want to save the data from my crawler to a mysql database, however I am having problems. I get the data with the crawler "Watchbot" however the pipeline does not get the items. I have already checked Settings.py and enabled the pipeline. Where exactly is my error and how can I transfer the data to the mysql DB?
This is my cralwer called Watchbot
import scrapy
from scrapy.crawler import CrawlerProcess
from watches.watches.items import WatchesItem
class WatchbotSpider(scrapy.Spider):
name = "watchbot"
start_urls = ["https://www.watch.de/english/rolex.html"]
def parse(self, response, **kwargs):
for link in response.css("div.product-item-link a::attr(href)"):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
item = WatchesItem()
item["itemnr"] = response.xpath('//span[#itemprop="sku"]/text()').extract()[0]
item["reference"] = response.xpath('//span[#itemprop="mpn"]/text()').extract()[0]
item["year"] = response.xpath(
'//div[#class="product-option baujahr"]/div[#class="product-option-value"]/text()'
).extract()[0]
yield item
That is the Pipeline.py
import mysql
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(host="", user="", passwd="", database="")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute(
"""insert into watches values (%s), (%s), (%s)""",
(item["YEAR"][0], item["REFERENCE"][0], item["ITEMNR"][0]),
)
self.conn.commit()
and that is my items.py
import scrapy
class WatchesItem(scrapy.Item):
year = scrapy.Field()
itemnr = scrapy.Field()
reference = scrapy.Field()
print(itemnr)
I notice in scrapy log that some urls returned 200 status but contained no items. It seems to be the stability of the site as re-crawling these urls 1-2 times again yield items. I would like to save these urls in a separate file for re-crawling.
I tried to create a dictionary in the spider class to store these urls but there is no easy way to save the dictionary into a file.
Another way I tried is to create a 2nd item class for urls and use item pipeline. It still outputs empty file though. I am not too advanced to write my own pipeline. Here is my code.
import scrapy
class MyItem(scrapy.Item):
productCode = scrapy.Field()
productName = scrapy.Field()
...
class UrlItem(scrapy.Item):
eurl = scrapy.Field()
parse
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if len(products) == 0:
url = UrlItem()
url['eurl'] = response.url
yield url
else:
item = MyItem()
item['...'] = ...
...
yield item
pipeline
from .items import MyItem, UrlItem
import csv
class UrlPipeline:
def open_spider(self, spider):
self.file = open('%s.csv' % "noProductsUrls", 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, url, spider):
if isinstance(url, UrlItem):
csvWriter = csv.writer(self.file)
csvWriter.writerow(ItemAdapter(url))
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if isinstance(item, MyItem):
adapter = ItemAdapter(item)
if adapter['productCode'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['productCode'])
return item
settings file
'project.pipelines.MyPipeline': 300,
'project.pipelines.UrlPipeline': 300,
The thing is also I have the exporter setting in spider class already that saves a csv. In the pipeline I just want to add one more csv. Do the two conflict? Or it is better to structure both csv files in the pipeline?
Update: I opted for #marcos solution below which is superior.
There is a way to save csv in the spider class which is based on this post.
def __init__(self):
self.outfile = open("urls.csv", "w", newline = "")
self.writer = csv.writer(self.outfile)
def closed(self,reason):
self.outfile.close()
add the following in def parse
if len(products) == 0:
self.writer.writerow([response.url])
I would suggest you just yield a retry request whenever no product is found on the page, unless you have a very specific reason to store those URLs.
The code would look like:
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if not len(products):
yield self._retry_request(response)
return
item = MyItem()
item['...'] = ...
...
yield item
def _retry_request(self, response, max_retries=5):
retries = response.meta.get('retry_time', 0)
if retries < max_retries:
return response.request.replace(
meta={**response.meta, 'retry_time': retries + 1},
dont_filter=True,
)
else:
self.logger.warning(f'Max retries reached for {response.url}')
I've scrapped the urls i want from a page. Now I want to filter them for keywords using a pipeline:
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if any(key in item['url'] for key in keywords):
return item
Problem is its returning nothing now.
The spider:
import scrapy
from gumtree_couches.items import adItem
from urllib.parse import urljoin
class GumtreeSpider(scrapy.Spider):
name = 'GumtreeCouches'
allowed_domains = ['https://someurl']
start_urls = ['https://someurl']
def parse(self, response):
item = adItem()
for ad_links in response.xpath('//div[#class="view"][1]//a'):
relative_url = ad_links.xpath('#href').extract_first()
item['title'] = ad_links.xpath('text()').extract_first()
item['url'] = response.urljoin(relative_url)
yield item
How can I filter all the scraped urls for keywords using the pipeline?
Thanks!
This should fix your problem:
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if any(key in item['url'] for key in self.keywords):
return item
Notice that I'm using self.keywords to refer to the keywords class attribute.
If you look at your spider logs, you should find some errors saying something like: NameError: name 'keywords' is not defined.
Anyway, I'd recommend you to implement this pipeline like this:
from scrapy.exceptions import DropItem
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if not any(key in item['url'] for key in self.keywords):
raise DropItem('missing keyword in URL')
return item
This way, you'll have the information about the dropped items in the job stats once it's finished.
From reading the documentation I think you have to cater for all paths e.g.
from scrapy.exceptions import DropItem
def process_item(self, item, spider):
keywords = ['leather', 'couches']
if item['url']:
if any(key in item['url'] for key in keywords):
return item
else
raise DropItem("Missing specified keywords.")
else
return item
I'm building a simple spider to crawl a structured site and download *.txt files. I've managed to get everything working except for a custom FilesPipeline class.
My goal is to download *.txt files into directories according to their url location. I can achieve my goal if I edit the Scrapy class directly (shown below)
files.py -> FilesPipeline::file_path()
...
# return 'full/%s%s' % (media_guid, media_ext)
return url.split('example.com/')[1]
I want to overload the class properly but haven't been successful. I'm not sure what I should be doing differently. The spider will run with no warnings or errors but wont download files.
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MySpiderFilesPipeline': 1,
'myspider.pipelines.MySpiderPipeline': 300,
}
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from myspider.items import MySpiderItem
class SpideySpider(CrawlSpider):
name = 'spidey'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
rules = (
Rule(LinkExtractor(allow='', restrict_xpaths='//tr/td/a', deny_extensions='html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
links = response.xpath('//tr/td/a')
for link in links:
i = MySpiderItem()
i['title'] = response.xpath('//title/text()').extract()
i['href'] = link.xpath('#href').extract()
i['text'] = link.xpath('text()').extract()
i["current_url"] = response.url
referring_url = response.request.headers.get('Referer', None)
i['referring_url'] = referring_url
i['depth'] = response.meta['depth']
if i['text'][0]:
if re.match('^#.*\.txt$', i['text'][0]) is not None:
i['file_urls'] = [ response.urljoin(i['href'][0]) ]
yield i
pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.contrib.pipeline.files import FilesPipeline, FSFilesStore
import json
import re
class MySpiderPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if re.match('^#.*\.html$', item['text'][0]) is not None:
valid = False
raise DropItem("HTML File")
if re.match('^#.*\.txt$', item['text'][0]) is not None:
pass
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpiderFilesPipeline(FilesPipeline):
_url_breakstring = "example.com/"
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])]
def file_path(self, request, response=None, info=None):
return url.split(_url_breakstring)[1]
# media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
# return 'full/%s%s' % (media_guid, media_ext)
For pipeline class add __init__ method for example:
class GCSFilePipeline(ImagesPipeline):
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, settings=settings, download_func=download_func)
I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()