I am currently learning Scrapy and I want to crawl the price and properties of Rolex watches. So far my crawler is running and displaying all the data correctly. However, now I want to save the data from my crawler to a mysql database, however I am having problems. I get the data with the crawler "Watchbot" however the pipeline does not get the items. I have already checked Settings.py and enabled the pipeline. Where exactly is my error and how can I transfer the data to the mysql DB?
This is my cralwer called Watchbot
import scrapy
from scrapy.crawler import CrawlerProcess
from watches.watches.items import WatchesItem
class WatchbotSpider(scrapy.Spider):
name = "watchbot"
start_urls = ["https://www.watch.de/english/rolex.html"]
def parse(self, response, **kwargs):
for link in response.css("div.product-item-link a::attr(href)"):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
item = WatchesItem()
item["itemnr"] = response.xpath('//span[#itemprop="sku"]/text()').extract()[0]
item["reference"] = response.xpath('//span[#itemprop="mpn"]/text()').extract()[0]
item["year"] = response.xpath(
'//div[#class="product-option baujahr"]/div[#class="product-option-value"]/text()'
).extract()[0]
yield item
That is the Pipeline.py
import mysql
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(host="", user="", passwd="", database="")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute(
"""insert into watches values (%s), (%s), (%s)""",
(item["YEAR"][0], item["REFERENCE"][0], item["ITEMNR"][0]),
)
self.conn.commit()
and that is my items.py
import scrapy
class WatchesItem(scrapy.Item):
year = scrapy.Field()
itemnr = scrapy.Field()
reference = scrapy.Field()
print(itemnr)
Related
my scrapy crawler collects data from a set of urls, but when I run it again to add new content, the old content is saved to my Mongodb database. Is there a way to check if this item is already found in my Mongodb database(duplicate items have the same title field) and if so, drop it from the pipeline. Also, is it better to delete them from the database after they are saved and if so, how would I implement that in my project.
This is my pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
bbcDict = {}
if item['art_content'] != []:
bbcDict['art_content'] = item['art_content']
bbcDict['date'] = item['date']
bbcDict['date_str'] = item['date_str']
bbcDict['title'] = item['title']
bbcDict['url'] = item['url']
self.db[self.collection_name].insert_one(dict(bbcDict))
return item
# self.db[self.collection_name].insert(dict(item))
# logging.debug("Post added to MongoDB")
# return item
This is my crawler
from datetime import datetime as dt
import scrapy
from ArtScraper.items import ArtscraperItem
class PostSpider(scrapy.Spider):
article = ""
name = 'crawly'
allowed_domains = []
start_urls = ['http://feeds.bbci.co.uk/arabic/rss.xml']
def parse(self, response):
# get the subreddit from the URL
#sub = response.url.split('/')[4]
#Get the title
# parse thru each of the posts
#for post in response.css('div.thing'):
articles = response.xpath('//channel/item')
for article in articles:
item = ArtscraperItem()
print ('hello')
item['date'] = dt.today()
item['date_str'] = article.xpath('pubDate/text()').extract_first()
item['url'] = article.xpath('link/text()').extract_first()
item['title'] = article.xpath('title/text()').extract_first()
url = item['url']
yield scrapy.Request(
url,
callback=self.parse_article,
meta={'item': item}, # carry over our item
)
#request = scrapy.Request(url, callback=self.parse_article)
#request.meta['item'] = item
#yield request
def parse_article(self, response):
item = response.meta['item']
pars = response.xpath("//div[#class='story-body']/div[#class='story-body__inner']/p/text()").extract()
item['art_content'] = '-'.join(pars)
print ("HHHH")
yield item
Thanks in advance.
You can filter out duplicates by creating a list of titles on your MongoPipeline class as the items are processed, and use DropItem to delete items during process_items. The official docs provide a great example. You can then save to MongoDB when the item is returned.
In your case here, this would be the implementation of a duplicates filter in your pipeline:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.titles_seen = set()
#classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
if item['title'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % item)
else:
self.titles_seen.add(item['title'])
return item
For me was necessary import the ItemAdapter to convert the Item to Array
from itemadapter import ItemAdapter
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if self.db[self.collection_name].find_one({'id':adapter['id']}) != None:
dado = self.db[self.collection_name].find_one_and_update({'id':adapter['id']})
## ----> raise DropItem(f"Duplicate item found: {item!r}") <------
print(f"Duplicate item found: {dado!r}")
else:
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item
I preferred to update rather than trigger dropitem.
I am trying to get data using scrapy. It is not showing any error but still not getting any result. Can someone please help?
Below is the code I am using.
Spider
import scrapy
class SecSpider(scrapy.Spider):
name = 'Sec'
allowed_domains = ['www.sec.gov']
start_urls = ['https://www.sec.gov/litigation/litreleases.shtml']
def parse(self, response):
rows = response.xpath('//*[#id="mainlist"]//tr')[2:]
for row in rows:
link = row.xpath('.//#href').extract_first()
number = row.xpath('.//a/text()').extract_first()
date = row.xpath('.//td[2]/text()').extract_first()
title = row.xpath('.//td[3]/text()').extract()
yield {
"Link": link,
"Number": number,
"Date": date,
"Title": title
}
pipeline
import pymssql
class ScrapingPipeline(object):
def __init__(self):
self.conn = pymssql.connect(host='localhost', user='sa', password='data1234', database='Sec')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO updates(link, number, date, title) VALUES (%s, %s, %s, %s)",
(item['Link'], item['Number'], item['Date'], item['Title']))
self.conn.commit()
return item
settings
ITEM_PIPELINES = {'Scraping.pipelines.ScrapingPipeline': 300}
items
from scrapy import item, Field
class ScrapingItem(scrapy.Item):
link = scrapy.Field()
number = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
I have a spider that reads the start_urls from a MySQL database and scrapes an unknown number of links from each page. I want to use pipelines.py to update the database with the scraped links but I don't know how to get the start_url back in to the pipeline for the SQL UPDATE statement.
Here is the spider code which works.
import scrapy
import MySQLdb
import MySQLdb.cursors
from scrapy.http.request import Request
from youtubephase2.items import Youtubephase2Item
class youtubephase2(scrapy.Spider):
name = 'youtubephase2'
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse)
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(#class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['pageurl'] = sel.xpath('#href').extract()
yield item
And here is the pipeline.py where I want to update the database with the links scraped using the start_url as the WHERE criteria for the SQL UPDATE statement. So start_url in the SQL statement is a placeholder for what I would like to accomplish.
import MySQLdb
import MySQLdb.cursors
import hashlib
import re
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
from youtubephase2.items import Youtubephase2Item
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=true)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE SearchResults SET PageURL = %s WHERE ResultURL = start_url[
VALUES (%s)""",
(item['pageurl']
))
self.conn.commit()
except MySQLdb.Error, e:
log.msg("Error %d: %s" % (e.args[0], e.args[1]))
return item
Hopefully my question is clear enough. I have used pipeline.py successfully in the past to insert items in to a database.
you can use the meta Request parameter to pass relevant information between related requests and items:
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse, meta=dict(start_url=row[0]))
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(#class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['pageurl'] = sel.xpath('#href').extract()
item['start_url'] = response.meta['start_url']
yield item
now, you could also use response.url, but this could change because of redirections or other stuff, so it could later differ from what you have in your database.
Last, you have to update your pipeline to also pass the item['start_url'] as the start_url parameter in your cursor.execute
I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()
I am trying to grasp the concept of Scrapy Callback. I had not been able to find any answers that sutied my issue as i needed to yield items twice in both pars an yet be able to callback.
This is my spider:
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
#for i in xrange(1000):
for i in range(1, 1000):
yield self.make_requests_from_url("http://www.snipplr.com/all/page/%d" % i)
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
return Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item
def parse2(self, response):
for sel in response.xpath('//div[#class="description"]'):
item = response.meta["item"]
item['desc'] = sel.xpath('p/text()').extract()
yield item
This is my pipeline:
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='sq', passwd='rt', db='sq')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s, %s)",
(item['title'][0], item['link'], item['desc'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
i am basically trying to get the data both from the first page and the page there after the page has been crawled. I am using Scrapy webcrawler an mysql.
You just need to yield a Request, not return:
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
yield Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item