I have a spider that reads the start_urls from a MySQL database and scrapes an unknown number of links from each page. I want to use pipelines.py to update the database with the scraped links but I don't know how to get the start_url back in to the pipeline for the SQL UPDATE statement.
Here is the spider code which works.
import scrapy
import MySQLdb
import MySQLdb.cursors
from scrapy.http.request import Request
from youtubephase2.items import Youtubephase2Item
class youtubephase2(scrapy.Spider):
name = 'youtubephase2'
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse)
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(#class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['pageurl'] = sel.xpath('#href').extract()
yield item
And here is the pipeline.py where I want to update the database with the links scraped using the start_url as the WHERE criteria for the SQL UPDATE statement. So start_url in the SQL statement is a placeholder for what I would like to accomplish.
import MySQLdb
import MySQLdb.cursors
import hashlib
import re
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
from youtubephase2.items import Youtubephase2Item
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=true)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE SearchResults SET PageURL = %s WHERE ResultURL = start_url[
VALUES (%s)""",
(item['pageurl']
))
self.conn.commit()
except MySQLdb.Error, e:
log.msg("Error %d: %s" % (e.args[0], e.args[1]))
return item
Hopefully my question is clear enough. I have used pipeline.py successfully in the past to insert items in to a database.
you can use the meta Request parameter to pass relevant information between related requests and items:
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse, meta=dict(start_url=row[0]))
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(#class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['pageurl'] = sel.xpath('#href').extract()
item['start_url'] = response.meta['start_url']
yield item
now, you could also use response.url, but this could change because of redirections or other stuff, so it could later differ from what you have in your database.
Last, you have to update your pipeline to also pass the item['start_url'] as the start_url parameter in your cursor.execute
Related
I am currently learning Scrapy and I want to crawl the price and properties of Rolex watches. So far my crawler is running and displaying all the data correctly. However, now I want to save the data from my crawler to a mysql database, however I am having problems. I get the data with the crawler "Watchbot" however the pipeline does not get the items. I have already checked Settings.py and enabled the pipeline. Where exactly is my error and how can I transfer the data to the mysql DB?
This is my cralwer called Watchbot
import scrapy
from scrapy.crawler import CrawlerProcess
from watches.watches.items import WatchesItem
class WatchbotSpider(scrapy.Spider):
name = "watchbot"
start_urls = ["https://www.watch.de/english/rolex.html"]
def parse(self, response, **kwargs):
for link in response.css("div.product-item-link a::attr(href)"):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
item = WatchesItem()
item["itemnr"] = response.xpath('//span[#itemprop="sku"]/text()').extract()[0]
item["reference"] = response.xpath('//span[#itemprop="mpn"]/text()').extract()[0]
item["year"] = response.xpath(
'//div[#class="product-option baujahr"]/div[#class="product-option-value"]/text()'
).extract()[0]
yield item
That is the Pipeline.py
import mysql
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(host="", user="", passwd="", database="")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute(
"""insert into watches values (%s), (%s), (%s)""",
(item["YEAR"][0], item["REFERENCE"][0], item["ITEMNR"][0]),
)
self.conn.commit()
and that is my items.py
import scrapy
class WatchesItem(scrapy.Item):
year = scrapy.Field()
itemnr = scrapy.Field()
reference = scrapy.Field()
print(itemnr)
I'm trying to build a for each for products so I want to scrape one by one from an array and I would also like to know where to place my for each.
The array i want to use it's called EAN.
import scrapy
import re
import MySQLdb
class ProductSpider(scrapy.Spider):
db = MySQLdb.connect(host="localhost", # Host name
user="root", # User Name
passwd="", # Passwoord
db="ProductSpider") # Database name
cur = db.cursor()
cur.execute("SELECT EAN FROM product")
name = 'product'
EAN = []
rows = cur.fetchall()
for row in rows:
EAN = (row[0])
# print(row) #activate to see EAN codes.
start_urls = ['https://www.google.nl/search?client=opera&biw=1880&bih=1008&output=search&tbm=shop&q='+EAN+'&oq='+EAN+'&gs_l=products-cc.12...0.0.0.2112.0.0.0.0.0.0.0.0..0.0....0...1ac..64.products-cc..0.0.0....0.Mgj-aNT06E4']
custom_settings = {
'FEED_URI': 'tmp/' + EAN + '.csv'
}
Here is what I've made.
for EAN in range(len(EAN)): #forloop afmaken
EAN.append('EAN')
print(EAN)
def parse(self, response):
urls = response.css('.MCpGKc > a::attr("href")').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse)
response.selector.remove_namespaces()
all_sellers = response.css(".os-seller-name-primary > a::text").extract()
all_prices = response.css("td.os-total-col::text").re("\d+\,\d{1,2}")
all_urls= response.css(".os-seller-name-primary > a::attr('href')").extract()
for item in zip(all_prices, all_sellers, all_urls):
scrapped_info = {
'price': item[0],
'seller': item[1],
'url' : item[2]
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
I am trying to get data using scrapy. It is not showing any error but still not getting any result. Can someone please help?
Below is the code I am using.
Spider
import scrapy
class SecSpider(scrapy.Spider):
name = 'Sec'
allowed_domains = ['www.sec.gov']
start_urls = ['https://www.sec.gov/litigation/litreleases.shtml']
def parse(self, response):
rows = response.xpath('//*[#id="mainlist"]//tr')[2:]
for row in rows:
link = row.xpath('.//#href').extract_first()
number = row.xpath('.//a/text()').extract_first()
date = row.xpath('.//td[2]/text()').extract_first()
title = row.xpath('.//td[3]/text()').extract()
yield {
"Link": link,
"Number": number,
"Date": date,
"Title": title
}
pipeline
import pymssql
class ScrapingPipeline(object):
def __init__(self):
self.conn = pymssql.connect(host='localhost', user='sa', password='data1234', database='Sec')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO updates(link, number, date, title) VALUES (%s, %s, %s, %s)",
(item['Link'], item['Number'], item['Date'], item['Title']))
self.conn.commit()
return item
settings
ITEM_PIPELINES = {'Scraping.pipelines.ScrapingPipeline': 300}
items
from scrapy import item, Field
class ScrapingItem(scrapy.Item):
link = scrapy.Field()
number = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
I am trying to grasp the concept of Scrapy Callback. I had not been able to find any answers that sutied my issue as i needed to yield items twice in both pars an yet be able to callback.
This is my spider:
import scrapy
import csv
from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["snipplr.com"]
def start_requests(self):
#for i in xrange(1000):
for i in range(1, 1000):
yield self.make_requests_from_url("http://www.snipplr.com/all/page/%d" % i)
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
return Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item
def parse2(self, response):
for sel in response.xpath('//div[#class="description"]'):
item = response.meta["item"]
item['desc'] = sel.xpath('p/text()').extract()
yield item
This is my pipeline:
import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector
class CsvWriterPipeline(object):
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', user='sq', passwd='rt', db='sq')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
result = self.cursor.fetchone()
if result:
log.msg("Item already in database: %s" % item, level=log.DEBUG)
else:
self.cursor.execute(
"INSERT INTO items (title, url) VALUES (%s, %s, %s)",
(item['title'][0], item['link'], item['desc'][0]))
self.connection.commit()
log.msg("Item stored : " % item, level=log.DEBUG)
return item
def handle_error(self, e):
log.err(e)
i am basically trying to get the data both from the first page and the page there after the page has been crawled. I am using Scrapy webcrawler an mysql.
You just need to yield a Request, not return:
def parse(self, response):
for sel in response.xpath('//ol[#class="snippets marg"]/li[1]/h3'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
yield Request(item['link'], request.meta={'item':item}, callback=self.parse2)
yield item
since three days I am trying to save the respective start_urs in a meta attribute to pass it as item to subsequent requests in scrapy, so I can use the start_url to call a dictionary to populate my output with additional data. Actually it should be straightforward, because it is explained in the documentation ...
There is a discussion in the google scrapy group and there was a question here also but I can't get it to run :(
I am new to scrapy and I think it is a great framework, but for my project I have to know the start_urls of all requests and it is quite complicated as it looks like.
I would really appreciate some help!
At the moment my code looks like this:
class example(CrawlSpider):
name = 'example'
start_urls = ['http://www.example.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/blablabla/', )), callback='parse_item'),
)
def parse(self, response):
for request_or_item in super(example, self).parse(response):
if isinstance(request_or_item, Request):
request_or_item = request_or_item.replace(meta = {'start_url': response.meta['start_url']})
yield request_or_item
def make_requests_from_url(self, url):
return Request(url, dont_filter=True, meta = {'start_url': url})
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = testItem()
print response.request.meta, response.url
I wanted to delete this answer as it doesn't solve OP's problem, but i thought to leave it as a scrapy example.
Warning:
When writing crawl spider rules, avoid using parse as callback, since
the CrawlSpider uses the parse method itself to implement its logic.
So if you override the parse method, the crawl spider will no longer
work.
Use BaseSpider instead:
class Spider(BaseSpider):
name = "domain_spider"
def start_requests(self):
last_domain_id = 0
chunk_size = 10
cursor = settings.db.cursor()
while True:
cursor.execute("""
SELECT domain_id, domain_url
FROM domains
WHERE domain_id > %s AND scraping_started IS NULL
LIMIT %s
""", (last_domain_id, chunk_size))
self.log('Requesting %s domains after %s' % (chunk_size, last_domain_id))
rows = cursor.fetchall()
if not rows:
self.log('No more domains to scrape.')
break
for domain_id, domain_url in rows:
last_domain_id = domain_id
request = self.make_requests_from_url(domain_url)
item = items.Item()
item['start_url'] = domain_url
item['domain_id'] = domain_id
item['domain'] = urlparse.urlparse(domain_url).hostname
request.meta['item'] = item
cursor.execute("""
UPDATE domains
SET scraping_started = %s
WHERE domain_id = %s
""", (datetime.now(), domain_id))
yield request
...