Simple Scrapy crawler won't use item pipeline - python

Background
I am trying to learn Scrapy by example.
At this point, I have made a CrawlSpider is able to navigate to a page, follow all links, extract data using CSS selectors and populate items using an item loader.
I am now trying to add an arbitrary pipeline, just so I can get it working.
My issue is that item pipelines for CrawlSpiders appear to need additional definitions than those that use a scrapy.Spider - I cannot find working examples of CrawlSpider pipelines.
What my code actually does
Starts at the wikipedia page for lexus and follows all other wikipedia pages that link from it.
It then extracts the title of each page, and headings from the first table. These are stored in items, which are then printed to a .txt document.
lexuswikicrawl.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from wikicars.items import WikiItem
#Global variables
pages = {}
failed_pages = 1
filename = 'wiki.txt'
class GenSpiderCrawl(CrawlSpider):
name = 'lexuswikicrawl'
#Start at the lexus wikipedia page, and only follow the wikipedia links
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Lexus']
#There are no specific rules specified below, therefore all links will be followed
rules = (Rule(LinkExtractor(), callback = 'parse_page'),)
#Define what selectors to crawl
def parse_page(self, response):
global pages
global failed_pages
#Try and capture the page title using CSS selector
#If not, keep count of the amount of failed selectors
try:
pagename = (response.css('div#content.mw-body > h1#firstHeading::text').extract())[0]
except:
pagename = ('Failed pagename: '+ str(failed_pages))
failed_pages += 1
# Capture table categories that fall under the CSS selector for regular text items
tabcat = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr > th::text').extract()
# Capture tale categories that fall under CSS selector for text with hyperlinks
for i in range(20):
tabcatlink1 = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr:nth-child('+str(i)+') > th > a::text').extract()
tabcatlink2 = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr:nth-child('+str(i)+') > th > div > a::text').extract()
if len(tabcatlink1) > 0:
tabcat.append(tabcatlink1)
else:
pass
if len(tabcatlink2) > 0:
tabcat.append(tabcatlink2)
else:
continue
#Load 'pagename' and 'categories' into a new item
page_loader = ItemLoader(item=WikiItem(), selector = tabcat)
page_loader.add_value('title', pagename)
page_loader.add_value('categories', tabcat)
#Store the items in an overarching dictionary structure
pages[pagename] = page_loader.load_item()
#Try and print the results to a text document
try:
with open(filename, 'a+') as f:
f.write('Page Name:' + str(pages[pagename]['title'])+ '\n')
except:
with open(filename, 'a+') as f:
f.write('Page name error'+ '\n')
try:
with open(filename, 'a+') as f:
f.write('Categories:' + str(pages[pagename]['categories'])+ '\n')
except:
with open(filename, 'a+') as f:
f.write('Table Category data not available' + '\n')
items.py
import scrapy
from scrapy.loader.processors import TakeFirst, MapCompose
def convert_categories(categories):
categories = (str(categories).upper().strip('[]'))
return categories
def convert_title(title):
title = title.upper()
return title
class WikiItem(scrapy.Item):
categories = scrapy.Field(
input_processor = MapCompose(convert_categories)
)
title = scrapy.Field(
input_processor = MapCompose(convert_title)
pipelines.py
This is where I suspect trouble is caused. My current thinking is that I need something more than just process_item() to get my Pipeline to run. I have tried all I can to rearrange the examples from: https://docs.scrapy.org/en/latest/topics/item-pipeline.html.
from scrapy.exceptions import DropItem
class PipelineCheck(object):
def process_item(self, item, spider):
print('I am a pipeline this is an item:' + str(item) + '\n')
settings.py
I have declared my pipeline and its priority. I have also declared a generic user agent. Is there an additional variable i need to set?
BOT_NAME = 'wikicars'
SPIDER_MODULES = ['wikicars.spiders']
NEWSPIDER_MODULE = 'wikicars.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'wikicars.pipelines.PipelineCheck': 100,
}

Related

Scrapy DropItem for URL that contains a substring

I'm very new to Python, and I'm using scrapy. Right now, I have two spiders, one for Google, and one for the pages themselves. I plan to combine them, but haven't yet because I want to troubleshoot the pages separately. Both spiders work fine, but I want to be able to drop internal links from my list of scraped links (so those that contain a '#' symbol). I've tried this a million different ways, including using find & regex, changing variable names, not using variables, adding "self" to the expression, but nothing seems to affect it. The pipeline is enabled -- it just doesn't seem to do anything. Any help is appreciated.
pipelines.py
from scrapy.exceptions import DropItem
class SpiderValidationPipeline:
def drop_links(self, item, spider):
url = str(item.get('links'))
marker = '#'
if item.get('links'):
if marker in url:
raise DropItem("Internal Link")
else:
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
def remove_nt(text):
return text.replace('\n', '').replace('\t', '').replace('[edit]', '').replace('/sæs/', '').replace('\"', '')\
.replace('\u2014', '—')
class GoogleCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
link = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
desc = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
class PageCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
meta = scrapy.Field()
h1 = scrapy.Field(input_processor=MapCompose(remove_tags))
h2 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h3 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h4 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
paragraph = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
links = scrapy.Field(input_processor=MapCompose(remove_tags))
pagespider.py
import scrapy
from scrapy.loader import ItemLoader
from google_crawl.items import PageCrawlItem
class PageSpider(scrapy.Spider):
name = 'page'
start_urls = ['https://en.wikipedia.org/wiki/Software_as_a_service']
def parse(self, response):
for meta_element in response.css('head'):
page_item = ItemLoader(item=PageCrawlItem(), selector=meta_element)
page_item.add_css('title', 'title')
page_item.add_css('meta', 'meta')
yield page_item.load_item()
for par_item in response.css('body'):
par_item = ItemLoader(item=PageCrawlItem(), selector=par_item)
par_item.add_css('paragraph', 'p')
par_item.add_css('h1', 'h1')
yield par_item.load_item()
for h2s in response.css('body'):
h2_item = ItemLoader(item=PageCrawlItem(), selector=h2s)
h2_item.add_css('h2', 'h2')
yield h2_item.load_item()
for h3s in response.css('body'):
h3_item = ItemLoader(item=PageCrawlItem(), selector=h3s)
h3_item.add_css('h3', 'h3')
yield h3_item.load_item()
for h4s in response.css('body'):
h4_item = ItemLoader(item=PageCrawlItem(), selector=h4s)
h4_item.add_css('h4', 'h4')
yield h4_item.load_item()
for links in response.css('body'):
link_item = ItemLoader(item=PageCrawlItem(), selector=links)
link_item.add_css('links', 'a::attr(href)')
yield link_item.load_item()
settings.py
BOT_NAME = 'google_crawl'
SPIDER_MODULES = ['google_crawl.spiders']
NEWSPIDER_MODULE = 'google_crawl.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 7
ITEM_PIPELINES = {
'google_crawl.pipelines.SpiderValidationPipeline': 100,
}
The way your spider is setup right now, you yield all of your "links" in one list in in one item. The method in your pipeline would only work if the links field in the item was a string.
Another problem is the method name in your pipeline needs to be changed to process_item for it to work with the scrapy api. Additionally since your items don't output the "links" key, you need to test to make sure that field is present in the item before attempting to filter out unwanted URLs.
For example just make these alterations:
pipeline.py
class SpiderValidationPipeline:
def process_item(self, item, spider):
if "links" in item:
item["links"] = [i for i in item.get("links") if "#" not in i]
return item

Crawling through multiple links on Scrapy

I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.

Unable to go next page

Trying to scrape the internet archive website (Wayback Machine): https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/.
I am succesful in scraping the 1st page content, but can't move to the next page. I have tried multiple xpath to move to next pages:
# 1
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() # does not work
# 2
next_page_url = response.xpath(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href).get() # does not work
I have tried converting to absolute url (and without) but again with no luck.
Can anyone help with new xpath or css selectors that I can finally scrape the next pages?
Below you can see my full code:
# -*- coding: utf-8 -*-
import scrapy
class ZalandoWomenSpider(scrapy.Spider):
name = 'zalando_women_historic_2015'
allowed_domains = ['www.web.archive.org']
start_urls = ['https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/']
def parse(self, response):
products = response.xpath("//a[#class='catalogArticlesList_productBox']")
for product in products:
link = product.xpath(".//#href").get()
absolute_url = f"https://web.archive.org{link}"
yield scrapy.Request(url=absolute_url,callback=self.parse_product,dont_filter=True,meta={'link':link})
# process next page
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() #(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href
absolute_next_page_url = f"https://web.archive.org{next_page_url}"
#absolute_next_page_url = next_page_url
#absolute_next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
def parse_product(self, response):
link = response.request.meta['link']
brand = response.xpath("//span[#itemprop='brand']/text()").get()
price = response.xpath("//span[#class='price oldPrice nowrap']/text()").get()
price1 = response.xpath("//span[#itemprop='price']/text()").get()
price2 = response.xpath("//div[#class='boxPrice']//span[contains(#class,'price')]/text()").get()
disc_price = response.xpath("//span[#class='price specialPrice nowrap']/text()").get()
product_type = response.xpath("//span[#itemprop='name']/text()").get()
material = response.xpath("//div[#class='content']//li[contains(.,'material')]/text()").get()
yield {
'brand_name': brand,
'product_price':price,
'product_price1':price1,
'product_price2':price2,
'product_price_b4_disc':disc_price,
'link':link,
'product_type':product_type,
'material':material}
next_page_url=response.xpath(".//a[#class='catalogPagination_page' and text() ='>']/#href").get()
Will get : '/web/20150906222155/https://www.zalando.co.uk/womens-clothing/?p=2'
You can then use split("/") to remove the "/web/201509..." bit
Note 1: I used the " " quotes inside the parentheses.
Note 2: in Scrapy you can also use "response.follow" to save having to join a relative URL to a base URL.
Check this post as well:
Scrapy response.follow query

Web-Scraping: moving to next pages using Scrapy for getting all data

I would need to scrape all the reviews from a product on Amazon:
https://www.amazon.com/Cascade-ActionPacs-Dishwasher-Detergent-Packaging/dp/B01NGTV4J5/ref=pd_rhf_cr_s_trq_bnd_0_6/130-6831149-4603948?_encoding=UTF8&pd_rd_i=B01NGTV4J5&pd_rd_r=b6f87690-19d7-4dba-85c0-b8f54076705a&pd_rd_w=AgonG&pd_rd_wg=GG9yY&pf_rd_p=4e0a494a-50c5-45f5-846a-abfb3d21ab34&pf_rd_r=QAD0984X543RFMNNPNF2&psc=1&refRID=QAD0984X543RFMNNPNF2
I am using Scrapy to do this. However it seems that the following code is not scraping all the reviews, as they are split n different pages. A human should click on all reviews first, the click on next page. I am wondering how I could do this using scrapy or a different tool in python.
There are 5893 reviews for this product and I cannot get this information manually.
Currently my code is the following:
import scrapy
from scrapy.crawler import CrawlerProcess
class My_Spider(scrapy.Spider):
name = 'spid'
start_urls = ['https://www.amazon.com/Cascade-ActionPacs-Dishwasher-Detergent-Packaging/dp/B01NGTV4J5/ref=pd_rhf_cr_s_trq_bnd_0_6/130-6831149-4603948?_encoding=UTF8&pd_rd_i=B01NGTV4J5&pd_rd_r=b6f87690-19d7-4dba-85c0-b8f54076705a&pd_rd_w=AgonG&pd_rd_wg=GG9yY&pf_rd_p=4e0a494a-50c5-45f5-846a-abfb3d21ab34&pf_rd_r=QAD0984X543RFMNNPNF2&psc=1&refRID=QAD0984X543RFMNNPNF2']
def parse(self, response):
for row in response.css('div.review'):
item = {}
item['author'] = row.css('span.a-profile-name::text').extract_first()
rating = row.css('i.review-rating > span::text').extract_first().strip().split(' ')[0]
item['rating'] = int(float(rating.strip().replace(',', '.')))
item['title'] = row.css('span.review-title > span::text').extract_first()
yield item
And to execute the crawler:
process = CrawlerProcess({
})
process.crawl(My_Spider)
process.start()
Can you tell me if it is possible to move to next pages and scrape all the reviews?
This should be the page where are stored the reviews.
With the url https://www.amazon.com/Cascade-ActionPacs-Dishwasher-Detergent-Packaging/product-reviews/B01NGTV4J5/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=<PUT PAGE NUMBER HERE> you could do something like this:
import scrapy
from scrapy.crawler import CrawlerProcess
class My_Spider(scrapy.Spider):
name = 'spid'
start_urls = ['https://www.amazon.com/Cascade-ActionPacs-Dishwasher-Detergent-Packaging/product-reviews/B01NGTV4J5/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=1']
def parse(self, response)
for row in response.css('div.review'):
item = {}
item['author'] = row.css('span.a-profile-name::text').extract_first()
rating = row.css('i.review-rating > span::text').extract_first().strip().split(' ')[0]
item['rating'] = int(float(rating.strip().replace(',', '.')))
item['title'] = row.css('span.review-title > span::text').extract_first()
yield item
next_page = response.css('ul.a-pagination > li.a-last > a::attr(href)').get()
yield scrapy.Request(url=next_page))

how do I Iterate through selector list while using Itemloaders in scrapy? details in description

I am trying to scrape a list of countries and their details that are members of UN. Here is my approach without using Item Loaders
Here, I am getting a parent tag that contains the details of all the UN members like name, date of joining, website, phone number and UN headquarters. Not all countries have a website, phone no and child details.
I am running a loop through the parent tag and extracting the details one by one and storing it in a variable then I am assigning tha variable to items.
import scrapy
from learn_scrapy.items import UNMemberItem
class UNMemberDetails(scrapy.Spider):
name = 'UN_details'
start_urls = ['http://www.un.org/en/member-states/index.html']
def parse(self, response):
"""
Get the details of the UN members
"""
members_tag = response.css('div.member-state.col-md-12')
#item_list = []
for member in members_tag:
member_name = member.css('span.member-state-name::text').extract()
member_join_date = member.css('span.date-display-single::text').extract()
member_website = member.css('div.site > a::text').extract()
member_phone = member.css('div.phone > ul > li::text').extract()
member_address = member.css('div.mail > a::text').extract()
member_national_holiday = member.css('div.national-holiday::text').extract()
UN_member = UNMemberItem()
UN_member['country_name'] = member_name
UN_member['join_date'] = member_join_date
if len(member_website) == 0:
member_website ='NA'
UN_member['website'] = member_website
if len(member_phone) == 0:
member_phone = 'NA'
UN_member['phone'] = member_phone
if len(member_address) == 0:
member_address = 'NA'
UN_member['mail_address'] = member_address
UN_member['national_holiday'] = member_national_holiday
print (UN_member)
UN_member = str(UN_member)
#item_list.append(UN_members)
with open('un_members_list.txt','a') as f:
f.write(UN_member + "\n")
And this my progress. I get a whole list of countries in an item. I want a single country in the item. What should be my aproach in this case?
import scrapy
from learn_scrapy.items import UNMemberItem
from scrapy.loader import ItemLoader
class UNMemberDetails(scrapy.Spider):
name = 'UN_details_loader'
start_urls = ['http://www.un.org/en/member-states/index.html']
def parse(self, response):
item_loader_object = ItemLoader(UNMemberItem(), response=response)
nested_loader = item_loader_object.nested_css('div.member-state.col-md-12')
nested_loader.add_css('country_name', 'span.member-state-name::text')
nested_loader.add_css('join_date', 'span.date-display-single::text')
nested_loader.add_css('website', 'div.site > a::text')
nested_loader.add_css('phone','div.phone > ul > li::text')
nested_loader.add_css('mail_address','div.mail > a::text')
nested_loader.add_css('national_holiday','div.national-holiday::text')
After some Research, I found the solution.
Instead of this
def parse(self, response):
item_loader_object = ItemLoader(UNMemberItem(), response=response)
You have will have to specify the selector parameter in the method. That means your ItemLoader will extract the Items from specified 'selector' instead of whole response (whole web page).
It is like selecting a part of a page from the whole response (page) and then selecting your items from it and plus you are iterating through it.
def parse(self, response):
item_loader_object = ItemLoader(UNMemberItem(), selector=member_tag)
And the new code would like something like this
members_tag = response.css('div.member-state.col-md-12')
for member in members_tag:
item_loader_object = ItemLoader(UNMemberItem(), response=member)
item_loader.add_css('country_name','span.member-state-name::text')
item_loader.add_css('join_date','span.date-display-single::text')
item_loader.add_css('website', 'div.site > a::text')
item_loader.add_css('phone','div.phone > ul > li::text')
item_loader.add_css('mail_address','div.mail > a::text')
item_loader.add_css('national_holiday','div.national-holiday::text')
The code is much is cleaner than very first code snippet in the question and gets the job done.

Categories