I'm very new to Python, and I'm using scrapy. Right now, I have two spiders, one for Google, and one for the pages themselves. I plan to combine them, but haven't yet because I want to troubleshoot the pages separately. Both spiders work fine, but I want to be able to drop internal links from my list of scraped links (so those that contain a '#' symbol). I've tried this a million different ways, including using find & regex, changing variable names, not using variables, adding "self" to the expression, but nothing seems to affect it. The pipeline is enabled -- it just doesn't seem to do anything. Any help is appreciated.
pipelines.py
from scrapy.exceptions import DropItem
class SpiderValidationPipeline:
def drop_links(self, item, spider):
url = str(item.get('links'))
marker = '#'
if item.get('links'):
if marker in url:
raise DropItem("Internal Link")
else:
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
def remove_nt(text):
return text.replace('\n', '').replace('\t', '').replace('[edit]', '').replace('/sæs/', '').replace('\"', '')\
.replace('\u2014', '—')
class GoogleCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
link = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
desc = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
class PageCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
meta = scrapy.Field()
h1 = scrapy.Field(input_processor=MapCompose(remove_tags))
h2 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h3 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h4 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
paragraph = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
links = scrapy.Field(input_processor=MapCompose(remove_tags))
pagespider.py
import scrapy
from scrapy.loader import ItemLoader
from google_crawl.items import PageCrawlItem
class PageSpider(scrapy.Spider):
name = 'page'
start_urls = ['https://en.wikipedia.org/wiki/Software_as_a_service']
def parse(self, response):
for meta_element in response.css('head'):
page_item = ItemLoader(item=PageCrawlItem(), selector=meta_element)
page_item.add_css('title', 'title')
page_item.add_css('meta', 'meta')
yield page_item.load_item()
for par_item in response.css('body'):
par_item = ItemLoader(item=PageCrawlItem(), selector=par_item)
par_item.add_css('paragraph', 'p')
par_item.add_css('h1', 'h1')
yield par_item.load_item()
for h2s in response.css('body'):
h2_item = ItemLoader(item=PageCrawlItem(), selector=h2s)
h2_item.add_css('h2', 'h2')
yield h2_item.load_item()
for h3s in response.css('body'):
h3_item = ItemLoader(item=PageCrawlItem(), selector=h3s)
h3_item.add_css('h3', 'h3')
yield h3_item.load_item()
for h4s in response.css('body'):
h4_item = ItemLoader(item=PageCrawlItem(), selector=h4s)
h4_item.add_css('h4', 'h4')
yield h4_item.load_item()
for links in response.css('body'):
link_item = ItemLoader(item=PageCrawlItem(), selector=links)
link_item.add_css('links', 'a::attr(href)')
yield link_item.load_item()
settings.py
BOT_NAME = 'google_crawl'
SPIDER_MODULES = ['google_crawl.spiders']
NEWSPIDER_MODULE = 'google_crawl.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 7
ITEM_PIPELINES = {
'google_crawl.pipelines.SpiderValidationPipeline': 100,
}
The way your spider is setup right now, you yield all of your "links" in one list in in one item. The method in your pipeline would only work if the links field in the item was a string.
Another problem is the method name in your pipeline needs to be changed to process_item for it to work with the scrapy api. Additionally since your items don't output the "links" key, you need to test to make sure that field is present in the item before attempting to filter out unwanted URLs.
For example just make these alterations:
pipeline.py
class SpiderValidationPipeline:
def process_item(self, item, spider):
if "links" in item:
item["links"] = [i for i in item.get("links") if "#" not in i]
return item
Related
Background
I am trying to learn Scrapy by example.
At this point, I have made a CrawlSpider is able to navigate to a page, follow all links, extract data using CSS selectors and populate items using an item loader.
I am now trying to add an arbitrary pipeline, just so I can get it working.
My issue is that item pipelines for CrawlSpiders appear to need additional definitions than those that use a scrapy.Spider - I cannot find working examples of CrawlSpider pipelines.
What my code actually does
Starts at the wikipedia page for lexus and follows all other wikipedia pages that link from it.
It then extracts the title of each page, and headings from the first table. These are stored in items, which are then printed to a .txt document.
lexuswikicrawl.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from wikicars.items import WikiItem
#Global variables
pages = {}
failed_pages = 1
filename = 'wiki.txt'
class GenSpiderCrawl(CrawlSpider):
name = 'lexuswikicrawl'
#Start at the lexus wikipedia page, and only follow the wikipedia links
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Lexus']
#There are no specific rules specified below, therefore all links will be followed
rules = (Rule(LinkExtractor(), callback = 'parse_page'),)
#Define what selectors to crawl
def parse_page(self, response):
global pages
global failed_pages
#Try and capture the page title using CSS selector
#If not, keep count of the amount of failed selectors
try:
pagename = (response.css('div#content.mw-body > h1#firstHeading::text').extract())[0]
except:
pagename = ('Failed pagename: '+ str(failed_pages))
failed_pages += 1
# Capture table categories that fall under the CSS selector for regular text items
tabcat = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr > th::text').extract()
# Capture tale categories that fall under CSS selector for text with hyperlinks
for i in range(20):
tabcatlink1 = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr:nth-child('+str(i)+') > th > a::text').extract()
tabcatlink2 = response.css('#mw-content-text > div > table.infobox.vcard > tbody > tr:nth-child('+str(i)+') > th > div > a::text').extract()
if len(tabcatlink1) > 0:
tabcat.append(tabcatlink1)
else:
pass
if len(tabcatlink2) > 0:
tabcat.append(tabcatlink2)
else:
continue
#Load 'pagename' and 'categories' into a new item
page_loader = ItemLoader(item=WikiItem(), selector = tabcat)
page_loader.add_value('title', pagename)
page_loader.add_value('categories', tabcat)
#Store the items in an overarching dictionary structure
pages[pagename] = page_loader.load_item()
#Try and print the results to a text document
try:
with open(filename, 'a+') as f:
f.write('Page Name:' + str(pages[pagename]['title'])+ '\n')
except:
with open(filename, 'a+') as f:
f.write('Page name error'+ '\n')
try:
with open(filename, 'a+') as f:
f.write('Categories:' + str(pages[pagename]['categories'])+ '\n')
except:
with open(filename, 'a+') as f:
f.write('Table Category data not available' + '\n')
items.py
import scrapy
from scrapy.loader.processors import TakeFirst, MapCompose
def convert_categories(categories):
categories = (str(categories).upper().strip('[]'))
return categories
def convert_title(title):
title = title.upper()
return title
class WikiItem(scrapy.Item):
categories = scrapy.Field(
input_processor = MapCompose(convert_categories)
)
title = scrapy.Field(
input_processor = MapCompose(convert_title)
pipelines.py
This is where I suspect trouble is caused. My current thinking is that I need something more than just process_item() to get my Pipeline to run. I have tried all I can to rearrange the examples from: https://docs.scrapy.org/en/latest/topics/item-pipeline.html.
from scrapy.exceptions import DropItem
class PipelineCheck(object):
def process_item(self, item, spider):
print('I am a pipeline this is an item:' + str(item) + '\n')
settings.py
I have declared my pipeline and its priority. I have also declared a generic user agent. Is there an additional variable i need to set?
BOT_NAME = 'wikicars'
SPIDER_MODULES = ['wikicars.spiders']
NEWSPIDER_MODULE = 'wikicars.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'wikicars.pipelines.PipelineCheck': 100,
}
Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items
When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)
Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass
My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item
I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item
I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...