Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items
When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)
Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass
Related
I'm very new to Python, and I'm using scrapy. Right now, I have two spiders, one for Google, and one for the pages themselves. I plan to combine them, but haven't yet because I want to troubleshoot the pages separately. Both spiders work fine, but I want to be able to drop internal links from my list of scraped links (so those that contain a '#' symbol). I've tried this a million different ways, including using find & regex, changing variable names, not using variables, adding "self" to the expression, but nothing seems to affect it. The pipeline is enabled -- it just doesn't seem to do anything. Any help is appreciated.
pipelines.py
from scrapy.exceptions import DropItem
class SpiderValidationPipeline:
def drop_links(self, item, spider):
url = str(item.get('links'))
marker = '#'
if item.get('links'):
if marker in url:
raise DropItem("Internal Link")
else:
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
def remove_nt(text):
return text.replace('\n', '').replace('\t', '').replace('[edit]', '').replace('/sæs/', '').replace('\"', '')\
.replace('\u2014', '—')
class GoogleCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
link = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
desc = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
class PageCrawlItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor=TakeFirst())
meta = scrapy.Field()
h1 = scrapy.Field(input_processor=MapCompose(remove_tags))
h2 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h3 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
h4 = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
paragraph = scrapy.Field(input_processor=MapCompose(remove_tags, remove_nt))
links = scrapy.Field(input_processor=MapCompose(remove_tags))
pagespider.py
import scrapy
from scrapy.loader import ItemLoader
from google_crawl.items import PageCrawlItem
class PageSpider(scrapy.Spider):
name = 'page'
start_urls = ['https://en.wikipedia.org/wiki/Software_as_a_service']
def parse(self, response):
for meta_element in response.css('head'):
page_item = ItemLoader(item=PageCrawlItem(), selector=meta_element)
page_item.add_css('title', 'title')
page_item.add_css('meta', 'meta')
yield page_item.load_item()
for par_item in response.css('body'):
par_item = ItemLoader(item=PageCrawlItem(), selector=par_item)
par_item.add_css('paragraph', 'p')
par_item.add_css('h1', 'h1')
yield par_item.load_item()
for h2s in response.css('body'):
h2_item = ItemLoader(item=PageCrawlItem(), selector=h2s)
h2_item.add_css('h2', 'h2')
yield h2_item.load_item()
for h3s in response.css('body'):
h3_item = ItemLoader(item=PageCrawlItem(), selector=h3s)
h3_item.add_css('h3', 'h3')
yield h3_item.load_item()
for h4s in response.css('body'):
h4_item = ItemLoader(item=PageCrawlItem(), selector=h4s)
h4_item.add_css('h4', 'h4')
yield h4_item.load_item()
for links in response.css('body'):
link_item = ItemLoader(item=PageCrawlItem(), selector=links)
link_item.add_css('links', 'a::attr(href)')
yield link_item.load_item()
settings.py
BOT_NAME = 'google_crawl'
SPIDER_MODULES = ['google_crawl.spiders']
NEWSPIDER_MODULE = 'google_crawl.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 7
ITEM_PIPELINES = {
'google_crawl.pipelines.SpiderValidationPipeline': 100,
}
The way your spider is setup right now, you yield all of your "links" in one list in in one item. The method in your pipeline would only work if the links field in the item was a string.
Another problem is the method name in your pipeline needs to be changed to process_item for it to work with the scrapy api. Additionally since your items don't output the "links" key, you need to test to make sure that field is present in the item before attempting to filter out unwanted URLs.
For example just make these alterations:
pipeline.py
class SpiderValidationPipeline:
def process_item(self, item, spider):
if "links" in item:
item["links"] = [i for i in item.get("links") if "#" not in i]
return item
import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)
I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.
I'm trying to parse all the categories and their nested categories recursivelly from this webpage which ultimately leads to such page and finally this innermost page from where I would like to fetch all the product titles.
The script can follow the above steps. However, when it comes to fetch all the titles from result pages traversing all next pages, the script gets fewer content than how many there are.
This is what I've written:
class mySpider(scrapy.Spider):
name = "myspider"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
def parse(self,response):
cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links)
else:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse)
def parse_all_links(self,response):
for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
target_link = response.urljoin(item.strip())
yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = urljoin(base_url,next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links)
def parse_main_content(self,response):
item = response.css("h1::text").get()
print(item)
How can I get all the titles available in that category?
The script gets different number of results every time I run it.
Your main issue is that you need to use separate cookiejar for each "/products/list_pages/" to get next page correctly. I used a class variable cookie for this (see my code) and got same result (4293 items) several times.
Here is my code (I don't download product page (just read product title from a list of products):
class mySpider(scrapy.Spider):
name = "phoenixcontact"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
cookie = 1
def parse(self,response):
# cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
cookie = self.cookie
self.cookie += 1
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links, cb_kwargs={'page_number': 1})
else:
yield scrapy.Request(item_link,headers=self.headers,callback=self.parse)
def parse_all_links(self,response, page_number):
# if page_number > 1:
# with open("Samples/Page.htm", "wb") as f:
# f.write(response.body)
# for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
for item in response.xpath('//div[#data-product-key]//h3//a'):
target_link = response.urljoin(item.xpath('./#href').get())
item_title = item.xpath('./text()').get()
yield {'title': item_title}
# yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = response.urljoin(next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links, cb_kwargs={'page_number': page_number + 1})
I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D
The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})
You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request
I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})