How to send cookie with scrapy CrawlSpider requests? - python

I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item

Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D

The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])

You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})

You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request

I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})

Related

Web scraper doesn't work correctly - field does not show any data

I tried to do a web scraper from Stackoverflow questions, but the 3rd column doesn't download the data, can you help me please?
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
class Question(Item):
a_id = Field()
b_question = Field()
c_desc = Field()
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
sel = Selector(response)
questions = sel.xpath('//div[#id="questions"]//div[#class="s-post-summary--content"]')
i = 1
for quest in questions:
item = ItemLoader(Question(), quest)
item.add_xpath('b_question', './/h3/a/text()')
item.add_xpath('c_desc', './/div[#class="s-post-summary--content-excerpt"]/text()')
item.add_value('a_id', i)
i = i+1
yield item.load_item()
picture from csv file output
picture from website and the html code
Try it like this: I added some inline notes to explain the changes
from scrapy.spiders import Spider
class StackOverflowSpider(Spider):
name = "MyFirstSpider"
custom_settings = {
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
# iterate through each question as an xpath object.
for i, question in enumerate(response.xpath("//div[#class='s-post-summary--content']")):
# use get method to grab text
title = question.xpath('.//h3/a/text()').get()
content = question.xpath('.//div[#class="s-post-summary--content-excerpt"]/text()').get()
# yielding a regular dictionary in your case is the same thing
yield {
"b_question": title.strip(),
"c_desc": content.strip(),
"a_id": i
}

Create Xpath using scrapy

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)

Trying t scrape table provide empty output

I am to scrape the table but they will provide me empty output theses is page link https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = ['https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important/']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
details={}
key=response.xpath("//table//tbody/tr/td[1]/text()").get()
value=response.xpath("//table//tbody/tr/td[2]/text()").get()
details[key]=value
yield details
It was a bit hard to xpath selection correctly.Now it's working.
from scrapy.http import Request
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
page_number = 1
start_urls = [
'https://www.sidmartinbio.org/why-is-the-jugular-vein-so-important']
def parse(self, response):
details={}
key=response.xpath("//td[contains(.,'Source')]/text()").get()
value=response.xpath("//td[contains(.,'Source')]/following-sibling::td/text()").get()
details[key]=value
yield details
Output:
{'Source': 'Sigmoid sinus and Inferior petrosal sinus'}

CrawlSpider with Splash, only first link is crawled & processed

I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.

Scrapy pagination is not working and optimized spider

Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items
When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)
Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass

Categories