I have a scrapy Crawlspider that parses links and returns html content just fine. For javascript pages however I enlisted Selenium to access the 'hidden' content. The problem is that while Selenium works outside the scrapy parsing, it does not work inside the parse_items function
from scrapy.spiders import CrawlSpider, Rule, Spider
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from craigslist_sample.items import CraigslistReviewItem
import scrapy
from selenium import selenium
from selenium import webdriver
class MySpider(CrawlSpider):
name = "spidername"
allowed_domains = ["XXXXX"]
start_urls = ['XXXXX']
rules = (
Rule(LinkExtractor(allow = ('reviews\?page')),callback= 'parse_item'),
Rule(LinkExtractor(allow=('.',),deny = ('reviews\?page',)),follow=True))
def __init__(self):
#this page loads
CrawlSpider.__init__(self)
self.selenium = webdriver.Firefox()
self.selenium.get('XXXXX')
self.selenium.implicitly_wait(30)
def parse_item(self, response):
#this page doesnt
print response.url
self.driver.get(response.url)
self.driver.implicitly_wait(30)
#...do things
You have some variable issues. In init method you are assigning browser instance to self.selenium and then in method parse_item you are using self.driver as browser instance. I have updated your script. Try now.
from scrapy.spiders import CrawlSpider, Rule, Spider
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from craigslist_sample.items import CraigslistReviewItem
import scrapy
from selenium import selenium
from selenium import webdriver
class MySpider(CrawlSpider):
name = "spidername"
allowed_domains = ["XXXXX"]
start_urls = ['XXXXX']
rules = (
Rule(LinkExtractor(allow = ('reviews\?page')),callback= 'parse_item'),
Rule(LinkExtractor(allow=('.',),deny = ('reviews\?page',)),follow=True))
def __init__(self):
#this page loads
CrawlSpider.__init__(self)
self.driver= webdriver.Firefox()
self.driver.get('XXXXX')
self.driver.implicitly_wait(30)
def parse_item(self, response):
#this page doesnt
print response.url
self.driver.get(response.url)
self.driver.implicitly_wait(30)
#...do things
Great! a combination of Hassan answer and better knowledge of the urls I was scraping lead to the answer (turns out the website had planted 'fake' urls that never loaded)
Related
I was trying to extract all urls related to my test domain. The designed page is Javascript page and it requires selenium to crawl through all urls corresponding to this domain. But the crawler stops after crawling one page. I need to collect all urls associated with my domain.
I used scrapy_selenium module for this. and the code I used is like below
import scrapy
from scrapy_selenium import SeleniumRequest
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'example'
start_urls = ['https://www.example.com/']
rules = (
Rule(LinkExtractor(allow_domains=['example.com']), follow=True),
)
def start_requests(self):
for url in self.start_urls:
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++",url)
yield SeleniumRequest(url=url, callback=self.parse,dont_filter=True)
def parse(self, response):
print(response.url)
item = {'url': response.url, 'html': response.body}
yield item
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy_selenium.SeleniumMiddleware': 800
},
'SELENIUM_DRIVER_NAME': 'chrome',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/ubuntu/selenium_drivers/chromedriver', # path to the chrome driver executable
'SELENIUM_DRIVER_ARGUMENTS': ['-headless'] # '-headless' for running chrome in headless mode
}
I dont understand why the crawler stops after one page and not crawling through the pages.
I'm not sure if this is the correct place for this question.
Here's my question:
If I run scrapy, it can't see the email addresses in the page source. The page has email addresses that are visible only when you hover over a user with an email address .
When I run my spider, I get no emails. What am I doing wrong?
Thank You.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class MailsSpider(CrawlSpider):
name = 'mails'
allowed_domains = ['biorxiv.org']
start_urls = ['https://www.biorxiv.org/content/10.1101/2022.02.28.482253v3']
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
emals = re.findall(r'[\w\.]+#[\w\.]+',response.text)
print(response.url)
print(emails)
Assuming you're allowed to scrape email contacts from a public website,
as said, scrapy does not loads js scripts, you need a full render browser like Playwright to get the address.
I've wrote down a quick and dirty example on how it could work, you can start from here if you wish (after you've installed playwright of course)
import scrapy
from scrapy.http import Request, FormRequest
from playwright.sync_api import sync_playwright
from scrapy.http import HtmlResponse
class PhaseASpider(scrapy.Spider):
name = "test"
def start_requests(self):
yield Request('https://www.biorxiv.org/content/10.1101/2022.02.28.482253v3', callback=self.parse_page)
def parse_page(self,response):
with sync_playwright() as p:
browser = p.firefox.launch(headless=False)
self.page = browser.new_page().
url='https://www.biorxiv.org/content/10.1101/2022.02.28.482253v3'
self.page.goto(url)
self.page.wait_for_load_state("load")
html_page=self.page.content()
response_sel = HtmlResponse(url="my HTML string", body=html_page, encoding='utf-8')
mails=response_sel.xpath('//a[contains(#href, "mailto")]/#href').extract()
for mail in mails:
print(mail.split('mailto:')[1])
I found this example code in a textbook about web scraping. After running the spider it showing error and found out that scrapy.contrib is removed in 1.16 release of scrapy. How should i change this so it work. I am new to web scraping btw.
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class ArticleSpider(CrawlSpider):
name = 'articles'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/'
'Benevolent_dictator_for_life']
rules = [Rule(LinkExtractor(allow='.*'), callback='parse_items',
follow=True)]
def parse_items(self, response):
url = response.url
title = response.css('h1::text').extract_first()
text = response.xpath('//div[#id="mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
lastUpdated = lastUpdate.replace(
'This page was last edited on ','')
print('URL is: {}'.format(url))
print('title is: {}'.format(title))
print('text is: {}'.format(text))
print('Last updated: {}'.format(lastUpdated))
In newer versions of scrapy you can simply import the modules as below
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
# add the rest of the code
Read more from the docs.
I'm trying to extract comments from a news page. The Crawler starts at the homepage and follows all the internal links found on the site. The comments are just on the article-sites and those comments are embedded from an external Website, so the section with the comments are in an JavaScript iframe. Here's an example article site
My first Step was to build a crawler and a selenium middleware. The crawler follows all the links and those are loaded through Selenium:
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CrawlerSpider(CrawlSpider):
name = 'crawler'
allowed_domains = ['www.merkur.de', 'disqus.com/embed/comments/']
start_urls = ['https://www.merkur.de/welt/novavax-corona-totimpfstoff-omikron-zulassung-impfstoff-weihnachten-wirkung-covid-lauterbach-zr-91197497.html']
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse',
follow=True)]
def parse(self, response):
title = response.xpath('//html/head/title/text()').extract_first()
iframe_url = response.xpath('//iframe[#title="Disqus"]//#src').get()
yield Request(iframe_url, callback=self.next_parse, meta={'title': title})
def next_parse(self, response):
title = response.meta.get('title')
comments = response.xpath("//div[#class='post-message ']/div/p").getall()
yield {
'title': title,
'comments': comments
}
To get access to the iframe elements the Scrapy Request goes through the middleware:
from scrapy import signals, spiders
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options=chrome_options)
# Here you get the request you are making to the urls with the LinkExtractor found and use selenium to get them and return a response.
def process_request(self, request, spider):
self.driver.get(request.url)
element = self.driver.find_element_by_xpath('//div[#id="disqus_thread"]')
self.driver.execute_script("arguments[0].scrollIntoView();", element)
time.sleep(1)
body = self.driver.page_source
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
I am getting the right link from the iframe src here but my CrawlerSpider is not yielding the iframe_url Request so that I can follow the link from the iframe. What am I doing wrong here ? I really appreciate your help!
I am working with htmlResponse and selector, htmlResponse returns the site <200 "site"> but when I check the Selector(response) it says <Selector xpath=None data=u'<html></html>'> even though the htmlResponse returns this
<200 http://www.tripadvisor.in/Hotel_Review-g3581633-d2290190-Reviews-Corbett_Tr
eetop_Riverview-Marchula_Jim_Corbett_National_Park_Uttarakhand.htmlhttp://www.tr
ipadvisor.in/Hotel_Review-g297600-d8029162-Reviews-Daman_Casa_Tesoro-Daman_Daman
_and_Diu.html>
Code:
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
from collections import OrderedDict
import json
from scrapy.selector.lxmlsel import HtmlXPathSelector
import csv
import scrapy
from scrapy.http import HtmlResponse
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
# base_uri = ["tripadvisor.in"]
def start_requests(self):
site_array=["http://www.tripadvisor.in/Hotel_Review-g3581633-d2290190-Reviews-Corbett_Treetop_Riverview-Marchula_Jim_Corbett_National_Park_Uttarakhand.html"
"http://www.tripadvisor.in/Hotel_Review-g297600-d8029162-Reviews-Daman_Casa_Tesoro-Daman_Daman_and_Diu.html",
"http://www.tripadvisor.in/Hotel_Review-g304557-d2519662-Reviews-Darjeeling_Khushalaya_Sterling_Holidays_Resort-Darjeeling_West_Bengal.html",
"http://www.tripadvisor.in/Hotel_Review-g319724-d3795261-Reviews-Dharamshala_The_Sanctuary_A_Sterling_Holidays_Resort-Dharamsala_Himachal_Pradesh.html",
"http://www.tripadvisor.in/Hotel_Review-g1544623-d8029274-Reviews-Dindi_By_The_Godavari-Nalgonda_Andhra_Pradesh.html"]
for i in range(len(site_array)):
response = HtmlResponse(site_array[i])
sels = Selector(response)
sites = sels.xpath('//a[contains(text(), "Next")]/#href').extract()
print "________________________________________________________________"
print sels
print "________________________________________________________________"
if(sites and len(sites) > 0):
for site in sites:
yield Request(site_array[i],self.parse)
As mentioned here you do not set the body of the Response object.
Why don't you yield a new Request with the URLs of your site_array to let Scrapy scrape them? What you currently are doing won't work out.
Naturally in this case you need to adjust your parser method or write a new one and add it as a callback to the Request (I would do the second version).