My problem is the following, my spider has just successfully clicked on a button within the function parse_search_page(). In the function parse_identity I am on the next page where I can start scraping some information. But the variable "response" is of type SplashJsonResponse which is not supported by xpath() and response.body is of type bytes which is as well not supported
The solutions of my problem that I think can work are:
Convert SplashJsonResponse to SplashTextResponse (which is an html response)
Use xpath on bytes
Convert scrapy_splash.response.SplashJsonResponse to scrapy.http.response.html.HtmlResponse
Code:
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy_splash import SplashRequest
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['https://app.nominations.hospimedia.fr']
def parse(self, response):
# the function "callback" is called after you have logged in
return scrapy.FormRequest.from_response(
response,
formdata={'user[email]': 'XXX', 'user[password]': 'XXX'},
callback=self.parse_landing_page
)
def parse_landing_page(self, response):
# open webpage after logging in
#open_in_browser(response)
start_urls = 'https://app.nominations.hospimedia.fr'
# we extract the title
# title = response.xpath('//title/text()').extract()
print("hello1")
# regarder a quoi sert le extract() si on le mets pas
next_page_partial_url = response.xpath('//div[#class="l-action l-action--small"]/a/#href').extract()
#print(next_page_partial_url)
next_page_url = start_urls + next_page_partial_url[0]
yield scrapy.Request(next_page_url, callback=self.parse_search_page)
def parse_search_page(self, response):
# if you click on the page below you know if your scrapy-splash is working
# http://localhost:8050/
script = '''
function main(splash, args)
splash:go(splash.args.url)
splash:runjs('document.getElementsByClassName("button tertiary")[0].click()')
return {
html = splash:html(),
}
end
'''
open_in_browser(response)
print("----------")
# scrapy.http.response.html.HtmlResponse
print(type(response))
print("------------")
#yield SplashRequest(response.request.url, callback = self.parse_identity, endpoint='execute', args={'lua_source': script})
yield SplashRequest(callback = self.parse_identity,
endpoint='execute',
args={'url':response.request.url,
'lua_source': script}
)
def parse_identity(self, response):
print("----------------------------------------")
# scrapy_splash.response.SplashJsonResponse
print(type(response))
# <class 'bytes'>
print(type(response.body))
print(response.body)
print(("----------------------------------------"))
next_page_partial_url = response.xpath('//div[#class="medium-6 small-12 columns"]/text()').extract()
#next_page_partial_url = response.xpath('//a[#rel="noopener noreferrer"]/text()').extract()
print(next_page_partial_url)
print(("----------------------------------------"))
#inspect_response(response, self)
#open_in_browser(response)
Actually the solution is to use HtmlResponse
from scrapy.http import HtmlResponse
html_response = HtmlResponse(url=response.url, body=response.text, encoding='utf-8')
Related
I am currently trying to crawl the the Company Overview from alibaba.com.
For instance: https://www.alibaba.com/product-detail/T14-series-original-air-pro-TWS_1600273931389.html?spm=a2700.galleryofferlist.normal_offer.d_title.4aa778f2ahtuBx&s=p
For getting the information like company name I did:
response.xpath("//a[#class='company-name company-name-lite-vb']/text()").extract()
Which works fine.
When entering "Company Overview">"Company Profile" and than trying to crawl information from the table with:
response.xpath("//div/div[#class='content-value']").extract()
I get an empty array.
resources/search_results_searchpage.yml:
products:
css: 'div[data-content="productItem"]'
multiple: true
type: Text
children:
link:
css: a.elements-title-normal
type: Link
crawler.py:
import scrapy
import csv
#from scrapy_selenium import SeleniumRequest # only needed when using selenium
import os
from selectorlib import Extractor
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text="Headphones"
url="https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(search_text)
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(response.text, base_url=response.url)
for product in data['products']:
parsed_url=product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage)
#yield SeleniumRequest(url=parsed_url, callback=self.crawl_mainpage)
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
Anybody having an idea what I could do to populate Year of Est.?
I tried to use scrapy_selenium and configured it correctly, because I suspect that the object is generated dynamically but still no luck or I am possibly using it wrong
tun with:
scrapy crawl alibaba_crawler -o out.csv -t csv
Your xpath selector is not correct. Try this
'Year of Est.': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
I also note some errors in your code such as the line below which will raise an error. You may want to recheck how you extract links from the search page.
data = self.link_extractor.extract(response.text, base_url=response.url)
Edit:
The year of establishment is loaded once the company tab is clicked. You have to simulate the click using selenium or scrapy-playwright. My simple implementation using scrapy-playwright is as below.
import scrapy
from scrapy.crawler import CrawlerProcess
import os
from selectorlib import Extractor
from scrapy_playwright.page import PageCoroutine
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text = "Headphones"
url = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(
search_text)
yield scrapy.Request(url, callback=self.parse, meta={"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(
response.text, base_url=response.url)
for product in data['products']:
parsed_url = product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage, meta={"playwright": True, 'playwright_page_coroutines': {
"click": PageCoroutine("click", selector="//span[#title='Company Profile']"),
},})
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
if __name__ == "__main__":
process = CrawlerProcess(settings={
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR' :"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
})
process.crawl(Spider)
process.start()
Below is a sample log of running the scraper using python crawler.py. The year 2010 is shown in the output
So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance
I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)
I'm trying to use scrapy to crawl some page with a lot of links inside, but my existing code so far only show the contents of the first link.
What mistake have I made?
from scrapy.spiders import BaseSpider
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.selector import Selector
from Proje.items import ProjeItem
class ProjeSpider(BaseSpider):
name = "someweb"
allowed_domains = ["someweb.com"]
start_urls = [
"http://someweb.com/indeks/"
]
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
return req
def kontene(self, response):
for mbuh in response.xpath('//head'):
Item = ProjeItem()
Item['title'] = mbuh.xpath('//title/text()').extract()
yield Item
according to the scrapy docs, parse needs to return an interable of Request, i.e. a list or a generator. Just change return to yield and it should work as expected:
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
yield req
The issue is that you have a return statement within your for loop. In Python, a return will return out of the function, giving you only the first links worth of content. Instead, consider adding req to a list of returned objects.
def parse(self, response):
req_list = []
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
req_list += req
return req_list
i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.
I have succesfully merge django and scrapy, and want to persist my items object into database.
Saving works fine, but whithout all the element.
I'm pretty new on python, scrapy and django and I figure I miss something, but can't solve it.
Here is my spider code :
from scrapy.http import FormRequest, Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy import log
from scrapy.contrib.loader import XPathItemLoader
from datacrowdscrapy.items import DatacrowdItem
class DatacrowdSpider(BaseSpider):
name = 'datacrowd'
start_urls = ['https://www.exemple.com/login']
def parse(self, response):
parsed = [FormRequest.from_response(
response,
formdata={
'login': 'email#gmail.com',
'password': 'password'
},
callback=self.after_login)]
return parsed
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
selector = HtmlXPathSelector(response)
investmentsLinks = selector.select('//a[contains(#class, "myClass")]/#href').extract()
for link in investmentsLinks:
curDatacrowdItem = XPathItemLoader(item=DatacrowdItem(), response=response)
curDatacrowdItem.add_value('url', link)
curRequest = Request(url=link, callback=self.parse_investments, meta={'item': curDatacrowdItem})
yield curRequest
def parse_investments(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
# Details
details = selector.select('//td/div[contains(#class, "myClass")]/text()').extract()
curDatacrowdItem.add_value('someVal', details[0].strip())
/* ... */
# Get nbInvestors
investorLink = selector.select('//ul[contains(#id, "myId")]/li/#onclick').re(r'window.location.href=\'(http.+/data.+)\'')
curRequest = Request(url=investorLink[0], callback=self.parse_investors, meta={'item': curDatacrowdItem})
yield curRequest
# Get last company details
detailsLink = selector.select('//ul[contains(#id, "myData")]/li/#onclick').re(r'window.location.href=\'(http.+/company-details.+)\'')
curRequest = Request(url=detailsLink[0], callback=self.parse_details, meta={'item': curDatacrowdItem})
yield curRequest
def parse_investors(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
nbInvestors = len(selector.select('//ul/li[contains(#class, "myClass")]'))
curDatacrowdItem.add_value('nbInvestors', nbInvestors)
return curDatacrowdItem
def parse_details(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
# Company name
name = selector.select('//div[contains(#class, "myClass")]/h2/text()').extract()
curDatacrowdItem.add_value('name', name[0].strip())
item = curDatacrowdItem.load_item()
item.save() # Here I'm persisiting datas
return item
I get an error log like that :
[datacrowd] ERROR: Spider must return Request, BaseItem or None, got 'XPathItemLoader' in <GET http://www.exemple.com/url/slug>
Any idea about what I'm doing wrong ?
Cheers,
Snite
Simply because you are yielding an XPathItemLoader and not an Item.
In your method "after_login", you're adding an XPathItemLoader objects in the meta, which you try to yield later.
Use the load_item method to return the item.
meta={'item': curDatacrowdItem.load_item()}
You should rename your variables to avoid these mistakes :)