I'm having trouble structuring scrapy data as I want. My spider get some data from one page, then follows a list of links on that page to get a link of this next page.
def parse_page(self, response):
links = response.css(LINK_SELECTOR).extract()
data = {
'name': response.css(NAME_SELECTOR).extract_first(),
'date': response.css(DATE_SELECTOR).extract(),
}
for link in links:
next_link = response.urljoin(link)
yield scrapy.Request(next_link, callback=self.parse_url, meta={'data': data})
def parse_url(self, response):
data = response.meta['data']
data['url'] = response.css(a::attr(href)').get()
yield data
What I would like is to get the data with the following structure:
{'name': name, 'date': date, 'url': [url1, url2, url3, url4]}
Instead of
{'name': name, 'date': date, 'url': url1}
{'name': name, 'date': date, 'url': url2}
{'name': name, 'date': date, 'url': url3}
{'name': name, 'date': date, 'url': url4}
I've tried to use items but I don't get how to pass the data from parse_url to the parse_page function. How would I do that?
Thanks in advance.
You can use scrapy's coroutine support to do this pretty easily.
The code would look something like this:
async def parse_page(self, response):
...
for link in links:
request = response.follow(link)
response = await self.crawler.engine.download(request, self)
urls.append(response.css('a::attr(href)').get())
The following is one of the ways how you can achieve that. There is a library inline_requests which will help you get the expected output.
import scrapy
from scrapy.crawler import CrawlerProcess
from inline_requests import inline_requests
class YellowpagesSpider(scrapy.Spider):
name = "yellowpages"
start_urls = ["https://www.yellowpages.com/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771"]
#inline_requests
def parse(self, response):
data = {
'name':response.css(".sales-info > h1::text").get(),
'phone':response.css(".contact > p.phone::text").get(),
'target_link':[]
}
for item_link in response.css(".review-info > a.author[href]::attr(href)").getall():
resp = yield scrapy.Request(response.urljoin(item_link), meta={'handle_httpstatus_all': True})
target_link = resp.css("a.review-business-name::attr(href)").get()
data['target_link'].append(target_link)
print(data)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YellowpagesSpider)
c.start()
Output it produces:
{'name': 'Honey Honey Cafe & Crepery', 'phone': '(415) 351-2423', 'target_link': ['/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771', '/walnut-ca/mip/akasaka-japanese-cuisine-455476824', '/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771']}
Related
I am trying to scrape a infinite scroll ajax request page but not able to go the next page and get the yield items. I'm able to get the response.txt. I tried debbuging but not able get any solution. Can anyone help me in out in this.
import scrapy
class InfiniteScrollingSpider(scrapy.Spider):
name = 'wegotthiscovered_review'
scrolling_url = 'https://wegotthiscovered.com/wp-admin/admin-ajax.php'
def start_requests(self):
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': '1',
'attrs': "{\"id\":\"1\",\"order\":\"\",\"orderby\":\"\",\"catnames\":\"movies+reviews\",\"postnotin\":\"905069,904520,904521,903475,901576,900303,893944,895136,891795,886876,884402,881283\",\"timestampbefore\":1591800990}"
},
callback=self.parse_page,
meta={'page': 1},
)
def parse_page(self, response):
next_page = response.meta.get('page') + 1
print('next_page:', next_page)
print(response.text)
json_data = json.loads(response.text)
print(json_data.keys())
print('success:', json_data.get('success'))
print('data:', json_data.get('data'))
if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
return
articles = scrapy.Selector(text=json_data['data']['content']).css('article')
for article in articles:
yield {
'page_title': article.css('h4 ::text').extract_first().strip(),
'review_link': article.css('h4 ::attr(href)').extract_first().strip(),
}
print('next page >>>')
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': str(next_page),
"query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
},
callback=self.parse_page,
meta={'page': next_page},
)
I'm using scrapy to scrape data from the website. Here's my code
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
start_urls = ['http://https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers/']
custom_settings = {
'FEED_URI': 'tmp/shop.csv'
}
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
Please check where I'm doing wrong?
Also, I want to scrape all the data while I'm scrolling. So it should take all the data till we are scrolling? So how do I go about it?
You have problems with:
incorrect allowed_domain (only domain needed);
broken start_urls (http twice and slash in the end);
wrong intends for yielding item in parse function.
Check fixed code here:
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['shopclues.com']
start_urls = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
import scrapy
class rottenTomatoesSpider(scrapy.Spider):
name = "movieList"
start_urls = [
'https://www.rottentomatoes.com/'
]
def parse(self, response):
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]'):
yield {
'score': response.css('td.left_col').extract_first(),
'title': response.css('td.middle_col').extract_first(),
'openingDate': response.css('td.right_col right').extract_first()
}
So the spider is instead scraping <div id='homepage-tv-top'>
I'm assuming it is the homepage- that is confusing the script. Anyone know the workaround?
You need to iterate over each tr and and also in for loop use movieList instead of response
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]//tr'):
yield {
'score': "".join(a for a in movieList.css('td.left_col *::text').extract()),
'title': "".join(a for a in movieList.css('td.middle_col *::text').extract()),
'openingDate': "".join(a for a in movieList.css('td.right_col *::text').extract())
}
I am having issues with pagination in the following code.
The spider starts but does not find any links on the first page. This is because the page actually returns a partial result... I know it sounds odd but its true, when I visit the page I see jobs listed but when the bot visits, there are no jobs listed.
From what I understand, scrapy will load the entire page regardless of JS or AJAX but I am starting to wonder...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from northrop.items import NorthropItem
from scrapy.http import HtmlResponse
from scrapy.exceptions import CloseSpider
import re
class NorthropSpider(CrawlSpider):
name = "northropJobStart"
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
allowed_domains = ["ngc.taleo.net"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="next"]/a',)), callback="parse_listings", follow= True),
)
def parse_start_url(self, response):
return self.parse_listings(response)
def parse_listings(self, response):
sel = Selector(response)
# There are no jobs listed.. I am lost.....
jobs = sel.xpath('//th/div/div/span/a/#href').extract()
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url,job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
sel = Selector(response)
job = sel.xpath('//*[#id="mainbody-jobs"]')
item = NorthropItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="mainbody-jobs"]/h1/text()').extract()
item['location'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[3]/div[2]/text()').extract()
item['applink'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[1]/a/#href').extract()
item['description'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[2]/div[1]/div[2]').extract()
item['travel'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[5]/div[2]/text()').extract()
item['job_category'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[2]/div[2]/text()').extract()
item['clearance_have'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['clearance_get'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['job_number'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/text()').extract()
item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value
Unfortunately the search form is hidden in quite deep but you can see it if in the network tab of your browser inspect.
Turns out it's sending a full json of default search parameters, so you need to pretty much copy and paste it only incrementing the pageNo. I couldn't help but solve it and before you know it I wrote a whole spider, so here it is, let me know if some parts are unclear:
import json
import scrapy
class TaleoSpider(scrapy.Spider):
name = 'taleo'
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
# baseform with base search values
base_form = {'advancedSearchFiltersSelectionParam':
{'searchFilterSelections': [
{'id': 'ORGANIZATION', 'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'URGENT_JOB', 'selectedValues': []},
{'id': 'EMPLOYEE_STATUS', 'selectedValues': []},
{'id': 'STUDY_LEVEL', 'selectedValues': []},
{'id': 'WILL_TRAVEL', 'selectedValues': []},
{'id': 'JOB_SHIFT', 'selectedValues': []},
{'id': 'JOB_NUMBER', 'selectedValues': []}]},
'fieldData': {'fields': {'JOB_TITLE': '', 'KEYWORD': '', 'LOCATION': ''},
'valid': True},
'filterSelectionParam': {'searchFilterSelections': [{'id': 'POSTING_DATE',
'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'JOB_TYPE', 'selectedValues': []},
{'id': 'JOB_SCHEDULE', 'selectedValues': []},
{'id': 'JOB_LEVEL', 'selectedValues': []}]},
'multilineEnabled': False,
'pageNo': 1, # <--- change this for pagination
'sortingSelection': {'ascendingSortingOrder': 'false',
'sortBySelectionParam': '3'}}
def parse(self, response):
# we got cookies from first start url now lets request into the search api
# copy base form for the first request
form = self.base_form.copy()
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(self.base_form),
# add headers to indicate we are sending a json package
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
# scrapy.Request defaults to 'GET', but we want 'POST' here
method='POST',
# load our form into meta so we can reuse it later
meta={'form': form},
callback=self.parse_items)
def parse_items(self, response):
data = json.loads(response.body)
# scrape data
for item in data['requisitionList']:
yield item
# next page
# get our form back and update the page number in it
form = response.meta['form']
form['pageNo'] += 1
# check if paging is over, is our next page higher than maximum page?
max_page = data['pagingData']['totalCount'] / data['pagingData']['pageSize']
if form['pageNo'] > max_page:
return
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(form),
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
method='POST',
meta={'form': form},
callback=self.parse_items)
There is an example in Scrapy Documentation Release 1.0.3,in 7th row, urljoin method is used when the links is relative.when the links is absolute,what should i do?
example code:
import scrapy
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['http://stackoverflow.com/questions?sort=votes']
def parse(self, response):
for href in response.css('.question-summary h3 a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
yield {
'title': response.css('h1 a::text').extract()[0],
'votes': response.css('.question .vote-count-post::text').extract()[0],
'body': response.css('.question .post-text').extract()[0],
'tags': response.css('.question .post-tag::text').extract(),
'link': response.url,
}
You don't need to worry about, urljoin() handles both cases properly:
In [1]: response.urljoin("http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery")
Out[1]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'
In [2]: response.urljoin("/questions/426258/checking-a-checkbox-with-jquery")
Out[2]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'