remove the unicode from the output of JSON using scrapy - python

import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(#class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(#class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(#class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(#class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(#class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item ["image"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?

There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page (div with id="siteTable"):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[#id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(#class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(#class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(#class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(#class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(#class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item["image"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
return item
Tested, here is what I get for, for example, votes_likes:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],

Related

Generate JSON dictionary from recursive scrapy functions

I am running the scrapy spider on airbnb for academic purposes below. I scrape all listings first
(such as: https://www.airbnb.com/s/Berlin--Germany/homes?tab_id=all_tab&query=Berlin%2C%20Germany&place_id=ChIJAVkDPzdOqEcRcDteW0YgIQQ&checkin=2020-05-01&adults=1&refinement_paths%5B%5D=%2Fhomes&source=structured_search_input_header&search_type=search_query&checkout=2020-05-02)
to get their ids and then go to the listing's page
(such as: https://www.airbnb.de/rooms/20839690?location=Berlin&check_in=2020-05-01&check_out=2020-05-02&adults=1)
and get the geo-data from the details JSON. Ideally, I would like to have a final JSON nested like:
{{'ID': ID1, 'Title': Title1, 'Latitude': Lat1},{'ID': ID2, 'Title': Title2, 'Latitude': Lat2}}
Because of the recursive structure, I have the full list of title, price etc. already in the first go, while lng and lat are only one element per loop run.
{{Price1, Price2, Price3..., id1, id2...lng1, lat1}, {Price1, Price2, Price3..., id1, id2..., lng2, lat2}}
Any idea how I can restructure the code to get the above structure?
Cheers
marcello
Spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from airbnb.items import AirbnbItem
import json
import pprint
all_ids = []
detail = {}
class AirbnbSpider(scrapy.Spider):
name = 'airbnb_spider'
allowed_domains = ['airbnb.com', 'airbnb.de']
start_urls = ['https://www.airbnb.de/s/Berlin/homes?checkin=2020-05-01&checkout=2020-05-02&adults=1']
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').extract()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').extract()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').extract()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').extract()
x_id = [i.split('_')[1] for i in detail['id']]
detail['id'] = x_id
for i in x_id:
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+i+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link, callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
return detail
Items
import scrapy
class AirbnbItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
price = scrapy.Field()
id = scrapy.Field()
rating = scrapy.Field()
lat = scrapy.Field()
lng = scrapy.Field()
pass
You can pass information to the to the parse_detail method and yield from there
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').get()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').get()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').get()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').get()
detail['id'] = detail['id'].split('_')[1]
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+detail['id']+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link,
meta={'item': detail}, #pass information to the next method
callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail = response.meta['item']
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
yield detail
BTW, Item class is useless, do not use it.

how do i select a specific element inside an elements with scrapy

import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

How to run both items in scrapy function?

Whenever I use the link of captions and transcription in start_urls variable, it gives me the price of caption in both captions and transcription variable and again give me the price of transcription in both variables. Why and how to solve this issue?
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions',
'https://www.rev.com/freelancers/transcription']
def parse(self, response):
items = FetchingItem()
Transcription_price = response.css('#middle-benefit .mt1::text').extract()
Caption_price = response.css('#middle-benefit .mt1::text').extract()
items['Transcription_price'] = Transcription_price
items['Caption_price'] = Caption_price
yield items
I suspect that you need another structure of class, sequential:
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions']
def parse(self, response):
items = FetchingItem()
items['Caption_price'] = response.css('#middle-benefit .mt1::text').extract()
yield Request('https://www.rev.com/freelancers/transcription', self.parse_transcription, meta={'items': items})
def parse_transcription(self, response):
items = response.meta['items']
items['Transcription_price'] = response.css('#middle-benefit .mt1::text').extract()
yield items

Python: How to append a string to a scrapy list item?

I'm scraping a collection of urls, but they all lack the base of the url, so I want to append the "start_url" as a base to each scraped url.
Spider class:
class MySpider(BaseSpider):
name = "teslanews"
allowed_domains = ["teslamotors.com"]
start_urls = ["http://www.teslamotors.com/blog"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
updates = hxs.xpath('//div[#class="blog-wrapper no-image"]')
items = []
for article in updates:
item = TeslanewsItem()
item["date"] = article.xpath('./div/span/span/text()').extract()
item["title"] = article.xpath('./h2/a/text()').extract()
item["url"] = article.xpath('./h2/a/#href').extract()
items.append(item)
return items
I can't do a simple item["url"] = article.xpath('./h2/a/#href').extract() + base with base = "http://www.teslamotors.com"
because this adds the base to the end and it does it letter by letter due to being in a for-loop and each letter is separated by commas.
I'm relatively new to Scrapy so I don't exactly know which way to go with this.
from scrapy.spider import BaseSpider
from urlparse import urljoin
class MySpider(BaseSpider):
name = "teslanews"
allowed_domains = ["teslamotors.com"]
base = "http://www.teslamotors.com/blog"
start_urls = ["http://www.teslamotors.com/blog"]
def parse(self, response):
updates = response.xpath('//div[#class="blog-wrapper no-image"]')
items = []
for article in updates:
item = TeslanewsItem()
item["date"] = article.xpath('./div/span/span/text()').extract()
item["title"] = article.xpath('./h2/a/text()').extract()
item['url'] = urljoin(self.base, ''.join(article.xpath('./h2/a/#href').extract()))
return items

Categories