how do i select a specific element inside an elements with scrapy - python

import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?

In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data

Related

Crawling through multiple links on Scrapy

I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.

how to crawl all href on zomato?

i am trying to crawl into zomato to get info of the restaurants in istanbul. so, i am trying to get all the hrefs in search result pages. however, i am only getting the first search result of every page.
import scrapy
from ..items import ZomatodataItem
class ZomatoSpider(scrapy.Spider):
name = 'zomato'
allowed_domains = ["zomato.com"]
start_urls = [
'https://www.zomato.com/istanbul/restaurants?page=1'
]
def parse(self, response):
all_css = response.css('.search_left_featured')
all_product = all_css.css('a::attr(href)').get()
yield scrapy.Request(all_product, callback=self.parse_dir_contents)
max_page_number = 6
for i in range(1, max_page_number):
url_next = 'https://www.zomato.com/istanbul/restaurants?page=' + str(i)+''
yield scrapy.Request(url_next, callback=self.parse)
def parse_dir_contents(self, response):
items = ZomatodataItem()
items['name'] = response.css('.diBDma::text').extract()
items['genre'] = response.css('.gQXqL::text').extract_first()
items['tags'] = response.css('.cunMUz::text').extract()
items['address'] = response.css('.clKRrC::text').extract()
items['phone_number'] = response.css('.kKemRh::text').extract()
yield items
Makes sense that you only get 1 result - 'all_product' will only contain 1 item. If you want to get the full list, you'll have to update it to this:
all_products = all_css.css('a::attr(href)').getall()
Now you can loop through the links and get the detailed information like this:
for product in all_products:
yield scrapy.Request(product, callback=self.parse_dir_contents)

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

remove the unicode from the output of JSON using scrapy

import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(#class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(#class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(#class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(#class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(#class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item ["image"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?
There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page (div with id="siteTable"):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[#id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(#class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(#class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(#class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(#class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(#class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item["image"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
return item
Tested, here is what I get for, for example, votes_likes:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],

Python Scrapy not always downloading data from website

Scrapy is used to parse an html page. My question is why sometimes scrapy returns the response I want, but sometimes does not return a response. Is it my fault? Here's my parsing function:
class AmazonSpider(BaseSpider):
name = "amazon"
allowed_domains = ["amazon.org"]
start_urls = [
"http://www.amazon.com/s?rh=n%3A283155%2Cp_n_feature_browse-bin%3A2656020011"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(#class, "result")]')
items = []
titles = {'titles': sites[0].xpath('//a[#class="title"]/text()').extract()}
for title in titles['titles']:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
I believe you are just not using the most adequate XPath expression.
Amazon's HTML is kinda messy, not very uniform and therefore not very easy to parse. But after some experimenting I could extract all the 12 titles of a couple of search results with the following parse function:
def parse(self, response):
sel = Selector(response)
p = sel.xpath('//div[#class="data"]/h3/a')
titles = p.xpath('span/text()').extract() + p.xpath('text()').extract()
items = []
for title in titles:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
If you care about the actual order of the results the above code might not be appropriate but I believe that is not the case.

Categories