I'm using Scrapy library to crawl data from a website.
I get the result from crawling an website and I want to save it to database. I use Scrapy item and pipeline for that.
I get an list, thus I need to use for loop to save the item. But the problem is that the only last item in a list gets saved.
My code is as follows:
def parse(self, response):
vehicles = []
total_results = response.css('.cl-filters-summary-counter::text').extract_first().replace('.', '')
reference_urls = []
for url in response.css('.cldt-summary-titles'):
reference_url = url.css("a::attr(href)").extract_first().strip(' \t\n\r')
reference_urls.append(reference_url)
ids = []
for item in response.css('.cldt-summary-full-item'):
car_id = item.css("::attr(id)").extract_first().strip(' \t\n\rli-')
ids.append(car_id)
for item in response.css('.cldt-price'):
dirty_price = item.css("::text").extract_first().strip(' \t\n\r')
comma = dirty_price.index(",-")
price = dirty_price[2:comma].replace('.', '')
prices.append(price)
for item in zip(ids, reference_urls, prices):
car = CarItem()
car['reference'] = item[0]
car['reference_url'] = item[1]
car['data'] = ""
car['price'] = item[2]
return car
The result that I get from crawling is good. If I in for loop do something as follows:
vehicles = []
for item in zip(ids, reference_urls, prices):
scraped_info = {
"reference": item[0],
"reference_url": item[1],
"price": item[2]
}
vehicles.append(scraped_info)
And if I print vehicles I get the right result:
[
{
"price": "4250",
"reference": "6784086e-1afb-216d-e053-e250040a033f",
"reference_url": "some-link-1"
},
{
"price": "4250",
"reference": "c05595ac-e49e-4b71-a436-868c192ef82c",
"reference_url": "some-link-2"
},
{
"price": "4900",
"reference": "444553f2-e8fd-41c9-9244-182668544e2a",
"reference_url": "some-link-3"
}
]
UPDATE
CarItem is just a scrapy item in items.py
class CarItem(scrapy.Item):
# define the fields for your item here like:
reference = scrapy.Field()
reference_url = scrapy.Field()
data = scrapy.Field()
price = scrapy.Field()
Any idea what I do wrong?
According to Scrapy Document, the method parse
, as well as any other Request callback, must return an iterable of Request and/or dicts or Item objects.
Also according to the code example below that link,
import scrapy
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
def start_requests(self):
yield scrapy.Request('http://www.example.com/1.html', self.parse)
yield scrapy.Request('http://www.example.com/2.html', self.parse)
yield scrapy.Request('http://www.example.com/3.html', self.parse)
def parse(self, response):
for h3 in response.xpath('//h3').extract():
yield MyItem(title=h3)
for url in response.xpath('//a/#href').extract():
yield scrapy.Request(url, callback=self.parse)
We can see we have to use yield to acquire proper results from parse function.
tl;dr: replace return in your last line with yield.
Related
I am trying to scrape an e-commerce website for its products, and I am currently facing an issue that not all of the pages I get with pagination are visited. The links themselves are valid, and visitable, not non-existing.
My spider code:
import scrapy
import json
from pbl.items import ShopCard
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['www.trobos.lt']
start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
item = []
list = [{
'sid': 10,
'name': 'Maxima',
'domain': 'hhttps://www.maxima.lt/',
'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
'product': item
}]
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllItemsXpath = '//*[#id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/#href'
self.TitleXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/h1/text()'
self.PriceXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)
next_page = [response.url + '&page='+str(x) for x in range(1,193)]
for page in next_page:
print('-'* 100)
print(page)
print('-'* 100)
url = page
yield scrapy.Request(url, callback=self.parse)
def parse_main_item(self,response):
shop = ShopCard()
Title = response.xpath(self.TitleXpath).extract_first()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath(self.PriceXpath).extract_first()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
self.item.append(shop['item'])
def closed(self, reason):
with open("spiderMaxima.json", "w") as final:
json.dump(self.list, final, indent=2, ensure_ascii=False)
I am using a list with range() function, because in the response (from scrapy shell view(response), pagination buttons are connected to a script.
I have also tried scrapy shell several of the links, the outputs for xpaths work, but still, the pages are not getting scraped. What may be the issue? Are there other ways to deal with the pagination?
There are many things wrong with your code, and other things that can be improved. Please read the documentation carefully.
There's really no need to create xpath attributes.
You can write the xpath way shorter.
You can create a start_urls from the beginning.
You can let the item exporter to handle the json.
Here's an example, change it to your needs.
import scrapy
class ShopCard(scrapy.Item):
item = scrapy.Field()
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['trobos.lt']
start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
items = []
custom_settings = {
'DOWNLOAD_DELAY': 0.4,
'FEEDS': {
'spiderMaxima.json': {
'format': 'json',
'indent': 2,
}
}
}
def parse(self, response):
for url in response.xpath('//div[#class="card small"]//a[contains(#class, "shrink")]/#href').getall():
yield response.follow(url=url, callback=self.parse_main_item)
def parse_main_item(self, response):
shop = ShopCard()
Title = response.xpath('//h1/text()').get()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath('//div[#class="price"]//span/text()').get()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
yield shop
I have created a scrapy web scraper but I have no idea how to create Test cases and Data point validators for this web scraper.
I have three interconnected parsers:
def parse(self, response):
urls = response.xpath('').extract()
for url in urls:
yield scrapy.Request(url, callback=self.parse_company_index)
def parse_company_index(self, response):
print("procesing:"+response.url)
name = response.xpath('').extract()
urls = response.xpath('').extract()
data = zip(name, urls)
for item in data:
dict = {
'record_type': 'company_index',
'company_name': item[0],
'source_url': item[1],
}
yield dict
for url in urls:
yield scrapy.Request(url, callback=self.parse_company_profiles)
next_page = response.xpath('').get()
if next_page is not None:
yield scrapy.Request(next_page, callback=self.parse_company_index)
def parse_company_profiles(self, response):
Company_name = response.xpath('').extract()
Company_location = response.xpath('').extract()
Company_website = response.xpath('').extract()
Company_webdomain = response.xpath('').extract()
Company_industry = response.xpath('').extract()
Company_employee_size = response.xpath('').extract()
Company_revenue = response.xpath('').extract()
Contact_name = response.xpath('').extract()
Contact_jobtitle = response.xpath('').extract()
Contact_email_domain = response.xpath('').extract()
Contact_detail = []
if Contact_name:
for i in range(len(Contact_name)):
Contact_detail.append({'Contact_name': Contact_name[i],
'Contact_jobtitle': Contact_jobtitle[i],
'Contact_email_domain': Contact_email_domain[i]})
Contact_details = [Contact_detail]
else:
Contact_details = ["None"]
data = zip(Company_name, Company_location, Company_website, Company_webdomain, Company_industry,
Company_employee_size, Company_revenue, Contact_details)
for item in data:
dict = {
'record_type': 'company_profiles',
'company_name': item[0],
'company_location': item[1],
'company_website': item[2],
'company_webdomain': item[3],
'company_industry': item[4],
'company_employee_size': item[5],
'company_revenue': item[6],
'contact_details': item[7]
}
yield dict
I first thought that it means creating a unittest but now that I think about it, it does not look like it. Also, if it is unittest what approach should I take to tackle this problem?
Any help will be appreciated.
you can check pydantic for writing Data point validators for web scrapers.
https://pydantic-docs.helpmanual.io/usage/validators/
I'm trying to scrape a games info website using Scrapy. The scraping process goes like this: scraping the categories -> scraping the list of games (multiple pages for each category) -> scraping game info.
The scraped info supposed to go into a json file. I'm getting the following result:
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
but I want to get this result:
{ "categories":
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
}
I tried to use the steps from this post and this post, with no success. couldn't find more related questions.
I would appreciate any help.
My spider:
import scrapy
from ..items import Category, Game
class GamesSpider(scrapy.Spider):
name = 'games'
start_urls = ['https://www.example.com/categories']
base_url = 'https://www.exmple.com'
def parse(self, response):
categories = response.xpath("...")
for category in categories:
cat_name = category.xpath(".//text()").get()
url = self.base_url + category.xpath(".//#href").get()
cat = Category()
cat['category'] = cat_name
yield response.follow(url=url,
callback=self.parse_category,
meta={ 'category': cat })
def parse_category(self, response):
games_url_list = response.xpath('//.../a/#href').getall()
cat = response.meta['category']
url = self.base_url + games_url_list.pop()
next_page = response.xpath('//a[...]/#href').get()
if next_page:
next_page = self.base_url + response.xpath('//a[...]/#href').get()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
def parse_game(self, response):
cat = response.meta['category']
game = Game()
try:
cat['games_list']
except:
cat['games_list'] = []
game['title_en'] = response.xpath('...')
game['os'] = response.xpath('...')
game['users_rating'] = response.xpath('...')
cat['games_list'].append(game)
games_url_list = response.meta['games_url_list']
next_page = response.meta['next_page']
if games_url_list:
url = self.base_url + games_url_list.pop()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
else:
if next_page:
yield response.follow(url=next_page,
callback=self.parse_category,
meta={'category': cat})
else:
yield cat
My item.py file:
import scrapy
class Category(scrapy.Item):
category = scrapy.Field()
games_list = scrapy.Field()
class Game(scrapy.Item):
title_en = scrapy.Field()
os = scrapy.Field()
users_rating = scrapy.Field()
You need to write a custom item exporter, or handle post-processing of the file generated by Scrapy separately, e.g. with a standalone Python script that converts from the output format to the desired format.
I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method
My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request