I'm trying to scrape from booking website the name of hotels that's shown in the first page using
the library scrapy in python , but I got an empty file csv, it contains only the name of columns , any suggestions!! Thank you
this is the python code :
import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
class CsvPipeline(object):
def __init__(self):
self.file = open ('duproprio.tmp','wb')
self.exporter = CsvItemExporter(self.file,str)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_items(self,item,spider):
self.exporter.export_item(item)
return item
class DuProprioSpider(scrapy.Spider):
name = "booking"
start_urls = [
"https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ&sid=2e5b4623e13363b5ec7de2d7957c8c22&sb=1&sb_lp=1&src=theme_landing_index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fhotel%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ%3Bsid%3D2e5b4623e13363b5ec7de2d7957c8c22%3B&ss=Maroc&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=ma&ac_position=1&ac_langcode=fr&ac_click_type=b&dest_id=143&dest_type=country&place_id_lat=32.4281&place_id_lon=-6.92197&search_pageview_id=7ca057bb44b9012d&search_selected=true&search_pageview_id=7ca057bb44b9012d&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0"]
custom_settings = {
'LOG_LEVEL':logging.WARNING,
'ITEM_PIPELINES':{'__main__.CsvPipeline':1},
'FEED_FORMAT':'csv',
'FEED_URI':'bookingresult.csv'
}
#count = 0
#total = 25
def parse(self,response):
#self.count =+25
nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
for i in response.css('div._814193827>div>div>div>div>a'):
yield scrapy.Request(url=i.xpath('#href').extract_first(),callback = self.parse_detail)
#if self.count < self.total+25:
yield scrapy.Request(nexturl,self.parse)
def parse_detail(self,response):
nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').get()
yield{
'nom_hotel' : nom_hotel.strip()
}
process = CrawlerProcess(
{
'USER_AGENT':'Mozilla/4.0 (comatible;MSIE 7.0;Window NT 5.1)'
})
process.crawl(DuProprioSpider)
process.start()
1. The first result is '\n'. Example for solution with get all:
def parse_detail(self, response):
nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').getall()
nom_hotel = ''.join(nom_hotel)
yield{
'nom_hotel': nom_hotel.strip()
}
Output:
nom_hotel
Camp Sahara Holidays
Lovely House at La perle de Cabo Negro
Riad Dar Salam
Hôtel Auberge du Littoral
Kasbah Sirocco
...
...
...
2. Your pipeline is wrong so you'll get the results at the end of file after many blank lines. Or instead just use the default exporter:
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter'},
'FEED_FORMAT': 'csv',
'FEED_URI': 'bookingresult.csv'
}
3. You don't have to enter each page just to get the name, you can just scrape it from the search results page. example:
def parse(self, response):
nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
all_names = response.xpath('//div[#data-testid="title"]/text()').getall()
for name in all_names:
yield {'nom_hotel': name}
It's faster since instead of 26 requests (the first plus 25 search results) you just create 1 request.
Related
I'm using generic spiders with a list of multiple urls in the start_urls field.
Is it possible to export one json file for each URL?
As far as I know it's only possible to set one path to one specific output file.
Any ideas how to solve this are rewarded!
EDIT: This is my spider class:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'my_spider'
start_urls = start_urls = ['www.domain1.com','www.domain2.com',
'www.domain3.com']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'FEED_URI': 'file:///C:/path/to/result.json',
}
rules = (
Rule(LinkExtractor(allow=r"abc"), callback='parse_item', follow=True),
)
def parse_item(self, response):
all_text = response.xpath("//p/text()").getall()
yield {
"text": " ".join(all_text),
"url": response.url,
}
First option
You can save the items in the spider as Scrapy tutorial for example:
import scrapy
import json
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = "mydomain"
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
filename = DICT[response.url]
with open(filename, 'w') as fp:
json.dump({"content": response.body.decode("utf-8")}, fp)
The DICT variable is just for specifying the JSON filename but you can use the domain as the filename too.
Second option
You can try using process_item in pipelines.py as follow:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
JsonItemExporter(open(filename, "wb")).export_item(item)
return item
item['filename'] is for save the filename for each start_url. You need to set the items.py too, for example:
import scrapy
class MydomainItem(scrapy.Item):
filename = scrapy.Field()
content = scrapy.Field()
your spider:
import scrapy
from ..items import MydomainItem
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = 'mydomain'
allowed_domains = ['mydomain.com']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
item = MydomainItem()
item["filename"] = DICT[response.url]
item["content"] = response.body.decode("utf-8")
yield item
Before running you need to add the pipeline in your settings.
ITEM_PIPELINES = {
'myproject.pipelines.SaveJsonPipeline': 300,
}
I am currently trying to crawl the the Company Overview from alibaba.com.
For instance: https://www.alibaba.com/product-detail/T14-series-original-air-pro-TWS_1600273931389.html?spm=a2700.galleryofferlist.normal_offer.d_title.4aa778f2ahtuBx&s=p
For getting the information like company name I did:
response.xpath("//a[#class='company-name company-name-lite-vb']/text()").extract()
Which works fine.
When entering "Company Overview">"Company Profile" and than trying to crawl information from the table with:
response.xpath("//div/div[#class='content-value']").extract()
I get an empty array.
resources/search_results_searchpage.yml:
products:
css: 'div[data-content="productItem"]'
multiple: true
type: Text
children:
link:
css: a.elements-title-normal
type: Link
crawler.py:
import scrapy
import csv
#from scrapy_selenium import SeleniumRequest # only needed when using selenium
import os
from selectorlib import Extractor
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text="Headphones"
url="https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(search_text)
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(response.text, base_url=response.url)
for product in data['products']:
parsed_url=product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage)
#yield SeleniumRequest(url=parsed_url, callback=self.crawl_mainpage)
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
Anybody having an idea what I could do to populate Year of Est.?
I tried to use scrapy_selenium and configured it correctly, because I suspect that the object is generated dynamically but still no luck or I am possibly using it wrong
tun with:
scrapy crawl alibaba_crawler -o out.csv -t csv
Your xpath selector is not correct. Try this
'Year of Est.': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
I also note some errors in your code such as the line below which will raise an error. You may want to recheck how you extract links from the search page.
data = self.link_extractor.extract(response.text, base_url=response.url)
Edit:
The year of establishment is loaded once the company tab is clicked. You have to simulate the click using selenium or scrapy-playwright. My simple implementation using scrapy-playwright is as below.
import scrapy
from scrapy.crawler import CrawlerProcess
import os
from selectorlib import Extractor
from scrapy_playwright.page import PageCoroutine
class Spider(scrapy.Spider):
name = 'alibaba_crawler'
allowed_domains = ['alibaba.com']
start_urls = ['http://alibaba.com/']
link_extractor = Extractor.from_yaml_file(os.path.join(os.path.dirname(__file__), "../resources/search_results_searchpage.yml"))
def start_requests(self):
search_text = "Headphones"
url = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText={0}&viewtype=G".format(
search_text)
yield scrapy.Request(url, callback=self.parse, meta={"search_text": search_text})
def parse(self, response):
data = self.link_extractor.extract(
response.text, base_url=response.url)
for product in data['products']:
parsed_url = product["link"]
yield scrapy.Request(parsed_url, callback=self.crawl_mainpage, meta={"playwright": True, 'playwright_page_coroutines': {
"click": PageCoroutine("click", selector="//span[#title='Company Profile']"),
},})
def crawl_mainpage(self, response):
yield {
'name': response.xpath("//h1[#class='module-pdp-title']/text()").extract(),
'Year of Establishment': response.xpath("//td[contains(text(), 'Year Established')]/following-sibling::td/div/div/div/text()").extract()
}
if __name__ == "__main__":
process = CrawlerProcess(settings={
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR' :"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
})
process.crawl(Spider)
process.start()
Below is a sample log of running the scraper using python crawler.py. The year 2010 is shown in the output
I am using scarpy to extract data from line using spider, But having issue that while yielding the result it saving brackets of list in a cell
here is my spider
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c3.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
for i in ids:
yield {
'extract': response.xpath("string(//td[#id='LC%s'])" % i).extract()
}
and the result is
I also tried this
from scrapy.spiders import Spider
class TestCCodeSpider(Spider):
name = 'test_c_code'
start_urls = ['http://github.com/gouravthakur39/beginners-C-program-examples/blob/master/AllTempScalesConv.c/']
custom_settings = {'FEED_URI': "test_ c4.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
ids = response.xpath("//table[#class='highlight tab-size js-file-line-container']/tr/td/#data-line-number").extract()
Code = []
for i in ids:
result = response.xpath("string(//td[#id='LC%s'])"%i)
Code.append(result.extract())
yield {'extract': Code}
But it gives these error
But the required result is
scrapy crawl raamatuvahetus -o raamatuvahetus.csv is outputting an empty csv file. I have no idea why this is. All other Scrapy files generated by scrapy startproject are untouched, and all settings are left at default.
import scrapy
from scrapy.exceptions import CloseSpider
class RaamatuvahetusSpider(scrapy.Spider):
name = 'raamatuvahetus'
start_urls = ['https://www.raamatuvahetus.ee/et/bookwished.wishall?limit=200']
def parse(self, response):
for href in response.xpath("//a[#class='b-info']/#href"):
yield response.follow(href, callback=self.parse_book)
def parse_book(self, response):
wishings = response.xpath("//img[#class='uimg']")
wishings_count = 0
if wishings:
wishings_count = len(wishings)
if wishings_count < 15:
raise CloseSpider('Wishings fever than 15.')
title = response.xpath("//article[#class='text']/h1/text()").extract_first()
author = response.xpath("//div[#class='author']/a/text()").extract_first()
year = response.xpath("//div[#class='year']/text()").extract_first()
yield
{
"Pealkiri": title,
"Autor": author,
"Aasta": year,
"Soovid": wishings_count
}
Edit:
Solved! Heed all travelers who accost a similar complication -- fret not! I have the answers you seek.
Instead of
yield
{
}
write
yield {
}
This is a scrapy code and I want to scrape data from mouthshut.com and it includes the strong tag in between. I am able to run it and have title coming but they are blank. Why it isn't extracting any data?
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
items = []
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
items.append(item)
return items
You should use a pipeline to extract data from your spider! Here is a sample that extract data to json files:
pipelines.py
# -*- coding: utf-8 -*-
# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os
# project import
from items import tgju
from pymongo import MongoClient
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
def get_items(module):
md = module.__dict__
return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))
class JsonPipeline(object):
def __init__(self):
self.files = dict()
self.exporter = dict()
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
for key in get_items(tgju):
path = os.path.join('temp', key)
if not os.path.exists(path):
os.makedirs(path)
self.files[key] = open(os.path.join(path,
'%s_%s_%s.json' % (spider.name,
key.lower(),
datetime.now().strftime('%Y%m%dT%H%M%S'))),
'w+b')
self.exporter[key] = JsonItemExporter(self.files[key])
self.exporter[key].start_exporting()
def spider_closed(self, spider):
for key in get_items(tgju):
self.exporter[key].finish_exporting()
self.files.pop(key).close()
def process_item(self, item, spider):
try:
log.msg('-----------------%s------------------' % item.__class__.__name__)
self.exporter[item.__class__.__name__].export_item(item)
except KeyError:
pass
return item
Add this line to your settings files:
ITEM_PIPELINES = {
'pipelines.JsonPipeline': 800,
}
And try yield each item instead of return.
Update:
Also change your spider to this one...
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
def parse(self,response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="reviewtitle fl"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]/a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
this is work well.
2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
{'title': [u'Vodafone 3G - Useless in Bangalore',
u'Worst Mobile Operator Ever',
u'Worst 3g connectivity of vodafone in bangalore',
u'Pathetic Network 3G',
u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
u'Bad customer service',
u'Vodafone Kolkata \u2013 My worst ever experience.',
u'Network connectivity - permanent nemesis',
u'VODAFONE MOBILE OPERATOR',
u'Beware of Vodafone billing plans',
u'Vodafone changed my billing plan without my notice',
u'Pathetic service. They deduct balance unnecessari',
u'Worst service from Vodafone',
u'Forget Vodafone',
u'Vodafone Data Services sucks',
u'Outgoing calls has been barred',
u'Vodafone Sucks',
u'Worst Customer satisfaction I have ever Faced',
u'Untrained Customer Care... Seems like headline de',
u'3rd Party downloads - shameless way to make money!']}
here you should know:
1. yield is much better then list in scrapy.
2. li node is not the parent of strong.
3. the value of strong stype has some blank.