Scraping listed HTML values using scrapy - python

I can't seem to figure out how to construct this xpath selector. I have even tried using nextsibling::text but to no avail. I have also browsed stackoverflow questions for scraping listed values but could not implement it correctly. I keep getting blank results. Any and all help would be appreciated. Thank you.
The website is https://www.unegui.mn/adv/5737502_10-r-khoroolold-1-oroo/.
Expected Results:
Woods
2015
Current Results:
blank
Current: XPath scrapy code:
list_li = response.xpath(".//ul[contains(#class, 'chars-column')]/li/text()").extract()
list_li = response.xpath("./ul[contains(#class,'value-chars')]//text()").extract()
floor_type = list_li[0].strip()
commission_year = list_li[1].strip()
HTML Snippet:
<div class="announcement-characteristics clearfix">
<ul class="chars-column">
<li class="">
<span class="key-chars">Flooring:</span>
<span class="value-chars">Wood</span></li>
<li class="">
<span class="key-chars">Commission year:</span>
2015
</li>
</ul>
</div>
FURTHER CLARIFICATION:
I previously did two selectors (one for the span list, one for the href list), but the problem was some pages on the website dont follow the same span list/a list order (i.e. on one page the table value would be in a span list, but some other page it would be in a href list). That is why I have been trying to only use one selector and get all the values.
This results in values as shown below in the image. Instead of the number of window aka an integer being scraped, it scrapes the address because on some pages the table value is under the href list not under the span list.
Previous 2 selectors:
list_span = response.xpath(".//span[contains(#class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(#class,'value-chars')]//text()").extract()
Whole Code (if someone needs it to test it):
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from datetime import datetime
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
dt_today = datetime.now().strftime('%Y%m%d')
filename = dt_today + ' UB HPI Buying Data'
# create Spider class
class UneguiApartmentsSpider(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {
"FEEDS": {
f'{filename}.csv': {
'format': 'csv',
'overwrite': True}}
}
# function used for start url
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(#class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[#itemprop='name']/#content").extract_first().strip()
price = card.xpath(".//*[#itemprop='price']/#content").extract_first().strip()
rooms = card.xpath("normalize-space(.//div[contains(#class,'announcement-block__breadcrumbs')]/span[2]/text())").extract_first().strip()
link = card.xpath(".//a[#itemprop='url']/#href").extract_first().strip()
date_block = card.xpath("normalize-space(.//div[contains(#class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath("//a[contains(#class,'number-list-next js-page-filter number-list-line')]/#href").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
print(f'Scraped {next_page}')
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_li = response.xpath(".//*[contains(#class, 'value-chars')]/text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_li[0].strip()
num_balcony = list_li[1].strip()
commission_year = list_li[2].strip()
garage = list_li[3].strip()
window_type = list_li[4].strip()
num_floors = list_li[5].strip()
door_type = list_li[6].strip()
area_sqm = list_li[7].strip()
floor = list_li[8].strip()
leasing = list_li[9].strip()
district = list_li[10].strip()
num_window = list_li[11].strip()
address = list_li[12].strip()
#list_span = response.xpath(".//span[contains(#class,'value-chars')]//text()").extract()
#list_a = response.xpath(".//a[contains(#class,'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
#floor_type = list_span[0].strip()
#num_balcony = list_span[1].strip()
#garage = list_span[2].strip()
#window_type = list_span[3].strip()
#door_type = list_span[4].strip()
#num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
#commission_year = list_a[0].strip()
#num_floors = list_a[1].strip()
#area_sqm = list_a[2].strip()
#floor = list_a[3].strip()
#leasing = list_a[4].strip()
#district = list_a[5].strip()
#address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
def __init__(self):
self.driver = webdriver.Firefox()
def parse_item2(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath(".//span[contains(#class,'phone-author__title')]//text()")
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartmentsSpider)
process.start()

You need two selectors, one will pass keys and another one will parse values. This will result in two lists that can be zipped together in order to give you the results you are looking for.
CSS Selectors could be like:
Keys Selector --> .chars-column li .key-chars
Values Selector --> .chars-column li .value-chars
Once you extract both lists, you can zip them and consume them as key value.

I suppose this is because of invalid HTML (some span-elements are not closed) normal xpath's are not possible.
This did gave me results:
".//*[contains(#class,'value-chars')]"
The * means any element, so it will select both select
<span class="value-chars">Wood</span>
and
2015

Use this XPath to get Wood
//*[#class="chars-column"]//span[2]//text()
Use this XPath to get 2015
//*[#class="chars-column"]//a[text()="2015"]

Related

How to paginate and parse multiple pages concurrently on Scrapy

I am trying to web-scrape multiple pages from a real estate website. I have been successful in scraping the first page of my URL, but unable to handle pagination. I have attempted trying to find a class tag with 'red' in it and identify next sibling. I believe this will get the next page response, and continue doing over and over. I read some examples were people wrote their code to be able to parse multiple pages at the same time.
Is it possible to do parallel/concurrent parsing? I want to be able to parse 90 pages as fast as possible, but don't know how to implement it. Any and all appreciated is greatly and much appreciated. Thank you.
PROGRESS UPDATE 1:
I figured out why my CSV outputs UTF-8 and returns Cyrillic characters correctly in my Pycharm IDE, but returns ?? placeholders when I use Excel. I have been able to bypass this issue by importing CSV file through Excel Data>From Text/CSV.
PROGRESS UPDATE 2: I understand I could implement a for loop in my start_request function and loop pages (1,90) or even (1,120) but that is not what I want, and this would make it so my code parses page by page, rather than concurrently.
HTML Snippet:
<ul class="number-list">
<li>
1
</li>
<li>
2
</li>
<li>
3
</li>
<li><span class="page-number">...</span></li>
<li>
89
</li>
<li>
90
</li>
<div class="clear"></div>
</ul>
Pagination Snippet:
# handling pagination
next_page = response.xpath("//a[contains(#class,'red')]/parent::li/following-sibling::li/a/#href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
Full Code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(#class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[#itemprop='name']/#content").extract_first()
price = card.xpath(".//*[#itemprop='price']/#content").extract_first()
rooms = card.xpath(".//div[contains(#class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[#itemprop='url']/#href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(#class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(#class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(#class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# handling pagination
next_page = response.xpath("//a[contains(#class,'red')]/parent::li/following-sibling::li/a/#href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
If I understand you correctly you need to move the 'next page' to the parse function. I also just take the 'next page' button value and follow it.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(#class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[#itemprop='name']/#content").extract_first()
price = card.xpath(".//*[#itemprop='price']/#content").extract_first()
rooms = card.xpath(".//div[contains(#class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[#itemprop='url']/#href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(#class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath('//a[contains(#class, "number-list-next js-page-filter number-list-line")]/#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(#class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(#class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
This should work.

Unable to go next page

Trying to scrape the internet archive website (Wayback Machine): https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/.
I am succesful in scraping the 1st page content, but can't move to the next page. I have tried multiple xpath to move to next pages:
# 1
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() # does not work
# 2
next_page_url = response.xpath(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href).get() # does not work
I have tried converting to absolute url (and without) but again with no luck.
Can anyone help with new xpath or css selectors that I can finally scrape the next pages?
Below you can see my full code:
# -*- coding: utf-8 -*-
import scrapy
class ZalandoWomenSpider(scrapy.Spider):
name = 'zalando_women_historic_2015'
allowed_domains = ['www.web.archive.org']
start_urls = ['https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/']
def parse(self, response):
products = response.xpath("//a[#class='catalogArticlesList_productBox']")
for product in products:
link = product.xpath(".//#href").get()
absolute_url = f"https://web.archive.org{link}"
yield scrapy.Request(url=absolute_url,callback=self.parse_product,dont_filter=True,meta={'link':link})
# process next page
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() #(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href
absolute_next_page_url = f"https://web.archive.org{next_page_url}"
#absolute_next_page_url = next_page_url
#absolute_next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
def parse_product(self, response):
link = response.request.meta['link']
brand = response.xpath("//span[#itemprop='brand']/text()").get()
price = response.xpath("//span[#class='price oldPrice nowrap']/text()").get()
price1 = response.xpath("//span[#itemprop='price']/text()").get()
price2 = response.xpath("//div[#class='boxPrice']//span[contains(#class,'price')]/text()").get()
disc_price = response.xpath("//span[#class='price specialPrice nowrap']/text()").get()
product_type = response.xpath("//span[#itemprop='name']/text()").get()
material = response.xpath("//div[#class='content']//li[contains(.,'material')]/text()").get()
yield {
'brand_name': brand,
'product_price':price,
'product_price1':price1,
'product_price2':price2,
'product_price_b4_disc':disc_price,
'link':link,
'product_type':product_type,
'material':material}
next_page_url=response.xpath(".//a[#class='catalogPagination_page' and text() ='>']/#href").get()
Will get : '/web/20150906222155/https://www.zalando.co.uk/womens-clothing/?p=2'
You can then use split("/") to remove the "/web/201509..." bit
Note 1: I used the " " quotes inside the parentheses.
Note 2: in Scrapy you can also use "response.follow" to save having to join a relative URL to a base URL.
Check this post as well:
Scrapy response.follow query

Scrapy and ajax requests to get hidden elements

I'm getting started with Scrapy and there is a website I'm trying to get data from. Specifically the phone number element which is inside a div element that has an id. I noticed that if I send a request to this page I can get it.
https://www.otomoto.pl/ajax/misc/contact/multi_phone/6CLxXv/0
so basiclay the base url would be https://www.otomoto.pl/ajax/misc/contact/multi_phone/ID/0/
and 6CLxXv = ID for this example.
How do I scrape all the div elements, concatenate them with the base url and then retrieve the phone number element ?
Here is the code used :
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Compose
from otomoto.items import OtomotoItem
def filter_out_array(x):
x = x.strip()
return None if x == '' else x
def remove_spaces(x):
return x.replace(' ', '')
def convert_to_integer(x):
return int(x)
class OtomotoCarLoader(ItemLoader):
default_output_processor = TakeFirst()
features_out = MapCompose(filter_out_array)
price_out = Compose(TakeFirst(), remove_spaces, convert_to_integer)
class OtomotoSpider(scrapy.Spider):
name = 'otomoto'
start_urls = ['https://www.otomoto.pl/osobowe/']
def parse(self, response):
for car_page in response.css('.offer-title__link::attr(href)'):
yield response.follow(car_page, self.parse_car_page)
for next_page in response.css('.next.abs a::attr(href)'):
yield response.follow(next_page, self.parse)
#inline_requests
def parse_car_page(self, response):
property_list_map = {
'Marka pojazdu': 'brand',
'Model pojazdu': 'model',
'Rok produkcji': 'year',
}
contact_response = yield scrapy.Request(url_number) # how do i get the specific phone number url
number = # parse the responose here ? then add load it in the loader
loader = OtomotoCarLoader(OtomotoItem(), response=response)
for params in response.css('.offer-params__item'):
property_name = params.css('.offer-params__label::text').extract_first().strip()
if property_name in property_list_map:
css = params.css('.offer-params__value::text').extract_first().strip()
if css == '':
css = params.css('a::text').extract_first().strip()
loader.add_value(property_list_map[property_name], css)
loader.add_css('price', '.offer-price__number::text')
loader.add_css('price_currency', '.offer-price__currency::text')
loader.add_css('features', '.offer-features__item::text')
loader.add_value('url', response.url)
loader.add('phone number', number) # here i want to add the phone number to the rest of the elements
yield loader.load_item()
note : i was able to find the following link "https://www.otomoto.pl/ajax/misc/contact/multi_phone/6CLxXv/0" by checking the page xhr
Take a look into xpath https://docs.scrapy.org/en/0.9/topics/selectors.html. There you should find feasable solutions to select the distinct elements you need. Eg. selecting all the elements divs of a parent div which have an id-attribute starting with a ... "//div[#id='a']/div/"
This way you can put your results into a list. The latter - extracting the numbers from the list and building the base string is simple string concatenation.
The same counts for scraping the ids. Find unique indicators, so you can make sure that those are the elements you need. Eg. following content. Is the id you need different from others on the page which you don't need?
for idx in collected_list:
url = 'https.com/a/b/'+idx+'/0'
EDIT:
I see. Your code is quite advanced. I could get more into it, if I would have the full code, but from that I see you use this html element:
<a href="" class="spoiler seller-phones__button" data-path="multi_phone" data-id="6D5zmw" data-id_raw="6074401671" title="Kontakt Rafał" data-test="view-seller-phone-1-button" data-index="0" data-type="bottom">
<span class="icon-phone2 seller-phones__icon"></span>
<span data-test="seller-phone-2" class="phone-number seller-phones__number">694 *** ***</span>
<span class="separator">-</span>
<span class="spoilerAction">Wyświetl numer</span>
</a>
The data-id is what you need to extract, because its the ID you are looking for and can simple apply to:
new_request_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/"+id+"/0/"

Scrapy not scraping if one item missing

I built my first scray spider in several hours for the last two days but i am stuck right now - the main purpose i wanted to achieve is to extract all data to later filter it in csv. Now, the real crucial data for me (Companies without! webpages) is dropped because scrapy can't find the xpath i provided if an item has a homepage. I tried an if statement here, but its not working.
Example website: https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/Unternehmen?view=publish&item=company&id=1345
I use xPath selector: response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_wwwIcon']/a/#href").extract()
Example non-website: https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/Unternehmen?view=publish&item=company&id=1512
Spider Code:
# -*- coding: utf-8 -*-
import scrapy
class AchernSpider(scrapy.Spider):
name = 'achern'
allowed_domains = ['www.achern.de']
start_urls = ['https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/']
def parse(self, response):
for href in response.xpath("//ul[#class='cCore_list cCore_customList']/li[*][*]/a/#href"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback= self.scrape)
def scrape(self, response):
#Extracting the content using css selectors
print("Processing:"+response.url)
firma = response.css('div>#cMpu_publish_company>h2.cCore_headline::text').extract()
anschrift = response.xpath("//div[contains(#class,'cCore_addressBlock_address')]/text()").extract()
tel = response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_phoneIcon']/text()").extract()
mail = response.xpath(".//div[#class='cCore_contactInformationBlock']//*[contains(text(), '#')]/text()").extract()
web1 = response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_wwwIcon']/a/#href").extract()
if "http:" not in web1:
web = "na"
else:
web = web1
row_data=zip(firma,anschrift,tel,mail,web1) #web1 must be changed to web but then it only give out "n" for every link
#Give the extracted content row wise
for item in row_data:
#create a dictionary to store the scraped info
scraped_info = {
'Firma' : item[0],
'Anschrift' : item[1] +' 77855 Achern',
'Telefon' : item[2],
'Mail' : item[3],
'Web' : item[4],
}
#yield or give the scraped info to scrapy
yield scraped_info
So overall it should export the DROPPED items even "web" is not there..
Hope someone can help, greetings S
Using
response.css(".cCore_wwwIcon > a::attr(href)").get()
gives you a None or the website address, then you can use or to provide a default:
website = response.css(".cCore_wwwIcon > a::attr(href)").get() or 'na'
Also, I refactored your scraper to use css selectors. Note that I've used .get() instead of .extract() to get a single item, not a list, which cleans up the code quite a bit.
import scrapy
from scrapy.crawler import CrawlerProcess
class AchernSpider(scrapy.Spider):
name = 'achern'
allowed_domains = ['www.achern.de']
start_urls = ['https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/']
def parse(self, response):
for url in response.css("[class*=cCore_listRow] > a::attr(href)").extract():
yield scrapy.Request(url, callback=self.scrape)
def scrape(self, response):
# Extracting the content using css selectors
firma = response.css('.cCore_headline::text').get()
anschrift = response.css('.cCore_addressBlock_address::text').get()
tel = response.css(".cCore_phoneIcon::text").get()
mail = response.css("[href^=mailto]::attr(href)").get().replace('mailto:', '')
website = response.css(".cCore_wwwIcon > a::attr(href)").get() or 'na'
scraped_info = {
'Firma': firma,
'Anschrift': anschrift + ' 77855 Achern',
'Telefon': tel,
'Mail': mail,
'Web': website,
}
yield scraped_info
if __name__ == "__main__":
p = CrawlerProcess()
p.crawl(AchernSpider)
p.start()
output:
with website:
{'Firma': 'Wölfinger Fahrschule GmbH', 'Anschrift': 'Güterhallenstraße 8 77855 Achern', 'Telefon': '07841 6738132', 'Mail': 'info#woelfinger-fahrschule.de', 'Web': 'http://www.woelfinger-fahrschule.de'}
without website:
{'Firma': 'Zappenduster-RC Steffen Liepe', 'Anschrift': 'Am Kirchweg 16 77855 Achern', 'Telefon': '07841 6844700', 'Mail': 'Zappenduster-Rc#hotmail.de', 'Web': 'na'}

Save Scrapy 'start_urls' and store properly in a Data Frame

I am using Scrapy to scrape some website data. But I can't make the step to get my data properly.
This is the output of my code (see code below):
In the command Line:
scrapy crawl myspider -o items.csv
Output:
asin_product product_name
ProductA,,,ProductB,,,ProductC,,, BrandA,,,BrandB,,,BrandC,,,
ProductA,,,ProductD,,,ProductE,,, BrandA,,,BrandB,,,BrandA,,,
#Note that the rows are representing the start_urls and that the ',,,'
#three commas are separating the data.
Desired output:
scrapy crawl myspider -o items.csv
Start_URL asin_product product_name
URL1 ProductA BrandA
URL1 ProductB BrandB
URL1 ProductC BrandC
URL2 ProductA BrandA
URL2 ProductD BrandB
URL2 ProductE BrandA
My Used Code in Scrapy:
import scrapy
from amazon.items import AmazonItem
class AmazonProductSpider(scrapy.Spider):
name = "AmazonDeals"
allowed_domains = ["amazon.com"]
#Use working product URL below
start_urls = [
"https://www.amazon.com/s?k=shoes&ref=nb_sb_noss_2", # This should
be #URL 1
"https://www.amazon.com/s?k=computer&ref=nb_sb_noss_2" # This should
be #URL 2
]
def parse(self, response):
items = AmazonItem()
title = response.xpath('//*[#class="a-size-base-plus a-color-base a-
text-normal"]/text()').extract()
asin = response.xpath('//*[#class ="a-link-normal"]/#href').extract()
# Note that I devided the products with ',,,' to make it easy to separate
# them. I am aware that this is not the best approach.
items['product_name'] = ',,,'.join(title).strip()
items['asin_product'] = ',,,'.join(asin).strip()
yield items
First of all, it's recomended to use css when querying by class.
Now to your code:
The product name is within the a tag (product url). So you can iterate though the links and store the URL and the title.
<a class="a-link-normal a-text-normal" href="/adidas-Mens-Lite-Racer-Running/dp/B071P19D3X/ref=sr_1_3?keywords=shoes&qid=1554132536&s=gateway&sr=8-3">
<span class="a-size-base-plus a-color-base a-text-normal">Adidas masculina Lite Racer byd tênis de corrida</span>
</a>
You need to create one AmazonItem object per line on your csv file.
def parse(self, response):
# You need to improve this css selector because there are links which
# are not a product, this is why I am checking if title is None and continuing.
for product in response.css('a.a-link-normal.a-text-normal'):
# product is a selector
title = product.css('span.a-size-base-plus.a-color-base.a-text-normal::text').get()
if not title:
continue
# The selector is already the a tag, so we only need to extract it's href attribute value.
asin = product.xpath('./#href').get()
item = AmazonItem()
item['product_name'] = title.strip()
item['asin_product'] = asin.strip()
yield item
Make the start_url available in parse method
instead of using start_urls you can yield your initial requests from a method named start_requests (see https://docs.scrapy.org/en/latest/intro/tutorial.html?highlight=start_requests#our-first-spider).
With each request you can pass the start url as meta data. This meta data is then available within your parse method (see https://docs.scrapy.org/en/latest/topics/request-response.html?highlight=meta#scrapy.http.Request.meta).
def start_requests(self):
urls = [...] # this is equal to your start_urls
for start_url in urls:
yield Request(url=url, meta={"start_url": start_url})
def parse(self, response):
start_url = response.meta["start_url"]
yield multiple items, one for each product
Instead of joining titles and brands you can yield several items from parse. For the example below i assume the lists title and asin have the same length.
for title, asin in zip(title, asin):
item = AmazonItem()
item['product_name'] = title
item['asin_product'] = asin
yield item
PS: you should check amazons robots.txt. They might not allow you to scrape their site and ban your IP (https://www.amazon.de/robots.txt)

Categories