Scrapy: scrape item fields from different pages - python

I'm trying to get item fields info from different pages using scrapy.
What I am trying to do:
main_url > scrape all links from this page > go to each link
from each link > scrape info, put info in items list and go to another link
from another link > scrape info and put info in the same items list
Go to next each link...repeat steps 2 - 4
when all links are done go to next page and repeat steps 1 - 3
I found some information from below but, I still can't get the results I want:
How can i use multiple requests and pass items in between them in scrapy python
http://doc.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-callback-arguments
Goal: to get the below layout results
What I've done is below
My item class
from scrapy.item import Item, Field
class myItems(Item):
info1 = Field()
info2 = Field()
info3 = Field()
info4 = Field()
My spider class
from scrapy.http import Request
from myProject.items import myItems
class mySpider(scrapy.Spider):
name = 'spider1'
start_urls = ['main_link']
def parse(self, response):
items = []
list1 = response.xpath().extract() #extract all info from here
list2 = response.xpath().extract() #extract all info from here
for i,j in zip(list1, list2):
link1 = 'http...' + i
request = Request(link1, self.parseInfo1, dont_filter =True)
request.meta['item'] = items
yield request
link2 = 'https...' + j
request = Request(link2, self.parseInfo2, meta={'item':items}, dont_filter = True)
# Code for crawling to next page
def parseInfo1(self, response):
item = myItems()
items = response.meta['item']
item[info1] = response.xpath().extract()
item[info2] = response.xpath().extract()
items.append(item)
return items
def parseInfo2(self, response):
item = myItems()
items = response.meta['item']
item[info3] = response.xpath().extract()
item[info4] = response.xpath().extract()
items.append(item)
return items
I executed the spider by typing this on the terminal:
> scrapy crawl spider1 -o filename.csv -t csv
I got the results for all the fields, but they are not in the right order. My csv file looks like this:
Does anyone know how to get the results like in my "Goal" above?
I appreciate the help.
Thanks

Never mind, I found my mistake. I instantiated myItems class twice, which resulted in 2 new objects and gave the results that I got.

Related

1: my spider is giving me all the results in one liners on csv file

In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

Scrapy: Extracting data from source and its links

Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test

Scrapy parse list of urls, open one by one and parse additional data

I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
"http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=0&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151",
]
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item
The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},
callback=self.parse_additional_info,
dont_filter=True)

Scrapy Python Craigslist Scraper

I am trying to scrape Craigslist classifieds using Scrapy to extract items that are for sale.
I am able to extract date, post title, and post url but am having trouble extracting price.
For some reason the current code extracts all of the prices, but when I remove the // before the price span look up the price field returns as empty.
Can someone please review the code below and help me out?
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://longisland.craigslist.org/search/sss?sort=date&query=raptor%20660&srchType=T"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistSampleItem()
item['date'] = titles.select('span[#class="itemdate"]/text()').extract()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
item ['price'] = titles.select('//span[#class="itempp"]/text()').extract()
items.append(item)
return items
itempp appears to be inside of another element, itempnr. Perhaps it would work if you were to change //span[#class="itempp"]/text() to span[#class="itempnr"]/span[#class="itempp"]/text().

Categories