I'm trying to scrape a specific web page and although on the console I get all results, on the outputted csv I don't. In this case, I want both title and author of a specific search, but I only get the title. If I reverse the order of the two I get author, so it only takes the first one. Why?
import scrapy
QUERY = "q=brilliant+friend&qt=results_page#x0%253Abook-%2C%2528x0%253Abook%2Bx4%253Aprintbook%2529%2C%2528x0%253Abook%2Bx4%253Adigital%2529%2C%2528x0%253Abook%2Bx4%253Alargeprint%2529%2C%2528x0%253Abook%2Bx4%253Amss%2529%2C%2528x0%253Abook%2Bx4%253Athsis%2529%2C%2528x0%253Abook%2Bx4%253Abraille%2529%2C%2528x0%253Abook%2Bx4%253Amic%2529%2Cx0%253Aartchap-%2C%2528x0%253Aartchap%2Bx4%253Achptr%2529%2C%2528x0%253Aartchap%2Bx4%253Adigital%2529format"
class Spider(scrapy.Spider):
name = 'worldcatspider'
start_urls = ['https://www.worldcat.org/search?start=%s&%s' % (number, QUERY) for number in range(0, 4400, 10)]
def parse(self, response):
for title in response.css('.name a > strong ::text').extract():
yield {"title:": title}
for author in response.css('.author ::text').extract():
yield {"author:": author}
My suggestion will be put for statement their head class or div.
I haven't checked but this should work:
def parse(self, response):
for page in response.css('.menuElem'):
title = page.css('.name a > strong ::text').extract()
author = page.css('.author ::text').extract()
yield {"title": title,
"author:": author}
Related
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old
I am trying to write some code to scrap the website of a UK housebuilder to record a list of houses for sale.
I am starting on the page http://www.persimmonhomes.com/sitemap and I have written one part of the code to list all the urls of the housebuilder developments and then the second part of the code to scrap from each of the urls to record prices etc.
I know the second part works and I know that the first part lists out all the urls. But for some reason the urls listed by the first part don't seem want to trigger the second part of the code to scrap from them.
The code of this first part is:
def parse(self, response):
for href in response.xpath('//*[#class="contacts-item"]/ul/li/a/#href'):
url = urlparse.urljoin('http://www.persimmonhomes.com/',href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
Now, I know this lists the urls I want (if I put in the line "print url" then they all get listed) and I can manually list add them to the code to run the second part all ok if I wanted to. However, even though the urls are created they do not seem to allow the second part of the code to scrap from them.
and the entire code is below:
import scrapy
import urlparse
from Persimmon.items import PersimmonItem
class persimmonSpider(scrapy.Spider):
name = "persimmon"
allowed_domains = ["http://www.persimmonhomes.com/"]
start_urls = [
"http://www.persimmonhomes.com/sitemap",
]
def parse(self, response):
for href in response.xpath('//*[#class="contacts-item"]/ul/li/a/#href'):
url = urlparse.urljoin('http://www.persimmonhomes.com/',href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="aspnetForm"]/div[4]'):
item = PersimmonItem()
item['name'] = sel.xpath('//*[#id="aspnetForm"]/div[4]/div[1]/div[1]/div/div[2]/span/text()').extract()
item['address'] = sel.xpath('//*[#id="XplodePage_ctl12_dsDetailsSnippet_pDetailsContainer"]/div/*[#itemprop="postalCode"]/text()').extract()
plotnames = sel.xpath('//div[#class="housetype js-filter-housetype"]/div[#class="housetype__col-2"]/div[#class="housetype__plots"]/div[not(contains(#data-status,"Sold"))]/div[#class="plot__name"]/a/text()').extract()
plotnames = [plotname.strip() for plotname in plotnames]
plotids = sel.xpath('//div[#class="housetype js-filter-housetype"]/div[#class="housetype__col-2"]/div[#class="housetype__plots"]/div[not(contains(#data-status,"Sold"))]/div[#class="plot__name"]/a/#href').extract()
plotids = [plotid.strip() for plotid in plotids]
plotprices = sel.xpath('//div[#class="housetype js-filter-housetype"]/div[#class="housetype__col-2"]/div[#class="housetype__plots"]/div[not(contains(#data-status,"Sold"))]/div[#class="plot__price"]/text()').extract()
plotprices = [plotprice.strip() for plotprice in plotprices]
result = zip(plotnames, plotids, plotprices)
for plotname, plotid, plotprices in result:
item['plotname'] = plotname
item['plotid'] = plotid
item['plotprice'] = plotprice
yield item
any views as to why the first part of the code creates the urls but the second part does not loop through them?
You just need to fix your allowed_domains property:
allowed_domains = ["persimmonhomes.com"]
(tested - worked for me).
I am working on a class project and trying to get all IMDB movie data (titles, budgets. etc.) up until 2016. I adopted the code from https://github.com/alexwhb/IMDB-spider/blob/master/tutorial/spiders/spider.py.
My thought is: from i in range(1874,2016) (since 1874 is the earliest year shown on http://www.imdb.com/year/), direct the program to the corresponding year's website, and grab the data from that url.
But the problem is, each page for each year only show 50 movies, so after crawling the 50 movies, how can I move on to the next page? And after crawling each year, how can I move on to next year? This is my code for the parsing url part so far, but it is only able to crawls 50 movies for a particular year.
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = ["http://www.imdb.com/search/title?year=2014,2014&title_type=feature&sort=moviemeter,asc"]
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
You can use CrawlSpiders to simplify your task. As you'll see below, start_requests dynamically generates the list of URLs while parse_page only extracts the movies to crawl. Finding and following the 'Next' link is done by the rules attribute.
I agree with #Padraic Cunningham that hard-coding values is not a great idea. I've added spider arguments so that you can call:
scrapy crawl imdb -a start=1950 -a end=1980 (the scraper will default to 1874-2016 if it doesn't get any arguments).
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from imdbyear.items import MovieItem
class IMDBSpider(CrawlSpider):
name = 'imdb'
rules = (
# extract links at the bottom of the page. note that there are 'Prev' and 'Next'
# links, so a bit of additional filtering is needed
Rule(LinkExtractor(restrict_xpaths=('//*[#id="right"]/span/a')),
process_links=lambda links: filter(lambda l: 'Next' in l.text, links),
callback='parse_page',
follow=True),
)
def __init__(self, start=None, end=None, *args, **kwargs):
super(IMDBSpider, self).__init__(*args, **kwargs)
self.start_year = int(start) if start else 1874
self.end_year = int(end) if end else 2016
# generate start_urls dynamically
def start_requests(self):
for year in range(self.start_year, self.end_year+1):
yield scrapy.Request('http://www.imdb.com/search/title?year=%d,%d&title_type=feature&sort=moviemeter,asc' % (year, year))
def parse_page(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
# note -- you had 'MianPageUrl' as your scrapy field name. I would recommend fixing this typo
# (you will need to change it in items.py as well)
item['MainPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MainPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
# make sure that the dynamically generated start_urls are parsed as well
parse_start_url = parse_page
# do your magic
def parseMovieDetails(self, response):
pass
you can use the below piece of code to follow the next page
#'a.lister-page-next.next-page::attr(href)' is the selector to get the next page link
next_page = response.css('a.lister-page-next.nextpage::attr(href)').extract_first() # joins current and next page url
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse) # calls parse function again when crawled to next page
I figured out a very dumb way to solve this. I put all the links in the start_urls. Better solution would be very much appreciated!
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = []
for i in xrange(1874, 2017):
for j in xrange(1, 11501, 50):
# since the largest number of movies for a year to have is 11,400 (2016)
start_url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=" + str(j) + "&title_type=feature&year=" + str(i) + "," + str(i)
start_urls.append(start_url)
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
The code that #Greg Sadetsky has provided needs some minor changes. Well only one change that is in the first line of parse_page method.
Just change xpath in the for loop from:
response.xpath("//*[#class='results']/tr/td[3]"):
to
response.xpath("//*[contains(#class,'lister-item-content')]/h3"):
This worked like a charm for me!
Scrapy is used to parse an html page. My question is why sometimes scrapy returns the response I want, but sometimes does not return a response. Is it my fault? Here's my parsing function:
class AmazonSpider(BaseSpider):
name = "amazon"
allowed_domains = ["amazon.org"]
start_urls = [
"http://www.amazon.com/s?rh=n%3A283155%2Cp_n_feature_browse-bin%3A2656020011"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[contains(#class, "result")]')
items = []
titles = {'titles': sites[0].xpath('//a[#class="title"]/text()').extract()}
for title in titles['titles']:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
I believe you are just not using the most adequate XPath expression.
Amazon's HTML is kinda messy, not very uniform and therefore not very easy to parse. But after some experimenting I could extract all the 12 titles of a couple of search results with the following parse function:
def parse(self, response):
sel = Selector(response)
p = sel.xpath('//div[#class="data"]/h3/a')
titles = p.xpath('span/text()').extract() + p.xpath('text()').extract()
items = []
for title in titles:
item = AmazonScrapyItem()
item['title'] = title
items.append(item)
return items
If you care about the actual order of the results the above code might not be appropriate but I believe that is not the case.
Have python script using scrapy , which scrapes the data from a website, allocates it to 3 fields and then generates a .csv. Works ok but with one major problem. All fields contain all of the data, rather than it being separated out for each table row. I'm sure this is due to my loop not working and when it finds the xpath it just grabs all the data for every row before moving on to get data for the other 2 fields, instead of creating seperate rows
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('//table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('//table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('//table/tbody/tr[*]/td[4]/p/text()').extract()
return item
The tr with the * increases in number with each entry on the website I need to crawl, and the other two paths slot in below. How do I edit this so it grabs the first set of data for say //table/tbody/tr[3] only, stores it for all three fields and then moves on to //table/tbody/tr[4] etc??
Update
Works correctly, however I'm trying to add some validation to the pipelines.py file to drop any records where var1 is more than 100%. I'm certain my code below is wrong, and also does "yield" instead of "return" stop the pipeline being used?
from scrapy.exceptions import DropItem
class TestbotPipeline(object):
def process_item(self, item, spider):
if item('var1') > 100%:
return item
else:
raise Dropitem(item)
I think this is what you are looking for:
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
yield item
You loop on the trs and then use relative XPath expressions (./td...), and in each iteration you use the yield instruction.
You can also append each item to a list and return that list outside of the loop) like this (it's equivalent to the code above):
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
items = []
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
items.append(item)
return items
You don't need HtmlXPathSelector. Scrapy already has built-in XPATH selector. Try this:
def parse(self, response):
divs = response.xpath('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.xpath('table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()[0]
item['var2'] = div.xpath('table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()[0]
item['var3'] = div.xpath('table/tbody/tr[*]/td[4]/p/text()').extract()[0]
return item