Scrapy yield only last data and merge scrapy data into one - python

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql

I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

Related

1: my spider is giving me all the results in one liners on csv file

In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv

Scrapy Spider following urls, but wont export the data

I am trying to grab details from a real estate listing page. I can grab all the data, I just can't seem to export it..
Perhaps a problem with the way I use the yield keyword. The code work for the most part:
Visits page 1, example.com/kittens
Goes to page 2, example.com/puppers. Here are 10 apartments listed in blocks. I can get data from each block, but I need additional info from inside the hyperlink.
Visits the hyperlink, say, example.com/puppers/apartment1. It grabs some info from here as well, but I can't seem to return this data to include it in my HousingItem() class.
import scrapy
from urllib.parse import urljoin
class HousingItem(scrapy.Item):
street = scrapy.Field()
postal = scrapy.Field()
city = scrapy.Field()
url = scrapy.Field()
buildY = scrapy.Field()
on_m = scrapy.Field()
off_m = scrapy.Field()
class FAppSpider(scrapy.Spider):
name = 'f_app'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/kittens']
def parse(self, response):
yield scrapy.Request(url="https://www.example.com/puppers",
callback=self.parse_puppers)
def parse_inside_pupper(self, response):
item = HousingItem()
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
# Problem area from here..
yield response.follow(url=item['url'],callback=self.parse_inside_pupper)
# yield scrapy.request(url=item['url'],callback=self.parse_inside_pupper)?
yield item
FEED_EXPORT_FIELDS is adjusted in my SETTINGS.py. The 4 items from parse_puppers() get exported correctly, parse_inside_puppers() data is correct in the console, but wont export.
I use scrapy crawl f_app -o raw_data.csv to run me spider. Thanks in advance, appreciate all the help.
p.s. im fairly new to python and practising, i bet you noticed.
You need to send you current item to the parse_inside_pupper using meta param:
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
yield response.follow(url=item['url'],callback=self.parse_inside_pupper, meta={"item": item})
After that you can use it inside parse_inside_pupper (and yield it from here):
def parse_inside_pupper(self, response):
item = response.meta["item"]
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
yield item

Scrape information from Scraped URL

I am new to scrapy and is currently learning how to scrape information from a list of scraped URL. I have been able to scrape information from a url by going thru the tutorial in scrapy website. However, i am facing problem scraping information from a list of url scraped from a url even after googling for solution online.
The scraper that i have written below is able to scrape from the first url. However, it is unsuccessful in scraping from a list of scraped URL. The problem starts at def parse_following_urls(self, response): whereby i am unable to scrape from the list of scraped URL
Can anyone help to solve this? Thank in advance.
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
yield item
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/"+ i for i in urll]
for url in urls:
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True)
yield request
request.meta['item'] = item
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I have re wrote the code after trying suggestions given and looking at the output. Below is the edited code. However, i got another error that states that Request url must be str or unicode, got %s:' % type(url).__name__). How do i convert the URL from list to a string?
I thought URL should be in string as it is in a For loop. I have added this as comment in the code below. Is there any way to solve this?
import scrapy
from scrapy.http import Request
class SET(scrapy.Item):
title = scrapy.Field()
open = scrapy.Field()
hi = scrapy.Field()
lo = scrapy.Field()
last = scrapy.Field()
bid = scrapy.Field()
ask = scrapy.Field()
vol = scrapy.Field()
exp = scrapy.Field()
exrat = scrapy.Field()
exdat = scrapy.Field()
class ThaiSpider(scrapy.Spider):
name = "warrant"
allowed_domains = ["marketdata.set.or.th"]
start_urls = ["http://marketdata.set.or.th/mkt/stocklistbytype.do?market=SET&language=en&country=US&type=W"]
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
url = ["http://marketdata.set.or.th/mkt/"]+ sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
request = scrapy.Request(url, callback=self.parse_following_urls, dont_filter=True) #Request url must be str or unicode, got list: How to solve this?
request.meta['item'] = item
yield item
yield request
def parse_following_urls(self, response):
for sel in response.xpath('//table[3]/tbody'):
item = response.meta['item']
item['exp'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['exrat'] = sel.xpath('tr[2]/td[2]/text()').extract()
item['exdat'] = sel.xpath('tr[3]/td[2]/text()').extract()
yield item
I see what you are trying to do here, it's called - chaining requests.
What this means is that you want to keep yielding Requests and keep carrying your filled Item in the Request
s meta attribute.
For your case all you need to do is instead of yielding Item yield a Request with an item in it. Change your parse to:
def parse(self, response):
for sel in response.xpath('//table[#class]/tbody/tr'):
item = SET()
item['title'] = sel.xpath('td[1]/a[contains(#href,"ssoPageId")]/text()').extract()
item['open'] = sel.xpath('td[3]/text()').extract()
item['hi'] = sel.xpath('td[4]/text()').extract()
item['lo'] = sel.xpath('td[5]/text()').extract()
item['last'] = sel.xpath('td[6]/text()').extract()
item['bid'] = sel.xpath('td[9]/text()').extract()
item['ask'] = sel.xpath('td[10]/text()').extract()
item['vol'] = sel.xpath('td[11]/text()').extract()
urll = response.xpath('//table[#class]/tbody/tr/td[1]/a[contains(#href,"ssoPageId")]/#href').extract()
urls = ["http://marketdata.set.or.th/mkt/" + i for i in urll]
for url in urls:
yield scrapy.Request(url,
callback=self.parse_following_urls,
meta={'item': item})
I try to change the inverse 5th line
item = response.meta['item']
to
item = SET()
then it works!
Actually I didn't realize your "meta"way very much,since I never use this to describe item.

Limit how much elements scrapy can collect

I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...

remove the unicode from the output of JSON using scrapy

import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(#class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(#class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(#class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(#class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(#class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item ["image"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?
There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page (div with id="siteTable"):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[#id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(#class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(#class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(#class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(#class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(#class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item["image"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
return item
Tested, here is what I get for, for example, votes_likes:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],

Categories