How to write scraped data into a CSV file in Scrapy? - python

I am trying to scrape a website by extracting the sub-links and their titles, and then save the extracted titles and their associated links into a CSV file. I run the following code, the CSV file is created but it is empty. Any help?
My Spider.py file looks like this:
from scrapy import cmdline
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class HyperLinksSpider(CrawlSpider):
name = "linksSpy"
allowed_domains = ["some_website"]
start_urls = ["some_website"]
rules = (Rule(LinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self, response):
items = []
for link in LinkExtractor(allow=(), deny=self.allowed_domains).extract_links(response):
item = ExtractlinksItem()
for sel in response.xpath('//tr/td/a'):
item['title'] = sel.xpath('/text()').extract()
item['link'] = sel.xpath('/#href').extract()
items.append(item)
return items
cmdline.execute("scrapy crawl linksSpy".split())
My pipelines.py is:
import csv
class ExtractlinksPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('Links.csv', 'wb'))
def process_item(self, item, spider):
self.csvwriter.writerow((item['title'][0]), item['link'][0])
return item
My items.py is:
import scrapy
class ExtractlinksItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
link = scrapy.Field()
pass
I have also changed my settings.py:
ITEM_PIPELINES = {'extractLinks.pipelines.ExtractlinksPipeline': 1}

To output all data scrapy has inbuilt feature called Feed Exports.
To put it shortly all you need is two settings in your settings.py file: FEED_FORMAT - format in which the feed should be saved, in your case csv and FEED_URI - location where the feed should be saved, e.g. ~/my_feed.csv
My related answer covers it in greater detail with a use case:
https://stackoverflow.com/a/41473241/3737009

Related

Why do my two scrapy program parts work separately, but not when i merge them?

My problem is that my two scrapy program parts do what they are supposed to do independently, but when I merge them I get no output.
The program is supposed to follow a large number of links at different depth levels. On the last and deepest level the program should render the html page with the python library scrapy_requests and extract the desired information with the help of the itemloader and save it in a csv.
spider.py looks like this:
import scrapy
from scrapy_requests import HtmlRequest
from SR.items import SrItem
from scrapy.loader import ItemLoader
class SrspiderSpider(scrapy.Spider):
name = 'SRspider'
start_urls =['https://www.microbiologyresearch.org/content/journal/ijsem/browse']
def parse(self, response):
# get all volume links and follow them
for volume in response.css('h3.h5 a::attr(href)'):
yield response.follow(volume.get(), callback=self.parse_issues)
def parse_issues(self, response):
# get all issue links within a volume and follow them
for issue in response.css('li.issue a::attr(href)'):
yield response.follow(issue.get(), callback=self.parse_articles)
def parse_articles(self, response):
# get all article links within a issue and follow them
for article in response.css('span.articleTitle.js-articleTitle.title a::attr(href)'):
yield response.follow(article.get(), callback=self.parse_html)
def parse_html(self, response):
url = response.request.url
yield HtmlRequest(url=url, callback=self.parse_content, render=True, options={'sleep': 1})
def parse_content(self,response):
loader = ItemLoader(item = SrItem(), selector=response)
loader.add_xpath('title', "//h1[#class='item-meta-data__item-title']")
loader.add_xpath('abstract', "//div[#class='articleSection article-abstract']/p")
loader.add_xpath('paragraph', "//div[#class='articleSection']/p")
yield loader.load_item()
items.py look like this:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags, replace_escape_chars
class SrItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars), output_processor = TakeFirst())
abstract = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars), output_processor = TakeFirst())
paragraph = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars))
if i want to follow all links it is possible and the program part works. if i render a page and extract the needed data it works too. but if i put the two parts together and try to get everything in one step it doesn't work. what could be the reason? the page is javascript and unfortunately has to be rendered to get the information.
My output command in the shell is: scrapy crawl SRspider -o test.csv

Network Graph output from Scrapy

I'm pretty new to using Scrapy and I'm having difficulties. I'm trying to work with scrapy to crawl a website and return a list of nodes and edges to build a network graph of internal and external websites from my start page to a depth of x (to be determined).
I have the following code and I'm having trouble figuring out what the issue is.
My items.py file looks like this:
from scrapy.item import Item, Field
class SitegraphItem(Item):
url=Field()
linkedurls=Field()
my graphspider.py file is as follows:
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.utils.url import urljoin_rfc
from sitegraph.items import SitegraphItem
class GraphspiderSpider(CrawlSpider):
name = 'graphspider'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/products/']
rules = (
Rule(LinkExtractor(allow=r'/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
i = SitegraphItem()
i['url'] = response.url
i['http_status'] = response.status
llinks=[]
for anchor in hxs.select('//a[#href]'):
href=anchor.select('#href').extract()[0]
if not href.lower().startswith("javascript"):
llinks.append(urljoin_rfc(response.url,href))
i['linkedurls'] = llinks
return i
and I modified the settings.py file to include:
BOT_NAME = 'sitegraph'
SPIDER_MODULES = ['sitegraph.spiders']
NEWSPIDER_MODULE = 'sitegraph.spiders'
FEED_FORMAT="jsonlines"
FEED_URI="C:\\Users\Merrie\\Desktop\\testscrape\\sitegraph\\sitegraph.json"
When I run it I'm using the following code:
$ scrapy crawl graphspider -o attempt2.csv
And my output table is empty. It also keeps throwing this error: "KeyError: 'SitegraphItem does not support field: http_status'"
Missing http_statusfield in your items.py causes the error, please update it.
from scrapy.item import Item, Field
class SitegraphItem(Item):
url=Field()
linkedurls=Field()
http_status=Field()

Scrapy saved scraped data in csv in one line instead of multiple line

I'm using scrapy to scrape URL from a website and save the results in a csv file. But it is saving in one line only instead of multiple line.I tried to search for an answer in stackoverflow but in vain.Here is my file:
import scrapy
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from uomscraprbot.items import UomscraprbotItem
class uomsitelinks(scrapy.Spider):
name = "uom"
allowed_domains = ["uom.ac"]
start_urls = [
"http://www.uom.ac.mu/"]
def parse(self, response):
# print response.xpath('//body//li/a/#href').extract()
item = UomscraprbotItem()
item['url'] = response.xpath('//body//li/a/#href').extract()
return item
i used : scrapy crawl uom -o uom.csv -t csv
i want it to save like this :
www.a.com,
www.b.com,
www.c.com
and not
www.a.com,www.b.com,www.c.com
where did i go wrong in my code?
You need to process each URL separatelly:
def parse(self, response):
# print response.xpath('//body//li/a/#href').extract()
for item_url in response.xpath('//body//li/a/#href').extract():
item = UomscrapebotItem()
item['url'] = item_url
yield item

CSV export is not working

This is my spider class:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.sgml import SgmlLinkExtractor
import csv
class StackItem(scrapy.Item):
job_role = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
desc = scrapy.Field()
read_more = scrapy.Field()
class newJobSpider(CrawlSpider):
name = "newFlaskSpider"
allowed_domains = ["placementindia.com"]
start_urls = ["http://jobs.placementindia.com/lucknow"]
rules = (Rule (SgmlLinkExtractor(allow=('.*\?id1=.*',),restrict_xpaths=('//a[#class="prevNext next"]',))
, callback="parse_items", follow= True),)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
posts = hxs.select("//article[#class='classified']")
items = []
for post in posts:
item = StackItem()
item["job_role"] = post.select("div[#class='uu mb2px']/a/strong/text()").extract()
item["company"] = post.select("p[1]/text()").extract()
item["location"] = post.select("p[#class='mb5px b red']/text()").extract()
item["desc"] = post.select("details[#class='aj mb10px']/text()").extract()
item["read_more"] = post.select("div[#class='uu mb2px']/a/#href").extract()enter code here
items.append(item)
for item in items:
yield item
And this is item pipeline
class myExporter(object):
def __init__(self):
self.myCSV = csv.writer(open('output6.csv', 'wb'))
self.myCSV.writerow([item['job_role'], item['company'], item['location'], item['desc'], item['read_more']])
def process_item(self, item, spider):
self.myCSV.writerow([item['job_role'], item['company'], item['location'], item['desc'], item['read_more']])
return item
When they are in separate class, running fine. I am getting results in csv file. Due to my project requirement, i need the csv exporter class inside the spider definition. How can these classes be combined??
There are two approaches to this problem:
1) Why nesting a class?
If you have to nest the exporter in the spider class itself it should not be nested. And there is no need for a separate exporter. Because if you have your exporter nested you should access it from the spider. This means you should use your spider to write the items to a CSV file so you do not need to yield any items from your parse_items method in this case -- but implement the process_item method there. So no need for a separate exporter.
2) Exporter in the same file than the spider
If you have your exporter in the same file as your spider, you have to point to this exporter class from your settings.py file. Imagine your project is located in the new_job folder, the Python file of your spider is called newjob.py. In this case you can add the following line in the settings.py:
ITEM_PIPELINES = {'new_job.spiders.newjob.myExporter' : 90,}
By the way when I copied your code then I've had some errors which I needed to fix. So I wonder if it ever worked for you.

Scrapy XPath all the links on the page

I am trying to collect all the URLs under a domain using Scrapy. I was trying to use the CrawlSpider to start from the homepage and crawl their web. For each page, I want to use Xpath to extract all the hrefs. And store the data in a format like key-value pair.
Key: the current Url
Value: all the links on this page.
class MySpider(CrawlSpider):
name = 'abc.com'
allowed_domains = ['abc.com']
start_urls = ['http://www.abc.com']
rules = (Rule(SgmlLinkExtractor()), )
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = AbcItem()
item['key'] = response.url
item['value'] = hxs.select('//a/#href').extract()
return item
I define my AbcItem() looks like below:
from scrapy.item import Item, Field
class AbcItem(Item):
# key: url
# value: list of links existing in the key url
key = Field()
value = Field()
pass
And when I run my code like this:
nohup scrapy crawl abc.com -o output -t csv &
The robot seems like began to crawl and I can see the nohup.out file being populated by all the configurations log but there is no information from my output file.. which is what I am trying to collect, can anyone help me with this? what might be wrong with my robot?
You should have defined a callback for a rule. Here's an example for getting all links from twitter.com main page (follow=False):
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class MySpider(CrawlSpider):
name = 'twitter.com'
allowed_domains = ['twitter.com']
start_urls = ['http://www.twitter.com']
rules = (Rule(SgmlLinkExtractor(), callback='parse_url', follow=False), )
def parse_url(self, response):
item = MyItem()
item['url'] = response.url
return item
Then, in the output file, I see:
http://status.twitter.com/
https://twitter.com/
http://support.twitter.com/forums/26810/entries/78525
http://support.twitter.com/articles/14226-how-to-find-your-twitter-short-code-or-long-code
...
Hope that helps.
if you dont set the callback function explicitly, scrapy will use the method parse to process crawled pages. so, you should add parse_item as the callback, or change it's name to parse.

Categories