I'm trying out print out a CSV file after scraping using piplines but the formatting is a bit weird because instead of printing it top to bottom it is printing it all at once after scraping page 1 and then all of page 2 in one column. I have attached piplines.py and one line from csv output(quite large). So how do I make to print column wise instead all at once from one page
pipline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class CSVPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['names','stars','subjects','reviews']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
and output.csv
names stars subjects
Vivek0388,NikhilVashisth,DocSharad,Abhimanyu_swarup,Suresh N,kaushalhkapadia,JyotiMallick,Nitin T,mhdMumbai,SunilTukrel(COLUMN 2) 5 of 5 stars,4 of 5 stars,1 of 5 stars,5 of 5 stars,3 of 5 stars,4 of 5 stars,5 of 5 stars,5 of 5 stars,4 of 5 stars,4 of 5 stars(COLUMN 3) Best Stay,Awesome View... Nice Experience!,Highly mismanaged and dishonest.,A Wonderful Experience,Good place with average front office,Honeymoon,Awesome Resort,Amazing,ooty's beauty!!,Good stay and food
It should look something like this
Vivek0388 5 of 5
NikhilVashisth 5 of 5
DocSharad 5 of 5
...so on
EDIT:
items = [{'reviews:':"",'subjects:':"",'names:':"",'stars:':""} for k in range(1000)]
if(sites and len(sites) > 0):
for site in sites:
i+=1
items[i]['names'] = item['names']
items[i]['stars'] = item['stars']
items[i]['subjects'] = item['subjects']
items[i]['reviews'] = item['reviews']
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
for k in range(1000):
yield items[k]
Figured it out, csv zip it and then for loop it through it and write row. This was MUCH less complicated once you read the docs.
import csv
import itertools
class CSVPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'), delimiter=',')
self.csvwriter.writerow(['names','starts','subjects','reviews'])
def process_item(self, item, ampa):
rows = zip(item['names'],item['stars'],item['subjects'],item['reviews'])
for row in rows:
self.csvwriter.writerow(row)
return item
Related
I am trying to a file using the scrapy pipelines.py, item is being parsed correctly and it shows in terminal when I run.
this is my pipleines.py
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w").close()
dict_writer = csv.DictWriter(infile, self.keys)
dict_writer.writeheader()
def process_item(self, item, spider):
self.dict_writer.writerow(item)
Error Message:
dict_writer = csv.DictWriter(infile, self.keys)
File "/usr/lib/python3.6/csv.py", line 140, in __init__
self.writer = writer(f, dialect, *args, **kwds)
TypeError: argument 1 must have a "write" method
You have several problems:
You close file descriptor before usage;
You did not set class variable. Use self.dict_writer, not dict_writer in __init__.
Check code:
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w") # <- remove close() here
self.dict_writer = csv.DictWriter(infile, self.keys) # <- add self. here
self.dict_writer.writeheader() # <- add self. here
def process_item(self, item, spider):
self.dict_writer.writerow(item)
I´d like to export data to several columns in csv but I always obtain this kind of file:
csv
I´d like to obtain two columns one "articulo" and another one "price"
My pipelines:
import scrapy
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
import csv
class MercadoPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemexporter(file)
self.exporter.fields_to_export = ['articulo','precio']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.closed()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Can you help me please?
Here you are:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['www.autodoc.es']
start_urls = ['https://www.autodoc.es/search?brandNo%5B0%5D=101']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//span[#class="next"]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//a[#class="ga-click"]')),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[2]/div[1]/div[1]/div/span[1]/span/text())').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[3]/div[2]/p[2]/text())').extract()
self.item_count += 1
if self.item_count > 20:
raise CloseSpider('item_exceeded')
yield ml_item
There is nothing wrong with the output of your code.
You are getting the two csv columns you want, but the program you are using to view the data is not interpreting it correctly.
By default, CsvItemExporter uses , as the delimiter, and the program seems to expect something else (and possibly even different quoting).
There are two possibilities to solve your problem:
Change the program's settings so it reads the file correctly
Change the way CsvItemExporter exports data (it will pass any additional keyword arguments to the underlying csv.writer object)
I am using scrapy to parse a table containing links and save it in json. The links from table contain additional detail and they will be fetched and stored into another json. (following this example: https://docs.scrapy.org/en/latest/topics/exporters.html)
To achieve this I am using a pipeline to check item type and store result in appropriate json. However, I am stuck in some weird error. Please refer below:
from scrapy import signals
from scrapy.exporters import JsonItemExporter
from for_icu import items
class ListPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print("spider_opened")
file_table = open('%s_table.json' % spider.name, 'w+b')
self.files[spider].append(file_table)
self.exporter1 = JsonItemExporter(file_table)
self.exporter1.start_exporting()
file_detail = open('%s_detail.json' % spider.name, 'w+b')
self.files[spider].append(file_detail)
self.exporter2 = JsonItemExporter(file_detail)
self.exporter2.start_exporting()
def spider_closed(self, spider):
print("spider_closed")
self.exporter1.finish_exporting()
self.exporter2.finish_exporting()
for file in self.files.pop(spider):
file.close()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.exporter1.export_item(item)
return item
elif isinstance(item, items.UniDetail):
self.exporter22.export_item(item)
return item
Error:
2017-12-27 11:41:15 [scrapy.core.scraper] ERROR: Error processing {'country': ('Finland',),
'country_code': ('fi ',),
'u_link': ('http://www.xxxxxxx.xxx/xxxxxxx/xxxx.htm',),
'u': (' pisto',)}
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/website_scrapy/for_icu/for_icu/pipelines.py", line 31, in process_item
self.exporter.export_item(item)
AttributeError: 'ListPipeline' object has no attribute 'exporter1'
Please let me know what I am missing here... being stuck at this from past couple of hours ...
I was unable to have exporter work, so I used simple filewriter for the task:
class ListPipeline(object):
unilist = []
unidetail = []
def close_spider(self, spider):
print("spider_closed")
file_table = open('%s_table.json' % spider.name, 'w')
line = json.dumps(self.unilist)
file_table.write(line)
file_table.close()
file_detail = open('%s_detail.json' % spider.name, 'w')
line = json.dumps(self.unidetail)
file_detail.write(line)
file_detail.close()
self.unilist.clear()
self.unidetail.clear()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.unilist.append(dict((item)))
return item
elif isinstance(item, items.UniDetail):
self.unidetail.append(dict((item)))
return item
This makes me achieve what I want but it would be better if one can use builtin exporters. If someone knows how to make it work, please update.
I have been putting together a piece of code, I found on stack overflow, on how to split url's output in separate csv files and I came up with the code below. However, I can no longer use field_to_export in the code. I wonder how can I set the field to export so that they are exported like: field_to_export = ['itemA', 'itemB', 'itemC'].
from scrapy import signals
from scrapy.exporters import CsvItemExporter
import re
class appPipeline(object):
urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
names = [name.group(1) for l in urls for name in [re.search(r'https://www.google.co.uk/', l, re.M|re.I)] if name]
def __init__(self):
self.files = {}
self.exporters = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.files = dict([ (name, open('results/'+name+'.csv','w+b')) for name in self.names])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.names])
### this line is not working self.exportes.fields_to_export = ['itemA','itemB','itemC']
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
myItem = item['myItem']
if myItem in set(self.names):
self.exporters[myItem].export_item(item)
return item
So far I have been trying to override the keys in items. I have tried to serialize the items and I have been looking how to sort values in a dictionary by a list of keys. None of them worked.
Thanks for you help.
I keep getting an error, but i dont see it..
I am new to programing and if you explane me the code, please dont assume I know too much.
#!/usr/bin/env python
# Name:
# Student number:
'''
This script crawls the IMDB top 250 movies.
'''
# Python standard library imports
import os
import sys
import csv
import codecs
import cStringIO
import errno
# Third party library imports:
import pattern
from pattern.web import URL, DOM
# --------------------------------------------------------------------------
# Constants:
TOP_250_URL = 'http://www.imdb.com/chart/top'
OUTPUT_CSV = 'top250movies.csv'
SCRIPT_DIR = os.path.split(os.path.realpath(__file__))[0]
BACKUP_DIR = os.path.join(SCRIPT_DIR, 'HTML_BACKUPS')
# --------------------------------------------------------------------------
# Unicode reading/writing functionality for the Python CSV module, taken
# from the Python.org csv module documentation (very slightly adapted).
# Source: http://docs.python.org/2/library/csv.html (retrieved 2014-03-09).
class UTF8Recoder(object):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader(object):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
# --------------------------------------------------------------------------
# Utility functions (no need to edit):
def create_dir(directory):
'''
Create directory if needed.
Args:
directory: string, path of directory to be made
Note: the backup directory is used to save the HTML of the pages you
crawl.
'''
try:
os.makedirs(directory)
except OSError as e:
if e.errno == errno.EEXIST:
# Backup directory already exists, no problem for this script,
# just ignore the exception and carry on.
pass
else:
# All errors other than an already exising backup directory
# are not handled, so the exception is re-raised and the
# script will crash here.
raise
def save_csv(filename, rows):
'''
Save CSV file with the top 250 most popular movies on IMDB.
Args:
filename: string filename for the CSV file
rows: list of rows to be saved (250 movies in this exercise)
'''
with open(filename, 'wb') as f:
writer = UnicodeWriter(f) # implicitly UTF-8
writer.writerow([
'title', 'runtime', 'genre(s)', 'director(s)', 'writer(s)',
'actor(s)', 'rating(s)', 'number of rating(s)'
])
writer.writerows(rows)
def make_backup(filename, html):
'''
Save HTML to file.
Args:
filename: absolute path of file to save
html: (unicode) string of the html file
'''
with open(filename, 'wb') as f:
f.write(html)
def main():
'''
Crawl the IMDB top 250 movies, save CSV with their information.
Note:
This function also makes backups of the HTML files in a sub-directory
called HTML_BACKUPS (those will be used in grading).
'''
# Create a directory to store copies of all the relevant HTML files (those
# will be used in testing).
print 'Setting up backup dir if needed ...'
create_dir(BACKUP_DIR)
# Make backup of the IMDB top 250 movies page
print 'Access top 250 page, making backup ...'
top_250_url = URL(TOP_250_URL)
top_250_html = top_250_url.download(cached=True)
make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)
# extract the top 250 movies
print 'Scraping top 250 page ...'
url_strings = scrape_top_250(top_250_url)
# grab all relevant information from the 250 movie web pages
rows = []
for i, url in enumerate(url_strings): # Enumerate, a great Python trick!
print 'Scraping movie %d ...' % i
# Grab web page
movie_html = URL(url).download(cached=True)
# Extract relevant information for each movie
movie_dom = DOM(movie_html)
rows.append(scrape_movie_page(movie_dom))
# Save one of the IMDB's movie pages (for testing)
if i == 83:
html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
make_backup(html_file, movie_html)
# Save a CSV file with the relevant information for the top 250 movies.
print 'Saving CSV ...'
save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
This function below, should return the webpage links of the top 250 movies:
# --------------------------------------------------------------------------
# Functions to adapt or provide implementations for:
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
table_rows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in table_rows[1:]:
a = tr.by_tag('a')[0]
movie_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
#print scrape_top_250(url)
And finaly this function should return specific contents.
def scrape_movie_page(dom):
'''
Scrape the IMDB page for a single movie
Args:
dom: pattern.web.DOM instance representing the page of 1 single
movie.
Returns:
A list of strings representing the following (in order): title, year,
duration, genre(s) (semicolon separated if several), director(s)
(semicolon separated if several), writer(s) (semicolon separated if
several), actor(s) (semicolon separated if several), rating, number
of ratings.
'''
# YOUR SCRAPING CODE GOES HERE:
for p in movie_urls:
p_url = URL(p)
p_dom = DOM(p_url.download(cached=True))
title = clean_unicode(p_dom.by_class('header')[0].content)
title = plaintext(strip_between('<span', '</span>', title))
runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)
duration = runtime
genres = []
for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
genres.append(clean_unicode(genre.content))
directors = []
writers = []
actors = []
text_blocks = p_dom.by_class('txt-block')[:3]
for t in text_blocks:
spans = t.by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'director':
director = s.by_tag('span')[0].by_tag('a')[0].content
directors.append(clean_unicode(director))
if s.attributes.get('itemprop') == 'writer':
p_writer = s.by_tag('span')[0].by_tag('a')[0].content
writers.append(clean_unicode(p_writer))
if s.attributes.get('itemprop') == 'actors':
actor = s.by_tag('span')[0].by_tag('a')[0].content
actors.append(clean_unicode(actor))
rating = []
ratings_count = []
spans = p_dom.by_class('star-box-details')[0].by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'ratingValue':
rating = clean_unicode(s.content)
if s.attributes.get('itemprop') == 'ratingCount':
ratings_count = clean_unicode(s.content)
# format the strings from lists
genres = concat_strings(genres)
directors = concat_strings(directors)
writers = concat_strings(writers)
actors = concat_strings(actors)
# Return everything of interest for this movie (all strings as specified
# in the docstring of this function).
return title, duration, genres, directors, writers, actors, rating, \
n_ratings
if __name__ == '__main__':
main() # call into the progam
# If you want to test the functions you wrote, you can do that here:
# ...
It's just that (in the original revision) you forgot to indent the body of the function scrape_movie_page. The for loop is in module scope.
Most common reason for cause of this error due to not proper indent the body of the function, but some time code looks proper a for as indentation point of view but still it throw same error. I always saw this error comes due to mismatch in indentation.In same block if you use two type of indentation like in same block if for some line you use tab and and for some line you use spaces, code looks good as for as indentation prospective but it always through indentation error.