creating rss with using scrapy

creating rss with using scrapy - python

I added a pipeline which I found as an answer in stackoverflow to a sample project.
it is :
import csv
from craiglist_sample import settings
def write_to_csv(item):
writer = csv.writer(open(settings.csv_file_path, 'a'), lineterminator='\n')
writer.writerow([item[key] for key in item.keys()])
class WriteToCsv(object):
def process_item(self, item, spider):
write_to_csv(item)
return item
it writes correctly to a csv file. then I change it to this one :
import csv
import sys
from craiglist_sample import settings
import datetime
import PyRSS2Gen
def write_to_csv(item):
rss = PyRSS2Gen.RSS2(
title = "Andrew's PyRSS2Gen feed",
link = "http://www.dalkescientific.com/Python/PyRSS2Gen.html",
description = "The latest news about PyRSS2Gen, a "
"Python library for generating RSS2 feeds",
lastBuildDate = datetime.datetime.now(),
items = [
PyRSS2Gen.RSSItem(
title =str((item['title']),
link = str((item['link']),
description = "Dalke Scientific today announced PyRSS2Gen-0.0, "
"a library for generating RSS feeds for Python. ",
guid = PyRSS2Gen.Guid("http://www.dalkescientific.com/news/"
"030906-PyRSS2Gen.html"),
pubDate = datetime.datetime(2003, 9, 6, 21, 31)),
])
rss.write_xml(open("pyrss2gen.xml", "w"))
class WriteToCsv(object):
def process_item(self, item, spider):
write_to_csv(item)
return item
But problem is it writes only the last entry to the xml file. How can I fix this? do I need to add new line for each entry?
items.py is :
class CraiglistSampleItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=Field()
link=Field()

Use a to append, your are overwriting each time using w so you only get the last piece of data:
rss.write_xml(open("pyrss2gen.xml", "a"))
If you look at the original code you can that also uses a not w.
You might want to use with when opening files or at least closing them.

Related

divide list of elements in scrapy output into seperate rows

I am trying to separate the output from Scrapy into separate lines in an Excel file but I get something like this
In other words each output from variant id, price and name should be in placed in seperate lines in Excel.
I am using scrapy-xlsx 0.1.1 library to export output to xlsx file (it cannot be in csv).
Please tell me where is the issue.
import scrapy
from ..items import ZooplusItem
import re
class ZooplusDeSpider(scrapy.Spider):
name = 'zooplus_de'
allowed_domains = ['zooplus.de']
start_urls = ['https://www.zooplus.de/shop/hunde/hundefutter_trockenfutter/diaetfutter']
def parse(self, response):
for link in response.css('.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-2.MuiGrid-justify-xs-flex-end'):
items = ZooplusItem()
redirect_urls = response.request.meta.get('redirect_urls')
items['url'] = link.redirect_urls[0] if redirect_urls else response.request.url
items['product_url'] = link.css('.MuiGrid-root.product-image a::attr(href)').getall()
items['title'] = link.css('h3 a::text').getall()
items['id'] = link.css('h3 a::attr(id)').getall()
items['review'] = link.css('span.sc-fzoaKM.kVcaXm::text').getall()
items['review'] = re.sub(r'\D', " ", str(items['review']))
items['review'] = items['review'].replace(" ", "")
#items['review'] = int(items['review'])
items['rate'] = len(link.css('a.v3-link i[role=full-star]'))
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield items

If you want to store all the variants with common information duplicated, then you need to loop through each variant and yield that separately. You can copy the common information you've already collected and add to that.
In summary replace
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield item
with something like
for i in link.css("[data-zta='product-variant']"):
variant = items.copy()
variant["variant_id"] = i.attrib["data-variant-id"]
variant["variant_name"] = "".join(i.css(".title > div::text").getall()).strip()
variant['variant_price'] = i.css("[itemprop='price']::attr(content)").get()
yield variant

Separate file for keeping inventory (of books)

I have a program that is basically a library simulation, you can look up books, edit, delete, etc.
In my program I've initialized some default books into a class such as this:
class BookData:
def __init__(self):
self.bookTitle = ''
self.isbn = ''
self.author = ''
self.publisher = ''
self.dateAdded = ''
self.quantity = 0.0
self.wholesale = 0.0
self.retail = 0.0
def __str__(self):
return 'Title: {} ISBN: {} Author: {} ' \
'Publisher: {} Date Added: {} ' \
'Quantity: {} Wholesale Value: {} ' \
'Retail Value: {}'.format(self.bookTitle, self.isbn, self.author, self.publisher, self.dateAdded,
self.quantity, self.wholesale, self.retail)
An example of a book I have stored in the program:
book0.bookTitle, book0.isbn, book0.author, book0.publisher, book0.dateAdded, book0.quantity, book0.wholesale, book0.retail = "INTRODUCING PYTHON", "978-1-4493-5936-2", "Bill Lubanovic", "O'Reilly Media, Inc.", "11/24/2014", 25, 39.95, 50.00
Each book then gets appended into a list.
What I want to do is store all the books into a separate file so that it can be updated and edited within that file, but I don't quite get how to properly open the file, read each part (such as title, isbn, author) then in the main program make those into BookData objects and put them into a list.
I've considered either a plain .txt document with commas to format. I don't know if something like JSON or XML will make this easier.
Psuedo code example:
open(file):
for word in file:
create book with title, author, isbn, etc in file
append to list of books

Python natively supports CSV (comma separated values) files: Python documentation
An example would be:
import csv
books = []
with open('file.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
book = BookData()
book.bookTitle = row[0]
book.isbn = row[1]
However, that being said, it may be more constructive to change your constructor (ha ha) to take in a row, and then assign it directly:
def __init__(self, row):
self.bookTitle = row[0]

Python Praw ways to store data for calling later?

Is a dictionary the correct way to be doing this? Ideally this will be more then 5+ deep. Sorry my only language experience is powershell there I would just make an array of object. Im not looking for someone to write the code I just wanna know if there is a better way?
Thanks
Cody
My Powershell way:
[$title1,$title2,$title3]
$titleX.comment = "comment here"
$titleX.comment.author = "bob"
$titleX.comment.author.karma = "200"
$titleX.comment.reply = "Hey Bob love your comment."
$titleX.comment.reply.author = "Alex"
$titleX.comment.reply.reply = "I disagree"
#
Python code Borken:
import praw
d = {}
reddit = praw.Reddit(client_id='XXXX',
client_secret='XXXX',
user_agent='android:com.example.myredditapp:'
'v1.2.3 (by /u/XXX)')
for submission in reddit.subreddit('redditdev').hot(limit=2):
d[submission.id] = {}
d[submission.id]['comment'] = {}
d[submission.id]['title']= {}
d[submission.id]['comment']['author']={}
d[submission.id]['title'] = submission.title
mySubmission = reddit.submission(id=submission.id)
mySubmission.comments.replace_more(limit=0)
for comment in mySubmission.comments.list():
d[submission.id]['comment'] = comment.body
d[submission.id]['comment']['author'] = comment.author.name
print(submission.title)
print(comment.body)
print(comment.author.name)
print(d)
File "C:/git/tensorflow/Reddit/pull.py", line 23, in <module>
d[submission.id]['comment']['author'] = comment.author.name
TypeError: 'str' object does not support item assignment
#
{'6xg24v': {'comment': 'Locking this version. Please comment on the [original post](https://www.reddit.com/r/changelog/comments/6xfyfg/an_update_on_the_state_of_the_redditreddit_and/)!', 'title': 'An update on the state of the reddit/reddit and reddit/reddit-mobile repositories'}}

I think your approach using a dictionary is okay, but you might also solve this by using a data structure for your posts: Instead of writing
d[submission.id] = {}
d[submission.id]['comment'] = {}
d[submission.id]['title']= {}
d[submission.id]['comment']['author']={}
d[submission.id]['title'] = submission.title
you could create a class Submission like this:
class Submission(object):
def __init__(self, id, author, title, content):
self.id = id
self.author = author
self.title = title
self.content = content
self.subSubmissions = {}
def addSubSubmission(self,submission):
self.subSubmission[submission,id] = submission
def getSubSubmission(self,id):
return self.subSubmission[id]
by using you could change your code to this
submissions = {}
for sm in reddit.subreddit('redditdev').hot(limit=2):
submissions[sm.id] = Submission(sm.id, sm.author, sm.title, sm.content)
# I am not quite sure what these lines are supposed to do, so you might be able to improve these, too
mySubmission = reddit.submission(id=sm.id)
mySubmission.comments.replace_more(limit=0)
for cmt in mySubmission.comments.list():
submissions[sm.id].addSubSubmission(Submission(cmt.id, cmt.title, cmt.author, cmt.body))
By using this apporach you are also able to export the code to readout the comments/subSubmissions into an extra function which can call itself recursively, so that you can read infitive depths of the comments.

Random "IndexError: list index out of range "

I am trying to scrape a site that returns its data via Javascript. The code I wrote using BeautifulSoup works pretty well, but at random points during scraping I get the following error:
Traceback (most recent call last):
File "scraper.py", line 48, in <module>
accessible = accessible[0].contents[0]
IndexError: list index out of range
Sometimes I can scrape 4 urls, sometimes 15, but at some point the script eventually fails and gives me the above error. I can find no pattern behind the failing, so I'm really at a loss here - what am I doing wrong?
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
countries = csv.reader(open("countries.csv", 'rb'), delimiter=",")
database = csv.writer(open("herdict_database.csv", 'w'), delimiter=',')
basepage = "https://www.herdict.org/explore/"
session_id = "indepth;jsessionid=C1D2073B637EBAE4DE36185564156382"
ccode = "#fc=IN"
end_date = "&fed=12/31/"
start_date = "&fsd=01/01/"
year_range = range(2009, 2011)
years = [str(year) for year in year_range]
def get_number(var):
number = re.findall("(\d+)", var)
if len(number) > 1:
thing = number[0] + number[1]
else:
thing = number[0]
return thing
def create_link(basepage, session_id, ccode, end_date, start_date, year):
link = basepage + session_id + ccode + end_date + year + start_date + year
return link
for ccode, name in countries:
for year in years:
link = create_link(basepage, session_id, ccode, end_date, start_date, year)
print link
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
accessible = soup.find_all("em", class_="accessible")
inaccessible = soup.find_all("em", class_="inaccessible")
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
acc_num = get_number(accessible)
inacc_num = get_number(inaccessible)
print acc_num
print inacc_num
database.writerow([name]+[year]+[acc_num]+[inacc_num])
time.sleep(2)

You need to add error-handling to your code. When scraping a lot of websites, some will be malformed, or somehow broken. When that happens, you'll be trying to manipulate empty objects.
Look through the code, find all assumptions where you're assuming it works, and check against errors.
For that specific case, I would do this:
if not inaccessible or not accessible:
# malformed page
continue

soup.find_all("em", class_="accessible") is probably returning an empty list. You can try:
if accessible:
accessible = accessible[0].contents[0]
or more generally:
if accessibe and inaccesible:
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
else:
print 'Something went wrong!'
continue

add an array of linked document _ids to couchdb documents in python

I want to add a links property to each couchdb document based on data in a csv file.
the value of the links property is to be an array of dicts containing the couchdb _id of the linked document and the linkType
When I run the script i get a links error (see error info below)
I am not sure how to create the dict key links if it doesn't exist and add the link data, or otherwise append to the links array if it does exist.
an example of a document with the links will look like this:
{
_id: p_3,
name: 'Smurfette'
links: [
{to_id: p_2, linkType: 'knows'},
{to_id: o_56, linkType: 'follows'}
]
}
python script for processing the csv file:
#!/usr/bin/python
# coding: utf-8
# Version 1
#
# csv fields: ID,fromType,fromID,toType,toID,LinkType,Directional
import csv, sys, couchdb
def csv2couchLinks(database, csvfile):
# CouchDB Database Connection etc
server = couchdb.Server()
#assumes that couchdb runs on http://localhost:5984
db = server[database]
#assumes that db is already created
# CSV file
data = csv.reader(open(csvfile, "rb")) # Read in the CSV file rb=read/binary
csv_links= csv.DictReader(open(csvfile, "rb"))
def makeLink(from_id, to_id, linkType):
# get doc from db
doc = db[from_id]
# construct link object
link = {'to_id':to_id, 'linkType':linkType}
# add link reference to array at key 'links'
if doc['links'] in doc:
doc['links'].append(link)
else:
doc['links'] = [link]
# update the record in the database
db[doc.id] = doc
# read each row in csv file
for row in csv_links:
# get entityTypes as lowercase and entityIDs
fromType = row['fromType'].lower()
fromID = row['fromID']
toType = row['toType'].lower()
toID = row['toID']
linkType = row['LinkType']
# concatenate 'entity type' and 'id' to make couch '_id'
fromIDcouch = fromType[0]+'_'+fromID #eg 'p_2' <= person 2
toIDcouch = toType[0]+'_'+toID
makeLink(fromIDcouch, toIDcouch, linkType)
makeLink(toIDcouch, fromIDcouch, linkType)
# Run csv2couchLinks() if this is not an imported module
if __name__ == '__main__':
DATABASE = sys.argv[1]
CSVFILE = sys.argv[2]
csv2couchLinks(DATABASE,CSVFILE)
error info:
$ python LINKS_csv2couchdb_v1.py "qmhonour" "./tablesAsCsv/links.csv"
Traceback (most recent call last):
File "LINKS_csv2couchdb_v1.py", line 65, in <module>
csv2couchLinks(DATABASE,CSVFILE)
File "LINKS_csv2couchdb_v1.py", line 57, in csv2couchLinks
makeLink(fromIDcouch, toIDcouch, linkType)
File "LINKS_csv2couchdb_v1.py", line 33, in makeLink
if doc['links'] in doc:
KeyError: 'links'

Another option is condensing the if block to this:
doc.setdefault('links', []).append(link)
The dictionary's setdefault method checks to see if links exists in the dictionary, and if it doesn't, it creates a key and makes the value an empty list (the default). It then appends link to that list. If links does exist, it just appends link to the list.
def makeLink(from_id, to_id, linkType):
# get doc from db
doc = db[from_id]
# construct link object
link = {'to_id':to_id, 'linkType':linkType}
# add link reference to array at key 'links'
doc.setdefault('links', []).append(link)
# update the record in the database
db[doc.id] = doc

Replace:
if doc['links'] in doc:
With:
if 'links' in doc:

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

creating rss with using scrapy - python

Use a to append, your are overwriting each time using w so you only get the last piece of data: rss.write_xml(open("pyrss2gen.xml", "a")) If you look at the original code you can that also uses a not w. You might want to use with when opening files or at least closing them.

Related

divide list of elements in scrapy output into seperate rows

Separate file for keeping inventory (of books)

Python Praw ways to store data for calling later?

Random "IndexError: list index out of range "

add an array of linked document _ids to couchdb documents in python

Categories

Resources