How to consume XML from RESTful web services using Django / Python? - python

Should I use PyXML or what's in the standard library?

ElementTree is provided as part of the standard Python libs. ElementTree is pure python, and cElementTree is the faster C implementation:
# Try to use the C implementation first, falling back to python
try:
from xml.etree import cElementTree as ElementTree
except ImportError, e:
from xml.etree import ElementTree
Here's an example usage, where I'm consuming xml from a RESTful web service:
def find(*args, **kwargs):
"""Find a book in the collection specified"""
search_args = [('access_key', api_key),]
if not is_valid_collection(kwargs['collection']):
return None
kwargs.pop('collection')
for key in kwargs:
# Only the first keword is honored
if kwargs[key]:
search_args.append(('index1', key))
search_args.append(('value1', kwargs[key]))
break
url = urllib.basejoin(api_url, '%s.xml' % 'books')
data = urllib.urlencode(search_args)
req = urllib2.urlopen(url, data)
rdata = []
chunk = 'xx'
while chunk:
chunk = req.read()
if chunk:
rdata.append(chunk)
tree = ElementTree.fromstring(''.join(rdata))
results = []
for i, elem in enumerate(tree.getiterator('BookData')):
results.append(
{'isbn': elem.get('isbn'),
'isbn13': elem.get('isbn13'),
'title': elem.find('Title').text,
'author': elem.find('AuthorsText').text,
'publisher': elem.find('PublisherText').text,}
)
return results

I always prefer to use the standard library when possible. ElementTree is well known amongst pythonistas, so you should be able to find plenty of examples. Parts of it have also been optimized in C, so it's quite fast.
http://docs.python.org/library/xml.etree.elementtree.html

There's also BeautifulSoup, which has an API some might prefer. Here's an example on how you can extract all tweets that have been favorited from Twitter's Public Timeline:
from BeautifulSoup import BeautifulStoneSoup
import urllib
url = urllib.urlopen('http://twitter.com/statuses/public_timeline.xml').read()
favorited = []
soup = BeautifulStoneSoup(url)
statuses = soup.findAll('status')
for status in statuses:
if status.find('favorited').contents != [u'false']:
favorited.append(status)

Related

How to automate RSS file retrieval in Python

I'm working on a system that scrapes news articles from RSS files and passes them to a sentiment analysis API.
It is my first time working on a project of that scale. I'm at a stage where I can get raw text out of links that are in an RSS file. I now need to put in place a system that can automatically fetch RSS files when they are updated.
Any high-level ideas of how this could be achieved?
Thanks
feedparser does a good job of sourcing RSS feeds. It also has features not used in this example for efficiently getting new items ETags
Google gave me the site https://blog.feedspot.com/world_news_rss_feeds/ for a source of multiple RSS news feeds. I just scraped this to get a dictionary. Then it's a simple case of looping over RSS sources.
import feedparser
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
import pandas as pd
# get some RSS feeds....
resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
soup = BeautifulSoup(resp.content.decode(), "html.parser")
rawfeeds = soup.find_all("h2")
feeds = {}
for rf in rawfeeds:
a = rf.find("a")
if a is not None:
feeds[a.string.replace("RSS Feed", "").strip()] = urllib.parse.parse_qs(a['href'])["q"][0].replace("site:","")
# now source them all into a dataframe
df = pd.DataFrame()
for k, url in feeds.items():
try:
df = pd.concat([df, pd.json_normalize(feedparser.parse(url)["entries"]).assign(Source=k)])
except (Exception, xml.sax.SAXParseException):
print(f"invalid xml: {url}")
re-entrant
use etag and modified capabilities of feedparser
persist dataframes so when run again it takes off from where it left off
I would use threading so that it is not purely sequential. Obviously with threading you need to think about synchronising your save points. Then you can just run in a scheduler to periodically source new items in RSS feeds and get associated article.
import feedparser, requests, newspaper
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
from pathlib import Path
import pandas as pd
if not Path().cwd().joinpath("news").is_dir(): Path.cwd().joinpath("news").mkdir()
p = Path().cwd().joinpath("news")
# get some RSS feeds....
if p.joinpath("rss.pickle").is_file():
dfrss = pd.read_pickle(p.joinpath("rss.pickle"))
else:
resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
soup = BeautifulSoup(resp.content.decode(), "html.parser")
rawfeeds = soup.find_all("h2")
feeds = []
for rf in rawfeeds:
a = rf.find("a")
if a is not None:
feeds.append({"name":a.string.replace("RSS Feed", "").strip(),
"url":urllib.parse.parse_qs(a['href'])["q"][0].replace("site:",""),
"etag":"","status":0, "dubug_msg":"", "modified":""})
dfrss = pd.DataFrame(feeds).set_index("url")
if p.joinpath("rssdata.pickle").is_file():
df = pd.read_pickle(p.joinpath("rssdata.pickle"))
else:
df = pd.DataFrame({"id":[],"link":[]})
# now source them all into a dataframe. head() is there for testing purposes
for r in dfrss.head(5).itertuples():
# print(r.Index)
try:
fp = feedparser.parse(r.Index, etag=r.etag, modified=r.modified)
if fp.bozo==1: raise Exception(fp.bozo_exception)
except Exception as e:
fp = feedparser.FeedParserDict(**{"etag":r.etag, "entries":[], "status":500, "debug_message":str(e)})
# keep meta information of what has already been sourced from a RSS feed
if "etag" in fp.keys(): dfrss.loc[r.Index,"etag"] = fp.etag
dfrss.loc[r.Index,"status"] = fp.status
if "debug_message" in fp.keys(): dfrss.loc[r.Index,"debug_mgs"] = fp.debug_message
# 304 means upto date... getting 301 and entries hence test len...
if len(fp["entries"])>0:
dft = pd.json_normalize(fp["entries"]).assign(Source=r.Index)
# don't capture items that have already been captured...
df = pd.concat([df, dft[~dft["link"].isin(df["link"])]])
# save to make re-entrant...
dfrss.to_pickle(p.joinpath("rss.pickle"))
df.to_pickle(p.joinpath("rssdata.pickle"))
# finally get the text...
if p.joinpath("text.pickle").is_file():
dftext = pd.read_pickle(p.joinpath("text.pickle"))
else:
dftext = pd.DataFrame({"link":[], "text":[]})
# head() is there for testing purposes
for r in df[~df["link"].isin(dftext["link"])].head(5).itertuples():
a = newspaper.Article(r.link)
a.download()
a.parse()
dftext = dftext.append({"link":r.link, "text":a.text},ignore_index=True)
dftext.to_pickle(p.joinpath("text.pickle"))
An analysis of data that has been retrieved.

Parsing Javascript In Python

I usually use Beautiful Soup to parse html that I need, but I came across some Javascript that I would like to get from here.
<script>
function Model(){
this.players = [{".....data......:""}];...etc
I tried to load it like...
import json
scrape_url = "https://swishanalytics.com/optimus/nba/daily-fantasy-projections?date=2016-12-15"
result = json.loads(scrape_url)
But I get "No Json Can Be Decoded". Not sure how to go about this.
You can extract JSON from arbitrary text with the jsonfinder library:
from jsonfinder import jsonfinder
import requests
scrape_url = "https://swishanalytics.com/optimus/nba/daily-fantasy-projections?date=2016-12-15"
content = requests.get(scrape_url).text
for _, __, obj in jsonfinder(content, json_only=True):
if (obj and
isinstance(obj, list) and
isinstance(obj[0], dict) and
{'player_id', 'event_id', 'name'}.issubset(obj[0])
):
break
else:
raise ValueError('data not found')
# Now you can use obj
print(len(obj))
print(obj[0])

Universal Feed Parser issue

I am working on a python script to parse RSS links.
I use the Universal Feed Parser and I am encountering issues with some links, for example while trying to parse the FreeBSD Security Advisories
Here is the sample code:
feed = feedparser.parse(url)
items = feed["items"]
Basically the feed["items"] should return all the entries in the feed, the fields that start with item, but it always returns empty.
I can also confirm that the following links are parsed as expected:
Ubuntu
Redhat
Is this a issue with the feeds, in that the ones from FreeBSD do nor respect the standard ?
EDIT:
I am using python 2.7.
I ended up using feedparser, in combination with BeautifulSoup, like Hai Vu proposed.
Here is the sample code I ended up with, slightly changed:
def rss_get_items_feedparser(self, webData):
feed = feedparser.parse(webData)
items = feed["items"]
return items
def rss_get_items_beautifulSoup(self, webData):
soup = BeautifulSoup(webData)
for item_node in soup.find_all('item'):
item = {}
for subitem_node in item_node.findChildren():
if subitem_node.name is not None:
item[str(subitem_node.name)] = str(subitem_node.contents[0])
yield item
def rss_get_items(self, webData):
items = self.rss_get_items_feedparser(webData)
if (len(items) > 0):
return items;
return self.rss_get_items_beautifulSoup(webData)
def parse(self, url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
webData = response .read()
for item in self.rss_get_items(webData):
#parse items
I also tried passing the response directly to rss_get_items, without reading it, but it throws and exception, when BeautifulSoup tries to read:
File "bs4/__init__.py", line 161, in __init__
markup = markup.read()
TypeError: 'NoneType' object is not callable
I found out the problem was with the use of namespace.
for FreeBSD's RSS feed:
<rss xmlns:atom="http://www.w3.org/2005/Atom"
xmlns="http://www.w3.org/1999/xhtml"
version="2.0">
For Ubuntu's feed:
<rss xmlns:atom="http://www.w3.org/2005/Atom"
version="2.0">
When I remove the extra namespace declaration from FreeBSD's feed, everything works as expected.
So what does it means for you? I can think of a couple of different approaches:
Use something else, such as BeautifulSoup. I tried it and it seems to work.
Download the whole RSS feed, apply some search/replace to fix up the namespaces, then use feedparser.parse() afterward. This approach is a big hack; I would not use it myself.
Update
Here is a sample code for rss_get_items() which will returns you a list of items from an RSS feed. Each item is a dictionary with some standard keys such as title, pubdate, link, and guid.
from bs4 import BeautifulSoup
import urllib2
def rss_get_items(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response)
for item_node in soup.find_all('item'):
item = {}
for subitem_node in item_node.findChildren():
key = subitem_node.name
value = subitem_node.text
item[key] = value
yield item
if __name__ == '__main__':
url = 'http://www.freebsd.org/security/rss.xml'
for item in rss_get_items(url):
print item['title']
print item['pubdate']
print item['link']
print item['guid']
print '---'
Output:
FreeBSD-SA-14:04.bind
Tue, 14 Jan 2014 00:00:00 PST
http://security.FreeBSD.org/advisories/FreeBSD-SA-14:04.bind.asc
http://security.FreeBSD.org/advisories/FreeBSD-SA-14:04.bind.asc
---
FreeBSD-SA-14:03.openssl
Tue, 14 Jan 2014 00:00:00 PST
http://security.FreeBSD.org/advisories/FreeBSD-SA-14:03.openssl.asc
http://security.FreeBSD.org/advisories/FreeBSD-SA-14:03.openssl.asc
---
...
Notes:
I omit error checking for sake of brevity.
I recommend only using the BeautifulSoup API when feedparser fails. The reason is feedparser is the right tool the the job. Hopefully, they will update it to be more forgiving in the future.

How to get all YouTube comments with Python's gdata module?

Looking to grab all the comments from a given video, rather than go one page at a time.
from gdata import youtube as yt
from gdata.youtube import service as yts
client = yts.YouTubeService()
client.ClientLogin(username, pwd) #the pwd might need to be application specific fyi
comments = client.GetYouTubeVideoComments(video_id='the_id')
a_comment = comments.entry[0]
The above code with let you grab a single comment, likely the most recent comment, but I'm looking for a way to grab all the comments at once. Is this possible with Python's gdata module?
The Youtube API docs for comments, the comment feed docs and the Python API docs
The following achieves what you asked for using the Python YouTube API:
from gdata.youtube import service
USERNAME = 'username#gmail.com'
PASSWORD = 'a_very_long_password'
VIDEO_ID = 'wf_IIbT8HGk'
def comments_generator(client, video_id):
comment_feed = client.GetYouTubeVideoCommentFeed(video_id=video_id)
while comment_feed is not None:
for comment in comment_feed.entry:
yield comment
next_link = comment_feed.GetNextLink()
if next_link is None:
comment_feed = None
else:
comment_feed = client.GetYouTubeVideoCommentFeed(next_link.href)
client = service.YouTubeService()
client.ClientLogin(USERNAME, PASSWORD)
for comment in comments_generator(client, VIDEO_ID):
author_name = comment.author[0].name.text
text = comment.content.text
print("{}: {}".format(author_name, text))
Unfortunately the API limits the number of entries that can be retrieved to 1000. This was the error I got when I tried a tweaked version with a hand crafted GetYouTubeVideoCommentFeed URL parameter:
gdata.service.RequestError: {'status': 400, 'body': 'You cannot request beyond item 1000.', 'reason': 'Bad Request'}
Note that the same principle should apply to retrieve entries in other feeds of the API.
If you want to hand craft the GetYouTubeVideoCommentFeed URL parameter, its format is:
'https://gdata.youtube.com/feeds/api/videos/{video_id}/comments?start-index={sta‌​rt_index}&max-results={max_results}'
The following restrictions apply: start-index <= 1000 and max-results <= 50.
The only solution I've got for now, but it's not using the API and gets slow when there's several thousand comments.
import bs4, re, urllib2
#grab the page source for vide
data = urllib2.urlopen(r'http://www.youtube.com/all_comments?v=video_id') #example XhFtHW4YB7M
#pull out comments
soup = bs4.BeautifulSoup(data)
cmnts = soup.findAll(attrs={'class': 'comment yt-tile-default'})
#do something with them, ie count them
print len(cmnts)
Note that due to 'class' being a builtin python name, you can't do regular searches for 'startwith' via regex or lambdas as seen here, since you're using a dict, over regular parameters. It also gets pretty slow due to BeautifulSoup, but it needs to get used because etree and minidom don't find matching tags for some reason. Even after prettyfying() with bs4

Extracting data from a URL result with special formatting

I have a URL:
http://somewhere.com/relatedqueries?limit=2&query=seedterm
where modifying the inputs, limit and query, will generate wanted data. Limit is the max number of term possible and query is the seed term.
The URL provides text result formatted in this way:
oo.visualization.Query.setResponse({version:'0.5',reqId:'0',status:'ok',sig:'1303596067112929220',table:{cols:[{id:'score',label:'Score',type:'number',pattern:'#,##0.###'},{id:'query',label:'Query',type:'string',pattern:''}],rows:[{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm1'}]},{c:[{v:0.9894380670262618,f:'0.99'},{v:'newterm2'}]}],p:{'totalResultsCount':'7727'}}});
I'd like to write a python script that takes two arguments (limit number and the query seed), go fetch the data online, parse the result and return a list with the new terms ['newterm1','newterm2'] in this case.
I'd love some help, especially with the URL fetching since I have never done this before.
It sounds like you can break this problem up into several subproblems.
Subproblems
There are a handful of problems that need to be solved before composing the completed script:
Forming the request URL: Creating a configured request URL from a template
Retrieving data: Actually making the request
Unwrapping JSONP: The returned data appears to be JSON wrapped in a JavaScript function call
Traversing the object graph: Navigating through the result to find the desired bits of information
Forming the request URL
This is just simple string formatting.
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=2, seedterm='seedterm')
Python 2 Note
You will need to use the string formatting operator (%) here.
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
Retrieving data
You can use the built-in urllib.request module for this.
import urllib.request
data = urllib.request.urlopen(url) # url from previous section
This returns a file-like object called data. You can also use a with-statement here:
with urllib.request.urlopen(url) as data:
# do processing here
Python 2 Note
Import urllib2 instead of urllib.request.
Unwrapping JSONP
The result you pasted looks like JSONP. Given that the wrapping function that is called (oo.visualization.Query.setResponse) doesn't change, we can simply strip this method call out.
result = data.read()
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
Parsing JSON
The resulting result string is just JSON data. Parse it with the built-in json module.
import json
result_object = json.loads(result)
Traversing the object graph
Now, you have a result_object that represents the JSON response. The object itself be a dict with keys like version, reqId, and so on. Based on your question, here is what you would need to do to create your list.
# Get the rows in the table, then get the second column's value for
# each row
terms = [row['c'][2]['v'] for row in result_object['table']['rows']]
Putting it all together
#!/usr/bin/env python3
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python3 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib.request
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit={limit}&query={seedterm}'
url = url_template.format(limit=limit, seedterm=seedterm)
try:
with urllib.request.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
print('Could not request data from server', file=sys.stderr)
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print(terms)
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print(term)
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
print(error_message, file=sys.stderr)
exit(2)
exit(main(limit, seedterm))
Python 2.7 version
#!/usr/bin/env python2.7
"""A script for retrieving and parsing results from requests to
somewhere.com.
This script works as either a standalone script or as a library. To use
it as a standalone script, run it as `python2.7 scriptname.py`. To use it
as a library, use the `retrieve_terms` function."""
import urllib2
import json
import sys
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
prefix = 'oo.visualization.Query.setResponse('
suffix = ');'
# Strip JSONP function wrapper
if result.startswith(prefix) and result.endswith(suffix):
result = result[len(prefix):-len(suffix)]
# Deserialize JSON to Python objects
result_object = json.loads(result)
# Get the rows in the table, then get the second column's value
# for each row
return [row['c'][2]['v'] for row in result_object['table']['rows']]
def retrieve_terms(limit, seedterm):
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://somewhere.com/relatedqueries?limit=%(limit)d&query=%(seedterm)s'
url = url_template % dict(limit=2, seedterm='seedterm')
try:
with urllib2.urlopen(url) as data:
data = perform_request(limit, seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
terms = parse_result(result)
print terms
def main(limit, seedterm):
"""Retrieves and parses data and prints each term to standard output"""
terms = retrieve_terms(limit, seedterm)
for term in terms:
print term
if __name__ == '__main__'
try:
limit = int(sys.argv[1])
seedterm = sys.argv[2]
except:
error_message = '''{} limit seedterm
limit must be an integer'''.format(sys.argv[0])
sys.stderr.write('%s\n' % error_message)
exit(2)
exit(main(limit, seedterm))
i didn't understand well your problem because from your code there it seem to me that you use Visualization API (it's the first time that i hear about it by the way).
But well if you are just searching for a way to fetch data from a web page you could use urllib2 this is just for getting data, and if you want to parse the retrieved data you will have to use a more appropriate library like BeautifulSoop
if you are dealing with another web service (RSS, Atom, RPC) rather than web pages you can find a bunch of python library that you can use and that deal with each service perfectly.
import urllib2
from BeautifulSoup import BeautifulSoup
result = urllib2.urlopen('http://somewhere.com/relatedqueries?limit=%s&query=%s' % (2, 'seedterm'))
htmletxt = resul.read()
result.close()
soup = BeautifulSoup(htmltext, convertEntities="html" )
# you can parse your data now check BeautifulSoup API.

Categories