Parse XML from URL into python object - python

The goodreads website has this API for accessing a user's 'shelves:' https://www.goodreads.com/review/list/20990068.xml?key=nGvCqaQ6tn9w4HNpW8kquw&v=2&shelf=toread
It returns XML. I'm trying to create a django project that shows books on a shelf from this API. I'm looking to find out how (or if there is a better way than) to write my view so I can pass an object to my template. Currently, this is what I'm doing:
import urllib2
def homepage(request):
file = urllib2.urlopen('https://www.goodreads.com/review/list/20990068.xml?key=nGvCqaQ6tn9w4HNpW8kquw&v=2&shelf=toread')
data = file.read()
file.close()
dom = parseString(data)
I'm not entirely sure how to manipulate this object if I'm doing this correctly. I'm following this tutorial.

I'd use xmltodict to make a python dictionary out of the XML data structure and pass this dictionary to the template inside the context:
import urllib2
import xmltodict
def homepage(request):
file = urllib2.urlopen('https://www.goodreads.com/review/list/20990068.xml?key=nGvCqaQ6tn9w4HNpW8kquw&v=2&shelf=toread')
data = file.read()
file.close()
data = xmltodict.parse(data)
return render_to_response('my_template.html', {'data': data})

xmltodict using requests
import requests
import xmltodict
url = "https://yoursite/your.xml"
response = requests.get(url)
data = xmltodict.parse(response.content)

xmltodict using urllib3
import traceback
import urllib3
import xmltodict
def getxml():
url = "https://yoursite/your.xml"
http = urllib3.PoolManager()
response = http.request('GET', url)
try:
data = xmltodict.parse(response.data)
except:
print("Failed to parse xml from response (%s)" % traceback.format_exc())
return data

Related

Mock/Monkeypatch BeautifulSoup html objects for Pytest

I'm working on a web scraping project in Python and trying to add automated testing w/ Pytest. I'm not new to web scraping but I'm very new to testing, and I believe the idea here is I should mock the HTTP request and replacing it with some dummy html fixture code to test if the rest of the function works without having to rely on requesting anything from the actual url.
Below is my web scraping function.
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_player_stats_data():
"""
Web Scrape function w/ BS4 that grabs aggregate season stats
Args:
None
Returns:
Pandas DataFrame of Player Aggregate Season stats
"""
try:
year_stats = 2022
url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_per_game.html"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
headers = headers[1:]
rows = soup.findAll("tr")[1:]
player_stats = [
[td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
]
stats = pd.DataFrame(player_stats, columns=headers)
print(
f"General Stats Extraction Function Successful, retrieving {len(stats)} updated rows"
)
return stats
except BaseException as error:
print(f"General Stats Extraction Function Failed, {error}")
df = []
return df
And here is what I'm using to grab the raw html of the page, and pickling it so I can save it and import it for testing.
import pickle
from bs4 import BeautifulSoup
from urllib.request import urlopen
year_stats = 2022
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
html = urlopen(url)
# how you save it
with open('new_test/tests/fixture_csvs/stats_html.html', 'wb') as fp:
while True:
chunk = html.read(1024)
if not chunk:
break
fp.write(chunk)
# how you open it
with open('new_test/tests/fixture_csvs/stats_html.html', "rb") as fp:
stats_html = fp.read()
My question is how do I mock/patch/monkeypatch the urlopen(url) call and use the pickled html in its place to create a fixture with it? The Pytest docs example is creating a class & monkeypatching requests.get() where get is an attribute of requests which seems a little different from what i'm doing, and I haven't been able to get mine working, I think i'm supposed to use something other than monkeypatch.setattr? Below is what I tried.
#pytest.fixture(scope="session")
def player_stats_data_raw(monkeypatch):
"""
Fixture to load web scrape html from an html file for testing.
"""
fname = os.path.join(
os.path.dirname(__file__), "fixture_csvs/stats_html.html"
)
with open(fname, "rb") as fp:
html = fp.read()
def mock_urlopen():
return html
monkeypatch.setattr(urlopen, "url", mock_urlopen)
df = get_player_stats_data()
return df
### The actual tests in a separate file
def test_raw_stats_rows(player_stats_data_raw):
assert len(player_stats_data_raw) == 30
def test_raw_stats_schema(player_stats_data_raw):
assert list(player_stats_data_raw.columns) == raw_stats_cols
The goal is to replace html = urlopen(url) in the web scraping function with this pickled html I've previously saved.
The other option is to turn that url into an input parameter for the function, where in production I just call the actual url as you see here (www.basketballreference.com/etc), and in testing I just read in that pickled value. That's an option but I'm curious to learn & apply this patching technique to a real example. If anyone has any thoughts I'd appreciate it!
In your test file, you could try like this:
from module.script import get_player_stats_data
#pytest.fixture(scope="session")
def urlopen(mocker):
with open(fname, "rb") as fp:
html = fp.read()
urlopen = mocker.patch("module.script.urlopen")
urlopen.return_value = html
return urlopen
def test_raw_stats_rows(urlopen):
df = get_player_stats_data()
assert len(df) == 30
def test_raw_stats_schema(urlopen):
df = get_player_stats_data()
assert list(df.columns) == raw_stats_cols

Printing Nested Json API data using Python

I was able to receive the data using an API get request, now I just need help printing a few objects. I'm having trouble because the objects I need are nested pretty deeply. Objects I need:
-cve ID
-url ref
-description
-severity
json page: https://services.nvd.nist.gov/rest/json/cve/1.0/CVE-2021-40463/
import requests
import json
import pprint
url = "https://services.nvd.nist.gov/rest/json/cve/1.0/CVE-2021-40463/"
params = {"q": "CVE"}
response = requests.get(url, params)
data = json.loads(response.text)
pprint.pprint (data)
import requests
import json
import pprint
url = "https://services.nvd.nist.gov/rest/json/cve/1.0/CVE-2021-40463/"
params = {"q": "CVE"}
response = requests.get(url, params)
data = json.loads(response.content)
pprint.pprint(data)
response.content will return the content of the response. after than:
cve ID: pprint.pprint(data['result']['CVE_Items'][0]['cve']['CVE_data_meta']['ID'])
url ref: pprint.pprint(data['result']['CVE_Items'][0]['cve']['references']['reference_data'][0]['url'])
description: pprint.pprint(data['result']['CVE_Items'][0]['cve']['description']['description_data'][0]['value'])
severity: pprint.pprint(data['result']['CVE_Items'][0]['impact']['baseMetricV2']['severity'])

Parsing XML creates not well-formed error in python

I am attempting to parse an XML document from the url https://www.predictit.org/api/marketdata/all/, using the following code:
import xml.etree.ElementTree as ET
import urllib.request
url = 'https://www.predictit.org/api/marketdata/all/'
response = urllib.request.urlopen(url).read().decode('utf-8')
tree = ET.fromstring(response)
However, I am getting the error ParseError: not well-formed (invalid token): line 1, column 0
What do I need to do in order to convert this to a python object? I am sure this is an XML document, and it appears to parse fine when opened in a browser.
You're most likely getting back json. To verify, try printing the value of info() on the HTTPResponse object and look at the "Content-Type":
response = urllib.request.urlopen(url)
print(response.info())
To request XML, create a Request object and set the header (printing tree for testing):
import xml.etree.ElementTree as ET
import urllib.request
url = "https://www.predictit.org/api/marketdata/all/"
request = urllib.request.Request(url, headers={"Content-Type": "application/xml"})
response = urllib.request.urlopen(request)
tree = ET.parse(response)
print(ET.tostring(tree.getroot()).decode())
this will print (truncated to fit SO):
<MarketList xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><Markets><MarketData><ID>2721</ID><Name>Which party will win the 2020 U.S....

python, webscraping, and writing a file

I am using 3 modules in this program, I don't know if what I'm trying to do is even possible! So I want to scrape some data off of twitter and write it in a text file using python, can somebody please guide me and tell me why my code isn't writing the data scrapped?
import urllib
import urllib.request
from os import path
from bs4 import BeautifulSoup
# here I define the url, I request the page, create my soup
theurl = "https://twitter.com/realDonaldTrump"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
def create_file(dest):
"""
Creates a file for the user to write data in!
:param dest:
:return:
"""
## FileName == Month_Day_Year
name = 'Data Scraped.txt'
if not(path.isfile(dest +name)):
f = open(dest + name, "w")
f.write(soup.title.text)
f.close()
if __name__ == '__main__':
destination = 'C:\\Users\\edwin\\' \
'Desktop\\WebScrappin\\'
create_file(destination)
print("Your file has been created!!")
You're only the writing the title of the document that you received.
f.write(soup.title.text)
Instead of scraping (which is against their ToS) you should gather your data from their RESTful API or use a library like Twython

Download a binary file using Python requests module

I need to download a file from an external source, I am using Basic authentication to login to the URL
import requests
response = requests.get('<external url', auth=('<username>', '<password>'))
data = response.json()
html = data['list'][0]['attachments'][0]['url']
print (html)
data = requests.get('<API URL to download the attachment>', auth=('<username>', '<password>'), stream=True)
print (data.content)
I am getting below output
<url to download the binary data>
\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\xcb\x00\x00\x1e\x00\x1e\x00\xbe\x07\x00\x00.\xcf\x05\x00\x00\x00'
I am expecting the URL to download the word document within the same session.
Working solution
import requests
import shutil
response = requests.get('<url>', auth=('<username>', '<password>'))
data = response.json()
html = data['list'][0]['attachments'][0]['url']
print (html)
data = requests.get('<url>', auth=('<username>', '<password>'), stream=True)
with open("C:/myfile.docx", 'wb') as f:
data.raw.decode_content = True
shutil.copyfileobj(data.raw, f)
I am able to download the file as it is.
When you want to download a file directly you can use shutil.copyfileobj():
https://docs.python.org/2/library/shutil.html#shutil.copyfileobj
You already are passing stream=True to requests which is what you need to get a file-like object back. Just pass that as the source to copyfileobj().

Categories