I'm using python 3.6.1 and have the following code which successfully retrieves data in JSON format:
import urllib.request,json,pprint
url = "https://someurl"
response = urllib.request.urlopen(url)
data = json.loads(response.read())
pprint.pprint(data)
I want to wrap this in a function, so i can reuse it. This is what i have tried in a file called getdata.py:
from urllib.request import urlopen
import json
def get_json_data(url):
response = urlopen(url)
return json.loads(response.read())
and this is the error i get after importing the file and attempting to print out the response:
>>> import getdata
>>> print(getdata.get_json_data("https://someurl"))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\Nick\getdata.py", line 6, in get_json_data
from urllib.request import urlopen
NameError: name 'urllib' is not defined
i also tried this and got the same error:
import urllib.request,json
def get_json_data(url):
response = urllib.request.urlopen(url)
return json.loads(response.read())
What do i need to do to get this to work please?
cheers
Its working now ! I think the problem was the hydrogen addon i have for the Atom editor. I uninstalled it, tried again and it worked. Thanks for looking.
Related
I have a small public repo with the following python code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib.request, urllib.parse, urllib.error
import json
from urllib.request import urlopen, Request
import os
def get_json_from_url(url):
config = json.loads(open(os.path.dirname(os.path.abspath(__file__))+'/config.json').read())
token = config['token']
request = Request(url)
request.add_header('Authorization', 'token %s' % token)
request.add_header('Accept', "application/vnd.github.inertia-preview+json" )
response = urlopen(request)
return json.loads(response.read())
def process_cards(pri,url, tag=""):
cards= get_json_from_url(url)
for card in cards:
payload=""
if card['note']:
payload="map project:"+ card['note']
else:
payload="Work on: "+ card['content_url']
print("({}) {} {}".format(pri,payload,tag))
def process_project_board(url,tag=""):
board= get_json_from_url(url)
columns_url= board["columns_url"]
columns= get_json_from_url(columns_url)
priorities=['*', 'A', 'B', 'C', 'D','E']
for x in columns:
process_cards(priorities.pop(0),x['cards_url'],tag)
if __name__ == "__main__":
process_project_board("https://api.github.com/projects/1613733","+EQT")
process_project_board("https://api.github.com/projects/1659667","+PersonalProjects")
It works perfectly from my desktop when I run it with
python3 vision.py
but when I clone it on a server, I get the following error:
Traceback (most recent call last):
File "vision.py", line 37, in <module>
process_project_board("https://api.github.com/projects/1613733","+EQT")
File "vision.py", line 29, in process_project_board
board= get_json_from_url(url)
File "vision.py", line 15, in get_json_from_url
return json.loads(response.read())
File "/usr/lib/python3.4/json/__init__.py", line 312, in loads
s.__class__.__name__))
TypeError: the JSON object must be str, not 'bytes'
What is causing the difference between behaviours? There are no missing library errors- the config.json file (where my Github Token is) is identical, and I've never had a similar problem moving things to the server...
You must be using a Python version lower than 3.6 on your server and JSON doesn't behave like expected. Upgrade your Python version or use a virualenv so you can use the same code on your Desktop and Server
there. I'm building a simple scraping tool. Here's the code that I have for it.
from bs4 import BeautifulSoup
import requests
from lxml import html
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import datetime
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('Programming
4 Marketers-File-goes-here.json', scope)
site = 'http://nathanbarry.com/authority/'
hdr = {'User-Agent':'Mozilla/5.0'}
req = requests.get(site, headers=hdr)
soup = BeautifulSoup(req.content)
def getFullPrice(soup):
divs = soup.find_all('div', id='complete-package')
price = ""
for i in divs:
price = i.a
completePrice = (str(price).split('$',1)[1]).split('<', 1)[0]
return completePrice
def getVideoPrice(soup):
divs = soup.find_all('div', id='video-package')
price = ""
for i in divs:
price = i.a
videoPrice = (str(price).split('$',1)[1]).split('<', 1)[0]
return videoPrice
fullPrice = getFullPrice(soup)
videoPrice = getVideoPrice(soup)
date = datetime.date.today()
gc = gspread.authorize(credentials)
wks = gc.open("Authority Tracking").sheet1
row = len(wks.col_values(1))+1
wks.update_cell(row, 1, date)
wks.update_cell(row, 2, fullPrice)
wks.update_cell(row, 3, videoPrice)
This script runs on my local machine. But, when I deploy it as a part of an app to Heroku and try to run it, I get the following error:
Traceback (most recent call last):
File "/app/.heroku/python/lib/python3.6/site-packages/gspread/client.py", line 219, in put_feed
r = self.session.put(url, data, headers=headers)
File "/app/.heroku/python/lib/python3.6/site-packages/gspread/httpsession.py", line 82, in put
return self.request('PUT', url, params=params, data=data, **kwargs)
File "/app/.heroku/python/lib/python3.6/site-packages/gspread/httpsession.py", line 69, in request
response.status_code, response.content))
gspread.exceptions.RequestError: (400, "400: b'Invalid query parameter value for cell_id.'")
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "AuthorityScraper.py", line 44, in
wks.update_cell(row, 1, date)
File "/app/.heroku/python/lib/python3.6/site-packages/gspread/models.py", line 517, in update_cell
self.client.put_feed(uri, ElementTree.tostring(feed))
File "/app/.heroku/python/lib/python3.6/site-packages/gspread/client.py", line 221, in put_feed
if ex[0] == 403:
TypeError: 'RequestError' object does not support indexing
What do you think might be causing this error? Do you have any suggestions for how I can fix it?
There are a couple of things going on:
1) The Google Sheets API returned an error: "Invalid query parameter value for cell_id":
gspread.exceptions.RequestError: (400, "400: b'Invalid query parameter value for cell_id.'")
2) A bug in gspread caused an exception upon receipt of the error:
TypeError: 'RequestError' object does not support indexing
Python 3 removed __getitem__ from BaseException, which this gspread error handling relies on. This doesn't matter too much because it would have raised an UpdateCellError exception anyways.
My guess is that you are passing an invalid row number to update_cell. It would be helpful to add some debug logging to your script to show, for example, which row it is trying to update.
It may be better to start with a worksheet with zero rows and use append_row instead. However there does seem to be an outstanding issue in gspread with append_row, and it may actually be the same issue you are running into.
I encountered the same problem. BS4 works fine at a local machine. However, for some reason, it is way too slow in the Heroku server resulting into giving error.
I switched to lxml and it is working fine now.
Install it by command:
pip install lxml
A sample code snippet is given below:
from lxml import html
import requests
getpage = requests.get("https://url_here")
gethtmlcontent = html.fromstring(getpage.content)
data = gethtmlcontent.xpath('//div[#class = "class-name"]/text()')
#this is a sample for fetching data from the dummy div
data = data[0:n] # as per your requirement
#now inject the data into django tmeplate.
So I'm accessing the poloniex API with python and this is my code:
from poloniex import Poloniex
import krakenex
import threading
import pprint
import urllib.request
import json
####POLONIEX####
#FUNCTIONS
polo = Poloniex()
def BTC_USDT_LAST_POLONIEX():
polo = Poloniex()
threading.Timer(1.0, BTC_USDT_LAST_POLONIEX).start() # called every minute
print("BTC Last Price = " + (polo('returnTicker')['USDT_BTC']['last']))
def POLONIEX_ASSET_LIST():
pprint.pprint(sorted(list(polo('returnTicker'))))
Everything is working so far and I want to avoid using urllib as its a pain to turn a http request into a list. I'm trying to access the order book but get the following error:
>>> polo('returnOrderBook')
Traceback (most recent call last):
File "<pyshell#27>", line 1, in <module>
polo('returnOrderBook')
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/poloniex/retry.py", line 15, in wrapped
return function(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/poloniex/__init__.py", line 183, in __call__
return self.parseJson(ret.text)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/poloniex/__init__.py", line 197, in parseJson
raise PoloniexError(jsonout['error'])
poloniex.PoloniexError: Please specify a currency pair.
I've tried specifying the currency pair but have no idea how to plug it in.
Rewrite your code and use requests module instead of urllib:
import requests
ret = requests.get('http://poloniex.com/public?command=returnOrderBook¤cyPair=BTC_BCN').json()
print ret
>>> {u'bids': [[u'0.00000034', 20629605.566027], [u'0.00000033', 43382683.465305], [u'0.00000032', 70007976.087993], [u'0.00000031', 49571221.248027], [u'0.00000030', 77520227.415484], [u'0.00000029', 46037827.046996], [u'0.00000028', 26267440.401662], [u'0.00000027', 22511987.85933], [u'0.00000026', 18885378.040015], [u'0.00000025', 13313109.292994], [u'0.00000024', 6243527.5236432], [u'0.00000023', 7504850.7832509], [u'0.00000022', 8443683.7997507], [u'0.00000021', 8996262.9826951], [u'0.00000020', 24601532.006268], [u'0.00000019', 26853346.478659], [u'0.00000018', 6027262.24889 etc....
CODE:
import networkx as net
from urllib.request import urlopen
def read_lj_friends(g, name):
# fetch the friend-list from LiveJournal
response=urllib.urlopen('http://www.livejournal.com/misc/fdata.bml?user='+name)
ERROR:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'urllib' is not defined
You've imported urlopen directly, so you should refer to it like that rather than via urllib:
response = urlopen('...')
You can also try in Python 3:
from six.moves import urllib
temp_file, _ = urllib.request.urlretrieve(url)
Just put import urllib at the top of your code
Try pls:
from urllib.request import urlopen
html = urlopen("http://www.google.com/")
print(html.read) # Content
For your case:
import networkx as net
from urllib.request import urlopen
def read_lj_friends(g, name):
# fetch the friend-list from LiveJournal
response=urlopen('http://www.livejournal.com/misc/fdata.bml?user='+name)
I am trying to get the contents of a webpage. For some reason whenever I try urlopen it says there is "no such resource". I also can't use urllib2.
I would simply like to get the contents of a webpage such as http://www.example.com
import urllib
import re
textfile = open('depth_1.txt','w')
print("Enter the URL you wish to crawl..")
print('Usage - "http://phocks.org/stumble/creepy/" <-- With the double quotes')
myurl = input("#> ")
for i in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(myurl).read(), re.I):
print(i)
for ee in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(i).read(), re.I):
print(ee)
textfile.write(ee+'\n')
textfile.close()
Here is the error:
Traceback (most recent call last):
File "/Users/austinhitt/Desktop/clases_example.py", line 8, in <module>
for i in re.findall('''href=["'](.[^"']+)["']''',
urllib.urlopen(myurl).read(), re.I):
AttributeError: module 'urllib' has no attribute 'urlopen'
For only the content use requests and if you want to play arround with the content you need to use scrapy, example:
import requests
r = requests.get('http://scrapy.org')
r.content
r.headers
r.status_code