Unable to parse JSON file, keep getting ValueError: Extra Data - python

So, leading on from my prior issue [found here][1], I'm attempting to parse a JSON file that I've managed to download with #SiHa's help. The JSON is structured like so:
{"properties": [{"property": "name", "value": "A random company name"}, {"property": "companyId", "value": 123456789}]}{"properties": [{"property": "name", "value": "Another random company name"}, {"property": "companyId", "value": 31415999}]}{"properties": [{"property": "name", "value": "Yet another random company"}, {"property": "companyId", "value": 10101010}]}
I've been able to get this by slightly modifiying #SiHa's code:
def get_companies():
create_get_recent_companies_call = "https://api.hubapi.com/companies/v2/companies/?hapikey={hapikey}".format(hapikey=wta_hubspot_api_key)
headers = {'content-type': 'application/json'}
create_get_recent_companies_response = requests.get(create_get_recent_companies_call, headers=headers)
if create_get_recent_companies_response.status_code == 200:
while True:
for i in create_get_recent_companies_response.json()[u'companies']:
all_the_companies = { "properties": [
{ "property": "name", "value": i[u'properties'][u'name'][u'value'] },
{ "property": "companyId", "value": i[u'companyId'] }
]
}
with open("all_the_companies.json", "a") as myfile:
myfile.write(json.dumps(all_the_companies))
#print(companyProperties)
offset = create_get_recent_companies_response.json()[u'offset']
hasMore = create_get_recent_companies_response.json()[u'has-more']
if not hasMore:
break
else:
create_get_recent_companies_call = "https://api.hubapi.com/companies/v2/companies/?hapikey={hapikey}&offset={offset}".format(hapikey=wta_hubspot_api_key, offset=offset)
create_get_recent_companies_response = requests.get(create_get_recent_companies_call, headers=headers)
else:
print("Something went wrong, check the supplied field values.\n")
print(json.dumps(create_get_recent_companies_response.json(), sort_keys=True, indent=4))
So that was part one. Now I'm trying to use the code below to extract two things: 1) the name and 2) the companyId.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os.path
import requests
import json
import csv
import glob2
import shutil
import time
import time as howLong
from time import sleep
from time import gmtime, strftime
# Local Testing Version
findCSV = glob2.glob('*contact*.csv')
theDate = time=strftime("%Y-%m-%d", gmtime())
theTime = time=strftime("%H:%M:%S", gmtime())
# Exception handling
try:
testData = findCSV[0]
except IndexError:
print ("\nSyncronisation attempted on {date} at {time}: There are no \"contact\" CSVs, please upload one and try again.\n").format(date=theDate, time=theTime)
print("====================================================================================================================\n")
sys.exit()
for theCSV in findCSV:
def process_companies():
with open('all_the_companies.json') as data_file:
data = json.load(data_file)
for i in data:
company_name = data[i][u'name']
#print(company_name)
if row[0].lower() == company_name.lower():
contact_company_id = data[i][u'companyId']
#print(contact_company_id)
return contact_company_id
else:
print("Something went wrong, check the \"get_companies()\" function.\n")
print(json.dumps(create_get_recent_companies_response.json(), sort_keys=True, indent=4))
if __name__ == "__main__":
start_time = howLong.time()
process_companies()
print("This operation took %s seconds.\n" % (howLong.time() - start_time))
sys.exit()
Unfortunately, its not working - I'm getting the following traceback:
Traceback (most recent call last):
File "wta_parse_json.py", line 62, in <module>
process_companies()
File "wta_parse_json.py", line 47, in process_companies
data = json.load(data_file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 290, in load
**kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.py", line 369, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 130 - line 1 column 1455831 (char 129 - 1455830)
I've made sure that i'm using json.dumps not json.dump to open the file, but still its not working. :(
I've now given up on JSON, and am trying to export a simple CSV with the code below:
def get_companies():
create_get_recent_companies_call = "https://api.hubapi.com/companies/v2/companies/?hapikey={hapikey}".format(hapikey=wta_hubspot_api_key)
headers = {'content-type': 'application/json'}
create_get_recent_companies_response = requests.get(create_get_recent_companies_call, headers=headers)
if create_get_recent_companies_response.status_code == 200:
while True:
for i in create_get_recent_companies_response.json()[u'companies']:
all_the_companies = "{name},{id}\n".format(name=i[u'properties'][u'name'][u'value'], id=i[u'companyId'])
all_the_companies.encode('utf-8')
with open("all_the_companies.csv", "a") as myfile:
myfile.write(all_the_companies)
#print(companyProperties)
offset = create_get_recent_companies_response.json()[u'offset']
hasMore = create_get_recent_companies_response.json()[u'has-more']
if not hasMore:
break
else:
create_get_recent_companies_call = "https://api.hubapi.com/companies/v2/companies/?hapikey={hapikey}&offset={offset}".format(hapikey=wta_hubspot_api_key, offset=offset)
create_get_recent_companies_response = requests.get(create_get_recent_companies_call, headers=headers)
[1]: http://stackoverflow.com/questions/36148346/unable-to-loop-through-paged-api-responses-with-python
But it looks like this isn't right either - even though i've read up on the formatting issues, and have added the .encode('utf-8') additions. I still end up getting the following traceback:
Traceback (most recent call last):
File "wta_get_companies.py", line 78, in <module>
get_companies()
File "wta_get_companies.py", line 57, in get_companies
all_the_companies = "{name},{id}\n".format(name=i[u'properties'][u'name'][u'value'], id=i[u'companyId'])
UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 3: ordinal not in range(128)

The JSON data has three Objects one after the other; simplified:
{ .. }{ .. }{ .. }
That's not something that's supported by the JSON standard. How is Python supposed to parse that? Automatically wrap it in an array? Assign it to three different variables? Just use the first one?
You probably want to wrap it in an array, simplified:
[{ .. },{ .. },{ .. }]
Or full:
[{"properties": [{"property": "name", "value": "A random company name"}, {"property": "companyId", "value": 123456789}]},{"properties": [{"property": "name", "value": "Another random company name"}, {"property": "companyId", "value": 31415999}]},{"properties": [{"property": "name", "value": "Yet another random company"}, {"property": "companyId", "value": 10101010}]}]

Related

How do I do multiple JSON entries with Python?

I'm trying to pull some data from a flight simulation JSON table. It's updated every 15 seconds and I've been trying to pull print(obj['pilots']['flight_plans']['cid']). However im getting the error
Traceback (most recent call last):
File "main.py", line 18, in <module>
print(obj['pilots']['flight_plans']['cid'])
TypeError: list indices must be integers or slices, not str
My code is below
import json
from urllib.request import urlopen
import urllib
# initial setup
URL = "https://data.vatsim.net/v3/vatsim-data.json"
# json entries
response = urllib.request.urlopen(URL)
str_response = response.read().decode('utf-8')
obj = json.loads(str_response)
# result is connections
# print(obj["general"]["connected_clients"])
print(obj['pilots']['flight_plans']['cid'])
The print(obj["general"]["connected_clients"]) does work.
Investigate your obj with print(json.dumps(obj,indent=2). You'll find that the pilots key is a list of dictionaries containing flight_plan (not plural) and cid keys. Here's the first few lines:
{
"general": {
"version": 3,
"reload": 1,
"update": "20220301062202",
"update_timestamp": "2022-03-01T06:22:02.245318Z",
"connected_clients": 292,
"unique_users": 282
},
"pilots": [
{
"cid": 1149936,
"name": "1149936",
"callsign": "URO504",
"server": "UK",
"pilot_rating": 0,
"latitude": -23.39706,
"longitude": -46.3709,
"altitude": 9061,
"groundspeed": 327,
"transponder": "0507",
"heading": 305,
"qnh_i_hg": 29.97,
"qnh_mb": 1015,
"flight_plan": {
"flight_rules": "I",
"aircraft": "A346",
...
For example, iterate over the list of pilots and print name/cid:
for pilot in obj['pilots']:
print(pilot['name'],pilot['cid'])
Output:
1149936 1149936
Nick Aydin OTHH 1534423
Oguz Aydin 1429318
Marvin Steglich LSZR 1482019
Daniel Krol EPKK 1279199
... etc ...

Python Interprter error while loading JSON file using json.load()

This is my python code for parsing a JSON file.
import os
import argparse
import json
import datetime
ResultsJson = "sample.json"
try:
with open(ResultsJson, 'r') as j:
jsonbuffer = json.load(j)
result_data = json.loads(jsonbuffer)
print("Just after loading json")
except Exception as e:
print(e, exc_info=True)
I get an error like in the snapshot attached below.
I'm also attaching the JSON file "sample.json" that I'm using here.
sample.json
{
"idx": 1,
"timestamp": 1562781093.1182132,
"machine_id": "tool_2",
"part_id": "af71ce94-e9b2-47c0-ab47-a82600616b6d",
"image_id": "14cfb9e9-1f38-4126-821b-284d7584b739",
"cam_sn": "camera-serial-number",
"defects": [
{
"type": 0,
"tl": [169, 776],
"br": [207, 799]
},
{
"type": 0,
"tl": [404, 224],
"br": [475, 228]
},
{
"type": 1,
"tl": [81, 765],
"br": [130, 782]
}
],
"display_info": [
{
"info": "DEFECT DETECTED",
"priority": 2
}
]
}
Not sure what I missed here. I'm very new to Python (Coming from C++ background). Please be easy on me if I've missed something basic.
You don't need this line:
result_data = json.loads(jsonbuffer)
...because jsonbuffer is the result of json.load, so it's already the result of parsing the JSON file. In your case it's a Python dictionary, but json.loads expects a string, so you get an error.
Also, as the second error message says, exc_info is not a valid keyword argument of the print function. If you wanted to print the exception, just do print(e).
You can do either:
with open(ResultsJson, 'r') as j:
result_data = json.load(j)
print("Just after loading json")
Or:
with open(ResultsJson, 'r') as j:
result_data = json.loads(j.read())
print("Just after loading json")
The json.load() internally calls the json.loads() function

Properly formatting http.client.HTTPSConnection in Python

Overall, I'm trying to invoke the MS Cognitive key phrases API from Python 3.5.1 :: Anaconda 4.0.0 (32-bit). I looked everywhere and tried to incorporate this stackoverflow response.
To call the API your account key below marked as ## needs to be added from here,
however to format the body correctly you probably don't need the account key. A good portion of the code below is from sample code.
Request body should look like
body = {
"documents": [
{
"language": "en",
"id": "1",
"text": "One line of text."
},
{
"language": "en",
"id": "2",
"text": "another line of text."
}
]
}
my code <it now works!!>
import sys
import os.path
import http.client
import urllib.request
import urllib.parse
import urllib.error
import base64
import json
subscription_key = '##'
headers = {
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': subscription_key
}
#input text is: ID | text to analyze. How my input file is formatted.
input_text = ["100|One line of text.", "101|another line of text."]
# Inputs holds the params to call the web service in bulk.
body = []
indx = 1
for line in input_text:
input_text = line.split("|")
print ('-----\n')
print ("Input text is:", input_text)
input_text_analyze = input_text[1]
print ('\nInput text to be analyzed:', input_text_analyze)
body.append({ "language" : "en", "id" : str(indx), "text" : input_text_analyze })
indx = indx + 1
print ('-----\n')
print ('\nBody has', body)
print ("Calling API to get keywords...")
body_documents = { 'documents': body }
print ("\nParams:", body_documents)
params = urllib.parse.urlencode({ })
try:
conn = http.client.HTTPSConnection('westus.api.cognitive.microsoft.com')
conn.request("POST", "/text/analytics/v2.0/keyPhrases?%s" % params, str(body_documents), headers)
response = conn.getresponse()
keyword_obj = response.read()
print("Returned keyword_obj is: ", keyword_obj)
conn.close()
except Exception as e:
print("[Errno {0}] {1}".format(e.errno, e.strerror))
I made 2 changed to the code above that allow it to work. 1) I was getting my params and body mixed up. 2) I needed to add str(body_documents) in my post. Both beginner mistakes.

Converting a dictionary to a string

I am having trouble with converting a dictionary to a string in python. I am trying to extract the information from one of my variables but cannot seem to remove the square brackets surrounding the information
for line in str(object):
if line.startswith ('['):
new_object = object.replace('[', '')
Is there a way to remove the square brackets or do I have to find another way of taking the information out of the dictionary?
Edit:
in more detail what i am trying to do here is the following
import requests
city = 'dublin'
country = 'ireland'
info = requests.get('http://api.openweathermap.org/data/2.5/weather?q='+city +','+ country +'&mode=json')
weather = info.json()['weather']
fh = open('/home/Ricky92d3/city.txt', 'w')
fh.write(str(weather))
fh.close()
fl = open('/home/Ricky92d3/city.txt')
Object = fl.read()
fl.close()
for line in str(Object):
if line.startswith ('['):
new_Object = Object.replace('[', '')
if line.startswith ('{'):
new_Object = Object.replace('{u', '')
print new_Object
i hope this makes what i am trying to do a little more clear
The object returned by info.json() is a Python dictionary, so you can access its contents using normal Python syntax. I admit that it can get a little bit tricky, since JSON dictionaries often contain other dictionaries and lists, but it's generally not too hard to figure out what's what if you print the JSON object out in a nicely formatted way. The easiest way to do that is by using the dumps() function in the standard Python json module.
The code below retrieves the JSON data into a dict called data.
It then prints the 'description' string from the list in the 'weather' item of data.
It then saves all the data (not just the 'weather' item) as an ASCII-encoded JSON file.
It then reads the JSON data back in again to a new dict called newdata, and pretty-prints it.
Finally, it prints the weather description again, to verify that we got back what we saw earlier. :)
import requests, json
#The base URL of the weather service
endpoint = 'http://api.openweathermap.org/data/2.5/weather'
#Filename for saving JSON data to
fname = 'data.json'
city = 'dublin'
country = 'ireland'
params = {
'q': '%s,%s' % (city, country),
'mode': 'json',
}
#Fetch the info
info = requests.get(endpoint, params=params)
data = info.json()
#print json.dumps(data, indent=4)
#Extract the value of 'description' from the list in 'weather'
print '\ndescription: %s\n' % data['weather'][0]['description']
#Save data
with open(fname, 'w') as f:
json.dump(data, f, indent=4)
#Reload data
with open(fname, 'r') as f:
newdata = json.load(f)
#Show all the data we just read in
print json.dumps(newdata, indent=4)
print '\ndescription: %s\n' % data['weather'][0]['description']
output
description: light intensity shower rain
{
"clouds": {
"all": 75
},
"name": "Dublin",
"visibility": 10000,
"sys": {
"country": "IE",
"sunset": 1438374108,
"message": 0.0118,
"type": 1,
"id": 5237,
"sunrise": 1438317600
},
"weather": [
{
"description": "light intensity shower rain",
"main": "Rain",
"id": 520,
"icon": "09d"
}
],
"coord": {
"lat": 53.340000000000003,
"lon": -6.2699999999999996
},
"base": "stations",
"dt": 1438347600,
"main": {
"pressure": 1014,
"humidity": 62,
"temp_max": 288.14999999999998,
"temp": 288.14999999999998,
"temp_min": 288.14999999999998
},
"id": 2964574,
"wind": {
"speed": 8.1999999999999993,
"deg": 210
},
"cod": 200
}
description: light intensity shower rain
I'm not quite sure what you're trying to do here (without seeing your dictionary) but if you have a string like x = "[myString]" you can just do the following:
x = x.replace("[", "").replace("]", "")
If this isn't working, there is a high chance you're actually getting a list returned. Though if that was the case you should see an error like this:
>>> x = [1,2,3]
>>> x.replace("[", "")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'list' object has no attribute 'replace'
Edit 1:
I think there's a misunderstanding of what you're getting back here. If you're just looking for a csv output file with the weather from your api try this:
import requests
import csv
city = 'dublin'
country = 'ireland'
info = requests.get('http://api.openweathermap.org/data/2.5/weather?q='+city +','+ country +'&mode=json')
weather = info.json()['weather']
weather_fieldnames = ["id", "main", "description", "icon"]
with open('city.txt', 'w') as f:
csvwriter = csv.DictWriter(f, fieldnames=weather_fieldnames)
for w in weather:
csvwriter.writerow(w)
This works by looping through the list of items you're getting and using a csv.DictWriter to write it as a row in the csv file.
Bonus
Don't call your dictionary object - It's a reserved word for the core language.

How to parse complex json in python 2.7.5?

I trying to list the names of my puppet classes from a Puppet Enterprise 3.7 puppet master, using Puppet's REST API.
Here is my script:
#!/usr/bin/env python
import requests
import json
url='https://ppt-001.example.com:4433/classifier-api/v1/groups'
headers = {"Content-Type": "application/json"}
data={}
cacert='/etc/puppetlabs/puppet/ssl/certs/ca.pem'
key='/etc/puppetlabs/puppet/ssl/private_keys/ppt-001.example.com.pem'
cert='/etc/puppetlabs/puppet/ssl/certs/ppt-001.example.com.pem'
result = requests.get(url,
data=data, #no data needed for this request
headers=headers, #dict {"Content-Type":"application/json"}
cert=(cert,key), #key/cert pair
verify=cacert
)
print json.dumps( result.json(), sort_keys=True, indent=4, separators=(',', ': '))
for i in result.json:
print i
Here is the error message I get when I execute the script:
Traceback (most recent call last):
File "./add-group.py", line 42, in <module>
for i in result.json:
TypeError: 'instancemethod' object is not iterable
Here is a sample of the data I get back from the REST API:
[
{
"classes": {},
"environment": "production",
"environment_trumps": false,
"id": "00000000-0000-4000-8000-000000000000",
"name": "default",
"parent": "00000000-0000-4000-8000-000000000000",
"rule": [
"and",
[
"~",
"name",
".*"
]
],
"variables": {}
},
{
"classes": {
"puppet_enterprise": {
"certificate_authority_host": "ppt-001.example.com",
"console_host": "ppt-001.example.com",
"console_port": "443",
"database_host": "ppt-001.example.com",
"database_port": "5432",
"database_ssl": true,
"mcollective_middleware_hosts": [
"ppt-001.example.com"
],
"puppet_master_host": "ppt-001.example.com",
"puppetdb_database_name": "pe-puppetdb",
"puppetdb_database_user": "pe-puppetdb",
"puppetdb_host": "ppt-001.example.com",
"puppetdb_port": "8081"
}
},
"environment": "production",
"environment_trumps": false,
"id": "52c479fe-3278-4197-91ea-9127ba12474e",
"name": "PE Infrastructure",
"parent": "00000000-0000-4000-8000-000000000000",
"variables": {}
},
.
.
.
How should I go about access the name key and getting the values like default and PE Infrastructure?
I have read the other answers here on SO saying that one should use json.loads() and I have tried using parsed_json = json.loads(result.json()) but results in this error message:
Traceback (most recent call last):
File "./add-group.py", line 38, in <module>
parsed_json = json.loads(result.json())
File "/usr/lib64/python2.7/json/__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "/usr/lib64/python2.7/json/decoder.py", line 365, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
TypeError: expected string or buffer
print json.dumps( result.json(), sort_keys=True, indent=4, separators=(',', ': '))
the first parameter of json.dumps must be a string or buffer, as stated by the TypeError your getting (TypeError: expected string or buffer).
Your variable result is an instance of Response, and the method .json() will return a dictionary. Since you're passing the result of .json() to json.dumps(), you're getting an error. You could either just use result.json() which is already a dictionary corresponding to your response, or change your json.dumps line to print json.dumps( result.text, sort_keys=True, indent=4, separators=(',', ': ')) where result.text is your JSON result as a string/unicode.
After the change, to access something like the name attribute, you could do something like:
for item in r.json():
try:
print item['name']
expect KeyError:
print "There is no 'name' attribute"

Categories