Can someone help me fix this, I don't know why I am getting this error.
I am trying to use a python program someone made, I tried to mess around with it but I could not figure out the issue.
Error:
PS D:\Python> python .\quizlet.py
Traceback (most recent call last):
File "D:\Python\quizlet.py", line 69, in <module>
q = QuizletParser(website)
File "D:\Python\quizlet.py", line 17, in QuizletParser
data = json.loads(BeautifulSoup(session.get(link).content, features="lxml").find_all('script')[-6].string[44:-152])
File "C:\Users\john\AppData\Local\Programs\Python\Python39\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Users\john\AppData\Local\Programs\Python\Python39\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 14 (char 13)
I am trying to use the code I found here from a bit ago: https://github.com/daijro/python-quizlet
Source:
from requests_html import HTMLSession
from box import Box
import box
import json
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
def FindFlashcard(flashcards: box.box_list.BoxList, match: str):
similar = lambda a, b: SequenceMatcher(None, a, b).ratio()
data = max(list(zip([similar(match, x.term) for x in flashcards], [x for x in range(len(flashcards))])))
flashcard = flashcards[data[1]]
flashcard.update({'similarity': data[0]})
return flashcard
def QuizletParser(link: str):
session = HTMLSession()
data = json.loads(BeautifulSoup(session.get(link).content, features="lxml").find_all('script')[-6].string[44:-152])
flashcards = []
for i in list(data['termIdToTermsMap'].values()):
i = {
'index': i['rank'],
'id': i['id'],
'term': i['word'],
'definition': i['definition'],
'setId': i['setId'],
'image': i['_imageUrl'],
'termTts': 'https://quizlet.com'+i['_wordTtsUrl'],
'termTtsSlow': 'https://quizlet.com'+i['_wordSlowTtsUrl'],
'definitionTts': 'https://quizlet.com'+i['_definitionTtsUrl'],
'definitionTtsSlow': 'https://quizlet.com'+i['_definitionSlowTtsUrl'],
'lastModified': i['lastModified'],
}
flashcards.append(i)
output = {
'title': data['set']['title'],
'flashcards': flashcards,
'author': {
'name': data['creator']['username'],
'id': data['creator']['id'],
'timestamp': data['creator']['timestamp'],
'lastModified': data['creator']['lastModified'],
'image': data['creator']['_imageUrl'],
'timezone': data['creator']['timeZone'],
'isAdmin': data['creator']['isAdmin'],
},
'id': data['set']['id'],
'link': data['set']['_webUrl'],
'thumbnail': data['set']['_thumbnailUrl'],
'timestamp': data['set']['timestamp'],
'lastModified': data['set']['lastModified'],
'publishedTimestamp': data['set']['publishedTimestamp'],
'authorsId': data['set']['creatorId'],
'termLanguage': data['set']['wordLang'],
'definitionLanguage': data['set']['defLang'],
'description': data['set']['description'],
'numTerms': data['set']['numTerms'],
'hasImages': data['set']['hasImages'],
'hasUploadedImage': data['hasUploadedImage'],
'hasDiagrams': data['set']['hasDiagrams'],
'hasImages': data['set']['hasImages'],
}
return Box(output)
website = 'https://quizlet.com/475389316/python-web-scraping-flash-cards/'
text = 'Two popular parsers'
q = QuizletParser(website)
flashcard = FindFlashcard(q.flashcards, match=text) # finds the flashcard most similar to the input
print(flashcard.term + " " + flashcard.definition) # calculates how similar the identified flashcard is to the input
Hard to give a solution without looking at the data.
A few tips for debugging JSON errors:
Check the input data to the JSONDecoder. You might be adding a comma to the last key-value pair of the input dictionary (which is very common).
Check the data type. If your input data came from an external source check the data first.
I would suggest doing a print of this and pasting it here if possible.
input_data = BeautifulSoup(session.get(link).content, features="lxml").find_all('script')[-6].string[44:-152]
print(input_data)
Related
I am trying to get all the websites from this json file
Unfortunately, when I use this code:
import requests
response = requests.get("https://github.com/solana-labs/token-list/blob/main/src/tokens/solana.tokenlist.json")
output = response.json()
# Extract specific node content.
print(output['website'])
I get following error:
Traceback (most recent call last):
File "/Users/dusandev/Desktop/StreamFlowWebTests/extract.py", line 5, in <module>
output = response.json()
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-
packages/requests/models.py", line 900, in json
return complexjson.loads(self.text, **kwargs)
File
"/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/json/__init__.py",
line 346, in loads
return _default_decoder.decode(s)
File
"/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/json/decoder.py",
line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File
"/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/json/decoder.py",
line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 7 column 1 (char 6)
Any help is appreciated. Thank you in advance
Use raw data to get raw json and then iterate over 'tokens' attr of the response object:
import requests
response = requests.get(
"https://raw.githubusercontent.com/solana-labs/token-list/main/src/tokens/solana.tokenlist.json")
output = response.json()
for i in output['tokens']:
if i.get('extensions'):
print(i.get('extensions').get('website'))
The file https://github.com/solana-labs/token-list/blob/main/src/tokens/solana.tokenlist.json is not a json. Use https://raw.githubusercontent.com/solana-labs/token-list/main/src/tokens/solana.tokenlist.json instead.
If you visit the url https://github.com/solana-labs/token-list/blob/main/src/tokens/solana.tokenlist.json in a browser, you'll get a fully rendered web page. In order to get just JSON you need to use the "view raw" link. That winds up being
https://raw.githubusercontent.com/solana-labs/token-list/main/src/tokens/solana.tokenlist.json
You will then have several thousand elements in the array attached to the "tokens" key in the response dictionary. To get the website element you'll need to iterate through the list and look at the "extensions"
>>> output["tokens"][0]
{'chainId': 101, 'address': 'CbNYA9n3927uXUukee2Hf4tm3xxkffJPPZvGazc2EAH1', 'symbol': 'agEUR', 'name': 'agEUR (Wormhole)', 'decimals': 8, 'logoURI': 'https://raw.githubusercontent.com/solana-labs/token-list/main/assets/mainnet/CbNYA9n3927uXUukee2Hf4tm3xxkffJPPZvGazc2EAH1/logo.png', 'tags': ['ethereum', 'wrapped', 'wormhole'], 'extensions': {'address': '0x1a7e4e63778B4f12a199C062f3eFdD288afCBce8', 'assetContract': 'https://etherscan.io/address/0x1a7e4e63778B4f12a199C062f3eFdD288afCBce8', 'bridgeContract': 'https://etherscan.io/address/0x3ee18B2214AFF97000D974cf647E7C347E8fa585', 'coingeckoId': 'ageur', 'description': 'Angle is the first decentralized, capital efficient and over-collateralized stablecoin protocol', 'discord': 'https://discord.gg/z3kCpTaKMh', 'twitter': 'https://twitter.com/AngleProtocol', 'website': 'https://www.angle.money'}}
>>> output["tokens"][0]["extensions"]["website"]
'https://www.angle.money'
This error usually means that the output can not be parsed as a json.
you have 2 options:
use "https://raw.githubusercontent.com/solana-labs/token-list/main/src/tokens/solana.tokenlist.json" instead-
import requests
response = requests.get("https://raw.githubusercontent.com/solana-labs/token-list/main/src/tokens/solana.tokenlist.json")
output = response.json()
first_website = output["tokens"][0]["extensions"]["website"]
#all websites:
for token in output['tokens']:
if extensions := token.get('extensions'): print(extensions.get('website'))
#output:
'https://www.angle.money'
you can parse it using BeautifulSoup - https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/
I am importing JSON data into Python from an API and ran into the following decode error:
JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Looking at online examples its immediately clear my JSON data has ' where others have have ".
Ideally, I'd like to know why it's being downloaded in this way. It seems highly likely its an error on my end, not theirs.
I decided it should be easy to correct the JSON format but I have failed here too. Please see the below code for how I obtain the JSON data and my attempt at fixing it.
#----------------------------------------
#---Read this only if you want to download
#---the data yourself.
#----------------------------------------
#Built from 'Towards Data Science' guide
#https://towardsdatascience.com/how-to-use-riot-api-with-python-b93be82dbbd6
#Must first have installed riotwatcher
#Info in my example is made up, I can't supply a real API code or
#I get in trouble. Sorry about this. You could obtain one from their website
#but this would be a lot of faff for what is probably a simple StackOverflow
#question
#If you were to get/have a key you could use the following information:
#<EUW> for region
#<Agurin> for name
#----------------------------------------
#---Code
#----------------------------------------
#--->Set Variables
#Get installed riotwatcher module for
#Python
import riotwatcher
#Import riotwatcher tools.
from riotwatcher import LolWatcher, ApiError
#Import JSON (to read the JSON API file)
import json
# Global variables
# Get new API from
# https://developer.riotgames.com/
api_key = 'RGAPI-XXXXXXX-XXX-XXXX-XXXX-XXXX'
watcher = LolWatcher(api_key)
my_region = 'MiddleEarth'
#need to give path to where records
#are to be stored
records_dir = "/home/solebaysharp/Projects/Riot API/Records"
#--->Obtain initial data, setup new varaibles
#Use 'watcher' to get basic stats and setup my account as a variable (me)
me = watcher.summoner.by_name(my_region, "SolebaySharp")
# Setup retrieval of recent match info
my_matches = watcher.match.matchlist_by_account(my_region, me["accountId"])
print(my_matches)
#--->Download the recent match data
#Define where the JSON data is going to go
recent_matches_index_json = (records_dir + "/recent_matches_index.json")
#get that JSON data
print ("Downloading recent match history data")
file_handle = open(recent_matches_index_json,"w+")
file_handle.write(str(my_matches))
file_handle.close()
#convert it to python
file_handle = open(recent_matches_index_json,)
recent_matches_index = json.load(file_handle)
Except this giver the following error...
JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
So instead to correct this I tried:
file_handle = open(recent_matches_index_json)
json_sanitised = json.loads(file_handle.replace("'", '"'))
This returns...
AttributeError: '_io.TextIOWrapper' object has no attribute 'replace'
For the sake of completeness, beneath is a sample of what the JSON looks like. I have added the paragraphs to enhance readability. It does not come this way.
{'matches': [
{'platformId': 'NA1',
'gameId': 5687555181,
'champion': 235,
'queue': 400,
'season': 13,
'timestamp': 1598243995076,
'role': 'DUO_SUPPORT',
'lane': 'BOTTOM'
},
{'platformId': 'NA1',
'gameId': 4965733458,
'champion': 235,
'queue': 400,
'season': 13,
'timestamp': 1598240780841,
'role': 'DUO_SUPPORT',
'lane': 'BOTTOM'
},
{'platformId': 'NA1',
'gameId': 4583215645,
'champion': 111,
'queue': 400,
'season': 13,
'timestamp': 1598236666162,
'role': 'DUO_SUPPORT',
'lane': 'BOTTOM'
}],
'startIndex': 0,
'endIndex': 100,
'totalGames': 186}
This is occurring because Python is converting the JSON to a string (str).
file_handle.write(str(my_matches))
As it doesn't see the difference between ' and " it just goes with its default of '.
We can stop this from happening by using JSON.dumps. This partially (certainly not fully) answers the second part of the question as well, as we're using a correctly formatted JSON command.
The above-mentioned line simply has to be replaced with this:
file_handle.write(json.dumps(my_matches))
This will preserve the JSON formatting.
I got a list in Python with Twitter user information and exported it with Pandas to an Excel file.
One row is one Twitter user with nearly all information of the user (name, #-tag, location etc.)
Here is my code to create the list and fill it with the user data:
def get_usernames(userids, api):
fullusers = []
u_count = len(userids)
try:
for i in range(int(u_count/100) + 1):
end_loc = min((i + 1) * 100, u_count)
fullusers.extend(
api.lookup_users(user_ids=userids[i * 100:end_loc])
)
print('\n' + 'Done! We found ' + str(len(fullusers)) + ' follower in total for this account.' + '\n')
return fullusers
except:
import traceback
traceback.print_exc()
print ('Something went wrong, quitting...')
The only problem is that every row is in JSON object and therefore one long comma-seperated string. I would like to create headers (no problem with Pandas) and only write parts of the string (i.e. ID or name) to colums.
Here is an example of a row from my output.xlsx:
User(_api=<tweepy.api.API object at 0x16898928>, _json={'id': 12345, 'id_str': '12345', 'name': 'Jane Doe', 'screen_name': 'jdoe', 'location': 'Nirvana, NI', 'description': 'Just some random descrition')
I have two ideas, but I don't know how to realize them due to my lack of skills and experience with Python.
Create a loop which saves certain parts ('id','name' etc.) from the JSON-string in colums.
Cut off the User(_api=<tweepy.api. API object at 0x16898928>, _json={ at the beginning and ) at the end, so that I may export they file as CSV.
Could anyone help me out with one of my two solutions or suggest a "simple" way to do this?
fyi: I want to do this to gather data for my thesis.
Try the python json library:
import json
jsonstring = "{'id': 12345, 'id_str': '12345', 'name': 'Jane Doe', 'screen_name': 'jdoe', 'location': 'Nirvana, NI', 'description': 'Just some random descrition')"
jsondict = json.loads(jsonstring)
# type(jsondict) == dictionary
Now you can just extract the data you want from it:
id = jsondict["id"]
name = jsondict["name"]
newdict = {"id":id,"name":name}
I'm getting the following error when executing the following script:
Error Type: <type 'exceptions.TypeError'>
Error Contents: 'NoneType' object is not iterable
Traceback (most recent call last):
File "addon.py", line 75, in <module>
plugin.run()
File "xbmcswift2/plugin.py", line 332, in run
items = self._dispatch(self.request.path)
File "/plugin.py", line 306, in _dispatch
listitems = view_func(**items)
File "/addon.py", line 42, in all_episodes
items = thisiscriminal.compile_playable_podcast(playable_podcast)
File "/lib/thisiscriminal.py", line 121, in compile_playable_podcast
for podcast in playable_podcast:
TypeError: 'NoneType' object is not iterable
The code in question is as follows, any advice would be greatly appreciated as I have no idea what I'm doing wrong:
def get_playable_podcast(soup):
"""
#param: parsed html page
"""
r = urllib.urlopen('https://thisiscriminal.com/wp-json/criminal/v1/episodes?posts=10000&page=1')
data = json.loads(r.read().decode('utf-8'))
for post in data['posts']:
print post['title']
print post['episodeNumber']
print post['audioSource']
print post['image']['medium']
subjects = []
item = {
'title': post['title'],
'audioSource': post['audioSource'],
'episodeNumber': post['episodeNumber'],
'medium': post['image']['medium']
}
subjects.append(item)
print subjects
def compile_playable_podcast(playable_podcast):
"""
#para: list containing dict of key/values pairs for playable podcasts
"""
items = []
for podcast in playable_podcast:
items.append({
post['title']: podcast['title']['episodeNumber'],
post['audioSource']: podcast['audioSource'],
post['image']['medium']: podcast['medium'],
'is_playable': True,})
return items
I assume your script does something alike to the following,
podcast = get_playable_podcast(soup)
compiled = compile_playable_podcast(podcast)
The problem is that get_playable_podcast has no return statement. In such a case, Python defaults to returning None - which you then pass into compile_playable_podcast. Since None is not iterable, compile_playable_podcast rightfully raises a TypeError.
Now, the solution is of course to return the podcast list you're building in get_playable_podcast, like so,
def get_playable_podcast(soup):
"""
#param: parsed html page
"""
r = urllib.urlopen('https://thisiscriminal.com/wp-json/criminal/v1/episodes?posts=10000&page=1')
data = json.loads(r.read().decode('utf-8'))
subjects = []
for post in data['posts']:
print post['title']
print post['episodeNumber']
print post['audioSource']
print post['image']['medium']
item = {
'title': post['title'],
'audioSource': post['audioSource'],
'episodeNumber': post['episodeNumber'],
'medium': post['image']['medium']
}
subjects.append(item)
print subjects
return subjects
Beside this, it may be worthwhile to carefully check your script for unused parameters and/or duplicate code.
I'm trying to add the JSON output below into a dictionary, to be saved into a SQL database.
{'Parkirisca': [
{
'ID_Parkirisca': 2,
'zasedenost': {
'Cas': '2016-10-08 13:17:00',
'Cas_timestamp': 1475925420,
'ID_ParkiriscaNC': 9,
'P_kratkotrajniki': 350
}
}
]}
I am currently using the following code to add the value to a dictionary:
import scraperwiki
import json
import requests
import datetime
import time
from pprint import pprint
html = requests.get("http://opendata.si/promet/parkirisca/lpt/")
data = json.loads(html.text)
for carpark in data['Parkirisca']:
zas = carpark['zasedenost']
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
parking_type = carpark.get('ID_Parkirisca')
if parking_type == "Avtomatizirano":
is_automatic = "Yes"
else:
is_automatic = "No"
scraped = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
savetodb = {
'scraped': scraped,
'id': carpark.get("ID_Parkirisca"),
'total_spaces': carpark.get("St_mest"),
'free_spaces': free_spaces,
'last_updated': last_updated,
'is_automatic': is_automatic,
'lon': carpark.get("KoordinataX_wgs"),
'lat': carpark.get("KoordinataY_wgs")
}
unique_keys = ['id']
pprint savetodb
However when I run this, it gets stuck at for zas in carpark["zasedenost"] and outputs the following error:
Traceback (most recent call last):
File "./code/scraper", line 17, in <module>
for zas in carpark["zasedenost"]:
KeyError: 'zasedenost'
I've been led to believe that zas is in fact now a string, rather than a dictionary, but I'm new to Python and JSON, so don't know what to search for to get a solution. I've also searched here on Stack Overflow for KeyErrror when key exist questions, but they didn't help, and I believe that this might be due to the fact that's a sub for loop.
Update: Now, when I swapped the double quotes for single quotes, I get the following error:
Traceback (most recent call last):
File "./code/scraper", line 17, in <module>
free_spaces = zas.get('P_kratkotrajniki')
AttributeError: 'unicode' object has no attribute 'get'
I fixed up your code:
Added required imports.
Fixed the pprint savetodb line which isn't valid Python.
Didn't try to iterate over carpark['zasedenost'].
I then added another pprint statement in the for loop to see what's in carpark when the KeyError occurs. From there, the error is clear. (Not all the elements in the array in your JSON contain the 'zasedenost' key.)
Here's the code I used:
import datetime
import json
from pprint import pprint
import time
import requests
html = requests.get("http://opendata.si/promet/parkirisca/lpt/")
data = json.loads(html.text)
for carpark in data['Parkirisca']:
pprint(carpark)
zas = carpark['zasedenost']
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
parking_type = carpark.get('ID_Parkirisca')
if parking_type == "Avtomatizirano":
is_automatic = "Yes"
else:
is_automatic = "No"
scraped = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
savetodb = {
'scraped': scraped,
'id': carpark.get("ID_Parkirisca"),
'total_spaces': carpark.get("St_mest"),
'free_spaces': free_spaces,
'last_updated': last_updated,
'is_automatic': is_automatic,
'lon': carpark.get("KoordinataX_wgs"),
'lat': carpark.get("KoordinataY_wgs")
}
unique_keys = ['id']
pprint(savetodb)
And here's the output on the iteration where the KeyError occurs:
{u'A_St_Mest': None,
u'Cena_dan_Eur': None,
u'Cena_mesecna_Eur': None,
u'Cena_splosno': None,
u'Cena_ura_Eur': None,
u'ID_Parkirisca': 7,
u'ID_ParkiriscaNC': 72,
u'Ime': u'P+R Studenec',
u'Invalidi_St_mest': 9,
u'KoordinataX': 466947,
u'KoordinataX_wgs': 14.567929171694901,
u'KoordinataY': 101247,
u'KoordinataY_wgs': 46.05457609543313,
u'Opis': u'2,40 \u20ac /dan',
u'St_mest': 187,
u'Tip_parkirisca': None,
u'U_delovnik': u'24 ur (ponedeljek - petek)',
u'U_sobota': None,
u'U_splosno': None,
u'Upravljalec': u'JP LPT d.o.o.'}
Traceback (most recent call last):
File "test.py", line 14, in <module>
zas = carpark['zasedenost']
KeyError: 'zasedenost'
As you can see, the error is quite accurate. There's no key 'zasedenost' in the dictionary. If you look through your JSON, you'll see that's true for a number of the elements in that array.
I'd suggest a fix, but I don't know what you want to do in the case where this dictionary key is absent. Perhaps you want something like this:
zas = carpark.get('zasedenost')
if zas is not None:
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
else:
free_spaces = None
last_updated = None