Convert json list into dictionary in python - python

The data I am using is Twitter API's twitter trending topics.
url_0 = 'https://api.twitter.com/1.1/trends/place.json?id=2459115'
res = requests.get(url_0, auth=auth)
print(res, res.status_code, res.headers['content-type'])
print(res.url)
top_trends_twitter = res.json()
data= top_trends_twitter[0]
This is how data looks like:
[{'as_of': '2017-02-13T21:59:32Z',
'created_at': '2017-02-13T21:53:22Z',
'locations': [{'name': 'New York', 'woeid': 2459115}],
'trends': [{'name': 'Victor Cruz',
'promoted_content': None,
'query': '%22Victor+Cruz%22',
'tweet_volume': 45690,
'url': 'http://twitter.com/search?q=%22Victor+Cruz%22'},
{'name': '#percussion',
'promoted_content': None,
'query': '%23percussion',
'tweet_volume': None,
'url': 'http://twitter.com/search?q=%23percussion'}, .....etc
Now, after I connect the server with SQL, and create database and table, an error appears. This is the part that is causing me trouble:
for entry in data:
trendname = entry['trends']['name']
url = entry['trends']['url']
num_tweets = entry['trends']['trend_volume']
date= entry['as_of']
print("Inserting trend", trendname, "at", url)
query_parameters = (trendname, url, num_tweets, date)
cursor.execute(query_template, query_parameters)
con.commit()
cursor.close()
Then, I get this error:
TypeError Traceback (most recent call last)
<ipython-input-112-da3e17aadce0> in <module>()
29
30 for entry in data:
---> 31 trendname = entry['trends']['name']
32 url = entry['trends']['url']
33 num_tweets = entry['trends']['trend_volume']
TypeError: string indices must be integers
How can I get the set of strings into dictionary, so that I can use that for entry data code?

You Need entry['trends'][0]['name']. entry['trends'] is a list and you need integer index to access items of list.
Try like so:
data=[{'as_of': '2017-02-13T21:59:32Z',
'created_at': '2017-02-13T21:53:22Z',
'locations': [{'name': 'New York', 'woeid': 2459115}],
'trends': [{'name': 'Victor Cruz',
'promoted_content': None,
'query': '%22Victor+Cruz%22',
'tweet_volume': 45690,
'url': 'http://twitter.com/search?q=%22Victor+Cruz%22'},
{'name': '#percussion',
'promoted_content': None,
'query': '%23percussion',
'tweet_volume': None,
'url': 'http://twitter.com/search?q=%23percussion'}]}]
for entry in data:
date= entry['as_of']
for trend in entry['trends']:
trendname = trend['name']
url = trend['url']
num_tweets = trend['tweet_volume']
print trendname, url, num_tweets, date
Output:
Victor Cruz http://twitter.com/search?q=%22Victor+Cruz%22 45690 2017-02-13T21:59:32Z
#percussion http://twitter.com/search?q=%23percussion None 2017-02-13T21:59:32Z

Related

Create a new dictionary from a nested JSON output after parsing

In python3 I need to get a JSON response from an API call,
and parse it so I will get a dictionary That only contains the data I need.
The final dictionary I ecxpt to get is as follows:
{'Severity Rules': ('cc55c459-eb1a-11e8-9db4-0669bdfa776e', ['cc637182-eb1a-11e8-9db4-0669bdfa776e']), 'auto_collector': ('57e9a4ec-21f7-4e0e-88da-f0f1fda4c9d1', ['0ab2470a-451e-11eb-8856-06364196e782'])}
the JSON response returns the following output:
{
'RuleGroups': [{
'Id': 'cc55c459-eb1a-11e8-9db4-0669bdfa776e',
'Name': 'Severity Rules',
'Order': 1,
'Enabled': True,
'Rules': [{
'Id': 'cc637182-eb1a-11e8-9db4-0669bdfa776e',
'Name': 'Severity Rule',
'Description': 'Look for default severity text',
'Enabled': False,
'RuleMatchers': None,
'Rule': '\\b(?P<severity>DEBUG|TRACE|INFO|WARN|ERROR|FATAL|EXCEPTION|[I|i]nfo|[W|w]arn|[E|e]rror|[E|e]xception)\\b',
'SourceField': 'text',
'DestinationField': 'text',
'ReplaceNewVal': '',
'Type': 'extract',
'Order': 21520,
'KeepBlockedLogs': False
}],
'Type': 'user'
}, {
'Id': '4f6fa7c6-d60f-49cd-8c3d-02dcdff6e54c',
'Name': 'auto_collector',
'Order': 4,
'Enabled': True,
'Rules': [{
'Id': '2d6bdc1d-4064-11eb-8856-06364196e782',
'Name': 'auto_collector',
'Description': 'DO NOT CHANGE!! Created via API coralogix-blocker tool',
'Enabled': False,
'RuleMatchers': None,
'Rule': 'AUTODISABLED',
'SourceField': 'subsystemName',
'DestinationField': 'subsystemName',
'ReplaceNewVal': '',
'Type': 'block',
'Order': 1,
'KeepBlockedLogs': False
}],
'Type': 'user'
}]
}
I was able to create a dictionary that contains the name and the RuleGroupsID, like that:
response = requests.get(url,headers=headers)
output = response.json()
outputlist=(output["RuleGroups"])
groupRuleName = [li['Name'] for li in outputlist]
groupRuleID = [li['Id'] for li in outputlist]
# Create a dictionary of NAME + ID
ruleDic = {}
for key in groupRuleName:
for value in groupRuleID:
ruleDic[key] = value
groupRuleID.remove(value)
break
Which gave me a simple dictionary:
{'Severity Rules': 'cc55c459-eb1a-11e8-9db4-0669bdfa776e', 'Rewrites': 'ddbaa27e-1747-11e9-9db4-0669bdfa776e', 'Extract': '0cb937b6-2354-d23a-5806-4559b1f1e540', 'auto_collector': '4f6fa7c6-d60f-49cd-8c3d-02dcdff6e54c'}
but when I tried to parse it as nested JSON things just didn't work.
In the end, I managed to create a function that returns this dictionary,
I'm doing it by breaking the JSON into 3 lists by the needed elements (which are Name, Id, and Rules from the first nest), and then create another list from the nested JSON ( which listed everything under Rule) which only create a list from the keyword "Id".
Finally creating a dictionary using a zip command on the lists and dictionaries created earlier.
def get_filtered_rules() -> List[dict]:
groupRuleName = [li['Name'] for li in outputlist]
groupRuleID = [li['Id'] for li in outputlist]
ruleIDList = [li['Rules'] for li in outputlist]
ruleIDListClean = []
ruleClean = []
for sublist in ruleIDList:
try:
lstRule = [item['Rule'] for item in sublist]
ruleClean.append(lstRule)
ruleContent=list(zip(groupRuleName, ruleClean))
ruleContentDictionary = dict(ruleContent)
lstID = [item['Id'] for item in sublist]
ruleIDListClean.append(lstID)
# Create a dictionary of NAME + ID + RuleID
ruleDic = dict(zip(groupRuleName, zip(groupRuleID, ruleIDListClean)))
except Exception as e: print(e)
return ruleDic

How to flatten nested dict formatted '_source' column of csv, into dataframe

I have a csv with 500+ rows where one column "_source" is stored as JSON. I want to extract that into a pandas dataframe. I need each key to be its own column. #I have a 1 mb Json file of online social media data that I need to convert the dictionary and key values into their own separate columns. The social media data is from Facebook,Twitter/web crawled... etc. There are approximately 528 separate rows of posts/tweets/text with each having many dictionaries inside dictionaries. I am attaching a few steps from my Jupyter notebook below to give a more complete understanding. need to turn all key value pairs for dictionaries inside dictionaries into columns inside a dataframe
Thank you so much this will be a huge help!!!
I have tried changing it to a dataframe by doing this
source = pd.DataFrame.from_dict(source, orient='columns')
And it returns something like this... I thought it might unpack the dictionary but it did not.
#source.head()
#_source
#0 {'sub_organization_id': 'default', 'uid': 'aba...
#1 {'sub_organization_id': 'default', 'uid': 'ab0...
#2 {'sub_organization_id': 'default', 'uid': 'ac0...
below is the shape
#source.shape (528, 1)
below is what the an actual "_source" row looks like stretched out. There are many dictionaries and key:value pairs where each key needs to be its own column. Thanks! The actual links have been altered/scrambled for privacy reasons.
{'sub_organization_id': 'default',
'uid': 'ac0fafe9ba98327f2d0c72ddc365ffb76336czsa13280b',
'project_veid': 'default',
'campaign_id': 'default',
'organization_id': 'default',
'meta': {'rule_matcher': [{'atribs': {'website': 'github.com/res',
'source': 'Explicit',
'version': '1.1',
'type': 'crawl'},
'results': [{'rule_type': 'hashtag',
'rule_tag': 'Far',
'description': None,
'project_veid': 'A7180EA-7078-0C7F-ED5D-86AD7',
'campaign_id': '2A6DA0C-365BB-67DD-B05830920',
'value': '#Far',
'organization_id': None,
'sub_organization_id': None,
'appid': 'ray',
'project_id': 'CDE2F42-5B87-C594-C900E578C',
'rule_id': '1838',
'node_id': None,
'metadata': {'campaign_title': 'AF',
'project_title': 'AF '}}]}],
'render': [{'attribs': {'website': 'github.com/res',
'version': '1.0',
'type': 'Page Render'},
'results': [{'render_status': 'success',
'path': 'https://east.amanaws.com/rays-ime-store/renders/b/b/70f7dffb8b276f2977f8a13415f82c.jpeg',
'image_hash': 'bb7674b8ea3fc05bfd027a19815f82c',
'url': 'https://discooprdapp.com/',
'load_time': 32}]}]},
'norm_attribs': {'website': 'github.com/res',
'version': '1.1',
'type': 'crawl'},
'project_id': 'default',
'system_timestamp': '2019-02-22T19:04:53.569623',
'doc': {'appid': 'subtter',
'links': [],
'response_url': 'https://discooprdapp.com',
'url': 'https://discooprdapp.com/',
'status_code': 200,
'status_msg': 'OK',
'encoding': 'utf-8',
'attrs': {'uid': '2ab8f2651cb32261b911c990a8b'},
'timestamp': '2019-02-22T19:04:53.963',
'crawlid': '7fd95-785-4dd259-fcc-8752f'},
'type': 'crawl',
'norm': {'body': '\n',
'domain': 'discordapp.com',
'author': 'crawl',
'url': 'https://discooprdapp.com',
'timestamp': '2019-02-22T19:04:53.961283+00:00',
'id': '7fc5-685-4dd9-cc-8762f'}}
before you post make sure the actual code works for the data attached. Thanks!
The below code I tried but it did not work there was a syntax error that I could not figure out.
pd.io.json.json_normalize(source_data.[_source].apply(json.loads))
pd.io.json.json_normalize(source_data.[_source].apply(json.loads))
^
SyntaxError: invalid syntax
Whoever can help me with this will be a saint!
I had to do something like that a while back. Basically I used a function that completely flattened out the json to identify the keys that would be turned into the columns, then iterated through the json to reconstruct a row and append each row into a "results" dataframe. So with the data you provided, it created 52 column row and looking through it, looks like it included all the keys into it's own column. Anything nested, for example: 'meta': {'rule_matcher':[{'atribs': {'website': ...]} should then have a column name meta.rule_matcher.atribs.website where the '.' denotes those nested keys
data_source = {'sub_organization_id': 'default',
'uid': 'ac0fafe9ba98327f2d0c72ddc365ffb76336czsa13280b',
'project_veid': 'default',
'campaign_id': 'default',
'organization_id': 'default',
'meta': {'rule_matcher': [{'atribs': {'website': 'github.com/res',
'source': 'Explicit',
'version': '1.1',
'type': 'crawl'},
'results': [{'rule_type': 'hashtag',
'rule_tag': 'Far',
'description': None,
'project_veid': 'A7180EA-7078-0C7F-ED5D-86AD7',
'campaign_id': '2A6DA0C-365BB-67DD-B05830920',
'value': '#Far',
'organization_id': None,
'sub_organization_id': None,
'appid': 'ray',
'project_id': 'CDE2F42-5B87-C594-C900E578C',
'rule_id': '1838',
'node_id': None,
'metadata': {'campaign_title': 'AF',
'project_title': 'AF '}}]}],
'render': [{'attribs': {'website': 'github.com/res',
'version': '1.0',
'type': 'Page Render'},
'results': [{'render_status': 'success',
'path': 'https://east.amanaws.com/rays-ime-store/renders/b/b/70f7dffb8b276f2977f8a13415f82c.jpeg',
'image_hash': 'bb7674b8ea3fc05bfd027a19815f82c',
'url': 'https://discooprdapp.com/',
'load_time': 32}]}]},
'norm_attribs': {'website': 'github.com/res',
'version': '1.1',
'type': 'crawl'},
'project_id': 'default',
'system_timestamp': '2019-02-22T19:04:53.569623',
'doc': {'appid': 'subtter',
'links': [],
'response_url': 'https://discooprdapp.com',
'url': 'https://discooprdapp.com/',
'status_code': 200,
'status_msg': 'OK',
'encoding': 'utf-8',
'attrs': {'uid': '2ab8f2651cb32261b911c990a8b'},
'timestamp': '2019-02-22T19:04:53.963',
'crawlid': '7fd95-785-4dd259-fcc-8752f'},
'type': 'crawl',
'norm': {'body': '\n',
'domain': 'discordapp.com',
'author': 'crawl',
'url': 'https://discooprdapp.com',
'timestamp': '2019-02-22T19:04:53.961283+00:00',
'id': '7fc5-685-4dd9-cc-8762f'}}
Code:
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data_source)
import pandas as pd
import re
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = re.sub(r'\_\d+\_', '.', column)
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output:
print (results.to_string())
atribs_website atribs_source atribs_version atribs_type results.rule_type results.rule_tag results.description results.project_veid results.campaign_id results.value results.organization_id results.sub_organization_id results.appid results.project_id results.rule_id results.node_id results.metadata_campaign_title results.metadata_project_title attribs_website attribs_version attribs_type results.render_status results.path results.image_hash results.url results.load_time sub_organization_id uid project_veid campaign_id organization_id norm_attribs_website norm_attribs_version norm_attribs_type project_id system_timestamp doc_appid doc_response_url doc_url doc_status_code doc_status_msg doc_encoding doc_attrs_uid doc_timestamp doc_crawlid type norm_body norm_domain norm_author norm_url norm_timestamp norm_id
0 github.com/res Explicit 1.1 crawl hashtag Far NaN A7180EA-7078-0C7F-ED5D-86AD7 2A6DA0C-365BB-67DD-B05830920 #Far NaN NaN ray CDE2F42-5B87-C594-C900E578C 1838 NaN AF AF github.com/res 1.0 Page Render success https://east.amanaws.com/rays-ime-store/render... bb7674b8ea3fc05bfd027a19815f82c https://discooprdapp.com/ 32.0 default ac0fafe9ba98327f2d0c72ddc365ffb76336czsa13280b default default default github.com/res 1.1 crawl default 2019-02-22T19:04:53.569623 subtter https://discooprdapp.com https://discooprdapp.com/ 200 OK utf-8 2ab8f2651cb32261b911c990a8b 2019-02-22T19:04:53.963 7fd95-785-4dd259-fcc-8752f crawl \n discordapp.com crawl https://discooprdapp.com 2019-02-22T19:04:53.961283+00:00 7fc5-685-4dd9-cc-8762f

Dynamically assign obtained results to variables in Python

I have an API response for listing out information of all Volumes. I want to loop through the response and get the value of the name and assign each one of them dynamically to each url.
This is my main API endpoint which returns the following:
[{'source': None, 'serial': '23432', 'created': '2018-11-
12T04:27:14Z', 'name': 'v001', 'size':
456456}, {'source': None, 'serial': '4364576',
'created': '2018-11-12T04:27:16Z', 'name': 'v002',
'size': 345435}, {'source': None, 'serial':
'6445645', 'created': '2018-11-12T04:27:17Z', 'name': 'v003', 'size':
23432}, {'source': None,
'serial': 'we43235', 'created': '2018-11-12T04:27:20Z',
'name': 'v004', 'size': 35435}]
I'm doing this to get the value of 'name'
test_url = 'https://0.0.0.0/api/1.1/volume'
test_data = json.loads(r.get(test_url, headers=headers,
verify=False).content.decode('UTF-8'))
new_data = [{
'name': value['name']
} for value in test_data]
final_data = [val['name'] for val in new_data]
for k in final_data:
print(k)
k prints out all the values in name, but i'm stuck at where i want to be able to use it in assigning different API endpoints. Now, k returns
v001
v002
v003
v004
I want to assign each one of them to different endpoints like below:
url_v001 = test_url + v001
url_v002 = test_url + v002
url_v003 = test_url + v003
url_v004 = test_url + v004
I want this to be dynamically done, because there may be more than 4 volume names returned by my main API.
It wouldn't be good to do that, but the best way is to use a dictionary:
d={}
for k in final_test:
d['url_'+k] = test_url + k
Or much better in a dictionary comprehension:
d={'url_'+k:test_url + k for k in final_test}
And now:
print(d)
Both reproduce:
{'url_v001': 'https://0.0.0.0/api/1.1/volumev001', 'url_v002': 'https://0.0.0.0/api/1.1/volumev002', 'url_v003': 'https://0.0.0.0/api/1.1/volumev003', 'url_v004': 'https://0.0.0.0/api/1.1/volumev004'}
To use d:
for k,v in d.items():
print(k+',',v)
Outputs:
url_v001, https://0.0.0.0/api/1.1/volumev001
url_v002, https://0.0.0.0/api/1.1/volumev002
url_v003, https://0.0.0.0/api/1.1/volumev003
url_v004, https://0.0.0.0/api/1.1/volumev004

Web Scraping: getting KeyError when parsing JSON in Python

I want to extract the full address from the webpage and I'm using BeautifulSoup and JSON.
Here's my code:
import bs4
import json
from bs4 import BeautifulSoup
import requests
url = 'xxxxxxxxxxxxxxxxx'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
for i in soup.find_all('div', attrs={'data-integration-name':'redux-container'}):
info = json.loads(i.get('data-payload'))
I printed 'info' out:
{'storeName': None, 'props': {'locations': [{'dirty': False, 'updated_at': '2016-05-05T07:57:19.282Z', 'country_code': 'US', 'company_id': 106906, 'longitude': -74.0001954, 'address': '5 Crosby St 3rd Floor', 'state': 'New York', 'full_address': '5 Crosby St 3rd Floor, New York, 10013, New York, USA', 'country': 'United States', 'id': 17305, 'to_params': 'new-york-us', 'latitude': 40.719753, 'region': '', 'city': 'New York', 'description': '', 'created_at': '2015-01-19T01:32:16.317Z', 'zip_code': '10013', 'hq': True}]}, 'name': 'LocationsMapList'}
What I want is the "full_address" under "location" so my code was:
info = json.loads(i.get('data-payload'))
for i in info['props']['locations']:
print (i['full_address'])
But I got this error:
----> 5 for i in info['props']['locations']:
KeyError: 'locations'
I want to print the full address out, which is '5 Crosby St 3rd Floor, New York, 10013, New York, USA'.
Thanks a lot!
The data you are parsing seem to be inconsistent, the keys are not in all objects.
If you still want to perform a loop, you need to use a try/except statement to catch an exception, or the method get to set a fallback when you're looking for a key in a dictionary that could be not here.
info = json.loads(i.get('data-payload'))
for item in info['props'].get('locations', []):
print (item.get('full_address', 'no address'))
get('locations', []) : returns an empty list if the key location doesn't exist, so the loop doesn't run any iteration.
get('full_address', 'no address') : returns "no adress" in case there is no such key
EDIT :
The data are inconsistent (never trust data). Some JSON objects have a key props with a null /None value. The next fix should correct that :
info = json.loads(i.get('data-payload'))
if info.get('props'):
for item in info['props'].get('locations', []):
print (item.get('full_address', 'no address'))
Your first object is fine, but it's clear that your second object has no locations key anywhere, nor full_address.

Send location ( lat & long ) to API AI

I'm using API AI to make a Facebook bot. After sharing my location in Facebook chat bot. I got a JSON like this:
{'message': {'attachments': [{'payload': {'coordinates': {'lat': 52.335001190772,'long': 4.8887078680234}},'title': 'Holiday Inn','type': 'location','url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.bing.com%2Fmaps%2Fdefault.aspx%3Fv%3D2%26pc%3DFACEBK%26mid%3D8100%26where1%3DDe%2BBoelelaan%2B2%252C%2B1083%2BHJ%2BAmsterdam%252C%2BNetherlands%26FORM%3DFBKPL1%26mkt%3Den-US&h=mAQEt4NIX&s=1&enc=AZN97DQxpVq5xpkZqvgi3bMq2OVJNwWBOXOiIOW4FHx1-kgYHxTPKfFwRkUsl0ibr0K5GAquaEltxBMLGvjxmUbCa1AmptlN85rg4jLhDH6K0g'}],
According to that JSON, I have lat and long value:
payload = message['message']['attachments'][0]['payload']
lat = payload['coordinates']['lat']
long = payload['coordinates']['long']
What I want is to send those values to parameters in API AI. So I wrote a method to post that:
def post_location(self, text, lat, long):
return self.get(
params={
'query': text,
'parameters': {
'latitude': lat,
'longitude': long,
},
'lang': 'en'
}
)
And this is how my get looks like:
def get(self, params):
"""
Get connection with api ai
:rtype: object
"""
print(params)
request = requests.get(
url=self.url, params=params, headers=self.headers)
content = request.content.decode('utf-8')
try:
return json.loads(content)
except ValueError:
print('Invalid JSON')
Finally I call post_location method where I handle the facebook message and give it the values but when I run, only the text ( query ) is sent to API AI.
def _post_location_to_api(message, lat, long):
ai_api = AIApi()
return ai_api.post_location(message, lat, long)
location = _post_location_to_api(message['message']['attachments'][0]['type'], latitude, longitude)
print(location) gives me this:
{'id': 'd4374511-86ce-4ccb-b7b3-e813011a0998', 'sessionId': '00000000-0000-0000-0000-000000000000', 'timestamp': '2016-10-04T11:26:11.613Z', 'result': {'action': 'PlaceSearch', 'actionIncomplete': False, 'score': 0.51, 'contexts': [], 'fulfillment': {'speech': ''}, 'parameters': {'latitude': '', 'longitude': ''}, 'source': 'agent', 'resolvedQuery': 'location', 'metadata': {'intentName': 'I want to eat in Amsterdam', 'intentId': '89f515a6-f723-4df5-96b2-9e0f784747c6', 'webhookUsed': 'true', 'warning': 'IMPORTANT: Please use unique session ID for each client. Otherwise you may experience unstable behaviour.'}}, 'status': {'errorDetails': 'Webhook call failed. Status code 404. Error:404 Not Found', 'errorType': 'partial_content', 'code': 206}}
What did I do wrong?

Categories