How to scrape attributes from json values

How to scrape attributes from json values - python

I am trying to scrape some values through a json that looks like:
{
"attributes":{
"531":{
"id":"531",
"code":"taille",
"label":"taille",
"options":[
{
"id":"30",
"label":"40",
"is_in":"0"
},
{
"id":"31",
"label":"41",
"is_in":"1"
}
]
}
},
"template":"Helloworld"
}
My issue is that the number 531 is different in each json file that I am trying to scrape and what I am trying to grab through this json is the label and is_in value
What I have done so far is that I tried to do something like this but I am stuck and dont know how to do if the 531 is changing to something else
getOption = '{
"attributes":{
"531":{
"id":"531",
"code":"taille",
"label":"taille",
"options":[
{
"id":"30",
"label":"40",
"is_in":"0"
},
{
"id":"31",
"label":"41",
"is_in":"1"
}
]
}
},
"template":"Helloworld"
}'
for att, values in getOption.items():
print(values)
So how do I possible scrape the value label and is_in?

I'm not sure if you can have several 531 keys but you can loop through them.
getOption = {
"attributes":{
"531":{
"id":"531",
"code":"taille",
"label":"taille",
"options":[
{
"id":"30",
"label":"40",
"is_in":"0"
},
{
"id":"31",
"label":"41",
"is_in":"1"
}
]
}
},
"template":"Helloworld"
}
attributes = getOption['attributes']
for key in attributes.keys():
for item in attributes[key]['options']:
print(item['label'], item['is_in'])

Related

Updating double nested object in MongoDB with pymongo

I have the following object in MongoDB:
{
"_id":"...",
"username":"XXX",
"keys":[
{
"key":"c443c2cc2754d3",
"ref":"Autos",
"lists":[
{
"list_name":"Toyota",
"key":"c443c2cc2754d3",
"broken_parts":{
"headlights":false,
"bonnet":true,
"interior":{
"dashboard":true,
"electronics":false
}
},
"timestamp":"2023-01-26T13:00:21.803Z",
"status":"parked"
},
{
"list_name":"Nissan",
"key":"c443c2cc2754d3",
"broken_parts":{
"headlights":true,
"bonnet":false,
"interior":{
"dashboard":false,
"electronics":true
}
},
"timestamp":"2023-01-26T13:00:21.803Z",
"status":"garage"
}
]
},
{
"key":"80d54bd834ff60",
"ref":"Trucks",
"lists":[
{
"list_name":"MAN",
"key":"c443c2cc2754d3",
"broken_parts":{
"headlights":false,
"bonnet":false,
"interior":{
"dashboard":false,
"electronics":false
}
},
"timestamp":"2023-01-26T13:00:21.803Z",
"status":"parked"
},
{
"list_name":"Toyota",
"key":"c443c2cc2754d3",
"broken_parts":{
"headlights":true,
"bonnet":false,
"interior":{
"dashboard":true,
"electronics":true
}
},
"timestamp":"2023-01-26T13:00:21.803Z",
"status":"leased"
}
]
}
]
}
and whenever I try to update the status of an item inside a list, it only updates the first object.
If I try to update the status of the 1st list, 2nd object, it will only update the first one:
self.users.update_one({"username": username,
"keys.key": key,
"keys.lists.list_name": listid
},
{"$set": {
"keys.0.lists.$.status": 'repaired'
}
}
)
What am I doing wrong and how can I update the list item I want? or even the 3rd nested object (dashboard, electronics)?

Python: merge Nested Dictionary into one JSON

How to merge strings from the yield generator of JSON into one JSON?
I have got Nested Dictionary by yield generator, and I aim to have one JSON file.
I have the output of these correct strings of nested dictionary.
{"domain.com": {"Chrome": "19362.344607264396"}}
{"domain.com": {"ChromeMobile": "7177.498437391487"}}
{"another.com": {"MobileSafari": "6237.433155080214"}}
{"another.com": {"Safari": "5895.409403430795"}}
and I want to merge into one JSON file
[
{
"domain.com": {
"Chrome": "19362.344607264396"
}
},
{
"domain.com": {
"ChromeMobile": "7177.498437391487"
}
},
{
"another.com": {
"MobileSafari": "6237.433155080214"
}
},
{
"another.com": {
"Safari": "5895.409403430795"
}
}
]
or ideally to have a JSON likes to this
{
"browsers": [
{
"domain.com": {
"Chrome": "19362.344607264396",
"ChromeMobile": "7177.498437391487",
},
"another.com": {
"MobileSafari": "6237.433155080214",
"Safari": "5895.409403430795"
}
}
]
}
My code
# Cloudflare zone bandwidth total
def browser_map_page_views(domain_zone):
cloudflare = prom.custom_query(
query="topk(5, sum by(family) (increase(browser_map_page_views_count{job='cloudflare', zone='"f'{domain_zone}'"'}[10d])))"
)
for domain_z in cloudflare:
user_agent = domain_z['metric']['family']
value = domain_z['value'][1]
yield {domain_zone: {user_agent: {'value': value}}}
# Get list of zones from Prometheus based on Host Tracker data
def domain_zones():
zones_domain = prom.custom_query(
query="host_tracker_uptime_percent{job='donodeexporter'}"
)
for domain_z in zones_domain:
yield domain_z['metric']['zone']
There is a final output.
# Get a list of domains and substitution each one into a request of Prometheus query.
for domain_list in domain_zones():
for dict in browser_map_page_views(domain_zone=domain_list):
dicts = dict
print(json.dumps(dicts))

Something like this
import json
DATA = [
{"domain.com": {"Chrome": "19362.344607264396"}},
{"domain.com": {"ChromeMobile": "7177.498437391487"}},
{"another.com": {"MobileSafari": "6237.433155080214"}},
{"another.com": {"Safari": "5895.409403430795"}}
]
def yield_data():
for d in DATA:
yield d
def merge1():
return list(yield_data())
def merge2():
output = {}
for d in DATA:
key = list(d.keys())[0]
output[key] = output.get(key, {})
output[key].update(d[key])
return {"browsers": [output]}
print(json.dumps(merge1(), indent=2))
print(json.dumps(merge2(), indent=2))
merge1's output looks like:
[
{
"domain.com": {
"Chrome": "19362.344607264396"
}
},
{
"domain.com": {
"ChromeMobile": "7177.498437391487"
}
},
{
"another.com": {
"MobileSafari": "6237.433155080214"
}
},
{
"another.com": {
"Safari": "5895.409403430795"
}
}
]
merge2's output looks like
{
"browsers": [
{
"domain.com": {
"Chrome": "19362.344607264396",
"ChromeMobile": "7177.498437391487"
},
"another.com": {
"MobileSafari": "6237.433155080214",
"Safari": "5895.409403430795"
}
}
]
}

how to use only strings to pull json data

So I keep running into a issue where json is asking to use a int to find pieces of data in a json response. Below is the code that works though the issue is i want to print every 'name' in the json though id have to change the [0] to 1 and then 2 ect. I tried to increment it though that ran into issue too. This could be just me overlooking something but let me know, thanks.
def BaseTesting(TarLink):
PayloadToSend = {
}
HeadersToSend = { # make sure to change your token at least once every 30 mins
'authorization': '',
'user-agent': ''
}
ReqForFriends = requests.post(TarLink, headers=HeadersToSend, data=PayloadToSend).text
LoadedJSONData = json.loads(ReqForFriends)
print(LoadedJSONData['friends'][0]['name'])
BaseTesting(TarLink="")
JSON
{
"friends":[
{
"name":"test1",
"user_id":"1132",
"type":2,
"display":"true",
},
{
"name":"test2",
"user_id":"2341",
"type":1,
"display":"true",
},
{
"name":"test3",
"user_id":"1234",
"type":2,
"display":"true",
},
}

it seems to work properly
LoadedJSONData = {
"friends":[
{
"name":"test1",
"user_id":"1132",
"type":2,
"display":"true",
},
{
"name":"test2",
"user_id":"2341",
"type":1,
"display":"true",
},
{
"name":"test3",
"user_id":"1234",
"type":2,
"display":"true",
},
]
}
for i in range(0,3):
print(LoadedJSONData['friends'][i]['name']) #test1 test2 test3

Unable to replicate post_filter query in elasticsearch-dsl

The query I would like to replicate in DSL is as below:
GET /_search
{
"query":{
"bool":{
"must":[
{
"term":{
"destination":"singapore"
}
},
{
"terms":{
"tag_ids":[
"tag_luxury"
]
}
}
]
}
},
"aggs":{
"max_price":{
"max":{
"field":"price_range_from.SGD"
}
},
"min_price":{
"min":{
"field":"price_range_from.SGD"
}
}
},
"post_filter":{
"range":{
"price_range_from.SGD":{
"gte":0.0,
"lte":100.0
}
}
}
}
The above query
Matches terms - destination and tags_ids
Aggregates to result to find the max price from field price_range_from.SGD
Applies another post_filter to subset the result set within price limits
It works perfectly well in the Elastic/Kibana console.
I replicated the above query in elasticsearch-dsl as below:
es_query = []
es_query.append(Q("term", destination="singapore"))
es_query.append(Q("terms", tag_ids=["tag_luxury"]))
final_query = Q("bool", must=es_query)
es_conn = ElasticSearch.instance().get_client()
dsl_client = DSLSearch(using=es_conn, index=index).get_dsl_client()
dsl_client.query = final_query
dsl_client.aggs.metric("min_price", "min", field="price_range_from.SGD")
dsl_client.aggs.metric("max_price", "max", field="price_range_from.SGD")
q = Q("range", **{"price_range_from.SGD":{"gte": 0.0, "lte": 100.0}})
dsl_client.post_filter(q)
print(dsl_client.to_dict())
response = dsl_client.execute()
print(response.to_dict().get("hits", {}))
Although the aggregations are correct, products beyond the price range are also being returned. There is no error returned but it seems like the post_filter query is not applied.
I dived in the dsl_client object to see whether my query is being captured correctly. I see only the query and aggs but don't see the post_filter part in the object. The query when converted to a dictionary using dsl_client.to_dict() is as below -
{
"query":{
"bool":{
"must":[
{
"term":{
"destination":"singapore"
}
},
{
"terms":{
"tag_ids":[
"tag_luxury"
]
}
}
]
}
},
"aggs":{
"min_price":{
"min":{
"field":"price_range_from.SGD"
}
},
"max_price":{
"max":{
"field":"price_range_from.SGD"
}
}
}
}
Please help. Thanks!

You have to re-assign the dsl_client like:
dsl_client = dsl_client.post_filter(q)

Dictionary length is equal to 3 but when trying to access an index receiving KeyError

I am attempting to parse a json response that looks like this:
{
"links": {
"next": "http://www.neowsapp.com/rest/v1/feed?start_date=2015-09-08&end_date=2015-09-09&detailed=false&api_key=xxx",
"prev": "http://www.neowsapp.com/rest/v1/feed?start_date=2015-09-06&end_date=2015-09-07&detailed=false&api_key=xxx",
"self": "http://www.neowsapp.com/rest/v1/feed?start_date=2015-09-07&end_date=2015-09-08&detailed=false&api_key=xxx"
},
"element_count": 22,
"near_earth_objects": {
"2015-09-08": [
{
"links": {
"self": "http://www.neowsapp.com/rest/v1/neo/3726710?api_key=xxx"
},
"id": "3726710",
"neo_reference_id": "3726710",
"name": "(2015 RC)",
"nasa_jpl_url": "http://ssd.jpl.nasa.gov/sbdb.cgi?sstr=3726710",
"absolute_magnitude_h": 24.3,
"estimated_diameter": {
"kilometers": {
"estimated_diameter_min": 0.0366906138,
"estimated_diameter_max": 0.0820427065
},
"meters": {
"estimated_diameter_min": 36.6906137531,
"estimated_diameter_max": 82.0427064882
},
"miles": {
"estimated_diameter_min": 0.0227984834,
"estimated_diameter_max": 0.0509789586
},
"feet": {
"estimated_diameter_min": 120.3760332259,
"estimated_diameter_max": 269.1689931548
}
},
"is_potentially_hazardous_asteroid": false,
"close_approach_data": [
{
"close_approach_date": "2015-09-08",
"close_approach_date_full": "2015-Sep-08 09:45",
"epoch_date_close_approach": 1441705500000,
"relative_velocity": {
"kilometers_per_second": "19.4850295284",
"kilometers_per_hour": "70146.106302123",
"miles_per_hour": "43586.0625520053"
},
"miss_distance": {
"astronomical": "0.0269230459",
"lunar": "10.4730648551",
"kilometers": "4027630.320552233",
"miles": "2502653.4316094954"
},
"orbiting_body": "Earth"
}
],
"is_sentry_object": false
},
}
I am trying to figure out how to parse through to get "miss_distance" dictionary values ? I am unable to wrap my head around it.
Here is what I have been able to do so far:
After I get a Response object from request.get()
response = request.get(url
I convert the response object to json object
data = response.json() #this returns dictionary object
I try to parse the first level of the dictionary:
for i in data:
if i == "near_earth_objects":
dataset1 = data["near_earth_objects"]["2015-09-08"]
#this returns the next object which is of type list
Please someone can explain me :
1. How to decipher this response in the first place.
2. How can I move forward in parsing the response object and get to miss_distance dictionary ?
Please any pointers/help is appreciated.
Thank you

Your data will will have multiple dictionaries for the each date, near earth object, and close approach:
near_earth_objects = data['near_earth_objects']
for date in near_earth_objects:
objects = near_earth_objects[date]
for object in objects:
close_approach_data = object['close_approach_data']
for close_approach in close_approach_data:
print(close_approach['miss_distance'])

The code below gives you a table of date, miss_distances for every object for every date
import json
raw_json = '''
{
"near_earth_objects": {
"2015-09-08": [
{
"close_approach_data": [
{
"miss_distance": {
"astronomical": "0.0269230459",
"lunar": "10.4730648551",
"kilometers": "4027630.320552233",
"miles": "2502653.4316094954"
},
"orbiting_body": "Earth"
}
]
}
]
}
}
'''
if __name__ == "__main__":
parsed = json.loads(raw_json)
# assuming this json includes more than one near_earch_object spread across dates
near_objects = []
for date, near_objs in parsed['near_earth_objects'].items():
for obj in near_objs:
for appr in obj['close_approach_data']:
o = {
'date': date,
'miss_distances': appr['miss_distance']
}
near_objects.append(o)
print(near_objects)
output:
[
{'date': '2015-09-08',
'miss_distances': {
'astronomical': '0.0269230459',
'lunar': '10.4730648551',
'kilometers': '4027630.320552233',
'miles': '2502653.4316094954'
}
}
]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape attributes from json values - python

Related

Updating double nested object in MongoDB with pymongo

Python: merge Nested Dictionary into one JSON

how to use only strings to pull json data

Unable to replicate post_filter query in elasticsearch-dsl

Dictionary length is equal to 3 but when trying to access an index receiving KeyError

Categories

Resources