Synonym analyzer not working in elastic search with python - python

I have a scenario as depicted below in python code .
In this I am trying to explicitly define new york and ny as synonyms. But unfortunately it is not working. Can you please guide me as I am new to elastic search.
Also I am using custom analyzer.
I also have the file synonyms.txt having text:
ny,newyork,nyork
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()
keywords = ['thousand eyes', 'facebook', 'superdoc', 'quora', 'your story', 'Surgery', 'lending club', 'ad roll',
'the honest company', 'Draft kings', 'newyork']
count = 1
doc_setting = {
"settings": {
"analysis": {
"analyzer": {
"my_analyzer_keyword": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase",
"synonym"
]
},
"my_analyzer_shingle": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"synonym"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms_path": "synonyms.txt",
"ignore_case": "true"
}
}
}
}, "mappings": {
"your_type": {
"properties": {
"keyword": {
"type": "string",
"index_analyzer": "my_analyzer_keyword",
"search_analyzer": "my_analyzer_shingle"
}
}
}
}
}
validate=es.index(index='test', doc_type='your_type', body=doc_setting)
print(validate)
for keyword in keywords:
doc = {
'id': count,
'keyword': keyword
}
res = es.index(index="test", doc_type='your_type', id=count, body=doc)
print(res['result'])
count = count + 1
#res11 = es.get(index="test", doc_type='your_type', id=1)
#print(res11['_source'])
es.indices.refresh(index="test")
question = "I saw news on ny news channel of lending club on facebook, your story and quora"
print("Question asked: %s" % question)
res = es.search(index="test",`enter code here` doc_type='your_type', body={
"query": {"match": {"keyword": question}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
print(hit["_source"])

PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer_keyword": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase",
"synonym"
]
},
"my_analyzer_shingle": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"synonym"
]
}
},
"filter": {
"synonym" : {
"type" : "synonym",
"lenient": true,
"synonyms" : ["ny,newyork,nyork"]
}
}
}
}, "mappings": {
"your_type": {
"properties": {
"keyword": {
"type": "text",
"analyzer": "my_analyzer_keyword",
"search_analyzer": "my_analyzer_shingle"
}
}
}
}
}
Then Analyze using
POST /test_index/_analyze
{
"analyzer" : "my_analyzer_shingle",
"text" : "I saw news on ny news channel of lending club on facebook, your story and quorat"
}
The tokens I get are
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "saw",
"start_offset": 2,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "news",
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "on",
"start_offset": 11,
"end_offset": 13,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "ny",
"start_offset": 14,
"end_offset": 16,
"type": "<ALPHANUM>",
"position": 4
},
{
"token": "newyork",
"start_offset": 14,
"end_offset": 16,
"type": "SYNONYM",
"position": 4
},
{
"token": "nyork",
"start_offset": 14,
"end_offset": 16,
"type": "SYNONYM",
"position": 4
},
{
"token": "news",
"start_offset": 17,
"end_offset": 21,
"type": "<ALPHANUM>",
"position": 5
},
{
"token": "channel",
"start_offset": 22,
"end_offset": 29,
"type": "<ALPHANUM>",
"position": 6
},
{
"token": "of",
"start_offset": 30,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 7
},
{
"token": "lending",
"start_offset": 33,
"end_offset": 40,
"type": "<ALPHANUM>",
"position": 8
},
{
"token": "club",
"start_offset": 41,
"end_offset": 45,
"type": "<ALPHANUM>",
"position": 9
},
{
"token": "on",
"start_offset": 46,
"end_offset": 48,
"type": "<ALPHANUM>",
"position": 10
},
{
"token": "facebook",
"start_offset": 49,
"end_offset": 57,
"type": "<ALPHANUM>",
"position": 11
},
{
"token": "your",
"start_offset": 59,
"end_offset": 63,
"type": "<ALPHANUM>",
"position": 12
},
{
"token": "story",
"start_offset": 64,
"end_offset": 69,
"type": "<ALPHANUM>",
"position": 13
},
{
"token": "and",
"start_offset": 70,
"end_offset": 73,
"type": "<ALPHANUM>",
"position": 14
},
{
"token": "quorat",
"start_offset": 74,
"end_offset": 80,
"type": "<ALPHANUM>",
"position": 15
}
]
}
and the search produces
POST /test_index/_search
{
"query" : {
"match" : { "keyword" : "I saw news on ny news channel of lending club on facebook, your story and quora" }
}
}
{
"took": 36,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.6858001,
"hits": [
{
"_index": "test_index",
"_type": "your_type",
"_id": "4",
"_score": 1.6858001,
"_source": {
"keyword": "newyork"
}
},
{
"_index": "test_index",
"_type": "your_type",
"_id": "2",
"_score": 1.1727304,
"_source": {
"keyword": "facebook"
}
},
{
"_index": "test_index",
"_type": "your_type",
"_id": "5",
"_score": 0.6931472,
"_source": {
"keyword": "quora"
}
}
]
}
}

Related

JSON iteration with different format Python

I have a JSON feed i need to collect data from but I am running into some issues with iterating and parsing the API data.
The function below collects what I need but some horses get scratched (dont race) and the format of their JSON is different to horses who's status is "Starter".
def race_data():
for race_key in races_list():
data = requests.get(f'{Base_url}events/{race_key}.json?app_id={AppID}&app_key={AppKey}').text
# parse race data
data = json.loads(data)
for race_odds in data['competitors']:
horse_name = race_odds['name']
jockey_name = race_odds['jockey']
jockey_weight = race_odds['weight']
trainer_name = race_odds['trainer']
fixed_win_odds = race_odds['prices'][2]['price']
fixed_place_odds = race_odds['prices'][0]['price']
race_key = data['eventKey']
race_time = data['eventDateTimeUtc']
horse_status = race_odds['status']
print(f'{race_key} {horse_name} {jockey_name} {jockey_weight} {trainer_name} {fixed_win_odds} {fixed_place_odds} {race_time} {horse_status}')
if __name__ == "__main__":
race_data()
which produces
202210290200.T.AUS.quirindi.1 Ponte Pietra Jai Williams 0 Nikki Pollock 1.1 16.0 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Gundawarra Vad Bolozhinskyi 0 G A O'brien 4.5 35.0 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Cemented Ms Chelsea Hillier 0 Sally Torrens 2.5 35.0 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Jack Duggan Ms Madeline Owen 0 J C Deamer 1.1 2.7 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Northern Borders Ms Zara Lewis 0 W T Martyn 9.6 68.1 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Iffland Ms Amelia Denby 0 K A Lees 1.9 19.4 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 La Brea Benjamin Osmond 0 K A Lees 2.1 14.0 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 My Gem Jacob Golden 0 Ms S Grills 2.8 16.1 2022-10-29T02:55:00Z Starter
202210290200.T.AUS.quirindi.1 Another Super Patrick Scorse 0 M J Dwyer 4.9 87.5 2022-10-29T02:55:00Z Starter
I run into an issue when one of the runners is scratched (not racing)
the problem is with the odds price field. When a horse is scratched (defined in horse_status)
fixed_win_odds = race_odds['prices'][2]['price']
fixed_place_odds = race_odds['prices'][0]['price']
The JSON below is for a horse that is scratched:
{
"status": "Scratched",
"startPos": 3,
"trainer": "Ms L Selby",
"shortForm": "0608247373",
"sequence": 7,
"weight": "0",
"name": "Bingo Banko",
"jockey": "Unknown",
"prices":
[
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 0
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 0,
"productType": "Current"
},
{
"price": 16,
"productType": "Max"
},
{
"price": 16,
"productType": "Open"
},
{
"price": 15,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
}
],
"price": 0
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.bingo_banko",
"competitorKey": "202210290200.T.AUS.quirindi.1.bingo_banko"
}
and this is what the JSON looks like when the horse is a starter (not scratched)
{
"status": "Starter",
"startPos": 9,
"trainer": "M J Dwyer",
"shortForm": "7626280005",
"sequence": 6,
"weight": "0",
"name": "Another Super",
"jockey": "Patrick Scorse",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 85
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 3.9
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 5.7
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 17,
"productType": "Current"
},
{
"price": 20,
"productType": "Max"
},
{
"price": 19,
"productType": "Open"
},
{
"price": 17,
"productType": "Last"
},
{
"price": 18,
"productType": "Last"
},
{
"price": 17,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 15,
"productType": "Last"
}
],
"price": 17
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.another_super",
"competitorKey": "202210290200.T.AUS.quirindi.1.another_super"
}
I still need to collect the information on the horses that are scratched, so need some help with formatting the iteration to collect the data without getting key error on horses that are scratched.
you can view the full JSON for a race below for reference:
[
{
"status": "Starter",
"startPos": 5,
"trainer": "Nikki Pollock",
"shortForm": "x686623464",
"sequence": 3,
"weight": "0",
"name": "Ponte Pietra",
"jockey": "Jai Williams",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 15.6
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 2.6
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 1.04
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 10,
"productType": "Current"
},
{
"price": 16,
"productType": "Max"
},
{
"price": 16,
"productType": "Open"
},
{
"price": 10,
"productType": "Last"
},
{
"price": 9,
"productType": "Last"
},
{
"price": 9,
"productType": "Last"
},
{
"price": 8,
"productType": "Last"
},
{
"price": 8,
"productType": "Last"
}
],
"price": 10
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.ponte_pietra",
"competitorKey": "202210290200.T.AUS.quirindi.1.ponte_pietra"
},
{
"status": "Starter",
"startPos": 4,
"trainer": "G A O'brien",
"shortForm": "90x4491",
"sequence": 4,
"weight": "0",
"name": "Gundawarra",
"jockey": "Vad Bolozhinskyi",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 34
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 1.45
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 11
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 3,
"productType": "Current"
},
{
"price": 3,
"productType": "Max"
},
{
"price": 3,
"productType": "Open"
},
{
"price": 3,
"productType": "Last"
},
{
"price": 3,
"productType": "Last"
},
{
"price": 3,
"productType": "Last"
},
{
"price": 3,
"productType": "Last"
},
{
"price": 3,
"productType": "Last"
}
],
"price": 3.7
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.gundawarra",
"competitorKey": "202210290200.T.AUS.quirindi.1.gundawarra"
},
{
"status": "Starter",
"startPos": 6,
"trainer": "Sally Torrens",
"shortForm": "7816x64858",
"sequence": 1,
"weight": "0",
"name": "Cemented",
"jockey": "Ms Chelsea Hillier",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 85
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 3.1
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 3
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 13,
"productType": "Current"
},
{
"price": 19,
"productType": "Max"
},
{
"price": 19,
"productType": "Open"
},
{
"price": 13,
"productType": "Last"
},
{
"price": 14,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 15,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
}
],
"price": 13
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.cemented",
"competitorKey": "202210290200.T.AUS.quirindi.1.cemented"
},
{
"status": "Starter",
"startPos": 2,
"trainer": "J C Deamer",
"shortForm": "75x5456x73",
"sequence": 2,
"weight": "0",
"name": "Jack Duggan",
"jockey": "Ms Madeline Owen",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 3
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 1.14
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 1.04
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 1,
"productType": "Current"
},
{
"price": 1,
"productType": "Max"
},
{
"price": 1,
"productType": "Open"
},
{
"price": 1,
"productType": "Last"
},
{
"price": 1,
"productType": "Last"
},
{
"price": 1,
"productType": "Last"
},
{
"price": 1,
"productType": "Last"
},
{
"price": 1,
"productType": "Last"
}
],
"price": 1.8
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.jack_duggan",
"competitorKey": "202210290200.T.AUS.quirindi.1.jack_duggan"
},
{
"status": "Starter",
"startPos": 10,
"trainer": "W T Martyn",
"shortForm": "00409",
"sequence": 8,
"weight": "0",
"name": "Northern Borders",
"jockey": "Ms Zara Lewis",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 116
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 8
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 9
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 41,
"productType": "Current"
},
{
"price": 51,
"productType": "Max"
},
{
"price": 34,
"productType": "Open"
},
{
"price": 41,
"productType": "Last"
},
{
"price": 51,
"productType": "Last"
},
{
"price": 41,
"productType": "Last"
},
{
"price": 51,
"productType": "Last"
},
{
"price": 41,
"productType": "Last"
}
],
"price": 41
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.northern_borders",
"competitorKey": "202210290200.T.AUS.quirindi.1.northern_borders"
},
{
"status": "Starter",
"startPos": 8,
"trainer": "K A Lees",
"shortForm": "6",
"sequence": 9,
"weight": "0",
"name": "Iffland",
"jockey": "Ms Amelia Denby",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 33.5
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 3.3
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 1.7
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 14,
"productType": "Current"
},
{
"price": 20,
"productType": "Max"
},
{
"price": 17,
"productType": "Open"
},
{
"price": 14,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 14,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 14,
"productType": "Last"
}
],
"price": 14
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.iffland",
"competitorKey": "202210290200.T.AUS.quirindi.1.iffland"
},
{
"status": "Starter",
"startPos": 7,
"trainer": "K A Lees",
"shortForm": "55x570x58",
"sequence": 10,
"weight": "0",
"name": "La Brea",
"jockey": "Benjamin Osmond",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 43.3
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 2.6
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 1.9
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 10,
"productType": "Current"
},
{
"price": 13,
"productType": "Max"
},
{
"price": 11,
"productType": "Open"
},
{
"price": 10,
"productType": "Last"
},
{
"price": 9,
"productType": "Last"
},
{
"price": 9,
"productType": "Last"
},
{
"price": 8,
"productType": "Last"
},
{
"price": 9,
"productType": "Last"
}
],
"price": 10
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.la_brea",
"competitorKey": "202210290200.T.AUS.quirindi.1.la_brea"
},
{
"status": "Starter",
"startPos": 1,
"trainer": "Ms S Grills",
"shortForm": "673410x084",
"sequence": 5,
"weight": "0",
"name": "My Gem",
"jockey": "Jacob Golden",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 14.9
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 3.7
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 3.2
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 16,
"productType": "Current"
},
{
"price": 16,
"productType": "Max"
},
{
"price": 13,
"productType": "Open"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 15,
"productType": "Last"
},
{
"price": 14,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 15,
"productType": "Last"
}
],
"price": 16
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.my_gem",
"competitorKey": "202210290200.T.AUS.quirindi.1.my_gem"
},
{
"status": "Starter",
"startPos": 9,
"trainer": "M J Dwyer",
"shortForm": "7626280005",
"sequence": 6,
"weight": "0",
"name": "Another Super",
"jockey": "Patrick Scorse",
"prices":
[
{
"betType": "Win",
"code": "BT3",
"flucs": [],
"price": 85
},
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 3.9
},
{
"betType": "Place",
"code": "BT2P",
"flucs": [],
"price": 5.7
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 17,
"productType": "Current"
},
{
"price": 20,
"productType": "Max"
},
{
"price": 19,
"productType": "Open"
},
{
"price": 17,
"productType": "Last"
},
{
"price": 18,
"productType": "Last"
},
{
"price": 17,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
},
{
"price": 15,
"productType": "Last"
}
],
"price": 17
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.another_super",
"competitorKey": "202210290200.T.AUS.quirindi.1.another_super"
},
{
"status": "Scratched",
"startPos": 3,
"trainer": "Ms L Selby",
"shortForm": "0608247373",
"sequence": 7,
"weight": "0",
"name": "Bingo Banko",
"jockey": "Unknown",
"prices":
[
{
"betType": "FixedPlace",
"code": "FXD",
"flucs": [],
"price": 0
},
{
"betType": "FixedWin",
"code": "FXD",
"flucs":
[
{
"price": 0,
"productType": "Current"
},
{
"price": 16,
"productType": "Max"
},
{
"price": 16,
"productType": "Open"
},
{
"price": 15,
"productType": "Last"
},
{
"price": 16,
"productType": "Last"
}
],
"price": 0
}
],
"id": "competitor:202210290200.T.AUS.quirindi.1.bingo_banko",
"competitorKey": "202210290200.T.AUS.quirindi.1.bingo_banko"
}
]
thanks for your assistance.
Use a for loop to iterate through the prices, and check the betType key to see if it's either FixedWin or FixedPlace, that way the ordering of the different prices doesn't matter.
Also since the race_key and the race event are always the same you should define them at the top of the function so your implementation doesn't need to keep checking the dictionary for them.
def race_data():
for race_key in races_list():
data = requests.get(f'{Base_url}events/{race_key}.json?app_id={AppID}&app_key={AppKey}').text
# parse race data
data = json.loads(data)
race_key = data['eventKey']
race_time = data['eventDateTimeUtc']
for race_odds in data:
horse_name = race_odds['name']
jockey_name = race_odds['jockey']
jockey_weight = race_odds['weight']
trainer_name = race_odds['trainer']
for price in race_odds['prices']:
if price["betType"] == "FixedWin":
fixed_win_odds = price["price"]
elif price["betType"] == "FixedPlace":
fixed_place_odds = price["price"]
horse_status = race_odds['status']
print(f'{race_key} {horse_name} {jockey_name} {jockey_weight} {trainer_name} {fixed_win_odds} {fixed_place_odds} {race_time} {horse_status}')
if __name__ == "__main__":
race_data()

Amazon SP API Feeds API processing report: Invalid product type

I have been working with the feeds API. I have been able to create a feed document, encrypt and upload a payload to the returned URL, create the feed, and then check its status. Once status reaches the DONE state, I was also able to get the processing report: The processing report looks like this:
{
"header": {
"sellerId": "A1HGLY0OQBKE5U",
"version": "2.0",
"feedId": "50042018828"
},
"issues": [
{
"messageId": 1,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 2,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 3,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 4,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 5,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 6,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 7,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 8,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 9,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 10,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 11,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 12,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 13,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 14,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 15,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 16,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 17,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 18,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
},
{
"messageId": 19,
"code": "4000003",
"severity": "ERROR",
"message": "The Amazon product type specified is invalid or not supported."
}
],
"summary": {
"errors": 19,
"warnings": 0,
"messagesProcessed": 19,
"messagesAccepted": 0,
"messagesInvalid": 19
}
}
The Payload was this:
{
"header": {
"sellerId": "A1HGLY0OQBKE5U",
"version": "2.0"
},
"messages": [
{
"messageId": 1,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Awesome Bronze Car",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk77"
},
{
"messageId": 2,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Sleek Steel Chair",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk80"
},
{
"messageId": 3,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Aerodynamic Iron Gloves",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk81"
},
{
"messageId": 4,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Sleek Copper Clock",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk86"
},
{
"messageId": 5,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Mediocre Plastic Wallet",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk88"
},
{
"messageId": 6,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-6088C959"
},
{
"messageId": 7,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-20D88EFC"
},
{
"messageId": 8,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-0BAF36BA"
},
{
"messageId": 9,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-CD28D4F0"
},
{
"messageId": 10,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-9A544946"
},
{
"messageId": 11,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-F2FD045F"
},
{
"messageId": 12,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-0138EE43"
},
{
"messageId": 13,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-D334BA28"
},
{
"messageId": 14,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-8650D0FD"
},
{
"messageId": 15,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] 1 L Le Parfait Jar",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "SLLPJ-8D93E1C3"
},
{
"messageId": 16,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Gorgeous Copper Hat",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk94"
},
{
"messageId": 17,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "[Sample] Tiered Wire Basket",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "TWB"
},
{
"messageId": 18,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Heavy Duty Cotton Gloves",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk98"
},
{
"messageId": 19,
"operationType": "UPDATE",
"productType": "SHOES",
"attributes": {
"item_name": [
{
"value": "Ergonomic Steel Lamp",
"language_tag": "en_US",
"marketplace_id": "A21TJRUUN4KGV"
}
],
"fulfillment_availability": [
{
"fulfillment_channel_code": "AMAZON_NA",
"quantity": 10
}
]
},
"sku": "sk103"
}
]
}
As can be observed, I am using the product type SHOES. In a previous attempt, I had used the product type LUGGAGE because that was used in the use-case guides. But Amazon still does not like this. Could someone guide me to a list of valid product types?
I've meet the same issue and solve this one, and I'm just sending https://sellingpartnerapi-na.amazon.com/definitions/2020-09-01/productTypes?marketplaceIds=A2EUQ1WTGCTBG2 to get the productType is PRODUCT.
You can reference with https://github.com/amzn/selling-partner-api-docs/blob/main/references/product-type-definitions-api/definitionsProductTypes_2020-09-01.md#searchdefinitionsproducttypes to find out what is the productType it is in the seller centre without keywords paramters.
Therefore you can try it again using "productType": "PRODUCT".

Selecting nested dictionaries and turning them to a DataFrame in Python

Selecting nested dictionaries and turning them to a DataFrame in Python
From the nested 'biblio' data below, is there a way of sorting this into a data frame with each key as a column? For example, where 'classifications_cpc' is a column header with the codes as the subsequent values?
{
"publication_reference": {
"jurisdiction": "US",
"doc_number": "10236491",
"kind": "B2",
"date": "2019-03-19"
},
"application_reference": {
"jurisdiction": "US",
"doc_number": "201615053025",
"kind": "A",
"date": "2016-02-25"
},
"priority_claims": {
"claims": [
{
"jurisdiction": "JP",
"doc_number": "2015062114",
"kind": "A",
"date": "2015-03-25",
"sequence": 1
}
]
},
"invention_title": [
{
"text": "Lithium ion secondary battery",
"lang": "en"
}
],
"parties": {
"applicants": [
{
"residence": "JP",
"extracted_name": {
"value": "AUTOMOTIVE ENERGY SUPPLY CORP"
}
}
],
"inventors": [
{
"residence": "JP",
"sequence": 1,
"extracted_name": {
"value": "SAKAGUCHI SHINICHIRO"
}
},
{
"residence": "JP",
"sequence": 2,
"extracted_name": {
"value": "KIMURA AIKA"
}
},
{
"residence": "JP",
"sequence": 3,
"extracted_name": {
"value": "MIZUTA MASATOMO"
}
}
],
"agents": [
{
"extracted_name": {
"value": "Troutman Sanders LLP"
}
}
],
"owners_all": [
{
"recorded_date": "2016-02-25",
"execution_date": "2016-01-28",
"extracted_name": {
"value": "AUTOMOTIVE ENERGY SUPPLY CORPORATION"
},
"extracted_address": "10-1, HIRONODAI 2-CHOME, ZAMA-SHI, KANAGAWA, 252-0012",
"extracted_country": "JP"
}
]
},
"classifications_ipcr": {
"classifications": [
{
"symbol": "H01M2/02"
},
{
"symbol": "H01M2/14"
},
{
"symbol": "H01M2/18"
},
{
"symbol": "H01M10/0525"
},
{
"symbol": "H01M10/0585"
}
]
},
"classifications_cpc": {
"classifications": [
{
"symbol": "H01M10/0525"
},
{
"symbol": "H01M10/0525"
},
{
"symbol": "H01M50/463"
},
{
"symbol": "H01M10/0525"
},
{
"symbol": "H01M10/0585"
},
{
"symbol": "H01M10/0585"
},
{
"symbol": "H01M50/10"
},
{
"symbol": "H01M50/116"
},
{
"symbol": "H01M50/116"
},
{
"symbol": "H01M50/40"
},
{
"symbol": "H01M50/40"
},
{
"symbol": "H01M50/409"
},
{
"symbol": "H01M50/543"
},
{
"symbol": "H01M50/543"
},
{
"symbol": "Y02E60/10"
}
]
},
"references_cited": {
"citations": [
{
"sequence": 1,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2011151307",
"kind": "A1",
"date": "2011-06-23"
},
"lens_id": "052-557-140-975-892"
}
},
{
"sequence": 2,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2011287301",
"kind": "A1",
"date": "2011-11-24"
},
"lens_id": "050-516-769-883-801"
}
},
{
"sequence": 3,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2014205887",
"kind": "A1",
"date": "2014-07-24"
},
"lens_id": "041-534-822-806-155"
}
},
{
"sequence": 4,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2015056492",
"kind": "A1",
"date": "2015-02-26"
},
"lens_id": "101-776-463-080-028"
}
},
{
"sequence": 5,
"patcit": {
"document_id": {
"jurisdiction": "WO",
"doc_number": "2013047778",
"kind": "A1",
"date": "2013-04-04"
},
"lens_id": "135-661-134-273-324"
}
},
{
"sequence": 1,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2011143183",
"kind": "A1",
"date": "2011-06-16"
},
"lens_id": "095-161-033-897-779"
}
},
{
"sequence": 2,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2014349169",
"kind": "A1",
"date": "2014-11-27"
},
"lens_id": "075-950-005-288-26X"
}
},
{
"sequence": 3,
"patcit": {
"document_id": {
"jurisdiction": "US",
"doc_number": "2015050542",
"kind": "A1",
"date": "2015-02-19"
},
"lens_id": "003-582-946-821-435"
}
},
{
"sequence": 4,
"patcit": {
"document_id": {
"jurisdiction": "CN",
"doc_number": "102124591",
"kind": "A",
"date": "2011-07-13"
},
"lens_id": "157-805-739-981-807"
}
},
{
"sequence": 5,
"patcit": {
"document_id": {
"jurisdiction": "CN",
"doc_number": "104106155",
"kind": "A",
"date": "2014-10-15"
},
"lens_id": "003-865-201-672-551"
}
},
{
"sequence": 6,
"patcit": {
"document_id": {
"jurisdiction": "CN",
"doc_number": "104205416",
"kind": "A",
"date": "2014-12-10"
},
"lens_id": "182-508-848-265-100"
}
},
{
"sequence": 7,
"patcit": {
"document_id": {
"jurisdiction": "EP",
"doc_number": "2747167",
"kind": "A1",
"date": "2014-06-25"
},
"lens_id": "167-072-626-506-628"
}
},
{
"sequence": 8,
"patcit": {
"document_id": {
"jurisdiction": "JP",
"doc_number": "2009277397",
"kind": "A",
"date": "2009-11-26"
},
"lens_id": "061-699-339-033-165"
}
},
{
"sequence": 9,
"nplcit": {
"text": "Extended European Search Report dated Apr. 14, 2016 issued in corresponding European Patent Application No. 16157356.3."
}
}
],
"patent_count": 13,
"npl_count": 1
},
"cited_by": {}
}
Any Suggestions or Ideas?
Do you want a column for each and every key? or only specific ones? For example, the cited_by key has no value in it.
However, assign the data you provided to a variable names your_data and try this code:
import pandas as pd
list_for_df =[]
classifications = your_data["classifications_cpc"]
symbol_list = classifications["classifications"]
for symbol in symbol_list:
list_for_df.append(symbol["symbol"])
df = pd.DataFrame(list_for_df,columns=["classifications_cpc"])
The data frame will look like that:
classifications_cpc
0 H01M10/0525
1 H01M10/0525
2 H01M50/463
3 H01M10/0525
4 H01M10/0585
5 H01M10/0585
6 H01M50/10
7 H01M50/116
8 H01M50/116
9 H01M50/40
10 H01M50/40
11 H01M50/409
12 H01M50/543
13 H01M50/543
14 Y02E60/10
Let me try to approach your requirements. As the column names 'classifications_cpc' or 'parties' or 'classifications_ipcr' are each an array of unequal lengths, it would not make sense to put them together into a single DataFrame. Each resultant row will have unrelated fields grouped together.
What you might be looking for is to extract values using specific key in the nested dictionary or 'lists of dictionaries' . For example using recursive function to extract values using some key:
data = {...nested dictionary or 'lists of dictionaries'...}
def get_vals(nested, key):
result = []
if isinstance(nested, list) and nested != []: #non-empty list
for lis in nested:
result.extend(get_vals(lis, key))
elif isinstance(nested, dict) and nested != {}: #non-empty dict
for val in nested.values():
if isinstance(val, (list, dict)): #(list or dict) in dict
result.extend(get_vals(val, key))
if key in nested.keys(): #key found in dict
result.append(nested[key])
return result
get_vals(data, 'value')
Output
['AUTOMOTIVE ENERGY SUPPLY CORP',
'SAKAGUCHI SHINICHIRO',
'KIMURA AIKA',
'MIZUTA MASATOMO',
'Troutman Sanders LLP',
'AUTOMOTIVE ENERGY SUPPLY CORPORATION']
Or to look for the key 'classifications', you would get 2 lists from 'classifications_ipcr' and 'classifications_cpc':
get_vals(data, 'classifications')
[[{'symbol': 'H01M2/02'},
{'symbol': 'H01M2/14'},
{'symbol': 'H01M2/18'},
{'symbol': 'H01M10/0525'},
{'symbol': 'H01M10/0585'}],
[{'symbol': 'H01M10/0525'},
{'symbol': 'H01M10/0525'},
{'symbol': 'H01M50/463'},
{'symbol': 'H01M10/0525'},
{'symbol': 'H01M10/0585'},
{'symbol': 'H01M10/0585'},
{'symbol': 'H01M50/10'},
{'symbol': 'H01M50/116'},
{'symbol': 'H01M50/116'},
{'symbol': 'H01M50/40'},
{'symbol': 'H01M50/40'},
{'symbol': 'H01M50/409'},
{'symbol': 'H01M50/543'},
{'symbol': 'H01M50/543'},
{'symbol': 'Y02E60/10'}]]
Another way is to use built-in function pd.json_normalize(), but you have to identify specific keys linkage to arrive at the data you desire.
df = pd.json_normalize(data['classifications_cpc']['classifications'])
Output df
symbol
0 H01M10/0525
1 H01M10/0525
2 H01M50/463
3 H01M10/0525
4 H01M10/0585
5 H01M10/0585
6 H01M50/10
7 H01M50/116
8 H01M50/116
9 H01M50/40
10 H01M50/40
11 H01M50/409
12 H01M50/543
13 H01M50/543
14 Y02E60/10

Python - Get Nested Data from Multiple Levels

Wasn't sure how to title this question but I am working with the Quickbooks Online API and when querying a report like BalanceSheet or GeneralLedger the API returns data rows in multiple nested levels which is quite frustrating to parse through.
Example of the BalanceSheet return included below. I am only interested in the data from "Row" objects but as you can see that can be returned in 1, 2, 3 or more different levels of data. I am thinking of going through each level to check for Rows and then get each Row but that seems overly complex as I would need multiple for loops for each level.
I'm wondering if there is a better way to get each "Row" in that data without regard to which level it is on? Any ideas would be appreciated!
Here's an example of a return from their sandbox data:
{
"Header": {
"Time": "2021-04-28T14:12:17-07:00",
"ReportName": "BalanceSheet",
"DateMacro": "this calendar year-to-date",
"ReportBasis": "Accrual",
"StartPeriod": "2021-01-01",
"EndPeriod": "2021-04-28",
"SummarizeColumnsBy": "Month",
"Currency": "USD",
"Option": [
{
"Name": "AccountingStandard",
"Value": "GAAP"
},
{
"Name": "NoReportData",
"Value": "false"
}
]
},
"Columns": {
"Column": [
{
"ColTitle": "",
"ColType": "Account",
"MetaData": [
{
"Name": "ColKey",
"Value": "account"
}
]
},
{
"ColTitle": "Jan 2021",
"ColType": "Money",
"MetaData": [
{
"Name": "StartDate",
"Value": "2021-01-01"
},
{
"Name": "EndDate",
"Value": "2021-01-31"
},
{
"Name": "ColKey",
"Value": "Jan 2021"
}
]
},
{
"ColTitle": "Feb 2021",
"ColType": "Money",
"MetaData": [
{
"Name": "StartDate",
"Value": "2021-02-01"
},
{
"Name": "EndDate",
"Value": "2021-02-28"
},
{
"Name": "ColKey",
"Value": "Feb 2021"
}
]
},
{
"ColTitle": "Mar 2021",
"ColType": "Money",
"MetaData": [
{
"Name": "StartDate",
"Value": "2021-03-01"
},
{
"Name": "EndDate",
"Value": "2021-03-31"
},
{
"Name": "ColKey",
"Value": "Mar 2021"
}
]
},
{
"ColTitle": "Apr 1-28, 2021",
"ColType": "Money",
"MetaData": [
{
"Name": "StartDate",
"Value": "2021-04-01"
},
{
"Name": "EndDate",
"Value": "2021-04-28"
},
{
"Name": "ColKey",
"Value": "Apr 1-28, 2021"
}
]
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "ASSETS"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Current Assets"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Bank Accounts"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Checking",
"id": "35"
},
{
"value": "1201.00"
},
{
"value": "1201.00"
},
{
"value": "1201.00"
},
{
"value": "1201.00"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Savings",
"id": "36"
},
{
"value": "800.00"
},
{
"value": "800.00"
},
{
"value": "800.00"
},
{
"value": "800.00"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Bank Accounts"
},
{
"value": "2001.00"
},
{
"value": "2001.00"
},
{
"value": "2001.00"
},
{
"value": "2001.00"
}
]
},
"type": "Section",
"group": "BankAccounts"
},
{
"Header": {
"ColData": [
{
"value": "Accounts Receivable"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Accounts Receivable (A/R)",
"id": "84"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Accounts Receivable"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
},
{
"value": "5281.52"
}
]
},
"type": "Section",
"group": "AR"
},
{
"Header": {
"ColData": [
{
"value": "Other Current Assets"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Inventory Asset",
"id": "81"
},
{
"value": "596.25"
},
{
"value": "596.25"
},
{
"value": "596.25"
},
{
"value": "596.25"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Undeposited Funds",
"id": "4"
},
{
"value": "2062.52"
},
{
"value": "2062.52"
},
{
"value": "2062.52"
},
{
"value": "2062.52"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Other Current Assets"
},
{
"value": "2658.77"
},
{
"value": "2658.77"
},
{
"value": "2658.77"
},
{
"value": "2658.77"
}
]
},
"type": "Section",
"group": "OtherCurrentAssets"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Current Assets"
},
{
"value": "9941.29"
},
{
"value": "9941.29"
},
{
"value": "9941.29"
},
{
"value": "9941.29"
}
]
},
"type": "Section",
"group": "CurrentAssets"
},
{
"Header": {
"ColData": [
{
"value": "Fixed Assets"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Truck",
"id": "37"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Original Cost",
"id": "38"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Truck"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
}
]
},
"type": "Section"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Fixed Assets"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
},
{
"value": "13495.00"
}
]
},
"type": "Section",
"group": "FixedAssets"
}
]
},
"Summary": {
"ColData": [
{
"value": "TOTAL ASSETS"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
}
]
},
"type": "Section",
"group": "TotalAssets"
},
{
"Header": {
"ColData": [
{
"value": "LIABILITIES AND EQUITY"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Liabilities"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Current Liabilities"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"Header": {
"ColData": [
{
"value": "Accounts Payable"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Accounts Payable (A/P)",
"id": "33"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Accounts Payable"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
},
{
"value": "1602.67"
}
]
},
"type": "Section",
"group": "AP"
},
{
"Header": {
"ColData": [
{
"value": "Credit Cards"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Mastercard",
"id": "41"
},
{
"value": "157.72"
},
{
"value": "157.72"
},
{
"value": "157.72"
},
{
"value": "157.72"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Credit Cards"
},
{
"value": "157.72"
},
{
"value": "157.72"
},
{
"value": "157.72"
},
{
"value": "157.72"
}
]
},
"type": "Section",
"group": "CreditCards"
},
{
"Header": {
"ColData": [
{
"value": "Other Current Liabilities"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Arizona Dept. of Revenue Payable",
"id": "89"
},
{
"value": "0.00"
},
{
"value": "0.00"
},
{
"value": "0.00"
},
{
"value": "0.00"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Board of Equalization Payable",
"id": "90"
},
{
"value": "370.94"
},
{
"value": "370.94"
},
{
"value": "370.94"
},
{
"value": "370.94"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Loan Payable",
"id": "43"
},
{
"value": "4000.00"
},
{
"value": "4000.00"
},
{
"value": "4000.00"
},
{
"value": "4000.00"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Other Current Liabilities"
},
{
"value": "4370.94"
},
{
"value": "4370.94"
},
{
"value": "4370.94"
},
{
"value": "4370.94"
}
]
},
"type": "Section",
"group": "OtherCurrentLiabilities"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Current Liabilities"
},
{
"value": "6131.33"
},
{
"value": "6131.33"
},
{
"value": "6131.33"
},
{
"value": "6131.33"
}
]
},
"type": "Section",
"group": "CurrentLiabilities"
},
{
"Header": {
"ColData": [
{
"value": "Long-Term Liabilities"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Notes Payable",
"id": "44"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
}
],
"type": "Data"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Long-Term Liabilities"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
},
{
"value": "25000.00"
}
]
},
"type": "Section",
"group": "LongTermLiabilities"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Liabilities"
},
{
"value": "31131.33"
},
{
"value": "31131.33"
},
{
"value": "31131.33"
},
{
"value": "31131.33"
}
]
},
"type": "Section",
"group": "Liabilities"
},
{
"Header": {
"ColData": [
{
"value": "Equity"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
]
},
"Rows": {
"Row": [
{
"ColData": [
{
"value": "Opening Balance Equity",
"id": "34"
},
{
"value": "-9337.50"
},
{
"value": "-9337.50"
},
{
"value": "-9337.50"
},
{
"value": "-9337.50"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Retained Earnings",
"id": "2"
},
{
"value": "1642.46"
},
{
"value": "1642.46"
},
{
"value": "1642.46"
},
{
"value": "1642.46"
}
],
"type": "Data"
},
{
"ColData": [
{
"value": "Net Income"
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
},
{
"value": ""
}
],
"type": "Data",
"group": "NetIncome"
}
]
},
"Summary": {
"ColData": [
{
"value": "Total Equity"
},
{
"value": "-7695.04"
},
{
"value": "-7695.04"
},
{
"value": "-7695.04"
},
{
"value": "-7695.04"
}
]
},
"type": "Section",
"group": "Equity"
}
]
},
"Summary": {
"ColData": [
{
"value": "TOTAL LIABILITIES AND EQUITY"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
},
{
"value": "23436.29"
}
]
},
"type": "Section",
"group": "TotalLiabilitiesAndEquity"
}
]
}
}

Extracting data from JSON File to CSV

I have a big JSON file with a very complex structure
you can look on it here: https://drive.google.com/file/d/1tBVJ2xYSCpTTUGPJegvAz2ZXbeN0bteX/view?usp=sharing
it contains more than 7 millions lines, and I want to extract only the "text" field
I have written a python code, to extra all the values of the "text" key or field in the whole file, and it extracted only 12 values! while when I open the JSON file on the Visualstudio, I have more than 19000 values!!
you can see the code here:
import json
import csv
with open("/Users/zahraa-maher/rasa-init-demo/venv/Tickie/external_data/frames2.json") as file:
data = json.load(file)
fname = "outputText8.csv"
with open(fname, "w") as file:
csv_file = csv.writer(file,lineterminator='\n')
csv_file.writerow(["text"])
for item in data[i]["turns"]:
csv_file.writerow([item['text']])
please take a look on the JSON file as it is very large one and with a complex structure, so I an not paste it here to see because it would be not understandable
also this is a part of the son file:
[
{
"user_id": "U22HTHYNP",
"turns": [
{
"text": "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
"labels": {
"acts": [
{
"args": [
{
"val": "book",
"key": "intent"
}
],
"name": "inform"
},
{
"args": [
{
"val": "Atlantis",
"key": "dst_city"
},
{
"val": "Caprica",
"key": "or_city"
},
{
"val": "Saturday, August 13, 2016",
"key": "str_date"
},
{
"val": "8",
"key": "n_adults"
},
{
"val": "1700",
"key": "budget"
}
],
"name": "inform"
}
],
"acts_without_refs": [
{
"args": [
{
"val": "book",
"key": "intent"
}
],
"name": "inform"
},
{
"args": [
{
"val": "Atlantis",
"key": "dst_city"
},
{
"val": "Caprica",
"key": "or_city"
},
{
"val": "Saturday, August 13, 2016",
"key": "str_date"
},
{
"val": "8",
"key": "n_adults"
},
{
"val": "1700",
"key": "budget"
}
],
"name": "inform"
}
],
"active_frame": 1,
"frames": [
{
"info": {
"intent": [
{
"val": "book",
"negated": false
}
],
"budget": [
{
"val": "1700.0",
"negated": false
}
],
"dst_city": [
{
"val": "Atlantis",
"negated": false
}
],
"or_city": [
{
"val": "Caprica",
"negated": false
}
],
"str_date": [
{
"val": "august 13",
"negated": false
}
],
"n_adults": [
{
"val": "8",
"negated": false
}
]
},
"frame_id": 1,
"requests": [],
"frame_parent_id": null,
"binary_questions": [],
"compare_requests": []
}
]
},
"author": "user",
"timestamp": 1471272019730.0
},
{
"db": {
"result": [
[
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 51,
"month": 8
},
"departure": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 9
},
"price": 2118.81,
"hotel": {
"gst_rating": 7.15,
"vicinity": [],
"name": "Scarlet Palms Resort",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_PARKING",
"FREE_WIFI"
],
"dst_city": "Goiania",
"category": "3.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 2,
"min": 37
},
"arrival": {
"hour": 12,
"year": 2016,
"day": 10,
"min": 37,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 10,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 2,
"min": 37
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 4,
"min": 37,
"month": 8
},
"departure": {
"hour": 22,
"year": 2016,
"day": 3,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 7
},
"price": 2369.83,
"hotel": {
"gst_rating": 0,
"vicinity": [],
"name": "Sunway Hostel",
"country": "Argentina",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI"
],
"dst_city": "Rosario",
"category": "2.0 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 0,
"month": 8
}
},
"seat": "BUSINESS",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 51,
"month": 8
},
"departure": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 9
},
"price": 2375.72,
"hotel": {
"gst_rating": 7.15,
"vicinity": [],
"name": "Scarlet Palms Resort",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_PARKING",
"FREE_WIFI"
],
"dst_city": "Goiania",
"category": "3.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 1,
"min": 30
},
"arrival": {
"hour": 11,
"year": 2016,
"day": 1,
"min": 30,
"month": 9
},
"departure": {
"hour": 10,
"year": 2016,
"day": 1,
"min": 0,
"month": 9
}
},
"seat": "BUSINESS",
"leaving": {
"duration": {
"hours": 1,
"min": 30
},
"arrival": {
"hour": 18,
"year": 2016,
"day": 19,
"min": 30,
"month": 8
},
"departure": {
"hour": 17,
"year": 2016,
"day": 19,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 13
},
"price": 2492.95,
"hotel": {
"gst_rating": 0,
"vicinity": [],
"name": "Hotel Mundo",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI",
"FREE_PARKING"
],
"dst_city": "Manaus",
"category": "2.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 31,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 31,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 19,
"year": 2016,
"day": 27,
"min": 51,
"month": 8
},
"departure": {
"hour": 19,
"year": 2016,
"day": 27,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 4
},
"price": 2538.0,
"hotel": {
"gst_rating": 8.22,
"vicinity": [],
"name": "The Glee",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI"
],
"dst_city": "Recife",
"category": "4.0 star hotel"
}
}
],
[],
[],
[],
[],
[],
[]
],
"search": [
{
"ORIGIN_CITY": "Porto Alegre",
"PRICE_MIN": "2000",
"NUM_ADULTS": "2",
"timestamp": 1471271949.995,
"PRICE_MAX": "3000",
"ARE_DATES_FLEXIBLE": "true",
"NUM_CHILDREN": "5",
"START_TIME": "1470110400000",
"MAX_DURATION": 2592000000.0,
"DESTINATION_CITY": "Brazil",
"RESULT_LIMIT": "10",
"END_TIME": "1472616000000"
},
{
"ORIGIN_CITY": "Atlantis",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272148.124,
"PRICE_MAX": "1700",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "NaN",
"END_TIME": "NaN"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MAX": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272189.07,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MAX": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272205.436,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272278.72,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272454.542,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1471060800000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272466.008,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1471060800000",
"END_TIME": "1472011200000"
}
]
},
How it could be modified to extract all the "text" values from the JSON file to a CSV file?
This is a potential solution using pandas:
import pandas as pd
#importing data
dj = pd.read_json("frames2.json")
dtext = dj[["user_id","turns"]]
#Saving text records in a list
list_ = []
for record in dtext["turns"].values:
for r in record:
list_.append(r["text"])
#Exporting the csv
out = pd.Series(list_,name="text")
out.to_csv("text.csv")
It gives the following output.
Try:
import json
import csv
with open("/Users/zahraa-maher/rasa-init-demo/venv/Tickie/external_data/frames2.json") as file:
data = json.load(file)
fname = "outputText8.csv"
with open(fname, "w") as file:
csv_file = csv.writer(file,lineterminator='\n')
csv_file.writerow(["text"])
for keys,values in data.items():
now it up to you which of the fields you want to save, if you user a debugger you can see the values and Keys

Categories