How to eliminate duplicate items while adding them to their own structure - python

I have a list of dictionary items, with each dictionary containing a list of presentation items. The sample dictionaries below are a small prototype of my real data set.
I need to remove duplicate presentations based on day (one presentation per day) and store them in a new dictionary with the same structure within the existing list.
So starting with:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7,
},
{
"presentation": "DEF",
"day": 7,
},
{
"presentation": "GHI",
"day": 8,
},
{
"presentation": "JKL",
"day": 8,
},
{
"presentation": "MNO",
"day": 9,
},
{
"presentation": "PQR",
"day": 9,
},
{
"presentation": "STU",
"day": 9,
}
]
} #only one dictionary item in the list for simplicity
]
The end result should be three dictionaries containing lists of presentations where there is one presentation for a given day:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7
},
{
"presentation": "DEF",
"day": 8
},
{
"presentation": "GHI",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "JKL",
"day": 7
},
{
"presentation": "MNO",
"day": 8
},
{
"presentation": "PQR",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "STU",
"day": 9
}
]
}
]
I don't know how to go about removing these duplicates (based on day) while adding them to their own dictionary.

Related

Extracting data from JSON File to CSV

I have a big JSON file with a very complex structure
you can look on it here: https://drive.google.com/file/d/1tBVJ2xYSCpTTUGPJegvAz2ZXbeN0bteX/view?usp=sharing
it contains more than 7 millions lines, and I want to extract only the "text" field
I have written a python code, to extra all the values of the "text" key or field in the whole file, and it extracted only 12 values! while when I open the JSON file on the Visualstudio, I have more than 19000 values!!
you can see the code here:
import json
import csv
with open("/Users/zahraa-maher/rasa-init-demo/venv/Tickie/external_data/frames2.json") as file:
data = json.load(file)
fname = "outputText8.csv"
with open(fname, "w") as file:
csv_file = csv.writer(file,lineterminator='\n')
csv_file.writerow(["text"])
for item in data[i]["turns"]:
csv_file.writerow([item['text']])
please take a look on the JSON file as it is very large one and with a complex structure, so I an not paste it here to see because it would be not understandable
also this is a part of the son file:
[
{
"user_id": "U22HTHYNP",
"turns": [
{
"text": "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
"labels": {
"acts": [
{
"args": [
{
"val": "book",
"key": "intent"
}
],
"name": "inform"
},
{
"args": [
{
"val": "Atlantis",
"key": "dst_city"
},
{
"val": "Caprica",
"key": "or_city"
},
{
"val": "Saturday, August 13, 2016",
"key": "str_date"
},
{
"val": "8",
"key": "n_adults"
},
{
"val": "1700",
"key": "budget"
}
],
"name": "inform"
}
],
"acts_without_refs": [
{
"args": [
{
"val": "book",
"key": "intent"
}
],
"name": "inform"
},
{
"args": [
{
"val": "Atlantis",
"key": "dst_city"
},
{
"val": "Caprica",
"key": "or_city"
},
{
"val": "Saturday, August 13, 2016",
"key": "str_date"
},
{
"val": "8",
"key": "n_adults"
},
{
"val": "1700",
"key": "budget"
}
],
"name": "inform"
}
],
"active_frame": 1,
"frames": [
{
"info": {
"intent": [
{
"val": "book",
"negated": false
}
],
"budget": [
{
"val": "1700.0",
"negated": false
}
],
"dst_city": [
{
"val": "Atlantis",
"negated": false
}
],
"or_city": [
{
"val": "Caprica",
"negated": false
}
],
"str_date": [
{
"val": "august 13",
"negated": false
}
],
"n_adults": [
{
"val": "8",
"negated": false
}
]
},
"frame_id": 1,
"requests": [],
"frame_parent_id": null,
"binary_questions": [],
"compare_requests": []
}
]
},
"author": "user",
"timestamp": 1471272019730.0
},
{
"db": {
"result": [
[
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 51,
"month": 8
},
"departure": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 9
},
"price": 2118.81,
"hotel": {
"gst_rating": 7.15,
"vicinity": [],
"name": "Scarlet Palms Resort",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_PARKING",
"FREE_WIFI"
],
"dst_city": "Goiania",
"category": "3.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 2,
"min": 37
},
"arrival": {
"hour": 12,
"year": 2016,
"day": 10,
"min": 37,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 10,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 2,
"min": 37
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 4,
"min": 37,
"month": 8
},
"departure": {
"hour": 22,
"year": 2016,
"day": 3,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 7
},
"price": 2369.83,
"hotel": {
"gst_rating": 0,
"vicinity": [],
"name": "Sunway Hostel",
"country": "Argentina",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI"
],
"dst_city": "Rosario",
"category": "2.0 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 24,
"min": 0,
"month": 8
}
},
"seat": "BUSINESS",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 51,
"month": 8
},
"departure": {
"hour": 0,
"year": 2016,
"day": 16,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 9
},
"price": 2375.72,
"hotel": {
"gst_rating": 7.15,
"vicinity": [],
"name": "Scarlet Palms Resort",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_PARKING",
"FREE_WIFI"
],
"dst_city": "Goiania",
"category": "3.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 1,
"min": 30
},
"arrival": {
"hour": 11,
"year": 2016,
"day": 1,
"min": 30,
"month": 9
},
"departure": {
"hour": 10,
"year": 2016,
"day": 1,
"min": 0,
"month": 9
}
},
"seat": "BUSINESS",
"leaving": {
"duration": {
"hours": 1,
"min": 30
},
"arrival": {
"hour": 18,
"year": 2016,
"day": 19,
"min": 30,
"month": 8
},
"departure": {
"hour": 17,
"year": 2016,
"day": 19,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 13
},
"price": 2492.95,
"hotel": {
"gst_rating": 0,
"vicinity": [],
"name": "Hotel Mundo",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI",
"FREE_PARKING"
],
"dst_city": "Manaus",
"category": "2.5 star hotel"
}
},
{
"trip": {
"returning": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 10,
"year": 2016,
"day": 31,
"min": 51,
"month": 8
},
"departure": {
"hour": 10,
"year": 2016,
"day": 31,
"min": 0,
"month": 8
}
},
"seat": "ECONOMY",
"leaving": {
"duration": {
"hours": 0,
"min": 51
},
"arrival": {
"hour": 19,
"year": 2016,
"day": 27,
"min": 51,
"month": 8
},
"departure": {
"hour": 19,
"year": 2016,
"day": 27,
"min": 0,
"month": 8
}
},
"or_city": "Porto Alegre",
"duration_days": 4
},
"price": 2538.0,
"hotel": {
"gst_rating": 8.22,
"vicinity": [],
"name": "The Glee",
"country": "Brazil",
"amenities": [
"FREE_BREAKFAST",
"FREE_WIFI"
],
"dst_city": "Recife",
"category": "4.0 star hotel"
}
}
],
[],
[],
[],
[],
[],
[]
],
"search": [
{
"ORIGIN_CITY": "Porto Alegre",
"PRICE_MIN": "2000",
"NUM_ADULTS": "2",
"timestamp": 1471271949.995,
"PRICE_MAX": "3000",
"ARE_DATES_FLEXIBLE": "true",
"NUM_CHILDREN": "5",
"START_TIME": "1470110400000",
"MAX_DURATION": 2592000000.0,
"DESTINATION_CITY": "Brazil",
"RESULT_LIMIT": "10",
"END_TIME": "1472616000000"
},
{
"ORIGIN_CITY": "Atlantis",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272148.124,
"PRICE_MAX": "1700",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "NaN",
"END_TIME": "NaN"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MAX": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272189.07,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MAX": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272205.436,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272278.72,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1470715200000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272454.542,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1471060800000",
"END_TIME": "1472011200000"
},
{
"ORIGIN_CITY": "Caprica",
"PRICE_MIN": "1700",
"NUM_ADULTS": "8",
"RESULT_LIMIT": "10",
"timestamp": 1471272466.008,
"DESTINATION_CITY": "Atlantis",
"NUM_CHILDREN": "",
"ARE_DATES_FLEXIBLE": "true",
"START_TIME": "1471060800000",
"END_TIME": "1472011200000"
}
]
},
How it could be modified to extract all the "text" values from the JSON file to a CSV file?
This is a potential solution using pandas:
import pandas as pd
#importing data
dj = pd.read_json("frames2.json")
dtext = dj[["user_id","turns"]]
#Saving text records in a list
list_ = []
for record in dtext["turns"].values:
for r in record:
list_.append(r["text"])
#Exporting the csv
out = pd.Series(list_,name="text")
out.to_csv("text.csv")
It gives the following output.
Try:
import json
import csv
with open("/Users/zahraa-maher/rasa-init-demo/venv/Tickie/external_data/frames2.json") as file:
data = json.load(file)
fname = "outputText8.csv"
with open(fname, "w") as file:
csv_file = csv.writer(file,lineterminator='\n')
csv_file.writerow(["text"])
for keys,values in data.items():
now it up to you which of the fields you want to save, if you user a debugger you can see the values and Keys

How to insert json from API to snowflake database using python?

I am getting data from Linkedin AD API using python.
I get the data as a json string.
How can I insert this json into Snowfalke table with a variant column?
Instead of variant, fields inside "elements" can also be inserted as a normal.
I am new to both json and python so would love to get some help on this.
Here is the sample json string I am getting.
{
"elements": [
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 3
},
"end": {
"month": 3,
"year": 2019,
"day": 3
}
},
"clicks": 11,
"impressions": 2453,
"pivotValues": [
"urn:li:sponsoredCampaign:1234567"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 4
},
"end": {
"month": 3,
"year": 2019,
"day": 4
}
},
"clicks": 4,
"impressions": 816,
"pivotValues": [
"urn:li:sponsoredCampaign:1234567"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 7
},
"end": {
"month": 3,
"year": 2019,
"day": 7
}
},
"clicks": 1,
"impressions": 629,
"pivotValues": [
"urn:li:sponsoredCampaign:1234565"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 21
},
"end": {
"month": 3,
"year": 2019,
"day": 21
}
},
"clicks": 3,
"impressions": 154,
"pivotValues": [
"urn:li:sponsoredCampaign:1323516"
]
}
],
"paging": {
"count": 10,
"start": 0,
"links": []
}
}
The documentation might be helpful here.
In particular:
INSERT INTO myTable (myColumn)
SELECT ('{"key3": "value3", "key4": "value4"}'::VARIANT);
Just insert your JSON string in the appropriate place.
Here is an example in python of how to insert JSON data:
https://github.com/snowflakedb/snowflake-connector-python/blob/master/test/test_cursor.py#L456
I imagine you're missing the parse_json function from your insert.

Converting a nested JSON to CSV in Python

I would like to convert nested json to a csv file.
I am receiving the json from Rest API.
The fields in csv should look like following.
daterange_start,daterange_end,clicks,impressions,pivotvalues.
I am new to Python and JSON so would love to get some help.
Here is the sample json.
{
"elements": [
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 3
},
"end": {
"month": 3,
"year": 2019,
"day": 3
}
},
"clicks": 11,
"impressions": 2453,
"pivotValues": [
"urn:li:sponsoredCampaign:1234567"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 7
},
"end": {
"month": 3,
"year": 2019,
"day": 7
}
},
"clicks": 1,
"impressions": 629,
"pivotValues": [
"urn:li:sponsoredCampaign:1234565"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 21
},
"end": {
"month": 3,
"year": 2019,
"day": 21
}
},
"clicks": 3,
"impressions": 154,
"pivotValues": [
"urn:li:sponsoredCampaign:1323516"
]
}
],
"paging": {
"count": 10,
"start": 0,
"links": []
}
}
You could use json_normalize. The only issue is the "pivotValues" is a list. So not sure what you'd want there, or if there are more than 1 element within those lists. If it's just one element, you can just easily process that column. If it can have multiple elements, you can eaither create a new row for each element (meaning you have multiple rows with the same data, except different pivotValues, or you could extend each row to have each pivotValues, but then would have nulls with those lists as different lengths.
I also added on there (seeing that the pivotValues all have same prefix), splitting out hat value for you in case yo needed it.
Given:
data = {
"elements": [
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 3
},
"end": {
"month": 3,
"year": 2019,
"day": 3
}
},
"clicks": 11,
"impressions": 2453,
"pivotValues": [
"urn:li:sponsoredCampaign:1234567"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 7
},
"end": {
"month": 3,
"year": 2019,
"day": 7
}
},
"clicks": 1,
"impressions": 629,
"pivotValues": [
"urn:li:sponsoredCampaign:1234565"
]
},
{
"dateRange": {
"start": {
"month": 3,
"year": 2019,
"day": 21
},
"end": {
"month": 3,
"year": 2019,
"day": 21
}
},
"clicks": 3,
"impressions": 154,
"pivotValues": [
"urn:li:sponsoredCampaign:1323516"
]
}
],
"paging": {
"count": 10,
"start": 0,
"links": []
}
}
Code:
import pandas as pd
from pandas.io.json import json_normalize
df = json_normalize(data['elements'])
df['pivotValues'] = df.pivotValues.apply(pd.Series).add_prefix('pivotValues_')
df['pivotValues_stripped'] = df['pivotValues'].str.rsplit(':',1, expand=True)[1]
df.to_csv('path/filename.csv', index=False)
Output:
print (results.to_string())
clicks dateRange.end.day dateRange.end.month dateRange.end.year dateRange.start.day dateRange.start.month dateRange.start.year impressions pivotValues pivotValues_stripped
0 11 3 3 2019 3 3 2019 2453 urn:li:sponsoredCampaign:1234567 1234567
1 1 7 3 2019 7 3 2019 629 urn:li:sponsoredCampaign:1234565 1234565
2 3 21 3 2019 21 3 2019 154 urn:li:sponsoredCampaign:1323516 1323516
You can load and parse the json in python with:
import json
y = json.loads(x)
y will be a python dict. Now loop over y['elements'] and create a list with your desired fields. For example extract the year of start and end dates:
list_for_csv=[]
for e in y['elements']:
list_for_csv.append([e['daterange']['start']['year'],e['daterange']['end']['year']])
Then use numpy to save as csv:
import numpy as np
for_csv = np.asarray(list_for_csv)
np.savetxt("your_file.csv", for_csv, delimiter=",")

Correctly parsing data with jq

I have the following data:
[
{
"M": [
{
"id": 1,
"nk": "MATH$$SPRING$$INST1$$2",
"section": {
"nk": "MATH$$SPRING$$INST1",
"course": 1,
"id": 1
},
"location": {
"id": 1,
"nk": "mcu$$101",
"campus": {
"id": 1,
"nk": "mcu",
"name": "Main Campus"
},
"address": "1 st",
"building": "1",
"room": "101"
},
"day_of_week": 2,
"start_time": "09:00:00",
"end_time": "10:00:00"
},
{
"id": 3,
"nk": "ENG$$SPRING$$INST2$$2",
"section": {
"nk": "ENG$$SPRING$$INST2",
"course": 2,
"id": 4
},
"location": {
"id": 2,
"nk": "mcu$$201",
"campus": {
"id": 1,
"nk": "mcu",
"name": "Main Campus"
},
"address": "1 st",
"building": "1",
"room": "201"
},
"day_of_week": 2,
"start_time": "09:00:00",
"end_time": "10:00:00"
},
{
"id": 4,
"nk": "ENG$$SPRING$$INST2$$22",
"section": {
"nk": "ENG$$SPRING$$INST2",
"course": 2,
"id": 4
},
"location": {
"id": 2,
"nk": "mcu$$201",
"campus": {
"id": 1,
"nk": "mcu",
"name": "Main Campus"
},
"address": "1 st",
"building": "1",
"room": "201"
},
"day_of_week": 2,
"start_time": "10:00:00",
"end_time": "11:00:00"
}
]
},
{
"W": [
{
"id": 2,
"nk": "MATH$$SPRING$$INST1$$4",
"section": {
"nk": "MATH$$SPRING$$INST2",
"course": 1,
"id": 2
},
"location": {
"id": 2,
"nk": "mcu$$201",
"campus": {
"id": 1,
"nk": "mcu",
"name": "Main Campus"
},
"address": "1 st",
"building": "1",
"room": "201"
},
"day_of_week": 4,
"start_time": "08:00:00",
"end_time": "10:00:00"
}
]
}
]
I'm trying to extract "W"'s list.
When i do: jq('[.[].W][]').transform(data) i get None, But when i do jq('[.[].M][]').transform(data) I get the desired result. Why im i experiencing this?
I'm trying to extract "W"'s list.
OK, so let's first deal with jq, and then with the python interface.
jq
.[] yields all the items in the top-level array, and therefore
.[] | .W will yield two items:
null (because the first item does not have .W), and
the desired list
To extract just "W"'s list, you could use any of the following filters,
depending on your precise requirements:
.[] | select(has("W")) | .W
.[] | .W | select(.)
.[] | .W // empty
.[1].W
from jq import jq
As the documentation at https://pypi.org/project/pyjq/ says:
If multiple_output is False (the default), then the first output is used
For example:
print jq('1,2').transform(data)
yields just 1.
In summary
Depending on the precise requirements, you can use any of the filters given above, for example:
jq('.[] | .W // empty').transform(data)
Moral
If there's a moral to this tale, it might be that, when in doubt, one should consider using jq (the command-line executable) or jqplay to make sure your jq filter is doing what you want.

converting list of dictionary to dictionary tree based on parent id

I want to make a list of dictionary that way, every element which has a parent id, it should be child of the parent element.
Let's say we have a python list, which contains multiple dictionaries.
[{
"id": 1,
"title": "node1",
"parent": null
},
{
"id": 2,
"title": "node2",
"parent": 1
},
{
"id": 3,
"title": "node3",
"parent": 1
},
{
"id": 4,
"title": "node4",
"parent": 2
},
{
"id": 5,
"title": "node5",
"parent": 2
}]
And I want to convert this list to tree based on parent key. like,
[{
'id':1,
'title':'node1',
'childs':[
{
'id':2,
'title':'node2'
'childs':[
{
'id':4,
'title':'node4',
'childs': []
},
{
'id':5,
'title':'node5',
'childs': []
}
]
},
{
'id':3,
'title':'node3'
'childs':[]
}
]
}]
data = [{
"id": 1,
"title": "node1",
"parent": "null"
},
{ "id": 2,
"title": "node2",
"parent": "null"
},
{
"id": 2,
"title": "node2",
"parent": 1
},
{
"id": 3,
"title": "node3",
"parent": 1
},
{
"id": 4,
"title": "node4",
"parent": 2
},
{
"id": 5,
"title": "node5",
"parent": 2
}]
parent_data=[]
for keys in data:
if keys['parent'] == "null":
keys['childs']=[]
parent_data.append(keys)
for keys in data:
for key in parent_data:
if key['id'] == keys['parent']:
key['childs'].append(keys)
print parent_data
k = [{
"id": 1,
"title": "node1",
"parent": "null"
},
{
"id": 2,
"title": "node2",
"parent": 1
},
{
"id": 3,
"title": "node3",
"parent": 1
},
{
"id": 4,
"title": "node4",
"parent": 2
},
{
"id": 5,
"title": "node5",
"parent": 2
}]
result, t = [], {}
for i in k:
i['childs'] = []
if i['parent'] == 'null':
del i['parent']
result.append(i)
t[1] = result[0]
else:
t[i['parent']]['childs'].append(i)
t[i['id']] = t[i['parent']]['childs'][-1]
del t[i['parent']]['childs'][-1]['parent']
print result

Categories