MongoDB collection to pandas Dataframe - python

My MongoDB document structure is as follows and some of the factors are NaN.
_id :ObjectId("5feddb959297bb2625db1450")
factors: Array
0:Object
factorId:"C24"
Index:0
weight:1
1:Object
factorId:"C25"
Index:1
weight:1
2:Object
factorId:"C26"
Index:2
weight:1
name:"Growth Led Momentum"
I want to convert it to pandas data frame as follows using pymongo and pandas.
|name | factorId | Index | weight|
----------------------------------------------------
|Growth Led Momentum | C24 | 0 | 0 |
----------------------------------------------------
|Growth Led Momentum | C25 | 1 | 0 |
----------------------------------------------------
|Growth Led Momentum | C26 | 2 | 0 |
----------------------------------------------------
Thank you

Update
I broke out the ol Python to give this a crack - the following code works flawlessly!
from pymongo import MongoClient
import pandas as pd
uri = "mongodb://<your_mongo_uri>:27017"
database_name = "<your_database_name"
collection_name = "<your_collection_name>"
mongo_client = MongoClient(uri)
database = mongo_client[database_name]
collection = database[collection_name]
# I used this code to insert a doc into a test collection
# before querying (just incase you wanted to know lol)
"""
data = {
"_id": 1,
"name": "Growth Lead Momentum",
"factors": [
{
"factorId": "C24",
"index": 0,
"weight": 1
},
{
"factorId": "D74",
"index": 7,
"weight": 9
}
]
}
insert_result = collection.insert_one(data)
print(insert_result)
"""
# This is the query that
# answers your question
results = collection.aggregate([
{
"$unwind": "$factors"
},
{
"$project": {
"_id": 1, # Change to 0 if you wish to ignore "_id" field.
"name": 1,
"factorId": "$factors.factorId",
"index": "$factors.index",
"weight": "$factors.weight"
}
}
])
# This is how we turn the results into a DataFrame.
# We can simply pass `list(results)` into `DataFrame(..)`,
# due to how our query works.
results_as_dataframe = pd.DataFrame(list(results))
print(results_as_dataframe)
Which outputs:
_id name factorId index weight
0 1 Growth Lead Momentum C24 0 1
1 1 Growth Lead Momentum D74 7 9
Original Answer
You could use the aggregation pipeline to unwind factors and then project the fields you want.
Something like this should do the trick.
Live demo here.
Database Structure
[
{
"_id": 1,
"name": "Growth Lead Momentum",
"factors": [
{
factorId: "C24",
index: 0,
weight: 1
},
{
factorId: "D74",
index: 7,
weight: 9
}
]
}
]
Query
db.collection.aggregate([
{
$unwind: "$factors"
},
{
$project: {
_id: 1,
name: 1,
factorId: "$factors.factorId",
index: "$factors.index",
weight: "$factors.weight"
}
}
])
Results
(.csv friendly)
[
{
"_id": 1,
"factorId": "C24",
"index": 0,
"name": "Growth Lead Momentum",
"weight": 1
},
{
"_id": 1,
"factorId": "D74",
"index": 7,
"name": "Growth Lead Momentum",
"weight": 9
}
]

Wonderful answer by Matt, In case you want to use pandas:
Use this after you have retrieved documents from db:
df = pd.json_normalize(data)
df = df['factors'].explode().apply(lambda x: [val for _, val in x.items()]).explode().apply(pd.Series).join(df).drop(columns=['factors'])
Output:
factorId Index weight name
0 C24 0 1 Growth Led Momentum
0 C25 1 1 Growth Led Momentum
0 C26 2 1 Growth Led Momentum

Related

dictionaries to pandas dataframe

I'm trying to extract data from dictionaries, here's an example for one dictionary. Here's what I have so far (probably not the greatest solution).
def common():
ab={
"names": ["Brad", "Chad"],
"org_name": "Leon",
"missing": 0.3,
"con": {
"base": "abx",
"conditions": {"func": "**", "ref": 0},
"results": 4,
},
"change": [{"func": "++", "ref": 50, "res": 31},
{"func": "--", "ref": 22, "res": 11}]
}
data = []
if "missing" in ab.keys():
data.append(
{
"names": ab["names"],
"org_name": ab["org_name"],
"func": "missing",
"ref": "",
"res": ab["missing"],
}
)
if "con" in ab.keys():
data.append(
{
"names": ab["names"],
"org_name": ab["con"]["base"],
"func": ab["con"]["conditions"]["func"],
"ref": ab["con"]["conditions"]["ref"],
"res": ab["con"]["results"],
}
)
df = pd.DataFrame(data)
print(df)
return df
Output:
names org_name func ref res
0 [Brad, Chad] Leon missing 0.3
1 [Brad, Chad] abx ** 0 4.0
What I would like the output to look like:
names org_name func ref res
0 [Brad, Chad] Leon missing 0.3
1 [Brad, Chad] abx ** 0 4
2 [Brad, Chad] Leon ++ 50 31
3 [Brad, Chad] Leon -- 22 11
The dictionaries can be different length, ultimately a list of several dictionaries will be passed. I'm not sure how to repeat the names and org_name values based on the ref and res values... I don't want to keep adding row by row, dynamic solution is always preferred.
Try:
import pandas as pd
ab={
"names": ["Brad", "Chad"],
"org_name": "Leon",
"missing": 0.3,
"con": {
"base": "abx",
"conditions": {"func": "**", "ref": 0},
"results": 4,
},
"change": [{"func": "++", "ref": 50, "res": 31},
{"func": "--", "ref": 22, "res": 11}]
}
out = []
if 'change' in ab:
for ch in ab['change']:
out.append({'names': ab['names'], 'org_name': ab['org_name'], **ch})
if 'con' in ab:
out.append({'names': ab['names'], 'org_name': ab['con']['base'], **ab['con']['conditions'], 'res': ab['con']['results']})
if 'missing' in ab:
out.append({'names': ab['names'], 'org_name': ab['org_name'], 'func': 'missing', 'res': ab['missing']})
print(pd.DataFrame(out).fillna(''))
Prints:
names org_name func ref res
0 [Brad, Chad] Leon ++ 50.0 31.0
1 [Brad, Chad] Leon -- 22.0 11.0
2 [Brad, Chad] abx ** 0.0 4.0
3 [Brad, Chad] Leon missing 0.3

Python - Grab specific value from known key inside large json

I need to get just 2 entries inside a very large json object, I don't know the array position, but I do know key:value pairs of the entry I want to find and where I want another value from this entry.
In this example there are only 4 examples, but in the original there are over 1000, and I need only 2 entries of which I do know "name" and "symbol" each. I need to get the value of quotes->ASK->time.
x = requests.get('http://example.org/data.json')
parsed = x.json()
gettime= str(parsed[0]["quotes"]["ASK"]["time"])
print(gettime)
I know that I can get it that way, and then loop through that a thousand times, but that seems like an overkill for just 2 values. Is there a way to do something like parsed["symbol":"kalo"]["quotes"]["ASK"]["time"] which would give me kalo time without using a loop, without going through all thousand entries?
[
{
"id": "nem-cri",
"name": "nemlaaoo",
"symbol": "nem",
"rank": 27,
"owner": "marcel",
"quotes": {
"ASK": {
"price": 19429,
"time": 319250866,
"duration": 21
}
}
},
{
"id": "kalo-lo-leek",
"name": "kalowaaa",
"symbol": "kalo",
"rank": 122,
"owner": "daniel",
"quotes": {
"ASK": {
"price": 12928,
"time": 937282932,
"duration": 09
}
}
},
{
"id": "reewmaarwl",
"name": "reeqooow",
"symbol": "reeq",
"rank": 4,
"owner": "eric",
"quotes": {
"ASK": {
"price": 9989,
"time": 124288222,
"duration": 19
}
}
},
{
"id": "sharkooaksj",
"name": "sharkmaaa",
"symbol": "shark",
"rank": 22,
"owner": "eric",
"quotes": {
"ASK": {
"price": 11122,
"time": 482773882,
"duration": 22
}
}
}
]
If you are OK with using pandas I would just create a DataFrame.
import pandas as pd
df = pd.json_normalize(parsed)
print(df)
id name symbol rank owner quotes.ASK.price \
0 nem-cri nemlaaoo nem 27 marcel 19429
1 kalo-lo-leek kalowaaa kalo 122 daniel 12928
2 reewmaarwl reeqooow reeq 4 eric 9989
3 sharkooaksj sharkmaaa shark 22 eric 11122
quotes.ASK.time quotes.ASK.duration
0 319250866 21
1 937282932 9
2 124288222 19
3 482773882 22
If you want the kalo value then
print(df[df['symbol'] == 'kalo']['quotes.ASK.price']) # -> 12928

Python:How to insert array of data into mongodb using pymongo from a dataframe

Have a dataframe with values
df
name rank subject marks age
tom 123 math 25 10
mark 124 math 50 10
How to insert the dataframe data into mongodb using pymongo like first two columns as a regular insert and another 3 as array
{
"_id": "507f1f77bcf86cd799439011",
"name":"tom",
"rank":"123"
"scores": [{
"subject": "math",
"marks": 25,
"age": 10
}]
}
{
"_id": "507f1f77bcf86cd799439012",
"name":"mark",
"rank":"124"
"scores": [{
"subject": "math",
"marks": 50,
"age": 10
}]
}
tried this :
convert_dict = df.to_dict("records")
mydb.school_data.insert_many(convert_dict)
I use this solution
convert_dict = df.to_dict(orient="records")
mydb.school_data.insert_many(convert_dict)

from dataframe to json

Hi how are things? I have a dataframe, which looks like a recursive table, my idea is to be able to transform it to a json (in a mamushka way). Im using python
my example:
Datafame:
id
name
relations
1
config
0
2
buttons
1
3
accept
2
4
delete
2
5
descripton
1
6
title
1
7
juan
0
and the json that i whant is
[{
"id":"1"
"name":"config",
"relations":
[{
"id":"2"
"name":"buttons"
"relations":[{
"id":"3"
"name":"accept"
},
{
"id":"4",
"name":"delete"
}],
},
{
"id":"5"
"name":"descripton",
"relations":[]
},
"id":"6"
"name":"title",
"relations":[]
}],
"id":"7",
"name":"juan",
"relations":[]
}]
As you will see, in the column "relation", you can see that it joins with its parents (id)

Pandas json_normalize with timestamps as keys

I try to read this JSON data
{
"values": [
{
"1510122047": [
35.7,
256
]
},
{
"1510125000": [
41.7,
7
]
},
{
"1510129000": [
31.7,
0
]
}
]
}
and normalize it into a pandas data frame of this format:
I tried it with json_normalize but I was not able to get the result I need.
Here is what I tried: But it's not quite efficient. I would like to find a solution that works with pandas' built in functions to do this. I'd appreciate ideas!
import pandas
import json
s = """
{"values": [
{
"1510122047": [35.7, 256]
},
{
"1510125000": [41.7, 7]
},
{
"1510129000": [31.7, 0]
}
]}
"""
data = json.loads(s)
normalized_data = []
for value in data['values']:
timestamp = list(value.keys())[0]
normalized_data.append({'timestamp':timestamp, 'value_1': value[timestamp][0], 'value_2': value[timestamp][1]})
pandas.DataFrame(normalized_data)
Thanks
EDIT
Thanks for your suggestions. Unfortunately none where faster than the solution of this OP. Here is what I did to generate a bigger payload and test for speed:
I guess it's the nature of JSON to be slowly for this application.
import pandas
import json
import time
s1 = """{
"1510122047": [35.7, 256]
},
{
"1510125000": [41.7, 7]
},
{
"1510129000": [31.7, 0]
}"""
s = """
{"values": [
{
"1510122047": [35.7, 256]
},
{
"1510125000": [41.7, 7]
},
{
"1510129000": [31.7, 0]
},
""" + ",".join([s1]*1000000) + "]}"
data = json.loads(s)
tic = time.time()
normalized_data = []
for value in data['values']:
timestamp = list(value.keys())[0]
normalized_data.append({'timestamp':timestamp, 'value_1': value[timestamp][0], 'value_2': value[timestamp][1]})
print(time.time() - tic)
pandas.DataFrame(normalized_data)
This is one approach using a nested comprehension
Ex:
df= pd.DataFrame([[key] + value for item in data['values']
for key, value in item.items()
], columns=["Timestamp", "Val_1", "Val_2"])
print(df)
Output:
Timestamp Val_1 Val_2
0 1510122047 35.7 256
1 1510125000 41.7 7
2 1510129000 31.7 0
data = {'values': [{'1510122047': [35.7, 256]},
{'1510125000': [41.7, 7]},
{'1510129000': [31.7, 0]}]}
dfn = json_normalize(data, 'values').stack()
df_output = pd.DataFrame(dfn.tolist(), index=dfn.index)
df_output = df_output.reset_index().iloc[:, 1:]
# rename columns
df_output.columns = 'value_' + df_output.columns.astype(str)
df_output.rename(columns={'value_level_1':'timestamp'}, inplace=True)
print(df_output)
# timestamp value_0 value_1
# 0 1510122047 35.7 256
# 1 1510125000 41.7 7
# 2 1510129000 31.7 0

Categories