How can I query a MongoDB database with different child levels? - python

I'm new to mongoDB using pymongo. I'm trying to query a collection and also get a specific child from a field. This is what I tried:
import pymongo
import csv
from pymongo import MongoClient
connection = MongoClient()
db = connection.database
collection1 = db.data1
collection2 = db.data2
writer = csv.writer(open("Result_example.csv", "w"))
with open('Data_example.csv') as csvfile:
spamreader = csv.reader(csvfile, delimiter=';')
for row in spamreader:
for rows in collection1.find({"_id": row[0]}, { "childs.first.name": 1}):
writer.writerow([row[0], rows.get("childs.first.name")])
The database structure is like this:
child
first
name
What I want to get is the name...Any ideas?
Thanks!!!

Other than the field not being pluralized in the example structure that was provided, the following query looks fine.
for rows in collection1.find({"_id": row[0]}, { "child.first.name": 1}):
Note that child field is singular.
rows is a reference to a dictionary object like below:
{
'child': {
'first': {
'name': 'Vorname'
}
}
}
rows.get("childs.first.name") returns None in writer.writerow([row[0], rows.get("childs.first.name")])
You can retrieve the name using
rows.get('child').get('first').get('name')
Or
rows['child']['first']['name']
You can save these nested key accesses by running an aggregation that returns the document id and firstname in place of collection1.find({"_id": row[0]}, { "child.first.name": 1}).
children_names = db.collection1.aggregate([
{
'$match': {'_id': ObjectId(row[0])}
},
{
'$replaceRoot': {'newRoot': {'_id': '$_id', 'first_name': '$child.first.name' }}
},
])
Key access could then be done once.
for rows in children_names:
writer.writerow([row[0], rows.get("first_name")])

Related

how to call python function by getting mongob collection values

how to create document and collection in mongodb to make python code configuration. Get attribute name, datatype, function to be called from mongodb ?
mongodb collection sample example
db.attributes.insertMany([
{ attributes_names: "email", attributes_datype: "string", attributes_isNull="false", attributes_std_function = "email_valid" }
{ attributes_names: "address", attributes_datype: "string", attributes_isNull="false", attributes_std_function = "address_valid" }
]);
Python script and function
def email_valid(df):
df1 = df.withColumn(df.columns[0], regexp_replace(lower(df.columns[0]), "^a-zA-Z0-9#\._\-| ", ""))
extract_expr = expr(
"regexp_extract_all(emails, '(\\\w+([\\\.-]?\\\w+)*#\\[A-Za-z\-\.]+([\\\.-]?\\\w+)*(\\\.\\\w{2,3})+)', 0)")
df2 = df1.withColumn(df.columns[0], extract_expr) \
.select(df.columns[0])
return df2
How to get all the mongodb values in python script and call the function according to attribues.
To create MongoDB collection from a python script :
import pymongo
# connect to your mongodb client
client = pymongo.MongoClient(connection_url)
# connect to the database
db = client[database_name]
# get the collection
mycol = db[collection_name]
from bson import ObjectId
from random_object_id import generate
# create a sample dictionary for the collection data
mydict = { "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" }
# insert the dictionary into the collection
mycol.insert_one(mydict)
To insert multiple values in the MongoDB, use insert_many() instead of insert_one() and pass the list of dictionary to it. So your list of dictionary will look like this
mydict = [{ "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" },
{ "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" }]
To get all the data from MongoDB collection into python script :
data = list()
for x in mycol.find():
data.append(x)
import pandas as pd
data = pd.json_normalize(data)
And then access the data as you access an element of a list of dictionaries:
value = data[0]["attributes_names"]

how to Remove _id field while extracing mongodb document to json python

how to remove _id field while extracting mongodb document to json python. I have written the code but getting nothing in json format.
mongodb document looks like
db.collection.find().pretty()
{
"_id" : ObjectId("612334997e2f032b9f077eb7"),
"sourceAttribute" : "first_name",
"domainAttribute" : "First_Name"
}
Code tried
myclient = pymongo.MongoClient('mongodb://localhost:27017/')
mydb = myclient["guid"]
mycol = mydb["mappedfields"]
cursor = mydb.mycol.find({},{'_id':False})
list_cur = list(cursor)
json_data = dumps(list_cur, indent=1)
with open('mapping_files/mapping_from_mongodb.json', 'w') as file:
file.write(json_data)
Output Getting
[]
Expected output
[
{
"sourceAttribute": "first_name",
"domainAttribute": "First_Name"
}
]
cursor = mycol.find({},{'_id':False})
mycol -> collection name.
_id should be in second braces.

Google BigQuery: In Python, column addition makes all the other columns Nullable

I have a table that already exists with the following schema:
{
"schema": {
"fields": [
{
"mode": "required",
"name": "full_name",
"type": "string"
},
{
"mode": "required",
"name": "age",
"type": "integer"
}]
}
}
It already contains entries like:
{'full_name': 'John Doe',
'age': int(33)}
I want to insert a new record with a new field and have the load job automatically add the new column as it loads. The new format looks like this:
record = {'full_name': 'Karen Walker',
'age': int(48),
'zipcode': '63021'}
My code is as follows:
from google.cloud import bigquery
client = bigquery.Client(project=projectname)
table = client.get_table(table_id)
config = bigquery.LoadJobConfig()
config.autoedetect = True
config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
config.schema_update_options = [
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
]
job = client.load_table_from_json([record], table, job_config=config)
job.result()
This results in the following error:
400 Provided Schema does not match Table my_project:my_dataset:mytable. Field age has changed mode from REQUIRED to NULLABLE
I can fix this by changing config.schema_update_options as follows:
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION
]
This allows me to insert the new record, with zipcode added to the schema, but it causes both full_name and age to become NULLABLE, which is not the behavior I want. Is there a way to prevent schema auto-detect from changing the existing columns?
If you need to add fields to your schema, you can do the following:
from google.cloud import bigquery
client = bigquery.Client()
table = client.get_table("your-project.your-dataset.your-table")
original_schema = table.schema # Get your current table's schema
new_schema = original_schema[:] # Creates a copy of the schema.
# Add new field to schema
new_schema.append(bigquery.SchemaField("new_field", "STRING"))
# Set new schema in your table object
table.schema = new_schema
# Call API to update your table with the new schema
table = client.update_table(table, ["schema"])
After updating your table's schema you can load your new records with this additional field ignoring any schema configurations.

How to insert document with collection.update_many() into Collection (MongoDB) using Pymongo (No Duplicated)

I insert Document into Collection with collection.update() because each data I have a postID to different. I want to when I run if a post was inserted in MongoDB, the post will be updated (not insert a new post with postID overlapping with postID first). This is a structure of my data:
comment1 = [
{
'commentParentId': parent_content.text,
'parentId': parent_ID,
'posted': child_time.text,
'postID':child_ID,
'author':
{
'name': child_name.text
},
'content': child_content.text
},
...............
]
This is my code, i used to insert data :
client = MongoClient()
db = client['comment_data2']
db.collection_data = db['comments']
for i in data_comment:
db.collection_data.update_many(
{db.collection_data.find({"postID": {"$in": i["postID"]}})},
{"$set": i},
{'upsert': True}
)
But I have a Error : TypeError: filter must be an instance of dict, bson.son.SON, or other type that inherits from collections.Mapping in line {'upsert': True}. And {db.collection_data.find({"postID": {"$in": i["postID"]}})} is right?
you can use this code:
db.collection_data.update_many(
{"postId": i["postID"]},
{"$set":i},
upsert = True
)

How to parse nestead json and construct relational database columns from dict values using python

Below is my sample json. Am trying to extract "attributes" part of the json and insert into a relational database. But I needed to construct "name" values as relational columns and insert "value" values into table. I mean
{"name":"ID","value":"528BE6D9FD"} "ID" as a column and insert 528BE6D9FD under the "ID". Its just beginning of my python learning so not sure on how to construct columns from dictionary values.
d = 'C:/adapters/sample1.json'
json_data = open(d).read()
json_file = json.loads(json_data)
for children in json_file["events"]:
#print (children)
for grandchildren in children["attributes"]:
#print(grandchildren)
for key, value in grandchildren.iteritems():
#if key == 'name':
print value
{
"events":[
{
"timestamp":"2010-11-20T11:08:00.978Z",
"code":"Event",
"namespace":null,
"version":null,
"attributes":[
{
"name":"ID",
"value":"528BE6D9FD"
},
{
"name":"Total",
"value":67
},
{
"name":"PostalCode",
"value":"6064"
},
{
"name":"Category",
"value":"More"
},
{
"name":"State",
"value":"QL"
},
{
"name":"orderDateTime",
"value":"2010-07-20T12:08:13Z"
},
{
"name":"CategoryID",
"value":"1091"
},
{
"name":"billingCountry",
"value":"US"
},
{
"name":"shipping",
"value":"Go"
},
{
"name":"orderFee",
"value":77
},
{
"name":"Name",
"value":"Roy"
}
]
}
]
}
As far as extracting the attributes hash of your json data, I would do that like so:
json_path = "c:\\adapters\\sample1.json"
with open(json_path) as json_file:
json_dict = json.load(json_file)
attributes = json_dict['events'][0]['attributes']
Now, I don't know which database system you are using, but regardless, you can extract names, and values with list comprehensions like so:
names = [key['name'] for key in attributes]
values = [key['value'] for key in attributes]
And now just create a table if needed, insert names as column headers, and insert values as a single row with respect to names.

Categories