how to create document and collection in mongodb to make python code configuration. Get attribute name, datatype, function to be called from mongodb ?
mongodb collection sample example
db.attributes.insertMany([
{ attributes_names: "email", attributes_datype: "string", attributes_isNull="false", attributes_std_function = "email_valid" }
{ attributes_names: "address", attributes_datype: "string", attributes_isNull="false", attributes_std_function = "address_valid" }
]);
Python script and function
def email_valid(df):
df1 = df.withColumn(df.columns[0], regexp_replace(lower(df.columns[0]), "^a-zA-Z0-9#\._\-| ", ""))
extract_expr = expr(
"regexp_extract_all(emails, '(\\\w+([\\\.-]?\\\w+)*#\\[A-Za-z\-\.]+([\\\.-]?\\\w+)*(\\\.\\\w{2,3})+)', 0)")
df2 = df1.withColumn(df.columns[0], extract_expr) \
.select(df.columns[0])
return df2
How to get all the mongodb values in python script and call the function according to attribues.
To create MongoDB collection from a python script :
import pymongo
# connect to your mongodb client
client = pymongo.MongoClient(connection_url)
# connect to the database
db = client[database_name]
# get the collection
mycol = db[collection_name]
from bson import ObjectId
from random_object_id import generate
# create a sample dictionary for the collection data
mydict = { "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" }
# insert the dictionary into the collection
mycol.insert_one(mydict)
To insert multiple values in the MongoDB, use insert_many() instead of insert_one() and pass the list of dictionary to it. So your list of dictionary will look like this
mydict = [{ "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" },
{ "_id": ObjectId(generate()),
"attributes_names": "email",
"attributes_datype": "string",
"attributes_isNull":"false",
"attributes_std_function" : "email_valid" }]
To get all the data from MongoDB collection into python script :
data = list()
for x in mycol.find():
data.append(x)
import pandas as pd
data = pd.json_normalize(data)
And then access the data as you access an element of a list of dictionaries:
value = data[0]["attributes_names"]
Related
I have a mongodb collection data in the following format
[{"name":"axe1","base-url":"www.example1.com","date":"2022-06-22 11:20", "ml_pred":"Invalid","hum_pred":"valid"},
{"name":"axe2","base-url":"www.example2.com","date":"2022-06-22 12:20", "ml_pred":"Valid","hum_pred":"null"},
{"name":"axe1","base-url":"www.example1.com","date":"2022-06-22 22:20", "ml_pred":"Invalid","hum_pred":"valid"},
{"name":"axe3","base-url":"www.example3.com","date":"2022-06-22 02:20", "ml_pred":"Valid","hum_pred":"null"},
{"name":"axe2","base-url":"www.example2.com","date":"2022-06-22 06:20", "ml_pred":"Invalid","hum_pred":"valid"},
{"name":"axe1","base-url":"www.example1.com","date":"2022-06-22 14:20", "ml_pred":"Invalid","hum_pred":"null"},
{"name":"axe1","base-url":"www.example1.com","date":"2022-06-22 10:20", "ml_pred":"Invalid","hum_pred":"invalid"},
{"name":"axe1","base-url":"www.example1.com","date":"2022-06-22 01:20", "ml_pred":"Invalid","hum_pred":"null"}]
I am trying to get unique base-url and name as a response. For that I use pymongo distinct like below
filter_stuff = {'base-url': 1, 'name':1,'_id': 0}
data = list(crawlcol.find({},filter_stuff).distinct("base-url"))
which returned me a list of base urls. But I am expecting an output like
[{"name":"axe1","base-url":"www.example1.com"},
{"name":"axe2","base-url":"www.example2.com"},
{"name":"axe3","base-url":"www.example3.com"}]
How this can be obtained
This will give the result as required
result = list(crawlcol.aggregate(
[
{"$group": { "_id": { "base-url": "$base-url", "name": "$name" } } }
]
))
I have a table that already exists with the following schema:
{
"schema": {
"fields": [
{
"mode": "required",
"name": "full_name",
"type": "string"
},
{
"mode": "required",
"name": "age",
"type": "integer"
}]
}
}
It already contains entries like:
{'full_name': 'John Doe',
'age': int(33)}
I want to insert a new record with a new field and have the load job automatically add the new column as it loads. The new format looks like this:
record = {'full_name': 'Karen Walker',
'age': int(48),
'zipcode': '63021'}
My code is as follows:
from google.cloud import bigquery
client = bigquery.Client(project=projectname)
table = client.get_table(table_id)
config = bigquery.LoadJobConfig()
config.autoedetect = True
config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
config.schema_update_options = [
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
]
job = client.load_table_from_json([record], table, job_config=config)
job.result()
This results in the following error:
400 Provided Schema does not match Table my_project:my_dataset:mytable. Field age has changed mode from REQUIRED to NULLABLE
I can fix this by changing config.schema_update_options as follows:
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION
]
This allows me to insert the new record, with zipcode added to the schema, but it causes both full_name and age to become NULLABLE, which is not the behavior I want. Is there a way to prevent schema auto-detect from changing the existing columns?
If you need to add fields to your schema, you can do the following:
from google.cloud import bigquery
client = bigquery.Client()
table = client.get_table("your-project.your-dataset.your-table")
original_schema = table.schema # Get your current table's schema
new_schema = original_schema[:] # Creates a copy of the schema.
# Add new field to schema
new_schema.append(bigquery.SchemaField("new_field", "STRING"))
# Set new schema in your table object
table.schema = new_schema
# Call API to update your table with the new schema
table = client.update_table(table, ["schema"])
After updating your table's schema you can load your new records with this additional field ignoring any schema configurations.
I have a small json file, with the following lines:
{
"IdTitulo": "Jaws",
"IdDirector": "Steven Spielberg",
"IdNumber": 8,
"IdDecimal": "2.33"
}
An there is a schema in my db collection, named test_dec. This is what I've used to create the schema:
db.createCollection("test_dec",
{validator: {
$jsonSchema: {
bsonType: "object",
required: ["IdTitulo","IdDirector"],
properties: {
IdTitulo: {
"bsonType": "string",
"description": "string type, nombre de la pelicula"
},
IdDirector: {
"bsonType": "string",
"description": "string type, nombre del director"
},
IdNumber : {
"bsonType": "int",
"description": "number type to test"
},
IdDecimal : {
"bsonType": "decimal",
"description": "decimal type"
}
}
}}
})
I've made multiple attempts to insert the data. The problem is in the IdDecimal field value.
Some of the trials, replacing the IdDecimal line by:
"IdDecimal": 2.33
"IdDecimal": {"$numberDecimal": "2.33"}
"IdDecimal": NumberDecimal("2.33")
None of them work. The second one is the formal solution provided by MongoDB manuals (mongodb-extended-json) adn the error is the output I've placed in my question: bson.errors.InvalidDocument: key'$numberDecimal' must not start with '$'.
I am currently using a python to load the json. I've been playing around with this file:
import os,sys
import re
import io
import json
from pymongo import MongoClient
from bson.raw_bson import RawBSONDocument
from bson.json_util import CANONICAL_JSON_OPTIONS,dumps,loads
import bsonjs as bs
#connection
client = MongoClient('localhost',27018,document_class=RawBSONDocument)
db = client['myDB']
coll = db['test_dec']
other_col = db['free']
for fname in os.listdir('/mnt/win/load'):
num = re.findall("\d+", fname)
if num:
with io.open(fname, encoding="ISO-8859-1") as f:
doc_data = loads(dumps(f,json_options=CANONICAL_JSON_OPTIONS))
print(doc_data)
test = '{"idTitulo":"La pelicula","idRelease":2019}'
raw_bson = bs.loads(test)
load_raw = RawBSONDocument(raw_bson)
db.other_col.insert_one(load_raw)
client.close()
I am using a json file. If I try to parse anything like Decimal128('2.33') the output is "ValueError: No JSON object could be decoded", because my json has an invalid format.
The result of
db.other_col.insert_one(load_raw)
Is that the content of "test" is inserted.
But I cannot use doc_data with RawBSONDocument, because it goes like that. It says:
TypeError: unpack_from() argument 1 must be string or buffer, not list:
When I manage to parse the json directly to the RawBSONDocument I got all the trash within and the record in database looks like the sample here:
{
"_id" : ObjectId("5eb2920a34eea737626667c2"),
"0" : "{\n",
"1" : "\t\"IdTitulo\": \"Gremlins\",\n",
"2" : "\t\"IdDirector\": \"Joe Dante\",\n",
"3" : "\t\"IdNumber\": 6,\n",
"4" : "\"IdDate\": {\"$date\": \"2010-06-18T:00.12:00Z\"}\t\n",
"5" : "}\n"
}
It seems it is not that simple to load a extended json into MongoDB. The extended version is because I want to use schema validation.
Oleg pointed out that is numberDecimal and not NumberDecimal as I had it before. I've fixed the json file, but nothing changed.
Executed:
with io.open(fname, encoding="ISO-8859-1") as f:
doc_data = json.load(f)
coll.insert(doc_data)
And the json file:
{
"IdTitulo": "Gremlins",
"IdDirector": "Joe Dante",
"IdNumber": 6,
"IdDecimal": {"$numberDecimal": "3.45"}
}
One more roll of the dice from me. If you are using schema validation as you are, I would recommend defining a class and being explicit with defining each field and how you propose to convert the field to the relevant python datatypes. While your solution is generic, the data structure has to be rigid to match the validation.
IMO this is clearer and you have control over any errors etc within the class.
Just to confirm I ran the schema validation and this works with the supplied validation.
from pymongo import MongoClient
import bson.json_util
import dateutil.parser
import json
class Film:
def __init__(self, file):
data = file.read()
loaded = json.loads(data)
self.IdTitulo = loaded.get('IdTitulo')
self.IdDirector = loaded.get('IdDirector')
self.IdDecimal = bson.json_util.Decimal128(loaded.get('IdDecimal'))
self.IdNumber = int(loaded.get('IdNumber'))
self.IdDateTime = dateutil.parser.parse(loaded.get('IdDateTime'))
def insert_one(self, collection):
collection.insert_one(self.__dict__)
client = MongoClient()
mycollection = client.mydatabase.test_dec
with open('c:/temp/1.json', 'r') as jfile:
film = Film(jfile)
film.insert_one(mycollection)
gives:
> db.test_dec.findOne()
{
"_id" : ObjectId("5eba79eabf951a15d32843ae"),
"IdTitulo" : "Jaws",
"IdDirector" : "Steven Spielberg",
"IdDecimal" : NumberDecimal("2.33"),
"IdNumber" : 8,
"IdDateTime" : ISODate("2020-05-12T10:08:21Z")
}
>
JSON file used:
{
"IdTitulo": "Jaws",
"IdDirector": "Steven Spielberg",
"IdNumber": 8,
"IdDecimal": "2.33",
"IdDateTime": "2020-05-12T11:08:21+0100"
}
JSON with type information is called Extended JSON. Following the examples, construct extended json for your data:
ext_json = '''
{
"IdTitulo": "Jaws",
"IdDirector": "Steven Spielberg",
"IdNumber": 8,
"IdDecimal": {"$numberDecimal":"2.33"}
}
'''
In Python, use json_util to load extended json into a Python dictionary:
from bson.json_util import loads
doc = loads(ext_json)
print(doc)
# {u'IdTitulo': u'Jaws', u'IdDirector': u'Steven Spielberg', u'IdDecimal': Decimal128('2.33'), u'IdNumber': 8}
The result of this load is sometimes referred to as a "BSON document" but it is not BSON, which is binary. "BSON" in this context really means that some values are not of python standard library types. The "document" part basically means the object is a dictionary.
You will notice that IdNumber is of a non-standard library type:
print type(doc['IdDecimal'])
# <class 'bson.decimal128.Decimal128'>
To insert this dictionary into MongoDB, follow pymongo tutorial:
from pymongo import MongoClient
client = MongoClient('localhost', 14420)
db = client.test_database
collection = db.test_collection
collection.insert_one(doc)
print(doc)
Finally, I've got the solution and it is using RawBSONDocument.
First the json file:
{
"IdTitulo": "Dead Snow",
"IdDirector": "Tommy Wirkola",
"IdNumber": 11,
"IdDecimal": {"$numberDecimal": "2.22"}
}
& the validation schema file:
db.createCollection("test_dec",
{validator: {
$jsonSchema: {
bsonType: "object",
required: ["IdTitulo","IdDirector"],
properties: {
IdTitulo: {
"bsonType": "string",
"description": "string type, nombre de la pelicula"
},
IdDirector: {
"bsonType": "string",
"description": "string type, nombre del director"
},
IdNumber : {
"bsonType": "int",
"description": "number type to test"
},
IdDecimal : {
"bsonType": "decimal",
"description": "decimal type"
}
}
}}
})
So, the collection in this case is "test_dec".
And the python script that opens the file ".json", reads it and parses it to be imported into MongoDB.
import json
from bson.raw_bson import RawBSONDocument
from pymongo import MongoClient
import bsonjs
#connection
client = MongoClient('localhost',27018)
db = client['movieDB']
coll = db['test_dec']
#open an read file
with open('1.json', 'r') as jfile:
data = jfile.read()
loaded = json.loads(data)
dumped = json.dumps(loaded, indent=4)
bson_bytes = bsonjs.loads(dumped)
coll.insert_one(RawBSONDocument(bson_bytes))
client.close()
The inserted document:
{
"_id" : ObjectId("5eb971ec6fbab859dfae8a6f"),
"IdTitulo" : "Dead Snow",
"IdDirector" : "Toomy Wirkola",
"IdDecimal" : NumberDecimal("2.22"),
"IdNumber" : 11
}
I don't know how it flipped the fields IdDecimal and IdNumber, but it passes the validation and I am really happy.
I tried a document with 'hello' instead of a number in NumberDecimal and the insertion resulted in:
{
"_id" : ObjectId("5eb973b76fbab859dfae8ecd"),
"IdTitulo" : "Shining",
"IdDirector" : "Stanley Kubrick",
"IdDecimal" : NumberDecimal("NaN"),
"IdNumber" : 19
}
Thanks to all that tried to help. Specially Oleg!!! Thank you for being so patient.
Could you not just use bson.decimal128.Decimal128? Ot am I missing something?
from pymongo import MongoClient
from bson.decimal128 import Decimal128
db = MongoClient()['mydatabase']
data = {
"IdTitulo": "Jaws",
"IdDirector": "Steven Spielberg",
"IdNumber": 8,
"IdDecimal": "2.33"
}
data['IdDecimal'] = Decimal128(data['IdDecimal'])
db.other_col.insert_one(data)
I have some JSON I'm looping through in the following format. I need to create an object for each unique primary key found in the source data and append to an array. I'm not sure how I would create the object on first encounter of the key and append to it on the next encounter. My initial attempt just creates a new object for each object in the source. Wasn't able to find an example in python only js.
Source data format:
[
...
{
"Id": "NOT NEEDED DATA",
"Client": {
"Id": "KEY",
"Name": "NOT NEEDED DATA"
},
"Name": "DESIRED DATAPOINT"
},
...
]
Desired format:
[
...
{
"client_id": "KEY",
"locations": ["DATA", "DATA"]
}
...
]
pseudocode
for i in sourcedata:
client_id = i['Client']['Id']
location_name = i['Name']
obj = {
"client_id": client_id,
"locations": [location_name]
}
new_array.append(obj)
You can first iterate and build a dictionary for then creating a list of dictionaries as specified in your output format.
from collections import defaultdict
# create and populate the dictionary
d = defaultdict(list)
for i in sourcedata:
client_id = i['Client']['Id']
location_name = i['Name']
d[client_id].append(location_name)
# build the result
res = [{"client_id": k, "locations": v} for k,v in d.items()]
I wrote a Flask REST implementation to receive the following data.
After checking the API key from the client, the server should store the data which comes in the following API definition. The issue, I am facing is, I have got many strings under the same field 'services' where I would appreciate any help.
{
"id": "string",
"termsAndConditions": "string",
"offererBranchId": "string",
"requesterBranchId": "string",
"accepted": "2017-05-24T10:06:31.012Z",
"services": [
{
"id": "string",
"name": "string",
"aggregationLevel": [
"string"
],
"aggregationMethod": [
"string"
],
"timestep": [
"string"
]
]
}
}
My code is below, if the field name 'services' has a single string with it, like the other ones (i.e "id","termsAndConditions" etc.).
from flask_pymongo import PyMongo
import json
app = Flask(__name__)
app.config['MONGO_DBNAME'] = 'demo'
app.config['MONGO_URI'] = 'mongodb://xxxx#xxxx.mlab.com:xxxx/demo'
mongo = PyMongo(app)
users = mongo.db.users
#app.route('/service-offer/confirmed/REQUESTER',methods=['POST'])
def serviceofferconfirmed():
key = request.headers.get('X-API-Key')
users=mongo.db.users
api_record=users.find_one({'name':"apikey"})
actual_API_key=api_record['X-API-Key']
if key==actual_API_key:
offer={"id": request.json["id"],
"termsAndConditions":request.json["termsAndConditions"],
"offererBranchId":request.json["offererBranchId"],
"requesterBranchId": request.json["requesterBranchId"],
"accepted":request.json["accepted"],
"services":request.json["services"] # Here I need help to match the schema.
}
users.insert(offer)
return "Service Data Successfully Stored"
return jsonify("Pleae check your API Key or URL")
I wish to receive the whole data which are many strings and store the data under the field name 'services'.
you can use isinstance("str", request.json["services"])
if you don't want value as string for services
if not isinstance("str", request.json["services"]):
// your code..........