MongoEngine not deleting all documents

MongoEngine not deleting all documents - python

I have a some unit tests which submit some info to a server which saves the info into a document in a mongo engine. At the end of the test, I want to delete all of the documents made by the test:
#router.delete("/all", summary="Delete all jobs in an organization")
async def delete_all_jobs(job_data: AuthorizedResource = Depends(CanActOnResource("delete", "jobs"))):
MongoJob.objects(organization=job_data.organization).delete()
However when I run this endpoint, some of the documents are only partially deleted:
This is what the JSON looks like before being deleted:
{
"_id" : "242d07ac-eafb-4875-a8f4-8ec89c7bc21f",
"_cls" : "MongoJob",
"_created_by" : "tom.mclean",
"_date_created" : ISODate("2022-02-24T08:23:50.943Z"),
"_date_modified" : ISODate("2022-02-24T08:25:02.062Z"),
"_modified_by" : "tom.mclean",
"client_info" : {
"protocol" : "tcp",
"interface" : "0.0.0.0",
"port" : 0
},
"grib_data" : {
"grib_dir_clim" : "X:\\Weather_Files\\Climatology",
"grib_dir_wind" : "X:\\Weather_Files\\NOAA_Forcasts",
"grib_dir_wave" : "X:\\Weather_Files\\NOAA_Forcasts"
},
"organization" : "8b50d3f2-03fe-4aca-9cf6-9922854f2989",
"output_dir" : "C:\\Users\\Tom.Mclean\\src\\routingserver\\WeatherRouting\\WeatherRouting\\..\\output",
"polars" : [
"5d19d7d0-eba2-49e5-8719-760d352d50dc"
],
"result" : {
"costs" : {
"total_cost" : null,
"fuel_cost" : null,
"hire_cost" : null
}
},
"route_form" : {
"waypoints" : [
{
"type" : "Waypoint",
"lon" : -7.25,
"lat" : 49.42,
"normal_deviation" : 0.2
},
{
"type" : "Waypoint",
"lon" : -50.0,
"lat" : 40.0,
"normal_deviation" : 0.0
}
],
"start_time" : ISODate("2022-02-24T08:23:50.842Z"),
"arrival_window" : {
"early" : null,
"late" : null
},
"max_tws" : 40.0,
"max_lat" : 65.0,
"min_lat" : -40.0,
"max_speed" : 16.0,
"min_speed" : 8.0,
"great_circle" : false,
"objective_funcs" : [
{
"hire_cost" : 16000.0,
"fuel_cost" : 550.0
}
],
"decision_time" : 24.0,
"course_change_angle" : 15.0,
"speed_step" : 0.5
},
"ship" : "f2775ef8-c58d-4aa3-a6a0-b82539535e88",
"status" : "FAILED",
"wave_data" : false
}
And then after running that end point of the API, some of the documents are deleted however some are left with just three fields:
{
"_id" : "5f04ffc3-45a3-4652-a79d-68b37e737268",
"_date_modified" : ISODate("2022-02-24T15:13:28.013Z"),
"status" : "FAILED"
}
If I run the unit tests in debug mode and pause on the line which calls the delete endpoint and then run it later on, it safely deletes all the documents:
#classmethod
def tearDownClass(cls) -> None:
# TODO Once jobs can be deleted, clear test jobs from the routing server
loop = asyncio.new_event_loop()
loop.run_until_complete(cls.oauth.get_new_access_token())
organization_path = cls.api._organization_path
pathname = f"{organization_path}/jobs/all"
loop.run_until_complete(cls.api.delete(pathname, token=cls.oauth.access_token)) <- PAUSE HERE
How can I safely ensure that all of the documents are deleted? I could add a pause to the unit test before calling the delete endpoint, but this does not feel right and I should just try and fix the issue first.

Related

Issues with turning json data into an variable in python

I have been working on an educational project a small part of it requires me to convert a single line of json data into an variable in python 3 which I recieve from domoticz (an external open source software) however due to my skill level with json I have expierenced some issues and I am not exactly sure what im doing wrong. I did get the 200 response everytime so I assume from what I understood that means the connection isnt the issue but rather the python code. (I censored the addressed but they are correct.)
The code im using:
import time
import re
import requests
from ctypes import c_int, c_char_p, byref, sizeof, c_uint16, c_int32, c_byte
from ctypes import c_void_p
from datetime import datetime
import os
import urllib.request
import json
import logging
import sys
from requests.exceptions import HTTPError
logger = logging.getLogger(__name__)
domoticzserver='ip'
switchid='3'
device='5'
tempbed=str(4)
def domoticzrequest (url):
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
return response.read()
import urllib.request, json
with urllib.request.urlopen("http://domoticzip/json.htm?type=devices&rid=4") as url:
data = json.loads(url.read().decode())
print(data)
The json I get back which i can see by typing clicking the url in python:
{
"ActTime" : 1606722346,
"AstrTwilightEnd" : "18:37",
"AstrTwilightStart" : "06:23",
"CivTwilightEnd" : "17:14",
"CivTwilightStart" : "07:47",
"DayLength" : "08:08",
"NautTwilightEnd" : "17:56",
"NautTwilightStart" : "07:04",
"ServerTime" : "2020-11-30 08:45:46",
"SunAtSouth" : "12:30",
"Sunrise" : "08:26",
"Sunset" : "16:34",
"app_version" : "2020.2",
"result" :
[
{
"AddjMulti" : 1.0,
"AddjMulti2" : 1.0,
"AddjValue" : 0.0,
"AddjValue2" : 0.0,
"BatteryLevel" : 255,
"CustomImage" : 0,
"Data" : "Normal",
"Description" : "",
"Favorite" : 0,
"HardwareID" : 1,
"HardwareName" : "Domoticz Internal",
"HardwareType" : "Domoticz Internal interface",
"HardwareTypeVal" : 67,
"HaveDimmer" : false,
"HaveGroupCmd" : false,
"HaveTimeout" : false,
"ID" : "148702",
"LastUpdate" : "2020-10-19 15:10:02",
"MaxDimLevel" : 0,
"Name" : "Domoticz Security Panel",
"Notifications" : "false",
"PlanID" : "0",
"PlanIDs" :
[
0
],
"Protected" : false,
"ShowNotifications" : true,
"SignalLevel" : "-",
"Status" : "Normal",
"StrParam1" : "",
"StrParam2" : "",
"SubType" : "Security Panel",
"SwitchType" : "Security",
"SwitchTypeVal" : 0,
"Timers" : "false",
"Type" : "Security",
"TypeImg" : "security",
"Unit" : 0,
"Used" : 0,
"XOffset" : "0",
"YOffset" : "0",
"idx" : "2"
},
{
"AddjMulti" : 1.0,
"AddjMulti2" : 1.0,
"AddjValue" : 0.0,
"AddjValue2" : 0.0,
"BatteryLevel" : 255,
"CustomImage" : 0,
"Data" : "-5.0 C",
"Description" : "",
"Favorite" : 1,
"HardwareID" : 2,
"HardwareName" : "Test sensor",
"HardwareType" : "Dummy (Does nothing, use for virtual switches only)",
"HardwareTypeVal" : 15,
"HaveTimeout" : true,
"ID" : "14053",
"LastUpdate" : "2020-11-09 09:03:34",
"Name" : "Temperatuur Kachel",
"Notifications" : "false",
"PlanID" : "0",
"PlanIDs" :
[
0
],
"Protected" : false,
"ShowNotifications" : true,
"SignalLevel" : "-",
"SubType" : "LaCrosse TX3",
"Temp" : -5.0,
"Timers" : "false",
"Type" : "Temp",
"TypeImg" : "temperature",
"Unit" : 1,
"Used" : 1,
"XOffset" : "0",
"YOffset" : "0",
"idx" : "3",
"trend" : 0
},
{
"AddjMulti" : 1.0,
"AddjMulti2" : 1.0,
"AddjValue" : 0.0,
"AddjValue2" : 0.0,
"BatteryLevel" : 255,
"CustomImage" : 0,
"Data" : "17.5",
"Description" : "",
"Favorite" : 1,
"HardwareID" : 3,
"HardwareName" : "Test switch",
"HardwareType" : "Dummy (Does nothing, use for virtual switches only)",
"HardwareTypeVal" : 15,
"HaveTimeout" : true,
"ID" : "0014054",
"LastUpdate" : "2020-11-06 11:51:09",
"Name" : "Temperatuur gewenst",
"Notifications" : "false",
"PlanID" : "0",
"PlanIDs" :
[
0
],
"Protected" : false,
"SetPoint" : "17.5",
"ShowNotifications" : true,
"SignalLevel" : "-",
"SubType" : "SetPoint",
"Timers" : "false",
"Type" : "Thermostat",
"TypeImg" : "override_mini",
"Unit" : 1,
"Used" : 1,
"XOffset" : "0",
"YOffset" : "0",
"idx" : "4"
}
],
"status" : "OK",
"title" : "Devices"
}
Basicly I want SetPoint(in the last tab of text also right above this) which is in this instance 17.5 as a variable in python and I will make the python code loop so it will grab that json url everytime to update the status of setpoint. But im having issues only grabbing the 17.5 to make it into a variable. I end up getting the entire json like this code is doing. or I will end up getting everything past and including the setpoint if I change some stuff. Does anyone know what im doing wrong and possibly where I should be looking for an solution? I am a bit unexpierenced with the json part of python and I have no clue where to get started as the code's I found and have tried seem to not work or give me errors.
Thank you very much for your time!

json.loads returns a python dictionary so maybe sth like this would do:
result = json['result']
set_point = 0.0
for res in result:
if 'SetPoint' in res:
set_point = res['SetPoint']

You are getting your data stored in data ={"key": argument} as a dictionary.
If you want to access a certain value you have to call for it. in Your case:
SetPoint = float(data["result"][-1]["SetPoint"])
To break it down:
data["result"] # gives you a list which elements are dictionaries.
the [-1] # calls for the last element in you list which contains the SetPoint
["SetPoint"] # then calls for the SetPoint Value which is a String "17.5"
float(...) converts the string to a float value

Trouble understanding mongo aggregation

I am trying to list all the virtual machines (vms) in my mongo database that use a certain data store, EMC_123.
I have this script, but it list vms that do not use the data store EMC_123.
#!/usr/bin/env python
import pprint
import pymongo
def run_query():
server = '127.0.0.1'
client = pymongo.MongoClient("mongodb://%s:27017/" % server)
db = client["data_center_test"]
collection = db["data_centers"]
pipeline = [
{ "$match": { "clusters.hosts.vms.data_stores.name" : "EMC_123"}},
{ "$group": { "_id" : "$clusters.hosts.vms.name" }}
]
for doc in list(db.data_centers.aggregate(pipeline)):
pp = pprint.PrettyPrinter()
pp.pprint(doc)
pp.pprint (db.command('aggregate', 'data_centers', pipeline=pipeline, explain=True))
def main():
run_query()
return 0
# Start program
if __name__ == "__main__":
main()
I assume I there is something wrong with my pipeline.
Here is the plan that gets printed out:
{u'ok': 1.0,
u'stages': [{u'$cursor': {u'fields': {u'_id': 0,
u'clusters.hosts.vms.name': 1},
u'query': {u'clusters.hosts.vms.data_stores.name': u'EMC_123'},
u'queryPlanner': {u'indexFilterSet': False,
u'namespace': u'data_center_test.data_centers',
u'parsedQuery': {u'clusters.hosts.vms.data_stores.name': {u'$eq': u'EMC_123'}},
u'plannerVersion': 1,
u'rejectedPlans': [],
u'winningPlan': {u'direction': u'forward',
u'filter': {u'clusters.hosts.vms.data_stores.name': {u'$eq': u'EMC_123'}},
u'stage': u'COLLSCAN'}}}},
{u'$group': {u'_id': u'$clusters.hosts.vms.name'}}]}
UPDATE:
Here is a skeleton of what the document looks like:
{
"name" : "data_center_name",
"clusters" : [
{
"hosts" : [
{
"name" : "esxi-hostname",
"vms" : [
{
"data_stores" : [ { "name" : "EMC_123" } ],
"name" : "vm-name1",
"networks" : [ { "name" : "vlan334" } ]
},
{
"data_stores" : [ { "name" : "some_other_data_store" } ],
"name" : "vm-name2",
"networks" : [ { "name" : "vlan334" } ]
}
]
}
],
"name" : "cluster_name"
}
]
}
The problem I am seeing is that vm-name2 shows up in the results when it doesn't have EMC_123 as a data store.
Upate 2:
ok I am able to write a mongo shell query that does what I want. It is a little ugly:
db.data_centers.aggregate({$unwind: '$clusters'}, {$unwind: '$clusters.hosts'}, {$unwind: '$clusters.hosts.vms'}, {$unwind: '$clusters.hosts.vms.data_stores'}, {$match: {"clusters.hosts.vms.data_stores.name": "EMC_123"}})
I came about this in the second answer of this SO question: MongoDB Projection of Nested Arrays

Based on the answers in MongoDB Projection of Nested Arrays I had to change my pipeline to this:
pipeline = [
{'$unwind': '$clusters'},
{'$unwind': '$clusters.hosts'},
{'$unwind': '$clusters.hosts.vms'},
{'$unwind': '$clusters.hosts.vms.data_stores'},
{'$match': {"clusters.hosts.vms.data_stores.name": "EMC_123"}}
]

Python Mongo "Sort operation used more than the maximum" when skip is high

I have a code that makes a rather simple query-skip-limit-sort.
I'm encountering a phenomena I'm having hard time explaining.
On "small" skip value - everything is fine.
On "high" skip value (>18000) - I can't get a result with limit higher then 20 without getting the following error:
OperationFailure: Executor error during find command: OperationFailed: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.
The question is - why is this happening only with large skip count? How can I solve this?
Running it on mongoShell (even with DBQuery.shellBatchSize = 300) works.
And it seems to be using the index
db.my_collection.find({'foo':false}).skip(19000).limit(100).sort({'meta_data.created_at':-1}).explain()
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "bla.my_collection",
"indexFilterSet" : false,
"parsedQuery" : {
"foo" : {
"$eq" : false
}
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 100,
"inputStage" : {
"stage" : "SKIP",
"skipAmount" : 9000,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"foo" : 1,
"meta_data.created_at" : -1
},
"indexName" : "foo_1_meta_data.created_at_-1",
"isMultiKey" : false,
"multiKeyPaths" : {
"foo" : [ ],
"meta_data.created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"foo" : [
"[false, false]"
],
"meta_data.created_at" : [
"[MaxKey, MinKey]"
]
}
}
}
}
},
}
More info:
It seems that indeed sorting is done in memory - it exists in rejected plans.
So what can be done?
"rejectedPlans" : [
{
"stage" : "SKIP",
"skipAmount" : 19000,
"inputStage" : {
"stage" : "SORT",
"sortPattern" : {
"meta_data.created_at" : -1
},
"limitAmount" : 19100,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"foo" : 1,
"_id" : 1
},
"indexName" : "foo_1__id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"foo" : [
"[false, false]"
],
"_id" : [
"[MinKey, MaxKey]"
]
}
}
}
}
}
}
And another question. Why is it happening only at a large skip count? why does it matter?

The question is - why is this happening only with large skip count?
This is because the sort is happening in memory. When you provide sort and limit, the number of documents that must be maintained in memory is equal to limit. When there is skip and limit the number kept in memory must be "skip+limit".
How can I solve this?
You can make sure there is an index that supports the sort as well as the filter, and if there is one but it's not getting picked, you can use hint to specify which index should be used by the query.
Why is it happening only at a large skip count?
The "best" plan is picked first time you run a particular query which is then cached (remembered) in the future. When the skip count gets large enough, the best plan which may have worked fine with small number no longer works best.

MongoDB find in array of objects

I want to query Mongodb: find all users, that have 'artist'=Iowa in any array item of objects.
Here is Robomongo of my collection:
In Python I'm doing:
Vkuser._get_collection().find({
'member_of_group': 20548570,
'my_music': {
'items': {
'$elemMatch': {
'artist': 'Iowa'
}
}
}
})
but this returns nothing. Also tried this:
{'member_of_group': 20548570, 'my_music': {'$elemMatch': {'$.artist': 'Iowa'}}} and that didn't work.
Here is part of document with array:
"can_see_audio" : 1,
"my_music" : {
"items" : [
{
"name" : "Anastasia Plotnikova",
"photo" : "http://cs620227.vk.me/v620227451/9c47/w_okXehPbYc.jpg",
"id" : "864451",
"name_gen" : "Anastasia"
},
{
"title" : "Ain't Talkin' 'Bout Dub",
"url" : "http://cs4964.vk.me/u14671028/audios/c5b8a0735224.mp3?extra=jgV4ZQrFrsfxZCJf4gsRgnKWvdAfIqjE0M6eMtxGFpj2yp4vjs5DYgAGImPMp4mCUSUGJzoyGeh2Es6L-H51TPa3Q_Q",
"lyrics_id" : 24313846,
"artist" : "Apollo 440",
"genre_id" : 18,
"id" : 344280392,
"owner_id" : 864451,
"duration" : 279
},
{
"title" : "Animals",
"url" : "http://cs1316.vk.me/u4198685/audios/4b9e4536e1be.mp3?extra=TScqXzQ_qaEFKHG8trrwbFyNvjvJKEOLnwOWHJZl_cW5EA6K3a9vimaMpx-Yk5_k41vRPywzuThN_IHT8mbKlPcSigw",
"lyrics_id" : 166037,
"artist" : "Nickelback",
"id" : 344280351,
"owner_id" : 864451,
"duration" : 186
},

The following query should work. You can use the dot notation to query into sub documents and arrays.
Vkuser._get_collection().find({
'member_of_group': 20548570,
'my_music.items.artist':'Iowa'
})
The following query worked for me in the mongo shell
db.collection1.find({ "my_music.items.artist" : "Iowa" })

Parsing json in python

I am having trouble parsing the following json file. I am trying to parse this using logstash/python.
{
"took" : 153,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 946,
"max_score" : 1.0,
"hits" : [ {
"_index" : "incoming_bytes",
"_type" : "logs",
"_id" : "lZSq4mBRSVSxO0kyTwh3fQ",
"_score" : 1.0, "_source" : {"user_id":"86c8c25d81a448c49e3d3924ea5ceddf","name":"network.incoming.bytes","resource_id":"instance-00000001-c8be5ca1-116b-45b3-accb-1b40050abc90-tapaf1e421f-c8","timestamp":"2013-11-02T07:32:36Z","resource_metadata":{"name":"tapaf1e421f-c8","parameters":{},"fref":null,"instance_id":"c8be5ca1-116b-45b3-accb-1b40050abc90","instance_type":"5c014e54-ee16-43a8-a763-54e243bd8969","mac":"fa:16:3e:67:39:29"},"volume":557462,"source":"openstack","project_id":"9ac587404bdd4fcdafe41c0b10f9f8ae","type":"cumulative","id":"f1eb19aa-4390-11e3-8bac-000c2973cfb1","unit":"B","#timestamp":"2013-11-02T07:32:38.276Z","#version":"1","host":"127.0.0.1","tags":["_grokparsefailure"],"priority":13,"severity":5,"facility":1,"facility_label":"user-level","severity_label":"Notice","#type":"%{appdeliveryserver}"}
}, {
"_index" : "incoming_bytes",
"_type" : "logs",
"_id" : "073URWt5Sc-krLACxQnI3g",
"_score" : 1.0, "_source" : {"user_id":"86c8c25d81a448c49e3d3924ea5ceddf","name":"network.incoming.bytes","resource_id":"instance-00000001-c8be5ca1-116b-45b3-accb-1b40050abc90-tapaf1e421f-c8","timestamp":"2013-11-02T07:32:38Z","resource_metadata":{"name":"tapaf1e421f-c8","parameters":{},"fref":null,"instance_id":"c8be5ca1-116b-45b3-accb-1b40050abc90","instance_type":"5c014e54-ee16-43a8-a763-54e243bd8969","mac":"fa:16:3e:67:39:29"},"volume":562559,"source":"openstack","project_id":"9ac587404bdd4fcdafe41c0b10f9f8ae","type":"cumulative","id":"f31e38d4-4390-11e3-8bac-000c2973cfb1","unit":"B","#timestamp":"2013-11-02T07:32:39.001Z","#version":"1","host":"127.0.0.1","tags":["_grokparsefailure"],"priority":13,"severity":5,"facility":1,"facility_label":"user-level","severity_label":"Notice","#type":"%{appdeliveryserver}"}
}]
}
}
I have used the following configuration for logstash, however the configuration does not work as expected which is: to parse individual fields in the JSON document and output to STDOUT.
input {
stdin{}
file {
path => ["/home/****/Downloads/throughput"]
codec => "json"
}
}
filter{
json{
source => "message"
target => "throughput"
}
}
output {
stdout {codec => rubydebug }
}
For python I am trying to access the Individual Volume and source (IP address) fields.
I tried the following code, with goal to map individual fields for each record, and I would like to know how to proceed in order to traverse and extract individual elements in the list.
import json
from pprint import pprint
json_data=open('throughput')
data = json.load(json_data)
pprint(data["hits"])
json_data.close()
Thanks

Parsed json is a dictionary, you can use itemgetter to drill down.
For example for volume
>>> for hits in data['hits']['hits']:
... print hits['_source']['volume']
...
557462
562559
or can use map to get a list:
>>> from operator import itemgetter
>>> map(itemgetter('volume'), map(itemgetter('_source'), data['hits']['hits']))
[557462, 562559]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

MongoEngine not deleting all documents - python

Related

Issues with turning json data into an variable in python

Trouble understanding mongo aggregation

Python Mongo "Sort operation used more than the maximum" when skip is high

MongoDB find in array of objects

Parsing json in python

Categories

Resources