group all elements in arrays from mongo db - python

I have data in mongodb and it contains many fields, one of them is the content of the tweet that I scraped, all I want is to get all hashtags from the content then group them.
my data looks like that:
{
"_id" : NumberLong(1564531556487659520),
"content" : "Wie hat die #Corona-Pandemie den Arbeitsmarkt in Deutschland verändert? – #JansenAnika und Paula Risius vom #iw_koeln geben auf unserem Blog einen",
"likes" : NumberInt(0),
"replies" : NumberInt(0),
"retweet" : NumberInt(0)
},
{
"_id" : NumberLong(1564531463999168512),
"content" : "Start-ups noch pessimistischer als im #Corona-#Krisenjahr 2020",
"likes" : NumberInt(0),
"replies" : NumberInt(0),
"retweet" : NumberInt(0)
},
{
"_id" : NumberLong(1564531140802789381),
"content" : "Gesundheitsminister #klausholetschek fürchtet das Sinken der Hemmschwelle bei der #Legalisierung von #Cannabis. Ab Mitte September erleben wir in #München wieder das Absinken ganz anderer Hemmschwellen, #Corona-Hotspot inklusive.",
"likes" : NumberInt(1),
"replies" : NumberInt(1),
"retweet" : NumberInt(0)
}
After I write the below Code:
data = db.tweets.aggregate([{
"$project":{
"content":{
"$regexFindAll":{
"input":"$content",
"regex":r'[#]\w+'
}
}
}
},
{
"$group":{
"_id":"$content.match",
"count":{
"$sum":1
}
}
}
])
my result was different than what I want, it give me a dictionaries and each dictionary contain the "_id" which contain a list of the hashtags that I collect
my results:
{'_id': ['#Gersemann', '#Corona'], 'count': 1},
{'_id': ['#MAH', '#CORONA', '#CASES'], 'count': 3},
{'_id': ['#corona', '#coronalanding', '#coronasymptoms', '#coronawordpresstheme', '#coronavirus', '#coronavirusprevention', '#covid', '#covid19', '#covid19theme', '#covid19',
'#healthbeauty', '#healthcare', '#imithemes', '#medical'], 'count': 1},
{'_id': ['#China', '#Covid', '#Corona', '#SarsCoV2'], 'count': 1},
{'_id': ['#Gehorsam', '#Staat', '#Unterdr', '#Corona', '#Covid', '#Masken', '#Manie', '#Deutschen', '#Coronauten'], 'count': 1},
{'_id': ['#Maskenregeln', '#Corona', '#COVID19', '#Maske'], 'count': 1},
{'_id': ['#Pandemie', '#GBD', '#Medienversagen', '#Corona'], 'count': 1},
{'_id': ['#Herbst', '#Covid', '#Gesundheit', '#Corona', '#Maskenpflicht', '#Bundesregierung', '#Krankheit', '#Pandemie', '#Wochenblatt', '#WochenblattMedia', '#WochenblattNews'], 'count': 1}, {'_id': ['#COVID19', '#SARSCoV2', '#CORONA'], 'count': 1}]
but what I want is to count each hashtag alone grouped.

You can use $unwind to split your list content
[
{
"$project": {
"content": {
"$regexFindAll": {
"input": "$content",
"regex": "[#]\\w+"
}
}
}
},
{
"$unwind": "$content"
},
{
"$group": {
"_id": "$content.match",
"count": {
"$sum": 1
}
}
}
]
try it here

Related

custom sorting in Ag-grid use Justpy

I wanna custom sorting ag-grid use justpy. But when I add 'comparator' to columnDefs and run code =>> It not run. It notify l.column.getColDef(...).comparator
enter image description here
can anyone help me
import justpy as jp
grid_options = {
'getDataPath': '''function(data) { return data.orgHierarchy; }''',
'treeData': True,
'defaultColDef': {
'filter': True,
'sortable': True,
'resizable': True,
},
'columnDefs': [
{'headerName': "job title", 'field': "jobTitle"},
{'headerName': "employment type",
'field': "employmentType",
'comparator': '''function(valueA, valueB) {
console.log('valuea', valueA)
if (valueA == valueB) return 0;
return (valueA > valueB) ? 1 : -1;
}'''
},
],
'rowData' : [
{'orgHierarchy': ['Erica'], 'jobTitle': "CEO", 'employmentType': "1"},
{'orgHierarchy': ['Erica', 'Malcolm'], 'jobTitle': "VP", 'employmentType': "2"},
{'orgHierarchy': ['Erica', 'Bob'], 'jobTitle': "SVP", 'employmentType': "3"},
{'orgHierarchy': ['Erica', 'Bob', 'jo'], 'jobTitle': "eVP", 'employmentType': "4"}
]
}
def grid_test():
wp = jp.WebPage()
grid = jp.AgGrid(a=wp, options=grid_options)
print(grid)
grid.evaluate = ['getDataPath']
return wp
jp.justpy(grid_test)

How to get some original json fields after mapping geo-point with python Elasticsearch?

I'm tying to get some original data with geo-point mapping. I need to get satname and timestamp alone with "geo"
I get data from Restful API with python Elasticsearch.
settings = { "settings": {
"number_of_shards":1,
'number_of_replicas':0
},
"mappings" : {
"document" : {
"properties":{
"geo": {
"type": "geo_point"
}
}
}
}
}
es.indices.create(index = "new", body=settings)
def collect_data():
data = requests.get(url = URL).json()
del data['positions'][1]
new_data = {'geo':{'lat':data['positions'][0]['satlatitude'],
'lon':data['positions'][0]['satlongitude']}}, {data['info'][0]['satname']} ,
{data['positions'][0]['timestamp']}
es.index(index='new', doc_type='document', body=new_data)
schedule.every(10).seconds.do(collect_data)
while True:
schedule.run_pending()
time.sleep(1)
Error received:
SerializationError: (({'geo': {'lat': 37.43662067, 'lon': -26.09384821}}, {1591391688}),
TypeError("Unable to serialize {1591391688} (type: <class 'set'>)"))
RESTful json data sample--- {'info': {'satname': 'SPACE STATION', 'satid': 25544,
'transactionscount': 0}, 'positions': [{'satlatitude': 28.89539607,
'satlongitude': 90.44547739, 'sataltitude': 420.36, 'azimuth': 12.46,
'elevation': -52.81, 'ra': 215.55022984, 'dec': -5.00234017, 'timestamp': 1591196844, 'eclipsed':
True}]}
I need to have "geo", "satnam" and"timestamp".I'm wondering how could I obtain correct results.
Looks like you were setting the timestamp and satname without a key, try this to process the data:
import json
from datetime import datetime
response_json = '''
{
"info": {
"satname": "SPACE STATION",
"satid": 25544,
"transactionscount": 0
},
"positions": [
{
"satlatitude": 28.89539607,
"satlongitude": 90.44547739,
"sataltitude": 420.36,
"azimuth": 12.46,
"elevation": -52.81,
"ra": 215.55022984,
"dec": -5.00234017,
"timestamp": 1591196844,
"eclipsed": true
}
]
}
'''
response_data = json.loads(response_json)
def process_data(data):
return {
'satname': response_data['info']['satname'],
# comvert unix timestamp to iso time
'timestamp': datetime.fromtimestamp(response_data['positions'][0]['timestamp']).isoformat(),
'geo': {
'lat': response_data['positions'][0]['satlatitude'],
'lon': response_data['positions'][0]['satlongitude']
}
}
print(process_data(response_data))
Output:
{'satname': 'SPACE STATION', 'timestamp': '2020-06-03T15:07:24', 'geo': {'lat': 28.89539607, 'lon': 90.44547739}}

How to add new key into dictionary like this [{ {]. This looks more like a dictionary inside a list

I would like to add new key into the dictionary list. Example:
"label" : [] (with empty list)
[
{
"Next" : {
"seed" : [
{
"Argument" : [
{
"id" : 4,
"label" : "org"
},
{
"id" : "I"
},
{
"word" : "He",
"seed" : 2,
"id" : 3,
"label" : "object"
},
{
"word" : "Gets",
"seed" : 9,
"id" : 2,
"label" : "verb"
}
]
}
],
"Next" : "he,get",
"time" : ""
}
}
]
I tried to use loop into "seed" and then to "argument" then use .update("label":[]) in the loop but it won't work. Can anyone please give me an example of using for loop to loop from beginning then to add these new "label"?
My prefered goal: ( to have extra "label" within the dictionary according to my input)
Example:
[
{
"Next" : {
"seed" : [
{
"Argument" : [
{
"id" : 4,
"label" : "org"
},
{
"id" : "I"
},
{
"word" : "He",
"seed" : 2,
"id" : 3,
"label" : "object"
},
{
"word" : "Gets",
"seed" : 9,
"id" : 2,
"label" : "verb"
},
{
"id" : 5,
"label" : "EXTRA"
},
{
"id" : 6,
"label" : "EXTRA"
},
{
"id" : 7,
"label" : "EXTRA"
}
]
}
],
"Next" : "he,get",
"time" : ""
}
}
]
I am new to dictionary so really need help with this
If I understand your problem correctly, you want to add 'label' to dict in Argument where there is no label. You could do it like so -
for i in x[0]['Next']['seed'][0]['Argument']:
if not 'label' in i.keys():
i['label'] = []
Where x is your dict. But what's x[0]['Next']['seed'][0]['Argument']:?
Let's simplify your dict -
x = [{'Next': {'Next': 'he,get',
'seed': [{'Argument': [{these}, {are}, {your}, {dicts}]}],
'time': ''}}]
How did we reach here?
Let's see-
x = [{'Next'(parent dict): {'Next'(child of previous 'Next'):{},
'seed(child of previous 'Next')':[{these}, {are}, {your}, {dicts}](a list of dicts)}]
I hope that makes sense. And to add more dictionaries in Argument
# create a function that returns a dict
import random # I don't know how you want ids, so this
def create_dicts():
return {"id": random.randint(1, 10), "label": ""}
for i in range(3): # 3 is how many dicts you want to push in Argument
x[0]['Next']['seed'][0]['Argument'].append(create_dicts())
Now your dict will become -
[{'Next': {'Next': 'he,get',
'seed': [{'Argument': [{'id': 4, 'label': 'org'},
{'id': 'I'},
{'id': 3, 'label': 'object', 'seed': 2, 'word': 'He'},
{'id': 2, 'label': 'verb', 'seed': 9, 'word': 'Gets'},
{'id': 1, 'label': ''},
{'id': 4, 'label': ''},
{'id': 4, 'label': ''}]}],
'time': ''}}]
First things first: access the list of dict that need to be updated.
according to your given structure that's l[0]["Next"]["seed"][0]["Argument"]
Then iterate that list and check if label already exists, if it does not then add it as an empty list.
This can be done by explicit checking:
if "label" not in i:
i["label"] = []
or by re-assigning:
i["label"] = i.get("label", [])
Full Code:
import pprint
l = [ {
"Next" : {
"seed" : [ {
"Argument" : [ {
"id" : 4,
"label" : "org"
}, {
"id" : "I"
}, {
"word" : "He",
"seed" : 2,
"id" : 3,
"label" : "object"
}, {
"word" : "Gets",
"seed" : 9,
"id" : 2,
"label" : "verb"
} ]
} ],
"Next" : "he,get",
"time" : ""
} }]
# access the list of dict that needs to be updated
l2 = l[0]["Next"]["seed"][0]["Argument"]
for i in l2:
i["label"] = i.get("label", []) # use the existing label or add an empty list
pprint.pprint(l)
Output:
[{'Next': {'Next': 'he,get',
'seed': [{'Argument': [{'id': 4, 'label': 'org'},
{'id': 'I', 'label': []},
{'id': 3,
'label': 'object',
'seed': 2,
'word': 'He'},
{'id': 2,
'label': 'verb',
'seed': 9,
'word': 'Gets'}]}],
'time': ''}}]
You have a list with one nested dictionary. Get the list of the inner dicts, and iterate. Assuming your initial data structure is named data
dict_list = data[0]['Next']['seed'][0]['Argument']
for item in dict_list:
item['label'] = input()

mongodb query takes too long time

I have following documents in my mongodb collection:
{'name' : 'abc-1','parent':'abc', 'price': 10}
{'name' : 'abc-2','parent':'abc', 'price': 5}
{'name' : 'abc-3','parent':'abc', 'price': 9}
{'name' : 'abc-4','parent':'abc', 'price': 11}
{'name' : 'efg', 'parent':'', 'price': 10}
{'name' : 'efg-1','parent':'efg', 'price': 5}
{'name' : 'abc-2','parent':'efg','price': 9}
{'name' : 'abc-3','parent':'efg','price': 11}
I want to perform following action:
a. Group By distinct parent
b. Sort all the groups based on price
c. For each group select a document with minimum price
i. check each record's parent sku exists as a record in name field
ii. If the name exists, do nothing
iii. If the record does not exists, insert a document with parent as empty and other values as the value of the record selected previously (minimum value).
I tired to do use for each as follows:
db.file.find().sort([("price", 1)]).forEach(function(doc){
cnt = db.file.count({"sku": {"$eq": doc.parent}});
if (cnt < 1){
newdoc = doc;
newdoc.name = doc.parent;
newdoc.parent = "";
delete newdoc["_id"];
db.file.insertOne(newdoc);
}
});
The problem with it is it takes too much time. What is wrong here? How can it be optimized? Would aggregation pipeline be a good solution, if yes how can it be done?
Retrieve a set of product names ✔
def product_names():
for product in db.file.aggregate([{$group: {_id: "$name"}}]):
yield product['_id']
product_names = set(product_names())
Retrieve product with minimum
price from group ✔
result_set = db.file.aggregate([
{
'$sort': {
'price': 1,
}
},
{
'$group': {
'_id': '$parent',
'name': {
'$first': '$name',
},
'price': {
'$min': '$price',
}
}
},
{
'$sort': {
'price': 1,
}
}
])
Insert products retrieved in 2 if name not in set
of product names retrieved in 1. ✔
from pymongo.operations import InsertOne
def insert_request(product):
return InsertOne({
name: product['name'],
price: product['price'],
parent: ''
})
requests = (
insert_request(product)
for product in result_set
if product['name'] not in product_names
)
db.file.bulk_write(list(requests))
Steps 2 and 3 can be implemented in the aggregation pipeline.
db.file.aggregate([
{
'$sort': {'price': 1}
},
{
'$group': {
'_id': '$parent',
'name': {
'$first': '$name'
},
'price': {
'$min': '$price'
},
}
},
{
'$sort': {
'price': 1
}
},
{
'$project': {
'name': 1,
'price': 1,
'_id': 0,
'parent':''
}
},
{
'$match': {
'name': {
'$nin': list(product_names())
}
}
},
{
'$out': 'file'
}
])

logical error in python dictionary traversal

one of my queries in mongoDB through pymongo returns:
{ "_id" : { "origin" : "ABE", "destination" : "DTW", "carrier" : "EV" }, "Ddelay" : -5.333333333333333,
"Adelay" : -12.666666666666666 }
{ "_id" : { "origin" : "ABE", "destination" : "ORD", "carrier" : "EV" }, "Ddelay" : -4, "Adelay" : 14 }
{ "_id" : { "origin" : "ABE", "destination" : "ATL", "carrier" : "EV" }, "Ddelay" : 6, "Adelay" : 14 }
I am traversing the result as below in my python module but I am not getting all the 3 results but only two. I believe I should not use len(results) as I am doing currently. Can you please help me correctly traverse the result as I need to display all three results in the resultant json document on web ui.
Thank you.
code:
pipe = [{ '$match': { 'origin': {"$in" : [origin_ID]}}},
{"$group" :{'_id': { 'origin':"$origin", 'destination': "$dest",'carrier':"$carrier"},
"Ddelay" : {'$avg' :"$dep_delay"},"Adelay" : {'$avg' :"$arr_delay"}}}, {"$limit" : 4}]
results = connect.aggregate(pipeline=pipe)
#pdb.set_trace()
DATETIME_FORMAT = '%Y-%m-%d'
for x in range(len(results)):
origin = (results['result'][x])['_id']['origin']
destination = (results['result'][x])['_id']['destination']
carrier = (results['result'][x])['_id']['carrier']
Adelay = (results['result'][x])['Adelay']
Ddelay = (results['result'][x])['Ddelay']
obj = {'Origin':origin,
'Destination':destination,
'Carrier': carrier,
'Avg Arrival Delay': Adelay,
'Avg Dep Delay': Ddelay}
json_result.append(obj)
return json.dumps(json_result,indent= 2, sort_keys=False,separators=(',',':'))
Pymongo returns result in format:
{u'ok': 1.0, u'result': [...]}
So you should iterate over result:
for x in results['result']:
...
In your code you try to calculate length of dict, not length of result container.

Categories