How to merge two aggregation queries into one

How to merge two aggregation queries into one - python

How can I merge these 2 queries into one:
Query 1
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$candidate_city_name_string",
"count": { "$sum": 1 }
}
}
])
Query 2
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$skill",
"count": { "$sum": 1 }
}
}
])
The result of this query like this
output 1:
{
"result": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
]
"ok":1
}
output 2:
{
"result": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
],
"ok":1
}
How can I get these 2 results in one query?
I need an output like this:
Expected output:
{
"result": [
"candidate_city_name_string": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
],
"skill": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
]
],
"ok":1
}
Is it possible? Somewhere I saw something about $facet but I didn't understand that.

db.Response.aggregate([
{"$match":{"$and":[{"job_details.owner_id" : 482},{"job_details.owner_type" : 'searches'}]}},
{$facet: {
"candidate_sublocation_name_string": [
{"$group": {"_id":"$candidate_sublocation_name_string","count": {"$sum": 1 }}}
],
"skill": [
{"$group": {"_id":"$skill","count": {"$sum": 1 }}}
]
}}])

Related

Convert a MultiIndex pandas DataFrame to a nested JSON

I have the following Dataframe with MultiIndex rows in pandas.
time available_slots status
month day
1 1 10:00:00 1 AVAILABLE
1 12:00:00 1 AVAILABLE
1 14:00:00 1 AVAILABLE
1 16:00:00 1 AVAILABLE
1 18:00:00 1 AVAILABLE
2 10:00:00 1 AVAILABLE
... ... ... ...
2 28 12:00:00 1 AVAILABLE
28 14:00:00 1 AVAILABLE
28 16:00:00 1 AVAILABLE
28 18:00:00 1 AVAILABLE
28 20:00:00 1 AVAILABLE
And I need to transform it to a hierarchical nested JSON as this:
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
...
]
},
{
"day": 2,
"slots": [
...
]
}
]
},
{
"month": 2,
"days":[
{
"day": 1,
"slots": [
...
]
}
]
},
...
]
Unfortunately, it is not as easy as doing df.to_json(orient="index").
Does anyone know if there is a method in pandas to perform this kind of transformations? or in what way I could iterate over the DataFrame to build the final object?

Here's one way. Basically repeated groupby + apply(to_dict) + reset_index until we get the desired shape:
out = (df.groupby(level=[0,1])
.apply(lambda x: x.to_dict('records'))
.reset_index()
.rename(columns={0:'slots'})
.groupby('month')
.apply(lambda x: x[['day','slots']].to_dict('records'))
.reset_index()
.rename(columns={0:'days'})
.to_json(orient='records', indent=True)
)
Output:
[
{
"month":1,
"days":[
{
"day":1,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
},
{
"day":2,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
},
{
"month":2,
"days":[
{
"day":28,
"slots":[
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"20:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
}
]

You can use a double loop for each level of your index:
data = []
for month, df1 in df.groupby(level=0):
data.append({'month': month, 'days': []})
for day, df2 in df1.groupby(level=1):
data[-1]['days'].append({'day': day, 'slots': df2.to_dict('records')})
Output:
import json
print(json.dumps(data, indent=2))
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "16:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
},
{
"day": 2,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
},
{
"month": 2,
"days": [
{
"day": 28,
"slots": [
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "20:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
}
]

Python Fattening Complex JSON

I have a LinkedIn dataset of follower statistics in the following JSON (Removed many key pair values for easy understanding). In this, each key has a different number of inner key pair values.
Can somebody help convert this to a CSV output using python?
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerCountsByAssociationType": [
{
"followerCounts": {
"organicFollowerCount": 2775,
"paidFollowerCount": 0
}
},
{
"followerCounts": {
"organicFollowerCount": 13,
"paidFollowerCount": 0
},
"associationType": "EMPLOYEE"
}
],
"followerCountsByRegion": [
{
"region": "urn:li:region:7312",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:6981",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:620",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
}
],
"followerCountsBySeniority": [
{
"followerCounts": {
"organicFollowerCount": 12,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:8"
},
{
"followerCounts": {
"organicFollowerCount": 5,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:9"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:1"
}
],
"followerCountsByIndustry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:51"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:74"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:77"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:78"
},
],
"followerCountsByFunction": [
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:14"
},
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:21"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:11"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:17"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:1"
},
],
"followerCountsByStaffCountRange": [
{
"followerCounts": {
"organicFollowerCount": 267,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1001_TO_5000"
},
{
"followerCounts": {
"organicFollowerCount": 185,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_201_TO_500"
},
{
"followerCounts": {
"organicFollowerCount": 131,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_501_TO_1000"
},
{
"followerCounts": {
"organicFollowerCount": 81,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_5001_TO_10000"
},
{
"followerCounts": {
"organicFollowerCount": 74,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_2_TO_10"
},
{
"followerCounts": {
"organicFollowerCount": 10,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1"
}
],
"followerCountsByCountry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:es"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ph"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ng"
}
],
"organizationalEntity": "urn:li:organization:28849398"
}
]
}
I tried using the json_normalize(data['Elements']) but that gives the following Output
I am sure that there must be some parameter in json_normalize() that can simplify the inner nesting.
The desired output is as follows-
FollwerCounByAssociationorganicfollowecount
FollwerCounByAssociationpaidfollowecount
AssociationType
Region
RegionOrganicFollowercount
RegionpaidFollowercount
2775
0
Employee
urn:li:region:7312
2
0
urn:li:region:6981
2
0
.......And so on
Now I have only made a small part of the output but largely for as many entries in the last the columns will go on (while for the others it will be null)
Would appreciate any help possible! Thanks!

How to merge two DSL query for aggregation and filter

I need to search BusinessArea which is Research or Accounting this is array of fields(OR) statement
I need to search Role is Developer or Tester condition this is array of fields(OR) statement
I want to get the count of masterid of BusinessArea, designationNames, Role which is all the names
Name filter is "Group1"
Below is the dictionary
test= [ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ 'Accounting','Research'], 'Designation': [ 'L1' 'L2' ] }, { 'masterid': '2', 'name': 'Group1', 'BusinessArea': ['Research','Accounting' ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ 'L1' 'L2' ]}, { 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ 'Engineering' ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ 'L1' 'L2' ]}]
Below is the aggregation function
{
"size": 0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}
Below is the filtering query
{
"query": {
"bool": {
"must": [
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
}
}
"filter": [
"term": {
"name.keyword": "Group1"}]
I need to merge both query and output will be having from the both

Nice start !!! Now you can simply combine all those snippets like this:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"name.keyword": "Group1"
}
},
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
},
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}

Sort query by length of field in Elstic Search

I have a field in each of my documents like so
'some_field': 3, 5, 10
But each document could have a very length of numbers,
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
'some_field': 3, 5, 10, 20, 9 # Doc 3
Is there a way to query and sort by the length so that my results would be arranged as so:
'some_field': 3, 5, 10, 20, 9 # Doc 3
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
My current query, sorting by _id at the moment
es_object.search(index='index', size=500, body={
"sort": [
{"_id": "desc"}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}})

You can do script sort
{
"sort": {
"_script": {
"script": "doc['some_field'].value.length()",
"type": "number",
"order": "asc"
}
},
"query": {
"bool": {
"must": [{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}
}

ElasticSearch - MoreLikeThis - Conditional Boost

I would like to set different "boost terms" according to the year of a publication, example:
"boost_term": 10.0 to produced after 2015
"boost_term": 5.0 to produced between 2010 and 2015
"boost_term": 3.0 to produced between 2010 and 2005
and so on..
Current code:
res = es.search(body={
"query": {
"dis_max": {
"queries": [
{
"more_like_this" : {
"fields": [
"article.name",
"article.year"
],
"like" : {
"_index" : "test-index",
"_type" : "researcher",
"_id" : "idResearcher,
},
"min_term_freq" : 1,
"min_doc_freq": 1,
"boost_terms": 5.0
}
},
]
}
}
})

Try something like:
{
"query": {
"bool": {
"must": [
{
"more_like_this": {
"fields": [
"article.name",
"article.year"
],
"like" : {
"_index" : "test-index",
"_type" : "researcher",
"_id" : "idResearcher",
},
"min_term_freq": 1,
"min_doc_freq": 1
}
}
],
"should": [
{
"range": {
"producedYear" : {
"gte" : "2015",
"boost" : 10.0
}
}
},
{
"range": {
"producedYear" : {
"gte" : "2010",
"lt" : "2015"
"boost" : 10.0
}
}
},{
"range": {
"producedYear" : {
"gte" : "2005",
"lt" : "2010"
"boost" : 3.0
}
}
}
]
}
}
}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to merge two aggregation queries into one - python

Related

Convert a MultiIndex pandas DataFrame to a nested JSON

Python Fattening Complex JSON

How to merge two DSL query for aggregation and filter

Sort query by length of field in Elstic Search

ElasticSearch - MoreLikeThis - Conditional Boost

Categories

Resources