Sort query by length of field in Elstic Search - python

I have a field in each of my documents like so
'some_field': 3, 5, 10
But each document could have a very length of numbers,
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
'some_field': 3, 5, 10, 20, 9 # Doc 3
Is there a way to query and sort by the length so that my results would be arranged as so:
'some_field': 3, 5, 10, 20, 9 # Doc 3
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
My current query, sorting by _id at the moment
es_object.search(index='index', size=500, body={
"sort": [
{"_id": "desc"}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}})

You can do script sort
{
"sort": {
"_script": {
"script": "doc['some_field'].value.length()",
"type": "number",
"order": "asc"
}
},
"query": {
"bool": {
"must": [{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}
}

Related

Convert a MultiIndex pandas DataFrame to a nested JSON

I have the following Dataframe with MultiIndex rows in pandas.
time available_slots status
month day
1 1 10:00:00 1 AVAILABLE
1 12:00:00 1 AVAILABLE
1 14:00:00 1 AVAILABLE
1 16:00:00 1 AVAILABLE
1 18:00:00 1 AVAILABLE
2 10:00:00 1 AVAILABLE
... ... ... ...
2 28 12:00:00 1 AVAILABLE
28 14:00:00 1 AVAILABLE
28 16:00:00 1 AVAILABLE
28 18:00:00 1 AVAILABLE
28 20:00:00 1 AVAILABLE
And I need to transform it to a hierarchical nested JSON as this:
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
...
]
},
{
"day": 2,
"slots": [
...
]
}
]
},
{
"month": 2,
"days":[
{
"day": 1,
"slots": [
...
]
}
]
},
...
]
Unfortunately, it is not as easy as doing df.to_json(orient="index").
Does anyone know if there is a method in pandas to perform this kind of transformations? or in what way I could iterate over the DataFrame to build the final object?
Here's one way. Basically repeated groupby + apply(to_dict) + reset_index until we get the desired shape:
out = (df.groupby(level=[0,1])
.apply(lambda x: x.to_dict('records'))
.reset_index()
.rename(columns={0:'slots'})
.groupby('month')
.apply(lambda x: x[['day','slots']].to_dict('records'))
.reset_index()
.rename(columns={0:'days'})
.to_json(orient='records', indent=True)
)
Output:
[
{
"month":1,
"days":[
{
"day":1,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
},
{
"day":2,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
},
{
"month":2,
"days":[
{
"day":28,
"slots":[
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"20:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
}
]
You can use a double loop for each level of your index:
data = []
for month, df1 in df.groupby(level=0):
data.append({'month': month, 'days': []})
for day, df2 in df1.groupby(level=1):
data[-1]['days'].append({'day': day, 'slots': df2.to_dict('records')})
Output:
import json
print(json.dumps(data, indent=2))
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "16:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
},
{
"day": 2,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
},
{
"month": 2,
"days": [
{
"day": 28,
"slots": [
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "20:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
}
]

Python Fattening Complex JSON

I have a LinkedIn dataset of follower statistics in the following JSON (Removed many key pair values for easy understanding). In this, each key has a different number of inner key pair values.
Can somebody help convert this to a CSV output using python?
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerCountsByAssociationType": [
{
"followerCounts": {
"organicFollowerCount": 2775,
"paidFollowerCount": 0
}
},
{
"followerCounts": {
"organicFollowerCount": 13,
"paidFollowerCount": 0
},
"associationType": "EMPLOYEE"
}
],
"followerCountsByRegion": [
{
"region": "urn:li:region:7312",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:6981",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:620",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
}
],
"followerCountsBySeniority": [
{
"followerCounts": {
"organicFollowerCount": 12,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:8"
},
{
"followerCounts": {
"organicFollowerCount": 5,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:9"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:1"
}
],
"followerCountsByIndustry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:51"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:74"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:77"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:78"
},
],
"followerCountsByFunction": [
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:14"
},
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:21"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:11"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:17"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:1"
},
],
"followerCountsByStaffCountRange": [
{
"followerCounts": {
"organicFollowerCount": 267,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1001_TO_5000"
},
{
"followerCounts": {
"organicFollowerCount": 185,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_201_TO_500"
},
{
"followerCounts": {
"organicFollowerCount": 131,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_501_TO_1000"
},
{
"followerCounts": {
"organicFollowerCount": 81,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_5001_TO_10000"
},
{
"followerCounts": {
"organicFollowerCount": 74,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_2_TO_10"
},
{
"followerCounts": {
"organicFollowerCount": 10,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1"
}
],
"followerCountsByCountry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:es"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ph"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ng"
}
],
"organizationalEntity": "urn:li:organization:28849398"
}
]
}
I tried using the json_normalize(data['Elements']) but that gives the following Output
I am sure that there must be some parameter in json_normalize() that can simplify the inner nesting.
The desired output is as follows-
FollwerCounByAssociationorganicfollowecount
FollwerCounByAssociationpaidfollowecount
AssociationType
Region
RegionOrganicFollowercount
RegionpaidFollowercount
2775
0
Employee
urn:li:region:7312
2
0
urn:li:region:6981
2
0
.......And so on
Now I have only made a small part of the output but largely for as many entries in the last the columns will go on (while for the others it will be null)
Would appreciate any help possible! Thanks!

How to eliminate duplicate items while adding them to their own structure

I have a list of dictionary items, with each dictionary containing a list of presentation items. The sample dictionaries below are a small prototype of my real data set.
I need to remove duplicate presentations based on day (one presentation per day) and store them in a new dictionary with the same structure within the existing list.
So starting with:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7,
},
{
"presentation": "DEF",
"day": 7,
},
{
"presentation": "GHI",
"day": 8,
},
{
"presentation": "JKL",
"day": 8,
},
{
"presentation": "MNO",
"day": 9,
},
{
"presentation": "PQR",
"day": 9,
},
{
"presentation": "STU",
"day": 9,
}
]
} #only one dictionary item in the list for simplicity
]
The end result should be three dictionaries containing lists of presentations where there is one presentation for a given day:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7
},
{
"presentation": "DEF",
"day": 8
},
{
"presentation": "GHI",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "JKL",
"day": 7
},
{
"presentation": "MNO",
"day": 8
},
{
"presentation": "PQR",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "STU",
"day": 9
}
]
}
]
I don't know how to go about removing these duplicates (based on day) while adding them to their own dictionary.

How to merge two DSL query for aggregation and filter

I need to search BusinessArea which is Research or Accounting this is array of fields(OR) statement
I need to search Role is Developer or Tester condition this is array of fields(OR) statement
I want to get the count of masterid of BusinessArea, designationNames, Role which is all the names
Name filter is "Group1"
Below is the dictionary
test= [ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ 'Accounting','Research'], 'Designation': [ 'L1' 'L2' ] }, { 'masterid': '2', 'name': 'Group1', 'BusinessArea': ['Research','Accounting' ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ 'L1' 'L2' ]}, { 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ 'Engineering' ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ 'L1' 'L2' ]}]
Below is the aggregation function
{
"size": 0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}
Below is the filtering query
{
"query": {
"bool": {
"must": [
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
}
}
"filter": [
"term": {
"name.keyword": "Group1"}]
I need to merge both query and output will be having from the both
Nice start !!! Now you can simply combine all those snippets like this:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"name.keyword": "Group1"
}
},
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
},
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}

How to merge two aggregation queries into one

How can I merge these 2 queries into one:
Query 1
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$candidate_city_name_string",
"count": { "$sum": 1 }
}
}
])
Query 2
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$skill",
"count": { "$sum": 1 }
}
}
])
The result of this query like this
output 1:
{
"result": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
]
"ok":1
}
output 2:
{
"result": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
],
"ok":1
}
How can I get these 2 results in one query?
I need an output like this:
Expected output:
{
"result": [
"candidate_city_name_string": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
],
"skill": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
]
],
"ok":1
}
Is it possible? Somewhere I saw something about $facet but I didn't understand that.
db.Response.aggregate([
{"$match":{"$and":[{"job_details.owner_id" : 482},{"job_details.owner_type" : 'searches'}]}},
{$facet: {
"candidate_sublocation_name_string": [
{"$group": {"_id":"$candidate_sublocation_name_string","count": {"$sum": 1 }}}
],
"skill": [
{"$group": {"_id":"$skill","count": {"$sum": 1 }}}
]
}}])

Categories