I have a field in each of my documents like so
'some_field': 3, 5, 10
But each document could have a very length of numbers,
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
'some_field': 3, 5, 10, 20, 9 # Doc 3
Is there a way to query and sort by the length so that my results would be arranged as so:
'some_field': 3, 5, 10, 20, 9 # Doc 3
'some_field': 3, 5, 10 # Doc 1
'some_field': 5 # Doc 2
My current query, sorting by _id at the moment
es_object.search(index='index', size=500, body={
"sort": [
{"_id": "desc"}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}})
You can do script sort
{
"sort": {
"_script": {
"script": "doc['some_field'].value.length()",
"type": "number",
"order": "asc"
}
},
"query": {
"bool": {
"must": [{
"match_all": {}
},
{
"exists": {
"field": "some_field"
}
}
],
"filter": [],
"should": [],
"must_not": []
}
}
}
Related
I have the following Dataframe with MultiIndex rows in pandas.
time available_slots status
month day
1 1 10:00:00 1 AVAILABLE
1 12:00:00 1 AVAILABLE
1 14:00:00 1 AVAILABLE
1 16:00:00 1 AVAILABLE
1 18:00:00 1 AVAILABLE
2 10:00:00 1 AVAILABLE
... ... ... ...
2 28 12:00:00 1 AVAILABLE
28 14:00:00 1 AVAILABLE
28 16:00:00 1 AVAILABLE
28 18:00:00 1 AVAILABLE
28 20:00:00 1 AVAILABLE
And I need to transform it to a hierarchical nested JSON as this:
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
...
]
},
{
"day": 2,
"slots": [
...
]
}
]
},
{
"month": 2,
"days":[
{
"day": 1,
"slots": [
...
]
}
]
},
...
]
Unfortunately, it is not as easy as doing df.to_json(orient="index").
Does anyone know if there is a method in pandas to perform this kind of transformations? or in what way I could iterate over the DataFrame to build the final object?
Here's one way. Basically repeated groupby + apply(to_dict) + reset_index until we get the desired shape:
out = (df.groupby(level=[0,1])
.apply(lambda x: x.to_dict('records'))
.reset_index()
.rename(columns={0:'slots'})
.groupby('month')
.apply(lambda x: x[['day','slots']].to_dict('records'))
.reset_index()
.rename(columns={0:'days'})
.to_json(orient='records', indent=True)
)
Output:
[
{
"month":1,
"days":[
{
"day":1,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
},
{
"day":2,
"slots":[
{
"time":"10:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
},
{
"month":2,
"days":[
{
"day":28,
"slots":[
{
"time":"12:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"14:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"16:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"18:00:00",
"available_slots":1,
"status":"AVAILABLE"
},
{
"time":"20:00:00",
"available_slots":1,
"status":"AVAILABLE"
}
]
}
]
}
]
You can use a double loop for each level of your index:
data = []
for month, df1 in df.groupby(level=0):
data.append({'month': month, 'days': []})
for day, df2 in df1.groupby(level=1):
data[-1]['days'].append({'day': day, 'slots': df2.to_dict('records')})
Output:
import json
print(json.dumps(data, indent=2))
[
{
"month": 1,
"days": [
{
"day": 1,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "16:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
},
{
"day": 2,
"slots": [
{
"time": "10:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
},
{
"month": 2,
"days": [
{
"day": 28,
"slots": [
{
"time": "12:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "14:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "18:00:00",
"available_slots": 1,
"status": "AVAILABLE"
},
{
"time": "20:00:00",
"available_slots": 1,
"status": "AVAILABLE"
}
]
}
]
}
]
I have a LinkedIn dataset of follower statistics in the following JSON (Removed many key pair values for easy understanding). In this, each key has a different number of inner key pair values.
Can somebody help convert this to a CSV output using python?
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerCountsByAssociationType": [
{
"followerCounts": {
"organicFollowerCount": 2775,
"paidFollowerCount": 0
}
},
{
"followerCounts": {
"organicFollowerCount": 13,
"paidFollowerCount": 0
},
"associationType": "EMPLOYEE"
}
],
"followerCountsByRegion": [
{
"region": "urn:li:region:7312",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:6981",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
},
{
"region": "urn:li:region:620",
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
}
}
],
"followerCountsBySeniority": [
{
"followerCounts": {
"organicFollowerCount": 12,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:8"
},
{
"followerCounts": {
"organicFollowerCount": 5,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:9"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"seniority": "urn:li:seniority:1"
}
],
"followerCountsByIndustry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:51"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:74"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:77"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"industry": "urn:li:industry:78"
},
],
"followerCountsByFunction": [
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:14"
},
{
"followerCounts": {
"organicFollowerCount": 3,
"paidFollowerCount": 0
},
"function": "urn:li:function:21"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:11"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:17"
},
{
"followerCounts": {
"organicFollowerCount": 2,
"paidFollowerCount": 0
},
"function": "urn:li:function:1"
},
],
"followerCountsByStaffCountRange": [
{
"followerCounts": {
"organicFollowerCount": 267,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1001_TO_5000"
},
{
"followerCounts": {
"organicFollowerCount": 185,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_201_TO_500"
},
{
"followerCounts": {
"organicFollowerCount": 131,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_501_TO_1000"
},
{
"followerCounts": {
"organicFollowerCount": 81,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_5001_TO_10000"
},
{
"followerCounts": {
"organicFollowerCount": 74,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_2_TO_10"
},
{
"followerCounts": {
"organicFollowerCount": 10,
"paidFollowerCount": 0
},
"staffCountRange": "SIZE_1"
}
],
"followerCountsByCountry": [
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:es"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ph"
},
{
"followerCounts": {
"organicFollowerCount": 1,
"paidFollowerCount": 0
},
"country": "urn:li:country:ng"
}
],
"organizationalEntity": "urn:li:organization:28849398"
}
]
}
I tried using the json_normalize(data['Elements']) but that gives the following Output
I am sure that there must be some parameter in json_normalize() that can simplify the inner nesting.
The desired output is as follows-
FollwerCounByAssociationorganicfollowecount
FollwerCounByAssociationpaidfollowecount
AssociationType
Region
RegionOrganicFollowercount
RegionpaidFollowercount
2775
0
Employee
urn:li:region:7312
2
0
urn:li:region:6981
2
0
.......And so on
Now I have only made a small part of the output but largely for as many entries in the last the columns will go on (while for the others it will be null)
Would appreciate any help possible! Thanks!
I have a list of dictionary items, with each dictionary containing a list of presentation items. The sample dictionaries below are a small prototype of my real data set.
I need to remove duplicate presentations based on day (one presentation per day) and store them in a new dictionary with the same structure within the existing list.
So starting with:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7,
},
{
"presentation": "DEF",
"day": 7,
},
{
"presentation": "GHI",
"day": 8,
},
{
"presentation": "JKL",
"day": 8,
},
{
"presentation": "MNO",
"day": 9,
},
{
"presentation": "PQR",
"day": 9,
},
{
"presentation": "STU",
"day": 9,
}
]
} #only one dictionary item in the list for simplicity
]
The end result should be three dictionaries containing lists of presentations where there is one presentation for a given day:
[
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "ABC",
"day": 7
},
{
"presentation": "DEF",
"day": 8
},
{
"presentation": "GHI",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "JKL",
"day": 7
},
{
"presentation": "MNO",
"day": 8
},
{
"presentation": "PQR",
"day": 9
}
]
},
{
"time": "04:00-20:59",
"category": 1,
"presentations": [
{
"presentation": "STU",
"day": 9
}
]
}
]
I don't know how to go about removing these duplicates (based on day) while adding them to their own dictionary.
I need to search BusinessArea which is Research or Accounting this is array of fields(OR) statement
I need to search Role is Developer or Tester condition this is array of fields(OR) statement
I want to get the count of masterid of BusinessArea, designationNames, Role which is all the names
Name filter is "Group1"
Below is the dictionary
test= [ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ 'Accounting','Research'], 'Designation': [ 'L1' 'L2' ] }, { 'masterid': '2', 'name': 'Group1', 'BusinessArea': ['Research','Accounting' ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ 'L1' 'L2' ]}, { 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ 'Engineering' ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ 'L1' 'L2' ]}]
Below is the aggregation function
{
"size": 0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}
Below is the filtering query
{
"query": {
"bool": {
"must": [
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
}
}
"filter": [
"term": {
"name.keyword": "Group1"}]
I need to merge both query and output will be having from the both
Nice start !!! Now you can simply combine all those snippets like this:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"name.keyword": "Group1"
}
},
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
},
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}
How can I merge these 2 queries into one:
Query 1
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$candidate_city_name_string",
"count": { "$sum": 1 }
}
}
])
Query 2
db.Response.aggregate([
{
"$and": [
{ "job_details.owner_id" : 428 },
{ "job_details.owner_type" : 'searches' }
]
},
{
"$group": {
"_id": "$skill",
"count": { "$sum": 1 }
}
}
])
The result of this query like this
output 1:
{
"result": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
]
"ok":1
}
output 2:
{
"result": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
],
"ok":1
}
How can I get these 2 results in one query?
I need an output like this:
Expected output:
{
"result": [
"candidate_city_name_string": [
{ _id: 'Bangalore', count: 8 },
{ _id: 'cochi', count: 9 }
],
"skill": [
{ _id: 'java', count: 7 },
{ _id: 'python', count: 10 }
]
],
"ok":1
}
Is it possible? Somewhere I saw something about $facet but I didn't understand that.
db.Response.aggregate([
{"$match":{"$and":[{"job_details.owner_id" : 482},{"job_details.owner_type" : 'searches'}]}},
{$facet: {
"candidate_sublocation_name_string": [
{"$group": {"_id":"$candidate_sublocation_name_string","count": {"$sum": 1 }}}
],
"skill": [
{"$group": {"_id":"$skill","count": {"$sum": 1 }}}
]
}}])