Fuzzywuzzy for a list of dictionaries - python

I have a list of dictionaries (API response) and I use the following function to search for certain nations:
def nation_search(self):
result = next((item for item in nations_v2 if (item["nation"]).lower() == (f"{self}").lower()), False)
if result:
return result
else:
return next((item for item in nations_v2 if (item["leader"]).lower() == (f"{self}").lower()), False)
2 examples :
nations_v2 = [{'nation_id': 5270, 'nation': 'Indo-Froschtia', 'leader': 'Saxplayer', 'continent': 2, 'war_policy': 4, 'domestic_policy': 2, 'color': 15, 'alliance_id': 790, 'alliance': 'Rose', 'alliance_position': 3, 'cities': 28, 'offensive_wars': 0, 'defensive_wars': 0, 'score': 4945, 'v_mode': False, 'v_mode_turns': 0, 'beige_turns': 0, 'last_active': '2020-08-10 04:04:48', 'founded': '2014-08-05 00:09:31', 'soldiers': 0, 'tanks': 0, 'aircraft': 2100, 'ships': 0, 'missiles': 0, 'nukes': 0},
{'nation_id': 582, 'nation': 'Nightsilver Woods', 'leader': 'Luna', 'continent': 4, 'war_policy': 4, 'domestic_policy': 2, 'color': 10, 'alliance_id': 615, 'alliance': 'Seven Kingdoms', 'alliance_position': 2, 'cities': 23, 'offensive_wars': 0, 'defensive_wars': 0, 'score': 3971.25, 'v_mode': False, 'v_mode_turns': 0, 'beige_turns': 0, 'last_active': '2020-08-10 00:22:16', 'founded': '2014-08-05 00:09:35', 'soldiers': 0, 'tanks': 0, 'aircraft': 1725, 'ships': 115, 'missiles': 0, 'nukes': 0}]
I want to add a fuzzy-search using fuzzywuzzy to get 5 possible matches in case there's a spelling error in the argument passed into the function but I can't seem to figure it out.
I only want to search in values for nation and leader.

If you need 5 possible matches, use process.
from fuzzywuzzy import process
def nation_search(self):
nations_only = [ v2['nation'].lower() for v2 in nations_v2 ]
leaders_only = [ v2['leader'].lower() for v2 in nations_v2 ]
matched_nations = process.extract((f"{self}").lower(), nations_only, limit=5)
matched_leaders = process.extract((f"{self}").lower(), leaders_only, limit=5)
return matched_nations, matched_leaders

Related

Group list of dictionaries based on key

I have a array that looks like this
comp_data = [
{
"product_id": 432263,
"price_zone_id": 1,
"oreilly_price": 0,
"amazon_price": 0,
"az_price": 0,
"rockauto_price": 0,
"napa_price": 0,
"oreilly_index": 0,
"amazon_index": 0,
"az_index": 0,
"rockauto_index": 0,
"napa_index": 0,
"weighted_comp_price": 0,
"weighted_comp_index": 0,
"week": None
}
]
Skipping the fields product_id and price_zone_id, I want to create a list of dictionaries by mapping the names of each key. For Example
You can see amazon_price, amazon_index, ultimately I want a list that looks like this
[
{
'amazon_price': 0,
'amazon_index': 0,
'name': 'Amazon' --> This would be simply adding .title() to the name.
},
{
'az_price': 0,
'az_index': 0,
'name': 'Az' --> This would be simply adding .title() to the name.
},
{
'orielly_price': 0,
'orielly_index': 0,
'name': 'ORielly' --> This would be simply adding .title() to the name.
}
]
My current code looks like this and is generating wrong output
stores_data = {}
for data in comp_data:
dict_keys = data.keys()
for keys in dict_keys:
if keys != 'product_id' and keys != 'price_zone_id':
store_name = keys.split('_')[0]
value_type = keys.split('_')[-1]
stores[store_name][value_type] = {}
Stores are eassentially the string infront of _index or _price. ex amazon_index, store would be amazon
For this key "weighted_comp_price", the store would be Weighted Comp
For each dict
Find the _price keys
Verify you have the corresponding _index key
Save it
stores_data = []
for data in comp_data:
for price_key in (k for k in data if k.endswith("_price")):
name = price_key.rsplit("_", maxsplit=1)[0]
index_key = f'{name}_index'
if index_key in data:
stores_data.append({price_key: data[price_key], index_key: data[index_key], 'name': name.title()})
Giving
[{'oreilly_price': 0, 'oreilly_index': 0, 'name': 'Oreilly'},
{'az_price': 0, 'az_index': 0, 'name': 'Az'},
{'napa_price': 0, 'napa_index': 0, 'name': 'Napa'},
{'amazon_price': 0, 'amazon_index': 0, 'name': 'Amazon'},
{'rockauto_price': 0, 'rockauto_index': 0, 'name': 'Rockauto'},
{'weighted_comp_price': 0, 'weighted_comp_index': 0, 'name': 'Weighted_Comp'}]
Your code is creating "price" and "index" instead of the whole name. Which did you want?
comp_data = [
{
"product_id": 432263,
"price_zone_id": 1,
"oreilly_price": 0,
"amazon_price": 0,
"az_price": 0,
"rockauto_price": 0,
"napa_price": 0,
"oreilly_index": 0,
"amazon_index": 0,
"az_index": 0,
"rockauto_index": 0,
"napa_index": 0,
"weighted_comp_price": 0,
"weighted_comp_index": 0,
"week": None
}
]
stores = {}
for data in comp_data:
for key,value in data.items():
if key not in ('product_id','price_zone_id','week'):
words = key.split('_')
store_name ='_'.join(words[:-1])
value_type = words[-1]
if store_name not in stores:
stores[store_name] = {'name': store_name.title()}
stores[store_name][key] = value
from pprint import pprint
pprint(stores)
Output:
{'amazon': {'amazon_index': 0, 'amazon_price': 0, 'name': 'Amazon'},
'az': {'az_index': 0, 'az_price': 0, 'name': 'Az'},
'napa': {'name': 'Napa', 'napa_index': 0, 'napa_price': 0},
'oreilly': {'name': 'Oreilly', 'oreilly_index': 0, 'oreilly_price': 0},
'rockauto': {'name': 'Rockauto', 'rockauto_index': 0, 'rockauto_price': 0},
'weighted_comp': {'name': 'Weighted_Comp',
'weighted_comp_index': 0,
'weighted_comp_price': 0}}

OperationFailure: unknown top level operator: $ne (Monogbd)

What is wrong with this code? When I try to run it I get OperationFailure: unknown top level operator: $ne full error: {'ok': 0.0, 'errmsg': 'unknown top level operator: $ne', 'code': 2, 'codeName': 'BadValue'}.
Any ideas what this means? Thank you in advance :)
import pandas as pd
def length_vs_references(articles):
res = {"1-5" : 0, "6-10" : 0, "11-15" : 0, "16-20" : 0, "21-25" : 0, "25-30" : 0, ">30" :0}
n = {"1-5" : 0, "6-10" : 0, "11-15" : 0, "16-20" : 0, "21-25" : 0, "25-30" : 0, ">30" :0}
cursor = articles.aggregate([
{'$match': {'$and' : [{'references': {'$exists': False}
}, {'$ne':['$page_end', '']}, {'$ne':['$page_start', '']} ]}},
{'$project': {'len_refernces': {"$size": '$references'},
'pages': {'$subtract': [{"$toInt": 'page_end'},
{"$toInt" : 'page_start'}]}}},
{'$bucket' :{
'$groupBy': '$pages',
'boundaries': [ 0, 6, 11, 16, 21, 26, 31, 1000000],
'default': 'Other',
'key': {
'output': {"average": {"$avg" : '$len_references'}},
}
}
}
])
return cursor
print(length_vs_references(articles))
Reading between the lines I suspect you want:
cursor = articles.aggregate([
{'$match': {'references': {'$exists': False}, 'page_end': {'$ne': ''}, 'page_start': {'$ne': ''}}},
{'$project': {'len_refernces': {"$size": '$references'},
'pages': {'$subtract': [{"$toInt": '$page_end'},
{"$toInt": '$page_start'}]}}},
{'$bucket': {
'groupBy': '$pages',
'boundaries': [0, 6, 11, 16, 21, 26, 31, 1000000],
'default': 'Other'
}
}
])
You don't need to AND your match filters as they are ANDed by default. I'm guessing you are trying to filter out blank page_end and page_start items. If not, please describe what you are trying to do.

Python elegant way to order an array of dictionary

I have the data in this format -
[
{'gstin_code': 'A',
'price_effective': 1199.0,
'company_id': 489,
'month': datetime.datetime(2018, 6, 1, 0, 0),
'year': datetime.datetime(2018, 1, 1, 0, 0)
},
{'gstin_code': 'B',
'price_effective': 1199.0,
'company_id': 489,
'month': datetime.datetime(2018, 6, 1, 0, 0),
'year': datetime.datetime(2018, 1, 1, 0, 0)
},
{'gstin_code': 'C',
'price_effective': 1199.0,
'company_id': 489,
'month': datetime.datetime(2018, 6, 1, 0, 0),
'year': datetime.datetime(2018, 1, 1, 0, 0)
}
]
The output expected is this -
{
"2": {
"2018": {
"3": {
"27AA": 1799
},
"4": {
"03AA": 1299,
"04AA": 1499,
"05AA": 699,
"06AA": 599,
"07AA": 199,
"09AA": 499,
"19AA": 599,
"23AA": 770,
"27AA": 420,
"27AA": 499
},
"5": {
"03AA": 1399,
"27AA": 399,
"27AA": 640
}
}
}
}
i.e {company_id:{year:{month:{gst:price_effective}}}}
Those values also come as None, like company_id, year, month, gst do come as None.
The solution which works is this -
for f_bag in data:
if f_bag['company_id'] in result and f_bag['company_id'] is not None:
if f_bag['year'].year in result[f_bag['company_id']] and f_bag['year'].year is not None:
if f_bag['month'].month in result[f_bag['company_id']][f_bag['year'].year] and f_bag['month'].month is not None:
if f_bag['gstin_code'] in result[f_bag['company_id']][f_bag['year'].year][f_bag['month'].month] and f_bag['gstin_code'] is not None:
pass
else:
result[f_bag['company_id']][f_bag['year'].year][f_bag['month'].month][f_bag['gstin_code']] = f_bag['price_effective']
else:
result[f_bag['company_id']][f_bag['year'].year][f_bag['month'].month] = {}
else:
result[f_bag['company_id']][f_bag['year'].year] = {}
else:
result[f_bag['company_id']] = {}
Any elegant way to avoid these if else statements?
with collections.defaultdict you can remove the checks whether the key exist in the dictionary:
from collections import defaultdict
def nested_dict():
return defaultdict(nested_dict)
def toNested(data):
nd = nested_dict()
for aa in data:
nd[aa['company_id']][aa['year'].year][aa['month'].month][aa['gstin_code']] = aa['price_effective']
return nd
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint( toNested(data) )

parse data from Dictionary in python

So i have this dictionary "runs":
[{
'id': 12,
'suite_id': 2,
'name': 'name',
'description': "desc.",
'nice_id': 3,
'joku_id': None,
'onko': False,
'eikai': False,
'tehty': None,
'config': None,
'config_ids': [],
'passed_count': 1,
'blocked_count': 2,
'untested_count': 3,
'retest_count': 4,
'failed_count': 5,
'custom_status1_count': 0,
'custom_status2_count': 0,
'custom_status3_count': 0,
'custom_status4_count': 0,
'custom_status5_count': 0,
'custom_status6_count': 0,
'custom_status7_count': 0,
'projekti_id': 1,
'plan_id': None,
'created_on': 12343214,
'created_by': 11,
'url': 'google.com'
}, {
'id': 16,
'suite_id': 2,
'name': 'namae)',
'description': "desc1",
'nice_id': 5,
'joku_id': None,
'onko': False,
'eikai': False,
'tehty': None,
'config': None,
'config_ids': [],
'passed_count': 100,
'blocked_count': 1,
'untested_count': 3,
'retest_count': 2,
'failed_count': 5,
'custom_status1_count': 0,
'custom_status2_count': 0,
'custom_status3_count': 0,
'custom_status4_count': 0,
'custom_status5_count': 0,
'custom_status6_count': 0,
'custom_status7_count': 0,
'prokti_id': 7,
'plan_id': None,
'created_on': 4321341644,
'created_by': 11,
'url': 'google.com/2' }
there is "id" for about 50 times. that is just a part of it.
i need to find all "id":s (Not joku_ids, ncie_ids etc. Only "id") and make a string/dict of them
and same for name, and description
i have tried:
j = json.load(run)
ids = (j["id"])
j = json.load(run)
names = (j["name"])
j = json.load(run)
descriptions = (j["description"])
but it returns:
AttributeError: 'list' object has no attribute 'read'
I also need to send a request with specific id and in this case the specific id is marked by o. so id[o]
the request code is below:
test = client.send_get('get_tests/1/ ')
so i need to have the id[o] instead of the 1.
i have tried
test = client.send_get('get_tests/' + id[o] + '/ ')
but it returns:
TypeError: 'int' object is not subscriptable
May be this can help you.
id = []
for i in runs :
id.append(i.get('id'))
[12, 16]
You are trying to pass a list to a json.load function. Please read the docs. Load() does not accep lists, it accepts
a .read()-supporting file-like object containing a JSON document
If you want your result in list of dictionary then:
result = [{x:y} for i in range(len(data)) for x,y in data[i].items() if x=='id' or x=='name' or x=='description']
output:
[{'name': 'name'}, {'id': 12}, {'description': 'desc.'}, {'name': 'namae)'}, {'id': 16}, {'description': 'desc1'}]
the data is your list of dictionary data.
hope this answer helpful for you.

limit() and sort() order pymongo and mongodb

Despite reading peoples answers stating that the sort is done first, evidence shows something different that the limit is done before the sort. Is there a way to force sort always first?
views = mongo.db.view_logging.find().sort([('count', 1)]).limit(10)
Whether I use .sort().limit() or .limit().sort(), the limit takes precedence. I wonder if this is something to do with pymongo...
According to the documentation, regardless of which goes first in your chain of commands, sort() would be always applied before the limit().
You can also study the .explain() results of your query and look at the execution stages - you will find that the sorting input stage examines all of the filtered (in your case all documents in the collection) and then the limit is applied.
Let's go through an example.
Imagine there is a foo database with a test collection having 6 documents:
>>> col = db.foo.test
>>> for doc in col.find():
... print(doc)
{'time': '2016-03-28 12:12:00', '_id': ObjectId('56f9716ce4b05e6b92be87f2'), 'value': 90}
{'time': '2016-03-28 12:13:00', '_id': ObjectId('56f971a3e4b05e6b92be87fc'), 'value': 82}
{'time': '2016-03-28 12:14:00', '_id': ObjectId('56f971afe4b05e6b92be87fd'), 'value': 75}
{'time': '2016-03-28 12:15:00', '_id': ObjectId('56f971b7e4b05e6b92be87ff'), 'value': 72}
{'time': '2016-03-28 12:16:00', '_id': ObjectId('56f971c0e4b05e6b92be8803'), 'value': 81}
{'time': '2016-03-28 12:17:00', '_id': ObjectId('56f971c8e4b05e6b92be8806'), 'value': 90}
Now, let's execute queries with different order of sort() and limit() and check the results and the explain plan.
Sort and then limit:
>>> from pprint import pprint
>>> cursor = col.find().sort([('time', 1)]).limit(3)
>>> sort_limit_plan = cursor.explain()
>>> pprint(sort_limit_plan)
{u'executionStats': {u'allPlansExecution': [],
u'executionStages': {u'advanced': 3,
u'executionTimeMillisEstimate': 0,
u'inputStage': {u'advanced': 6,
u'direction': u'forward',
u'docsExamined': 6,
u'executionTimeMillisEstimate': 0,
u'filter': {u'$and': []},
u'invalidates': 0,
u'isEOF': 1,
u'nReturned': 6,
u'needFetch': 0,
u'needTime': 1,
u'restoreState': 0,
u'saveState': 0,
u'stage': u'COLLSCAN',
u'works': 8},
u'invalidates': 0,
u'isEOF': 1,
u'limitAmount': 3,
u'memLimit': 33554432,
u'memUsage': 213,
u'nReturned': 3,
u'needFetch': 0,
u'needTime': 8,
u'restoreState': 0,
u'saveState': 0,
u'sortPattern': {u'time': 1},
u'stage': u'SORT',
u'works': 13},
u'executionSuccess': True,
u'executionTimeMillis': 0,
u'nReturned': 3,
u'totalDocsExamined': 6,
u'totalKeysExamined': 0},
u'queryPlanner': {u'indexFilterSet': False,
u'namespace': u'foo.test',
u'parsedQuery': {u'$and': []},
u'plannerVersion': 1,
u'rejectedPlans': [],
u'winningPlan': {u'inputStage': {u'direction': u'forward',
u'filter': {u'$and': []},
u'stage': u'COLLSCAN'},
u'limitAmount': 3,
u'sortPattern': {u'time': 1},
u'stage': u'SORT'}},
u'serverInfo': {u'gitVersion': u'6ce7cbe8c6b899552dadd907604559806aa2e9bd',
u'host': u'h008742.mongolab.com',
u'port': 53439,
u'version': u'3.0.7'}}
Limit and then sort:
>>> cursor = col.find().limit(3).sort([('time', 1)])
>>> limit_sort_plan = cursor.explain()
>>> pprint(limit_sort_plan)
{u'executionStats': {u'allPlansExecution': [],
u'executionStages': {u'advanced': 3,
u'executionTimeMillisEstimate': 0,
u'inputStage': {u'advanced': 6,
u'direction': u'forward',
u'docsExamined': 6,
u'executionTimeMillisEstimate': 0,
u'filter': {u'$and': []},
u'invalidates': 0,
u'isEOF': 1,
u'nReturned': 6,
u'needFetch': 0,
u'needTime': 1,
u'restoreState': 0,
u'saveState': 0,
u'stage': u'COLLSCAN',
u'works': 8},
u'invalidates': 0,
u'isEOF': 1,
u'limitAmount': 3,
u'memLimit': 33554432,
u'memUsage': 213,
u'nReturned': 3,
u'needFetch': 0,
u'needTime': 8,
u'restoreState': 0,
u'saveState': 0,
u'sortPattern': {u'time': 1},
u'stage': u'SORT',
u'works': 13},
u'executionSuccess': True,
u'executionTimeMillis': 0,
u'nReturned': 3,
u'totalDocsExamined': 6,
u'totalKeysExamined': 0},
u'queryPlanner': {u'indexFilterSet': False,
u'namespace': u'foo.test',
u'parsedQuery': {u'$and': []},
u'plannerVersion': 1,
u'rejectedPlans': [],
u'winningPlan': {u'inputStage': {u'direction': u'forward',
u'filter': {u'$and': []},
u'stage': u'COLLSCAN'},
u'limitAmount': 3,
u'sortPattern': {u'time': 1},
u'stage': u'SORT'}},
u'serverInfo': {u'gitVersion': u'6ce7cbe8c6b899552dadd907604559806aa2e9bd',
u'host': u'h008742.mongolab.com',
u'port': 53439,
u'version': u'3.0.7'}}
As you can see, in both cases the sort is applied first and affects all the 6 documents and then the limit limits the results to 3.
And, the execution plans are exactly the same:
>>> from copy import deepcopy # just in case
>>> cursor = col.find().sort([('time', 1)]).limit(3)
>>> sort_limit_plan = deepcopy(cursor.explain())
>>> cursor = col.find().limit(3).sort([('time', 1)])
>>> limit_sort_plan = deepcopy(cursor.explain())
>>> sort_limit_plan == limit_sort_plan
True
Also see:
How do you tell Mongo to sort a collection before limiting the results?
The mongodb documentation states that the skip() method controls the starting point of the results set, followed by sort() and ends with the limit() method.
This is regardless the order of your code. The reason is that mongo gets all the methods for the query, then it orders the skip-sort-limit methods in that exact order, and then runs the query.
I suspect, you're passing wrong key in sort parameter. something like "$key_name" instead of just "key_name"
refer How do you tell Mongo to sort a collection before limiting the results?solution for same problem as yours
Logically it should be whatever comes first in pipeline, But MongoDB always sort first before limit.
In my test Sort operation does takes precedence regardless of if it's coming before skip or after. However, it appears to be very strange behavior to me.
My sample dataset is:
[
{
"_id" : ObjectId("56f845fea524b4d098e0ef81"),
"number" : 48.98052410874508
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef82"),
"number" : 50.98747461471063
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef83"),
"number" : 81.32911244349772
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef84"),
"number" : 87.95549919039071
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef85"),
"number" : 81.63582683594402
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef86"),
"number" : 43.25696270026136
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef87"),
"number" : 88.22046335409453
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef88"),
"number" : 64.00556739160076
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef89"),
"number" : 16.09353150244296
},
{
"_id" : ObjectId("56f845fea524b4d098e0ef8a"),
"number" : 17.46667776660574
}
]
Python test code:
import pymongo
client = pymongo.MongoClient("mongodb://localhost:27017")
database = client.get_database("test")
collection = database.get_collection("collection")
print("----------------[limit -> sort]--------------------------")
result = collection.find().limit(5).sort([("number", pymongo.ASCENDING)])
for r in result:
print(r)
print("----------------[sort -> limit]--------------------------")
result = collection.find().sort([("number", pymongo.ASCENDING)]).limit(5)
for r in result:
print(r)
Result:
----------------[limit -> sort]--------------------------
{u'_id': ObjectId('56f845fea524b4d098e0ef89'), u'number': 16.09353150244296}
{u'_id': ObjectId('56f845fea524b4d098e0ef8a'), u'number': 17.46667776660574}
{u'_id': ObjectId('56f845fea524b4d098e0ef86'), u'number': 43.25696270026136}
{u'_id': ObjectId('56f845fea524b4d098e0ef81'), u'number': 48.98052410874508}
{u'_id': ObjectId('56f845fea524b4d098e0ef82'), u'number': 50.98747461471063}
----------------[sort -> limit]--------------------------
{u'_id': ObjectId('56f845fea524b4d098e0ef89'), u'number': 16.09353150244296}
{u'_id': ObjectId('56f845fea524b4d098e0ef8a'), u'number': 17.46667776660574}
{u'_id': ObjectId('56f845fea524b4d098e0ef86'), u'number': 43.25696270026136}
{u'_id': ObjectId('56f845fea524b4d098e0ef81'), u'number': 48.98052410874508}
{u'_id': ObjectId('56f845fea524b4d098e0ef82'), u'number': 50.98747461471063}
The accepted answer didn't work for me, but this does:
last5 = db.collection.find( {'key': "YOURKEY"}, sort=[( '_id', pymongo.DESCENDING )] ).limit(5)
with the limit outside and the sort inside of the find argument.

Categories