parse data using regex and converting it into tuple - python

I need to find a city with the highest population using regex, data is presented in such way:
data = ["id,name,poppulation,is_capital",
"3024,eu_kyiv,24834,y",
"3025,eu_volynia,20231,n",
"3026,eu_galych,23745,n",
"4892,me_medina,18038,n",
"4401,af_cairo,18946,y",
"4700,me_tabriz,13421,n",
"4899,me_bagdad,22723,y",
"6600,af_zulu,09720,n"]
I've done this so far:
def max_population(data):
lst = []
for items in data:
a = re.findall(r',\S+_\S+,[0-9]+', items)
lst += [[b for b in i.split(',') if b] for i in a]
return max(lst, key=lambda x:int(x[1]))
But function should return (str, int) tuple, is it possible to change my code in a way that it will return tuple without iterating list once again?

All your strings are separated by a comma. You could get the max value using split and check if the third value is a digit and is greater than the first value of the tuple.
If it is, set it as the new highest value.
def max_population(data):
result = None
for s in data:
parts = s.split(",")
if not parts[2].isdigit():
continue
tup = (parts[1], int(parts[2]))
if result is None or tup[1] > result[1]:
result = tup
return result
print(max_population(items))
Output
('eu_kyiv', 24834)
Python demo

The following long line get the wanted result (str, int) tuple:
def max_population(data):
p=max([(re.findall(r"(\w*),\d*,\w$",i)[0],int(re.findall(r"(\d*),\w$",i)[0])) for n,i in enumerate(data) if n>0],key=lambda x:int(x[1]) )
return p
in this line,enumerate(data) and n>0 were used to skip the header "id,name,poppulation,is_capital". But if data has no-header the, line would be:
def max_population(data):
p=max([(re.findall(r"(\w*),\d*,\w$",i)[0],int(re.findall(r"(\d*),\w$",i)[0])) for i in data],key=lambda x:int(x[1]) )
return p
The result for both is ('eu_kyiv', 24834)

Create a list of tuples instead of a list of lists.
import re
data = ["id,name,poppulation,is_capital",
"3024,eu_kyiv,24834,y",
"3025,eu_volynia,20231,n",
"3026,eu_galych,23745,n",
"4892,me_medina,18038,n",
"4401,af_cairo,18946,y",
"4700,me_tabriz,13421,n",
"4899,me_bagdad,22723,y",
"6600,af_zulu,09720,n"]
def max_population(data):
lst = []
for items in data:
a = re.findall(r',\S+_\S+,[0-9]+', items)
lst += [tuple(b for b in i.split(',') if b) for i in a]
return max(lst, key=lambda x:int(x[1]))
print(max_population(data))

You could create a mapping function to map the types to the data and use the operator.itemgetter function as your key in max:
from operator import itemgetter
def f(row):
# Use a tuple of types to cast str to the desired type
types = (str, int)
# slice here to get the city and population values
return tuple(t(val) for t, val in zip(types, row.split(',')[1:3]))
# Have max consume a map on the data excluding the
# header row (hence the slice)
max(map(f, data[1:]), key=itemgetter(1))
('eu_kyiv', 24834)

Related

sort array based on custom alphabet order python

I want to write a code to sort arr based on customAl order, and not use sorted function.
customAl = [dshbanfmg]
arr = [bba,abb,baa,mggfba,mffgh......]
psudo code:
def sortCA(arr, customAl):
dt = {}
generate dt order based on customAl
look up and sort arr
return result
newArr = [bba,baa,abb,mffgh,mggfba......]
I know there's a similiar question but the answer is wrapped in sorted function which I don't wish to use. anyone has a better solution than unsorted, or dictionary which takes space?
Sorting string values according to a custom alphabet in Python
In my opinion, programming is a trade-off, it depends on which part you care most.
Specifically, in this scenario, you can choose to trade time for space by str.index, or you can trade space for time with an extra index dict:
customAl = 'dshbanfmg'
arr = ['bba', 'abb', 'baa', 'mggfba', 'mffgh']
# trade time for space
# no extra space but, but O(n) to index
def sortCA1(arr, customAl):
return sorted(arr, key=lambda x: [customAl.index(c) for c in x])
# trade space for time
# extra space O(n), but O(1) to index
def sortCA2(arr, customAl):
dt = {c: i for i, c in enumerate(customAl)}
return sorted(arr, key=lambda x: [dt[c] for c in x])
# output: ['bba', 'baa', 'abb', 'mffgh', 'mggfba']
Here is a version which not use sorted function, we can use a bucket based on custom alphabet order. split the arr by 1st char, if one bucket has multiple elements then split by 2nd char recursively...kind of radix sort:
one thing to mention, the length is different, so we should add a bucket to record none index str.
def sortCA3(arr, customAl):
dt = {c: i + 1 for i, c in enumerate(customAl)} # keep 0 for none bucket
def bucket_sort(arr, start):
new_arr = []
buckets = [[] for _ in range(len(customAl) + 1)]
for s in arr:
if start < len(s):
buckets[dt[s[start]]].append(s)
else:
buckets[0].append(s)
for bucket in buckets:
if len(bucket) == 1:
new_arr += bucket
elif len(bucket) > 1:
new_arr += bucket_sort(bucket, start+1)
return new_arr
return bucket_sort(arr, 0)
test and output
customAl = 'dshbanfmg'
arr = ['bba', 'bb', 'abb', 'baa', 'mggfba', 'mffgh'] # add `bb` for test
print(sortCA4(arr, customAl))

Given a linear order completely represented by a list of tuples of strings, output the order as a list of strings

Given pairs of items of form [(a,b),...] where (a,b) means a > b, for example:
[('best','better'),('best','good'),('better','good')]
I would like to output a list of form:
['best','better','good']
This is very hard for some reason. Any thoughts?
======================== code =============================
I know why it doesn't work.
def to_rank(raw):
rank = []
for u,v in raw:
if u in rank and v in rank:
pass
elif u not in rank and v not in rank:
rank = insert_front (u,v,rank)
rank = insert_behind(v,u,rank)
elif u in rank and v not in rank:
rank = insert_behind(v,u,rank)
elif u not in rank and v in rank:
rank = insert_front(u,v,rank)
return [[r] for r in rank]
# #Use: insert word u infront of word v in list of words
def insert_front(u,v,words):
if words == []: return [u]
else:
head = words[0]
tail = words[1:]
if head == v: return [u] + words
else : return ([head] + insert_front(u,v,tail))
# #Use: insert word u behind word v in list of words
def insert_behind(u,v,words):
words.reverse()
words = insert_front(u,v,words)
words.reverse()
return words
=================== Update ===================
Per suggestion of many, this is a straight forward topological sort setting, I ultimately decided to use the code from this source: algocoding.wordpress.com/2015/04/05/topological-sorting-python/
which solved my problem.
def go_topsort(graph):
in_degree = { u : 0 for u in graph } # determine in-degree
for u in graph: # of each node
for v in graph[u]:
in_degree[v] += 1
Q = deque() # collect nodes with zero in-degree
for u in in_degree:
if in_degree[u] == 0:
Q.appendleft(u)
L = [] # list for order of nodes
while Q:
u = Q.pop() # choose node of zero in-degree
L.append(u) # and 'remove' it from graph
for v in graph[u]:
in_degree[v] -= 1
if in_degree[v] == 0:
Q.appendleft(v)
if len(L) == len(graph):
return L
else: # if there is a cycle,
return []
RockBilly's solution also work in my case, because in my setting, for every v < u, we are guaranteed to have a pair (u,v) in our list. So his answer is not very "computer-sciency", but it gets the job done in this case.
If you have a complete grammar specified then you can simply count up the items:
>>> import itertools as it
>>> from collections import Counter
>>> ranks = [('best','better'),('best','good'),('better','good')]
>>> c = Counter(x for x, y in ranks)
>>> sorted(set(it.chain(*ranks)), key=c.__getitem__, reverse=True)
['best', 'better', 'good']
If you have an incomplete grammar then you can build a graph and dfs all paths to find the longest. This isn't very inefficient, as I haven't thought about that yet :):
def dfs(graph, start, end):
stack = [[start]]
while stack:
path = stack.pop()
if path[-1] == end:
yield path
continue
for next_state in graph.get(path[-1], []):
if next_state in path:
continue
stack.append(path+[next_state])
def paths(ranks):
graph = {}
for n, m in ranks:
graph.setdefault(n,[]).append(m)
for start, end in it.product(set(it.chain(*ranks)), repeat=2):
yield from dfs(graph, start, end)
>>> ranks = [('black', 'dark'), ('black', 'dim'), ('black', 'gloomy'), ('dark', 'gloomy'), ('dim', 'dark'), ('dim', 'gloomy')]
>>> max(paths(ranks), key=len)
['black', 'dim', 'dark', 'gloomy']
>>> ranks = [('a','c'), ('b','a'),('b','c'), ('d','a'), ('d','b'), ('d','c')]
>>> max(paths(ranks), key=len)
['d', 'b', 'a', 'c']
What you're looking for is topological sort. You can do this in linear time using depth-first search (pseudocode included in the wiki I linked)
Here is one way. It is based on using the complete pairwise rankings to make an old-style (early Python 2) cmp function and then using functools.cmp_to_key to convert it to a key suitable for the Python 3 approach to sorting:
import functools
def sortByRankings(rankings):
def cmp(x,y):
if x == y:
return 0
elif (x,y) in rankings:
return -1
else:
return 1
items = list({x for y in rankings for x in y})
items.sort(key = functools.cmp_to_key(cmp))
return items
Tested like:
ranks = [('a','c'), ('b','a'),('b','c'), ('d','a'), ('d','b'), ('d','c')]
print(sortByRankings(ranks)) #prints ['d', 'b', 'a', 'c']
Note that to work correctly, the parameter rankings must contain an entry for each pair of distinct items. If it doesn't, you would first need to compute the transitive closure of the pairs that you do have before you feed it to this function.
You can take advantage of the fact that the lowest ranked item in the list will never appear at the start of any tuple. You can extract this lowest item, then remove all elements which contain this lowest item from your list, and repeat to get the next lowest.
This should work even if you have redundant elements, or have a sparser list than some of the examples here. I've broken it up into finding the lowest ranked item, and then the grunt work of using this to create a final ranking.
from copy import copy
def find_lowest_item(s):
#Iterate over set of all items
for item in set([item for sublist in s for item in sublist]):
#If an item does not appear at the start of any tuple, return it
if item not in [x[0] for x in s]:
return item
def sort_by_comparison(s):
final_list = []
#Make a copy so we don't mutate original list
new_s = copy(s)
#Get the set of all items
item_set = set([item for sublist in s for item in sublist])
for i in range(len(item_set)):
lowest = find_lowest_item(new_s)
if lowest is not None:
final_list.insert(0, lowest)
#For the highest ranked item, we just compare our current
#ranked list with the full set of items
else:
final_list.insert(0,set(item_set).difference(set(final_list)).pop())
#Update list of ranking tuples to remove processed items
new_s = [x for x in new_s if lowest not in x]
return final_list
list_to_compare = [('black', 'dark'), ('black', 'dim'), ('black', 'gloomy'), ('dark', 'gloomy'), ('dim', 'dark'), ('dim', 'gloomy')]
sort_by_comparison(list_to_compare)
['black', 'dim', 'dark', 'gloomy']
list2 = [('best','better'),('best','good'),('better','good')]
sort_by_comparison(list2)
['best', 'better', 'good']
list3 = [('best','better'),('better','good')]
sort_by_comparison(list3)
['best', 'better', 'good']
If you do sorting or create a dictionary from the list items, you are going to miss the order as #Rockybilly mentioned in his answer. I suggest you to create a list from the tuples of the original list and then remove duplicates.
def remove_duplicates(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
i = [(5,2),(1,3),(1,4),(2,3),(2,4),(3,4)]
i = remove_duplicates(list(x for s in i for x in s))
print(i) # prints [5, 2, 1, 3, 4]
j = [('excellent','good'),('excellent','great'),('great','good')]
j = remove_duplicates(list(x for s in j for x in s))
print(j) # prints ['excellent', 'good', 'great']
See reference: How do you remove duplicates from a list in whilst preserving order?
For explanation on the remove_duplicates() function, see this stackoverflow post.
If the list is complete, meaning has enough information to do the ranking(Also no duplicate or redundant inputs), this will work.
from collections import defaultdict
lst = [('best','better'),('best','good'),('better','good')]
d = defaultdict(int)
for tup in lst:
d[tup[0]] += 1
d[tup[1]] += 0 # To create it in defaultdict
print sorted(d, key = lambda x: d[x], reverse=True)
# ['best', 'better', 'good']
Just give them points, increment the left one each time you encounter it in the list.
Edit: I do think the OP has a determined type of input. Always have tuple count of combination nCr(n, 2). Which makes this a correct solution. No need to complain about the edge cases, which I already knew posting the answer(and mentioned it).

Python Tulpe Key For Dict Partial Lookup

I have a dictionary with a tuple of 5 values as a key. For example:
D[i,j,k,g,h] = value.
Now i need to process all elements with a certain partial key pair (i1,g1):
I need now for each pair (i1,g1) all values that have i == i1 and g == g1 in the full key.
What is an pythonic and efficient way to retrieve this, knowing that i need the elements for all pairs and each full key belongs to exactly one partial key?
Is there a more appropriate data structure than dictionaries?
One reference implementation is this:
results = {}
for i in I:
for g in G:
results[i,g] = []
for i,j,k,g,h in D:
if i1 == i and g1 == g:
results[i,g].append(D[i,j,k,g,h])
Assuming you know all the valid values for the different indices you can get all possible keys using itertools.product:
import itertools
I = [3,6,9]
J = range(10)
K = "abcde"
G = ["first","second"]
H = range(10,20)
for tup in itertools.product(I,J,K,G,H):
my_dict[tup] = 0
To restrict the indices generated just put a limit on one / several of the indices that gets generated, for instance all of the keys where i = 6 would be:
itertools.product((6,), J,K,G,H)
A function to let you specify you want all the indices where i==6 and g =="first" would look like this:
def partial_indices(i_vals=I, j_vals=J, k_vals=K, g_vals = G, h_vals = H):
return itertools.product(i_vals, j_vals, k_vals, g_vals, h_vals)
partial_indices(i_vals=(6,), g_vals=("first",))
Or assuming that not all of these are present in the dictionary you can also pass the dictionary as an argument and check for membership before generating the keys:
def items_with_partial_indices(d, i_vals=I, j_vals=J, k_vals=K, g_vals = G, h_vals = H):
for tup in itertools.product(i_vals, j_vals, k_vals, g_vals, h_vals):
try:
yield tup, d[tup]
except KeyError:
pass
for k,v in D.iteritems():
if i in k and p in k:
print v

In Python, How can I get the next and previous key:value of a particular key in a dictionary?

Okay, so this is a little hard to explain, but here goes:
I have a dictionary, which I'm adding content to. The content is a hashed username (key) with an IP address (value).
I was putting the hashes into an order by running them against base 16, and then using Collection.orderedDict.
So, the dictionary looked a little like this:
d = {'1234': '8.8.8.8', '2345':'0.0.0.0', '3213':'4.4.4.4', '4523':'1.1.1.1', '7654':'1.3.3.7', '9999':'127.0.0.1'}
What I needed was a mechanism that would allow me to pick one of those keys, and get the key/value item one higher and one lower. So, for example, If I were to pick 2345, the code would return the key:value combinations '1234:8.8.8.8' and '3213:4.4.4.4'
So, something like:
for i in d:
while i < len(d)
if i == '2345':
print i.nextItem
print i.previousItem
break()
Edit: OP now states that they are using OrderedDicts but the use case still requires this sort of approach.
Since dicts are not ordered you cannot directly do this. From your example, you are trying to reference the item like you would use a linked list.
A quick solution would be instead to extract the keys and sort them then iterate over that list:
keyList=sorted(d.keys())
for i,v in enumerate(keyList):
if v=='eeee':
print d[keyList[i+1]]
print d[keyList[i-1]]
The keyList holds the order of your items and you have to go back to it to find out what the next/previous key is to get the next/previous value. You also have to check for i+1 being greater than the list length and i-1 being less than 0.
You can use an OrderedDict similarly but I believe that you still have to do the above with a separate list as OrderedDict doesn't have next/prev methods.
As seen in the OrderedDict source code,
if you have a key and you want to find the next and prev in O(1) here's how you do that.
>>> from collections import OrderedDict
>>> d = OrderedDict([('aaaa', 'a',), ('bbbb', 'b'), ('cccc', 'c'), ('dddd', 'd'), ('eeee', 'e'), ('ffff', 'f')])
>>> i = 'eeee'
>>> link_prev, link_next, key = d._OrderedDict__map['eeee']
>>> print 'nextKey: ', link_next[2], 'prevKey: ', link_prev[2]
nextKey: ffff prevKey: dddd
This will give you next and prev by insertion order. If you add items in random order then just keep track of your items in sorted order.
You could also use the list.index() method.
This function is more generic (you can check positions +n and -n), it will catch attempts at searching a key that's not in the dict, and it will also return None if there's nothing before of after the key:
def keyshift(dictionary, key, diff):
if key in dictionary:
token = object()
keys = [token]*(diff*-1) + sorted(dictionary) + [token]*diff
newkey = keys[keys.index(key)+diff]
if newkey is token:
print None
else:
print {newkey: dictionary[newkey]}
else:
print 'Key not found'
keyshift(d, 'bbbb', -1)
keyshift(d, 'eeee', +1)
Try:
pos = 0
d = {'aaaa': 'a', 'bbbb':'b', 'cccc':'c', 'dddd':'d', 'eeee':'e', 'ffff':'f'}
for i in d:
pos+=1
if i == 'eeee':
listForm = list(d.values())
print(listForm[pos-1])
print(listForm[pos+1])
As in #AdamKerz's answer enumerate seems pythonic, but if you are a beginner this code might help you understand it in an easy way.
And I think its faster + smaller compared to sorting followed by building list & then enumerating
You could use a generic function, based on iterators, to get a moving window (taken from this question):
import itertools
def window(iterable, n=3):
it = iter(iterable)
result = tuple(itertools.islice(it, n))
if len(result) == n:
yield result
for element in it:
result = result[1:] + (element,)
yield result
l = range(8)
for i in window(l, 3):
print i
Using the above function with OrderedDict.items() will give you three (key, value) pairs, in order:
d = collections.OrderedDict(...)
for p_item, item, n_item in window(d.items()):
p_key, p_value = p_item
key, value = item
# Or, if you don't care about the next value:
n_key, _ = n_item
Of course using this function the first and last values will never be in the middle position (although this should not be difficult to do with some adaptation).
I think the biggest advantage is that it does not require table lookups in the previous and next keys, and also that it is generic and works with any iterable.
Maybe it is an overkill, but you can keep Track of the Keys inserted with a Helper Class and according to that list, you can retrieve the Key for Previous or Next. Just don't forget to check for border conditions, if the objects is already first or last element. This way, you will not need to always resort the ordered list or search for the element.
from collections import OrderedDict
class Helper(object):
"""Helper Class for Keeping track of Insert Order"""
def __init__(self, arg):
super(Helper, self).__init__()
dictContainer = dict()
ordering = list()
#staticmethod
def addItem(dictItem):
for key,value in dictItem.iteritems():
print key,value
Helper.ordering.append(key)
Helper.dictContainer[key] = value
#staticmethod
def getPrevious(key):
index = (Helper.ordering.index(key)-1)
return Helper.dictContainer[Helper.ordering[index]]
#Your unordered dictionary
d = {'aaaa': 'a', 'bbbb':'b', 'cccc':'c', 'dddd':'d', 'eeee':'e', 'ffff':'f'}
#Create Order over keys
ordered = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
#Push your ordered list to your Helper class
Helper.addItem(ordered)
#Get Previous of
print Helper.getPrevious('eeee')
>>> d
You can store the keys and values in temp variable in prior, and can access previous and next key,value pair using index.
It is pretty dynamic, will work for any key you query. Please check this code :
d = {'1234': '8.8.8.8', '2345':'0.0.0.0', '3213':'4.4.4.4', '4523':'1.1.1.1', '7654':'1.3.3.7', '9999':'127.0.0.1'}
ch = raw_input('Pleasure Enter your choice : ')
keys = d.keys()
values = d.values()
#print keys, values
for k,v in d.iteritems():
if k == ch:
ind = d.keys().index(k)
print keys[ind-1], ':',values[ind-1]
print keys[ind+1], ':',values[ind+1]
I think this is a nice Pythonic way of resolving your problem using a lambda and list comprehension, although it may not be optimal in execution time:
import collections
x = collections.OrderedDict([('a','v1'),('b','v2'),('c','v3'),('d','v4')])
previousItem = lambda currentKey, thisOrderedDict : [
list( thisOrderedDict.items() )[ z - 1 ] if (z != 0) else None
for z in range( len( thisOrderedDict.items() ) )
if (list( thisOrderedDict.keys() )[ z ] == currentKey) ][ 0 ]
nextItem = lambda currentKey, thisOrderedDict : [
list( thisOrderedDict.items() )[ z + 1 ] if (z != (len( thisOrderedDict.items() ) - 1)) else None
for z in range( len( thisOrderedDict.items() ) )
if (list( thisOrderedDict.keys() )[ z ] == currentKey) ][ 0 ]
assert previousItem('c', x) == ('b', 'v2')
assert nextItem('c', x) == ('d', 'v4')
assert previousItem('a', x) is None
assert nextItem('d',x) is None
Another way that seems simple and straight forward: this function returns the key which is offset positions away from k
def get_shifted_key(d:dict, k:str, offset:int) -> str:
l = list(d.keys())
if k in l:
i = l.index(k) + offset
if 0 <= i < len(l):
return l[i]
return None
i know how to get next key:value of a particular key in a dictionary:
flag = 0
for k, v in dic.items():
if flag == 0:
code...
flag += 1
continue
code...{next key and value in for}
if correct :
d = { "a": 1, "b":2, "c":3 }
l = list( d.keys() ) # make a list of the keys
k = "b" # the actual key
i = l.index( k ) # get index of the actual key
for the next :
i = i+1 if i+1 < len( l ) else 0 # select next index or restart 0
n = l [ i ]
d [ n ]
for the previous :
i = i-1 if i-1 >= 0 else len( l ) -1 # select previous index or go end
p = l [ i ]
d [ p ]

How to join array based on position and datatype in Python?

I have a few arrays containing integer and strings. For example:
myarray1 = [1,2,3,"ab","cd",4]
myarray2 = [1,"a",2,3,"bc","cd","e",4]
I'm trying to combine only the strings in an array that are next to each other. So I want the result to be:
newarray1= [1,2,3,"abcd",4]
newarray2= [1,"a",2,3,"bccde",4]
Does anyone know how to do this? Thank you!
The groupby breaks the list up into runs of strings and runs of integers. The ternary operation joins the groups of strings and puts them into a temporary sequence. The chain re-joins the strings and the runs of integers.
from itertools import groupby, chain
def joinstrings(iterable):
return list(chain.from_iterable(
(''.join(group),) if key else group
for key, group in
groupby(iterable, key=lambda elem: isinstance(elem, basestring))))
>>> myarray1 = [1,2,3,"ab","cd",4]
>>> newarray1 = [myarray1[0]]
>>> for item in myarray1[1:]:
... if isinstance(item, str) and isinstance(newarray1[-1], str):
... newarray1[-1] = newarray1[-1] + item
... else:
... newarray1.append(item)
>>> newarray1
[1, 2, 3, 'abcd', 4]
reduce(lambda x, (tp, it): tp and x + ["".join(it)] or x+list(it), itertools.groupby( myarray1, lambda x: isinstance(x, basestring) ), [])
a = [1,2,3,"ab","cd",4]
b = [1,a,2,3,"bc","cd","e",4]
def func(a):
ret = []
s = ""
for x in a:
if isinstance(x, basestring):
s = s + x
else:
if s:
ret.append(s)
s = ""
ret.append(x)
return ret
print func(a)
print func(b)

Categories