How do I merge a dictionary based on a condition? - python

Say I have the following list of dictionaries:
x = [{
'218': {
'text': 'profit',
'start': 0,
'end': 21
}
}, {
'312': {
'text': 'for',
'start': 30,
'end': 60
}
}, {
'350': {
'text': 'year',
'start': 70,
'end': 85
}
}, {
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
}, {
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}]
I want to merge some of the dictionaries, condition is whenever the end of first dict and the start of next dict have a difference less than 20 than I need to merge all the dict, and concatenate all the text.
The output should look like this:
x_new = [{
'218,312,350': {
'text': 'profit for year',
'start': 0,
'end': 85
}
}, {
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
}, {
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}]
I have already solved it with the basic approach, but it does not look good, is there any solution using itertools or something like that?
What i have tried
x_updated=sorted(x, key=lambda x: x.values()[0])
final_merge=[]
merge=[]
for first, second in zip(x_updated, x_updated[1:]):
if abs(second.values()[0]['start']-first.values()[0]['end'])<25:
print "its belong to the same column"
merge=merge+[first.keys()[0]]
else:
merge=merge+[first.keys()[0]]
final_merge=final_merge+[merge]
merge=[]
merge=merge+[second.keys()[0]]
final_merge=final_merge+[merge]
And once i have final_merge, which tells me which value to merge its easy to add the values. but for the above code is there any simple way.Also, in the end after the loop i manually added the last dict because in my situation the last one would always be a different column, but what if it belongs to the same?

This is what I would do:
first I would make some helper functions:
def merge(d1, d2):
return {",".join([list(d1)[0], list(d2)[0]]): {'text': " ".join([list(d1.values())[0]['text'], list(d2.values())[0]['text']]), 'start': list(d1.values())[0]['start'], 'end': list(d2.values())[0]['end']}}
def should_merge(d1, d2):
if (d1 is None) or (d2 is None):
return False
return abs(list(d1.values())[0]['end'] - list(d2.values())[0]['start']) < 20
The first function merges two dictionaries
The second returns True if two dictionaries should merge.
All that's left is the actual merge function:
from itertools import zip_longest
def merged_dicts(x):
actual_merge = []
last_merged = False
for d1, d2 in zip_longest(x, x[1:], fillvalue=None):
if should_merge(d1, d2) and last_merged:
actual_merge.append(merge(actual_merge.pop(), d2))
elif should_merge(d1, d2):
actual_merge.append(merge(d1, d2))
last_merged = True
elif last_merged:
last_merged = False
else:
actual_merge.append(d1)
last_merged = False
print(actual_merge)
That is a little more readable though it doesn't use any "fancy" itertool functions.
I would also consider changing the id of the dict to be inside the inner dict:
d= {'id': '385',
'text': 'next_column',
'start': 160,
'end': 169
}
That is a little less complicated and cleaner.

I would create a class for these objects you use:
class my_dict:
__init__(self, id, text, start, end):
self.id = id
self.text = text
self.start = start
self.end = end
merge(self, other):
self.id = "{},{}".format(self.id, other.id)
self.text = "{} {}".format(self.text, other.text)
self.end = other.end
And then the main code loop will be:
x_new = [x[0]]
for obj in x[1:]:
last = x_new[-1]
if obj.start - last.end > 20:
x_new.append(obj)
else:
last.merge(obj)

Try this:
x=[{'218':{'text':'profit','start':0,'end':21}},
{'312':{'text':'for','start':30,'end':60}},
{'350':{'text':'year','start':70,'end':85}},
{'370':{'text':'next column','start':120,'end':130}},
{'385':{'text':'next_column','start':160,'end':169}}]
x_new = []
d_keys = []
first_start_value = 0
def merge_dict(d_keys,x,i,first_start_value,current_index_dict_key):
# remove duplicate list of string
d_keys = list(set(d_keys))
# sort list by number
d_keys.sort(key=int)
new_key = ','.join(d_keys)
# update start value
x[i][current_index_dict_key]['start'] = first_start_value
dict1 = {new_key: x[i][current_index_dict_key]}
return dict1
for i in range(0,len(x)):
current_index_dict_key = list(x[i].keys())[0]
#check next index of list is valid
if i+1 > len(x)-1:
if len(d_keys) > 0:
# merge dictionary
dict1 = merge_dict(d_keys, x, i, first_start_value, current_index_dict_key)
x_new.append(dict1)
break
dict1 = {current_index_dict_key: x[i][current_index_dict_key]}
x_new.append(dict1)
break
next_index_dict_key = list(x[i+1].keys())[0]
start = x[i+1][next_index_dict_key]['start']
end = x[i][current_index_dict_key]['end']
diff = start - end
#compare current and next list of dicstionary end and start value
if diff < 20:
if len(d_keys) <= 0 and i == 1:
first_start_value = x[i][current_index_dict_key]['start']
d_keys.append(current_index_dict_key)
d_keys.append(next_index_dict_key)
else:
if len(d_keys) > 0:
# merge dictionary
dict1 = merge_dict(d_keys,x,i,first_start_value,current_index_dict_key)
d_keys = []
first_start_value = x[i][current_index_dict_key]['start']
else:
dict1 = {current_index_dict_key: x[i][current_index_dict_key]}
x_new.append(dict1)
print(x_new)
O/P:
[
{
'218,312,350': {
'text': 'year',
'start': 0,
'end': 85
}
},
{
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
},
{
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}
]

Related

Aggregate certain values in array of dictionary based on key/value criteria

I have the below JSON of forum posts.
What would be the pythonic way of creating a resulting JSON of aggregated Positive/Negative ratings per forum?
Input Json:
{"Posting_Stats":{
"Posts":[
{
"Date":"2020-03-29 12:41:00",
"Forum":"panorama",
"Positive":2,
"Negative":0
},
{
"Date":"2020-03-29 12:37:00",
"Forum":"web",
"Positive":6,
"Negative":0
},
{
"Date":"2020-03-29 12:37:00",
"Forum":"web",
"Positive":2,
"Negative":2
},...]}
Output should be:
{"Forum_Stats" : [{"Forum" : "panorama",
"Positive":2,
"Negative":0},
{"Forum" : "web",
"Positive":8,
"Negative":2},...]
}
]
Cannot think of a different way:
posts = inputData['Posting_Stats']['Posts']
postAggregator = {}
for post in posts:
try:
postAggregator[post['Forum']]['Positive'] += post.get('Positive',0)
postAggregator[post['Forum']]['Negative'] += post.get('Negative',0)
except KeyError:
postAggregator.update({post['Forum']:{"Positive":post.get('Positive',0), "Negative":post.get('Negative',0)}})
outputData = {"Forum_Stats": []}
for key, value in postAggregator.items():
outputData['Forum_Stats'].append({"Forum":key , "Positive":value['Positive'],"Negative":value['Negative']})
print(outputData)
Output:
{'Forum_Stats': [{'Forum': 'panorama', 'Positive': 2, 'Negative': 0}, {'Forum': 'web', 'Positive': 8, 'Negative': 2}]}
This may be one way of solving:
#taking the input in a dictionary
d = {"Posting_Stats":{
"Posts":[
{
"Date":"2020-03-29 12:41:00",
"Forum":"panorama",
"Positive":2,
"Negative":0
},
{
"Date":"2020-03-29 12:37:00",
"Forum":"web",
"Positive":6,
"Negative":0
},
{
"Date":"2020-03-29 12:37:00",
"Forum":"web",
"Positive":2,
"Negative":2
}]}}
#iterating over the values to get their some on the basis of forum as key
temp = {}
for i in d.get('Posting_Stats').get('Posts'):
if temp.get(i.get('Forum')) == None:
temp[i.get('Forum')] = {}
temp[i.get('Forum')]['Positive'] = 0
temp[i.get('Forum')]['Negative'] = 0
temp[i.get('Forum')]['Positive']+=i.get('Positive')
temp[i.get('Forum')]['Negative']+=i.get('Negative')
Finally converting the output into the required format
output = [{'Forum': i , **temp[i] } for i in temp]
print(output)
#[{'Forum': 'panorama', 'Positive': 2, 'Negative': 0},
#{'Forum': 'web', 'Positive': 8, 'Negative': 2}]

Compare previous value in loop and append to string if within tolerance

I have a list like below:
word_list = '''
[{'bottom': Decimal('58.650'),
'text': 'Welcome'
{'bottom': Decimal('74.101'),
'text': 'This'
},
{'bottom': Decimal('74.101'),
'text': 'is'
},
{'bottom': Decimal('77.280'),
'text': 'Oliver'
}]
'''
that represents a series of words: Contact Name is Oliver, which is extracted from a PDF file. The bottom value is the distance from bottom to the top of the page.
The list is sorted by the bottom key:
words = sorted(word_list, key=itemgetter('bottom'))
I am trying to iterate the list and each word to see if the word belongs on the same line - or it should be appended to a new line.
The way I am thinking to do this is to compare the bottom value in each loop, with a tolerance of xx. For example, the words This is Oliver is all on the same line within the PDF file - but the bottom value is not equal (hence the tolerance level).
Expected output
What I am trying to end up with, is something like:
[{'text': 'Welcome',
'line:' 1
{'text': 'This is Oliver',
'line': 2
}]
This is what I have so far:
for i, word in enumerate(word_list):
previous_element = word_list[i-1] if i > 0 else None
current_element = word
next_element = word_list[i +1] if i < len(word_list) - 1 else None
if math.isclose(current_element['bottom'], next_element['bottom'], abs_tol=5):
# Append the word to the line
I am a bit stuck in the above loop. I can't seem to figure out if the math.isclose() is correct and how to actually append the line[i] and the actual word to create a line sentence.
I don't think you need to use a math function; you could just check the threshold yourself. Maybe like this:
from decimal import Decimal
word_list = [
{
'bottom': Decimal('58.650'),
'text': 'Welcome',
},
{
'bottom': Decimal('74.101'),
'text': 'This',
},
{
'bottom': Decimal('77.280'),
'text': 'Oliver',
},
{
'bottom': Decimal('74.101'),
'text': 'is',
},
]
word_list = sorted(word_list, key=lambda x: x['bottom'])
threshold = Decimal('10')
current_row = [word_list[0], ]
row_list = [current_row, ]
for word in word_list[1:]:
if abs(current_row[-1]['bottom'] - word['bottom']) <= threshold:
# distance is small, use same row
current_row.append(word)
else:
# distance is big, create new row
current_row = [word, ]
row_list.append(current_row)
print('final output')
for i, row in enumerate(row_list):
data = {
'line': i,
'text': ' '.join(elem['text'] for elem in row),
}
print(data)
The output from this code is:
final output
{'line': 0, 'text': 'Welcome'}
{'line': 1, 'text': 'This is Oliver'}
line_sentence_map = {}
tolerance = 5
line = 1
what_you_want = []
for i in range(len(word_list)):
if(i == 0):
previous_line_threshold = word_list[i]['bottom']
line_sentence_map[line] = []
if(word_list[i]['bottom'] - previous_line_threshold > tolerance):
what_you_want.append({"line":line,"text":' '.join(line_sentence_map[line])})
line +=1
previous_line_threshold = word_list[i]['bottom']
line_sentence_map[line] = []
line_sentence_map[line].append(word_list[i]['text'])
if i == len(word_list) - 1:
what_you_want.append({"line": line, "text": ' '.join(line_sentence_map[line])})
Here, what_you_want will give you what you want -
[{'text': 'Welcome', 'line': 1}, {'text': 'This is Oliver', 'line': 2}]
Cheers!

Use list comprehension to return a new list

I have a dict array that I want to get a specific attribute and create a new dict list based on those attribute.
I'm trying to use list comprehension to parse every row instead of use a traditional loop. Is possible to do it?
from datetime import datetime
from dateutil.parser import parse
def _format_string_to_timestamp(dt, output):
if dt is None or type(dt) == float:
return ""
origin_dt = parse(dt)
return origin_dt.strftime(output)
def extract_tickets_tags_history(audit):
tag_history = []
sync = "1234"
tags = [d for d in audit['events'] if d.get('field_name', '') == 'tags']
if len(tags) > 0:
return [
{
'tag': tag,
'updated': _format_string_to_timestamp(audit['created_at'], "%Y-%m-%d %H:%M:%S"),
'ticket_id': audit['ticket_id'],
'tagged': False,
'user_id': audit['author_id'],
'sync': sync
}
for tag in tags[-1]['value']]
return None
audit = {
'ticket_id': 123,
'author_id': 654,
'created_at': '2019-04-07T01:09:40Z',
'events': [
{
'field_name': 'tags',
'value': ['taga', 'tagb']
}
]
}
example = [
{
'id': 123,
'data': [audit]
}
]
result = [extract_tickets_tags_history(data) for data in x['data'] for x in example]
I'm getting an error NameError: name 'x' is not defined
...
And the result should be something like [{"tag": "...", "updated": "...", ...}]
You swapped the two for loops in your list comprehension
result = [extract_tickets_tags_history(data) for x in example for data in x['data'] ]
which is equivalent to
result = []
for x in example:
for data in x['data']:
result.append(extract_tickets_tags_history(data))

python change value in nested dictionary if condition is met

I know similar questions have already been asked before, but I really having problems implementing them for my special case:
Let's say I have a dictionary with varying depths, for example:
dicti = {'files':
{'a':{'offset':100, 'start': 0},
'b':{
'c':{'offset':50, 'start':0}
'd':{'offset':70, 'start':0}
}
'e':{
'f':{'offset':80, 'start':0}
'g':{'offset':30, 'start':0}
'h':{'offset':20, 'start':0}
}
}
}
etc... (with a lot more different levels and entries)
so now I want a copy of that dictionary with basically the same structure and keys, but if 'offset' (at any level) is greater than let's say 50 'offset' should be changed to 0
I guess some kind of iterative function would be the best, but I cannot get my head around that...
You might use the standard machinery for the copy and then modify the copied dictionary (solution #1 in my example), or you might do copying and modification in the same function (solution #2).
In either case, you're looking for a recursive function.
import copy
from pprint import pprint
dicti = {'files':
{'a':{'offset':100, 'start': 0},
'b':{
'c':{'offset':50, 'start':0},
'd':{'offset':70, 'start':0},
},
'e':{
'f':{'offset':80, 'start':0},
'g':{'offset':30, 'start':0},
'h':{'offset':20, 'start':0},
}
}
}
# Solution 1, two passes
def modify(d):
if isinstance(d, dict):
if d.get('offset', 0) > 50:
d['offset'] = 0
for k,v in d.items():
modify(v)
dictj = copy.deepcopy(dicti)
modify(dictj)
pprint(dictj)
# Solution 2, copy and modify in one pass
def copy_and_modify(d):
if isinstance(d, dict):
d2 = {k:copy_and_modify(v) for k,v in d.items()}
if d2.get('offset') > 50:
d2['offset'] = 0
return d2
return d
dictj = copy_and_modify(dicti)
pprint(dictj)
A recursive solution is going to be more intuitive. You want something like the following pseudocode:
def copy(dict):
new_dict = {}
for key, value in dict:
if value is a dictionary:
new_dict[key] = copy(value)
else if key == 'offset' and value > 50:
new_dict[key] = 0
else:
new_dict[key] = value
return new_dict
d = {'files':
{'a':{'offset':100, 'start': 0},
'b':{
'c':{'offset':50, 'start':0},
'd':{'offset':70, 'start':0}
},
'e':{
'f':{'offset':80, 'start':0},
'g':{'offset':30, 'start':0},
'h':{'offset':20, 'start':0}
}
}
}
def transform(item):
new_item = item.copy() # consider usage of deepcopy if needed
if new_item['offset'] == 80:
new_item['offset'] = 'CHANGED'
return new_item
def visit(item):
if item.get('offset'):
return transform(item)
else:
return {k: visit(v) for k, v in item.items()}
result = visit(d)
print(result)
Output:
{
'files': {
'b': {
'd': {
'offset': 70,
'start': 0
},
'c': {
'offset': 50,
'start': 0
}
},
'e': {
'g': {
'offset': 30,
'start': 0
},
'h': {
'offset': 20,
'start': 0
},
'f': {
'offset': 'CHANGED',
'start': 0
}
},
'a': {
'offset': 100,
'start': 0
}
}
}
You can revise some links regarding stuff which is used in the answer:
Recursion
Visitor pattern
You could call a recursive function to change its value once condition is met:
dicti = {'files':
{'a':{'offset':100, 'start': 0},
'b':{
'c':{'offset':50, 'start':0},
'd':{'offset':70, 'start':0}
},
'e':{
'f':{'offset':80, 'start':0},
'g':{'offset':30, 'start':0},
'h':{'offset':20, 'start':0}
}
}
}
def dictLoop(dt):
for k, v in dt.items():
if isinstance(v, int):
if k == 'offset' and v > 50:
dt[k] = 0
else: dictLoop(v)
return dt
print dictLoop(dicti)

How do you find the number of occurrence within a list of dictionaries

I'm trying to find how many times per case_id does email_response occurs in the example below:
json_obj = [{
'case_id': 1000,
'type': 'email',
'customer_id': 42571,
'date': '2015-01-20',
},
{
'case_id': 1000,
'type': 'email_response',
'customer_id': 42571,
'date': '2015-01-21',
},
{
'case_id': 1021,
'type': 'email',
'customer_id': 88686,
'date': '2015-01-24',
}]
So in this case, the answer would be 1 for case_id = 1000 and 0 for case_id = 1021.
You can create another dictionary and keep on updating the count like this
>>> result = {}
>>> for obj in json_obj:
... if obj['type'] == 'email_response':
... result[obj['case_id']] = result.get(obj['case_id'], 0) + 1
...
>>> result
{1000: 1, 1021: 0}
Since we pass 0 as the second parameter, the dict.get method will return 0 if it cannot find the key being retrieved, otherwise the actual value corresponding to the key. You can do the same like this
>>> result = {}
>>> for obj in json_obj:
... result[obj['case_id']] = result.get(obj['case_id'], 0) + (obj['type'] == 'email_response')
...
>>> result
{1000: 1, 1021: 0}
Since Python's booleans are subclasses of int, True will be 1 and False will be 0. So, the result of (obj['type'] == 'email_response') will be added with the current value of case_id in the result dictionary.

Categories