Compare previous value in loop and append to string if within tolerance - python

I have a list like below:
word_list = '''
[{'bottom': Decimal('58.650'),
'text': 'Welcome'
{'bottom': Decimal('74.101'),
'text': 'This'
},
{'bottom': Decimal('74.101'),
'text': 'is'
},
{'bottom': Decimal('77.280'),
'text': 'Oliver'
}]
'''
that represents a series of words: Contact Name is Oliver, which is extracted from a PDF file. The bottom value is the distance from bottom to the top of the page.
The list is sorted by the bottom key:
words = sorted(word_list, key=itemgetter('bottom'))
I am trying to iterate the list and each word to see if the word belongs on the same line - or it should be appended to a new line.
The way I am thinking to do this is to compare the bottom value in each loop, with a tolerance of xx. For example, the words This is Oliver is all on the same line within the PDF file - but the bottom value is not equal (hence the tolerance level).
Expected output
What I am trying to end up with, is something like:
[{'text': 'Welcome',
'line:' 1
{'text': 'This is Oliver',
'line': 2
}]
This is what I have so far:
for i, word in enumerate(word_list):
previous_element = word_list[i-1] if i > 0 else None
current_element = word
next_element = word_list[i +1] if i < len(word_list) - 1 else None
if math.isclose(current_element['bottom'], next_element['bottom'], abs_tol=5):
# Append the word to the line
I am a bit stuck in the above loop. I can't seem to figure out if the math.isclose() is correct and how to actually append the line[i] and the actual word to create a line sentence.

I don't think you need to use a math function; you could just check the threshold yourself. Maybe like this:
from decimal import Decimal
word_list = [
{
'bottom': Decimal('58.650'),
'text': 'Welcome',
},
{
'bottom': Decimal('74.101'),
'text': 'This',
},
{
'bottom': Decimal('77.280'),
'text': 'Oliver',
},
{
'bottom': Decimal('74.101'),
'text': 'is',
},
]
word_list = sorted(word_list, key=lambda x: x['bottom'])
threshold = Decimal('10')
current_row = [word_list[0], ]
row_list = [current_row, ]
for word in word_list[1:]:
if abs(current_row[-1]['bottom'] - word['bottom']) <= threshold:
# distance is small, use same row
current_row.append(word)
else:
# distance is big, create new row
current_row = [word, ]
row_list.append(current_row)
print('final output')
for i, row in enumerate(row_list):
data = {
'line': i,
'text': ' '.join(elem['text'] for elem in row),
}
print(data)
The output from this code is:
final output
{'line': 0, 'text': 'Welcome'}
{'line': 1, 'text': 'This is Oliver'}

line_sentence_map = {}
tolerance = 5
line = 1
what_you_want = []
for i in range(len(word_list)):
if(i == 0):
previous_line_threshold = word_list[i]['bottom']
line_sentence_map[line] = []
if(word_list[i]['bottom'] - previous_line_threshold > tolerance):
what_you_want.append({"line":line,"text":' '.join(line_sentence_map[line])})
line +=1
previous_line_threshold = word_list[i]['bottom']
line_sentence_map[line] = []
line_sentence_map[line].append(word_list[i]['text'])
if i == len(word_list) - 1:
what_you_want.append({"line": line, "text": ' '.join(line_sentence_map[line])})
Here, what_you_want will give you what you want -
[{'text': 'Welcome', 'line': 1}, {'text': 'This is Oliver', 'line': 2}]
Cheers!

Related

Parse only selected records from empty-line separated file

I have a file with the following structure:
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
Records (i.e., blocks) are separated by an empty line. Each line in a block starts with a SE tag. text tag always occurs in the first line of each block.
I wonder how to properly extract only blocks with a relation tag, which is not necessarily present in each block. My attempt is pasted below:
from itertools import groupby
with open('test.txt') as f:
for nonempty, group in groupby(f, bool):
if nonempty:
process_block() ## ?
Desired output is a json dump:
{
"result": [
{
"text": "Baz",
"relation": ["Bla","Foo"]
},
{
"text": "Zoo",
"relation": ["Bla","Baz"]
}
]
}
I have a proposed solution in pure python that returns a block if it contains the value in any position. This could most likely be done more elegant in a proper framework like pandas.
from pprint import pprint
fname = 'ex.txt'
# extract blocks
with open(fname, 'r') as f:
blocks = [[]]
for line in f:
if len(line) == 1:
blocks.append([])
else:
blocks[-1] += [line.strip().split('|')]
# remove blocks that don't contain 'relation
blocks = [block for block in blocks
if any('relation' == x[1] for x in block)]
pprint(blocks)
# [[['SE', 'text', 'Baz'],
# ['SE', 'entity', 'Bla'],
# ['SE', 'relation', 'Bla'],
# ['SE', 'relation', 'Foo']],
# [['SE', 'text', 'Zoo'], ['SE', 'relation', 'Bla'], ['SE', 'relation', 'Baz']]]
# To export to proper json format the following can be done
import pandas as pd
import json
results = []
for block in blocks:
df = pd.DataFrame(block)
json_dict = {}
json_dict['text'] = list(df[2][df[1] == 'text'])
json_dict['relation'] = list(df[2][df[1] == 'relation'])
results.append(json_dict)
print(json.dumps(results))
# '[{"text": ["Baz"], "relation": ["Bla", "Foo"]}, {"text": ["Zoo"], "relation": ["Bla", "Baz"]}]'
Let's go through it
Read the file into a list and divide each block by a blank line and divide columns with the | character.
Go through each block in the list and sort out any that does not contain relation.
Print the output.
You can not store the same key twice in a dictionary as mentioned in the comments.
You can read your file, split at '\n\n' into blocks, split blocks into lines at '\n', split lines into data at '|'.
You then can put it into a suiteable datastructure and parse it into a string using module json:
Create data file:
with open("f.txt","w")as f:
f.write('''SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz''')
Read data and process it:
with open("f.txt") as f:
all_text = f.read()
as_blocks = all_text.split("\n\n")
# skip SE when splitting and filter only with |relation|
with_relation = [[k.split("|")[1:]
for k in b.split("\n")]
for b in as_blocks if "|relation|" in b]
print(with_relation)
Create a suiteable data structure - grouping multiple same keys into a list:
result = []
for inner in with_relation:
result.append({})
for k,v in inner:
# add as simple key
if k not in result[-1]:
result[-1][k] = v
# got key 2nd time, read it as list
elif k in result[-1] and not isinstance(result[-1][k], list):
result[-1][k] = [result[-1][k], v]
# got it a 3rd+ time, add to list
else:
result[-1][k].append(v)
print(result)
Create json from data structure:
import json
print( json.dumps({"result":result}, indent=4))
Output:
# with_relation
[[['text', 'Baz'], ['entity', 'Bla'], ['relation', 'Bla'], ['relation', 'Foo']],
[['text', 'Zoo'], ['relation', 'Bla'], ['relation', 'Baz']]]
# result
[{'text': 'Baz', 'entity': 'Bla', 'relation': ['Bla', 'Foo']},
{'text': 'Zoo', 'relation': ['Bla', 'Baz']}]
# json string
{
"result": [
{
"text": "Baz",
"entity": "Bla",
"relation": [
"Bla",
"Foo"
]
},
{
"text": "Zoo",
"relation": [
"Bla",
"Baz"
]
}
]
}
In my opinion this is a very good case for a small parser.
This solution uses a PEG parser called parsimonious but you could totally use another one:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import json
data = """
SE|text|Baz
SE|entity|Bla
SE|relation|Bla
SE|relation|Foo
SE|text|Bla
SE|entity|Foo
SE|text|Zoo
SE|relation|Bla
SE|relation|Baz
"""
class TagVisitor(NodeVisitor):
grammar = Grammar(r"""
content = (ws / block)+
block = line+
line = ~".+" nl?
nl = ~"[\n\r]"
ws = ~"\s+"
""")
def generic_visit(self, node, visited_children):
return visited_children or node
def visit_content(self, node, visited_children):
filtered = [child[0] for child in visited_children if isinstance(child[0], dict)]
return {"result": filtered}
def visit_block(self, node, visited_children):
text, relations = None, []
for child in visited_children:
if child[1] == "text" and not text:
text = child[2].strip()
elif child[1] == "relation":
relations.append(child[2])
if relations:
return {"text": text, "relation": relations}
def visit_line(self, node, visited_children):
tag1, tag2, text = node.text.split("|")
return tag1, tag2, text.strip()
tv = TagVisitor()
result = tv.parse(data)
print(json.dumps(result))
This yields
{"result":
[{"text": "Baz", "relation": ["Bla", "Foo"]},
{"text": "Zoo", "relation": ["Bla", "Baz"]}]
}
The idea is to phrase a grammar, build an abstract syntax tree out of it and return the block's content in a suitable data format.

How do I merge a dictionary based on a condition?

Say I have the following list of dictionaries:
x = [{
'218': {
'text': 'profit',
'start': 0,
'end': 21
}
}, {
'312': {
'text': 'for',
'start': 30,
'end': 60
}
}, {
'350': {
'text': 'year',
'start': 70,
'end': 85
}
}, {
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
}, {
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}]
I want to merge some of the dictionaries, condition is whenever the end of first dict and the start of next dict have a difference less than 20 than I need to merge all the dict, and concatenate all the text.
The output should look like this:
x_new = [{
'218,312,350': {
'text': 'profit for year',
'start': 0,
'end': 85
}
}, {
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
}, {
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}]
I have already solved it with the basic approach, but it does not look good, is there any solution using itertools or something like that?
What i have tried
x_updated=sorted(x, key=lambda x: x.values()[0])
final_merge=[]
merge=[]
for first, second in zip(x_updated, x_updated[1:]):
if abs(second.values()[0]['start']-first.values()[0]['end'])<25:
print "its belong to the same column"
merge=merge+[first.keys()[0]]
else:
merge=merge+[first.keys()[0]]
final_merge=final_merge+[merge]
merge=[]
merge=merge+[second.keys()[0]]
final_merge=final_merge+[merge]
And once i have final_merge, which tells me which value to merge its easy to add the values. but for the above code is there any simple way.Also, in the end after the loop i manually added the last dict because in my situation the last one would always be a different column, but what if it belongs to the same?
This is what I would do:
first I would make some helper functions:
def merge(d1, d2):
return {",".join([list(d1)[0], list(d2)[0]]): {'text': " ".join([list(d1.values())[0]['text'], list(d2.values())[0]['text']]), 'start': list(d1.values())[0]['start'], 'end': list(d2.values())[0]['end']}}
def should_merge(d1, d2):
if (d1 is None) or (d2 is None):
return False
return abs(list(d1.values())[0]['end'] - list(d2.values())[0]['start']) < 20
The first function merges two dictionaries
The second returns True if two dictionaries should merge.
All that's left is the actual merge function:
from itertools import zip_longest
def merged_dicts(x):
actual_merge = []
last_merged = False
for d1, d2 in zip_longest(x, x[1:], fillvalue=None):
if should_merge(d1, d2) and last_merged:
actual_merge.append(merge(actual_merge.pop(), d2))
elif should_merge(d1, d2):
actual_merge.append(merge(d1, d2))
last_merged = True
elif last_merged:
last_merged = False
else:
actual_merge.append(d1)
last_merged = False
print(actual_merge)
That is a little more readable though it doesn't use any "fancy" itertool functions.
I would also consider changing the id of the dict to be inside the inner dict:
d= {'id': '385',
'text': 'next_column',
'start': 160,
'end': 169
}
That is a little less complicated and cleaner.
I would create a class for these objects you use:
class my_dict:
__init__(self, id, text, start, end):
self.id = id
self.text = text
self.start = start
self.end = end
merge(self, other):
self.id = "{},{}".format(self.id, other.id)
self.text = "{} {}".format(self.text, other.text)
self.end = other.end
And then the main code loop will be:
x_new = [x[0]]
for obj in x[1:]:
last = x_new[-1]
if obj.start - last.end > 20:
x_new.append(obj)
else:
last.merge(obj)
Try this:
x=[{'218':{'text':'profit','start':0,'end':21}},
{'312':{'text':'for','start':30,'end':60}},
{'350':{'text':'year','start':70,'end':85}},
{'370':{'text':'next column','start':120,'end':130}},
{'385':{'text':'next_column','start':160,'end':169}}]
x_new = []
d_keys = []
first_start_value = 0
def merge_dict(d_keys,x,i,first_start_value,current_index_dict_key):
# remove duplicate list of string
d_keys = list(set(d_keys))
# sort list by number
d_keys.sort(key=int)
new_key = ','.join(d_keys)
# update start value
x[i][current_index_dict_key]['start'] = first_start_value
dict1 = {new_key: x[i][current_index_dict_key]}
return dict1
for i in range(0,len(x)):
current_index_dict_key = list(x[i].keys())[0]
#check next index of list is valid
if i+1 > len(x)-1:
if len(d_keys) > 0:
# merge dictionary
dict1 = merge_dict(d_keys, x, i, first_start_value, current_index_dict_key)
x_new.append(dict1)
break
dict1 = {current_index_dict_key: x[i][current_index_dict_key]}
x_new.append(dict1)
break
next_index_dict_key = list(x[i+1].keys())[0]
start = x[i+1][next_index_dict_key]['start']
end = x[i][current_index_dict_key]['end']
diff = start - end
#compare current and next list of dicstionary end and start value
if diff < 20:
if len(d_keys) <= 0 and i == 1:
first_start_value = x[i][current_index_dict_key]['start']
d_keys.append(current_index_dict_key)
d_keys.append(next_index_dict_key)
else:
if len(d_keys) > 0:
# merge dictionary
dict1 = merge_dict(d_keys,x,i,first_start_value,current_index_dict_key)
d_keys = []
first_start_value = x[i][current_index_dict_key]['start']
else:
dict1 = {current_index_dict_key: x[i][current_index_dict_key]}
x_new.append(dict1)
print(x_new)
O/P:
[
{
'218,312,350': {
'text': 'year',
'start': 0,
'end': 85
}
},
{
'370': {
'text': 'next column',
'start': 120,
'end': 130
}
},
{
'385': {
'text': 'next_column',
'start': 160,
'end': 169
}
}
]

Use list comprehension to return a new list

I have a dict array that I want to get a specific attribute and create a new dict list based on those attribute.
I'm trying to use list comprehension to parse every row instead of use a traditional loop. Is possible to do it?
from datetime import datetime
from dateutil.parser import parse
def _format_string_to_timestamp(dt, output):
if dt is None or type(dt) == float:
return ""
origin_dt = parse(dt)
return origin_dt.strftime(output)
def extract_tickets_tags_history(audit):
tag_history = []
sync = "1234"
tags = [d for d in audit['events'] if d.get('field_name', '') == 'tags']
if len(tags) > 0:
return [
{
'tag': tag,
'updated': _format_string_to_timestamp(audit['created_at'], "%Y-%m-%d %H:%M:%S"),
'ticket_id': audit['ticket_id'],
'tagged': False,
'user_id': audit['author_id'],
'sync': sync
}
for tag in tags[-1]['value']]
return None
audit = {
'ticket_id': 123,
'author_id': 654,
'created_at': '2019-04-07T01:09:40Z',
'events': [
{
'field_name': 'tags',
'value': ['taga', 'tagb']
}
]
}
example = [
{
'id': 123,
'data': [audit]
}
]
result = [extract_tickets_tags_history(data) for data in x['data'] for x in example]
I'm getting an error NameError: name 'x' is not defined
...
And the result should be something like [{"tag": "...", "updated": "...", ...}]
You swapped the two for loops in your list comprehension
result = [extract_tickets_tags_history(data) for x in example for data in x['data'] ]
which is equivalent to
result = []
for x in example:
for data in x['data']:
result.append(extract_tickets_tags_history(data))

Why is not looping?

I'm learning python and I'm not sure why the output of the below code is only "False" and not many "false" if I created a loop and the list of dict have 5 elements.
I was expect an ouput like
"False"
"False"
"False"
"False"
"False"
movies = [{
"name": "Usual Suspects"
}, {
"name": "Hitman",
}, {
"name": "Dark Knight",
},{
"name": "The Choice",
}, {
"name": "Colonia",}
]
def peliMayor(p):
index= -1
for n in movies:
index= index + 1
if (movies[index]['name'] == p):
return print("True")
else:
return print("False")
peli = "Thriller"
peliMayor(peli)
You should remove return from your for-loop as it breaks your loop.
Also, the if-else statement is not required. You can print the boolean value directly from the comparison movies[index]['name'] == p
def peliMayor(p):
index= -1
for n in movies:
index= index + 1
print(movies[index]['name'] == p)
movies = [{'name': 'Usual Suspects'}, {'name': 'Hitman'},
{'name': 'Dark Knight'}, {'name': 'The Choice'},
{'name': 'Colonia'}]
peli = 'Thriller'
peliMayor(peli)
You should remove return from loop. The following code will help you. Thanks.
movies = [{'name': 'Usual Suspects'}, {'name': 'Hitman'},
{'name': 'Dark Knight'}, {'name': 'The Choice'},
{'name': 'Colonia'}]
def peliMayor(p):
index = -1
for n in movies:
index = index + 1
if movies[index]['name'] == p:
print 'True'
else:
print 'False'
peli = 'Thriller'
peliMayor(peli)

Formatting a string in required format in Python

I have a data in format:
id1 id2 value
Something like
1 234 0.2
1 235 0.1
and so on.
I want to convert it in json format:
{
"nodes": [ {"name":"1"}, #first element
{"name":"234"}, #second element
{"name":"235"} #third element
] ,
"links":[{"source":1,"target":2,"value":0.2},
{"source":1,"target":3,"value":0.1}
]
}
So, from the original data to above format.. the nodes contain all the set of (distinct) names present in the original data and the links are basically the line number of source and target in the values list returned by nodes.
For example:
1 234 0.2
1 is in the first element in the list of values holded by the key "nodes"
234 is the second element in the list of values holded by the key "nodes"
Hence the link dictionary is {"source":1,"target":2,"value":0.2}
How do i do this efficiently in python.. I am sure there should be better way than what I am doing which is so messy :(
Here is what I am doing
from collections import defaultdict
def open_file(filename,output=None):
f = open(filename,"r")
offset = 3429
data_dict = {}
node_list = []
node_dict = {}
link_list = []
num_lines = 0
line_ids = []
for line in f:
line = line.strip()
tokens = line.split()
mod_wid = int(tokens[1]) + offset
if not node_dict.has_key(tokens[0]):
d = {"name": tokens[0],"group":1}
node_list.append(d)
node_dict[tokens[0]] = True
line_ids.append(tokens[0])
if not node_dict.has_key(mod_wid):
d = {"name": str(mod_wid),"group":1}
node_list.append(d)
node_dict[mod_wid] = True
line_ids.append(mod_wid)
link_d = {"source": line_ids.index(tokens[0]),"target":line_ids.index(mod_wid),"value":tokens[2]}
link_list.append(link_d)
if num_lines > 10000:
break
num_lines +=1
data_dict = {"nodes":node_list, "links":link_list}
print "{\n"
for k,v in data_dict.items():
print '"'+k +'"' +":\n [ \n "
for each_v in v:
print each_v ,","
print "\n],"
print "}"
open_file("lda_input.tsv")
I'm assuming by "efficiently" you're talking about programmer efficiency—how easy it is to read, maintain, and code the logic—rather than runtime speed efficiency. If you're worried about the latter, you're probably worried for no reason. (But the code below will probably be faster anyway.)
The key to coming up with a better solution is to think more abstractly. Think about rows in a CSV file, not lines in a text file; create a dict that can be rendered in JSON rather than trying to generate JSON via string processing; wrap things up in functions if you want to do them repeatedly; etc. Something like this:
import csv
import json
import sys
def parse(inpath, namedict):
lastname = [0]
def lookup_name(name):
try:
print('Looking up {} in {}'.format(name, names))
return namedict[name]
except KeyError:
lastname[0] += 1
print('Adding {} as {}'.format(name, lastname[0]))
namedict[name] = lastname[0]
return lastname[0]
with open(inpath) as f:
reader = csv.reader(f, delimiter=' ', skipinitialspace=True)
for id1, id2, value in reader:
yield {'source': lookup_name(id1),
'target': lookup_name(id2),
'value': value}
for inpath in sys.argv[1:]:
names = {}
links = list(parse(inpath, names))
nodes = [{'name': name} for name in names]
outpath = inpath + '.json'
with open(outpath, 'w') as f:
json.dump({'nodes': nodes, 'links': links}, f, indent=4)
Don't construct the JSON manually. Make it out of an existing Python object with the json module:
def parse(data):
nodes = set()
links = set()
for line in data.split('\n'):
fields = line.split()
id1, id2 = map(int, fields[:2])
value = float(fields[2])
nodes.update((id1, id2))
links.add((id1, id2, value))
return {
'nodes': [{
'name': node
} for node in nodes],
'links': [{
'source': link[0],
'target': link[1],
'value': link[2]
} for link in links]
}
Now, you can use json.dumps to get a string:
>>> import json
>>> data = '1 234 0.2\n1 235 0.1'
>>> parsed = parse(data)
>>> parsed
{'links': [{'source': 1, 'target': 235, 'value': 0.1},
{'source': 1, 'target': 234, 'value': 0.2}],
'nodes': [{'name': 1}, {'name': 234}, {'name': 235}]}
>>> json.dumps(parsed)
'{"nodes": [{"name": 1}, {"name": 234}, {"name": 235}], "links": [{"source": 1, "target": 235, "value": 0.1}, {"source": 1, "target": 234, "value": 0.2}]}'

Categories