Matching a list's item with another list in python - python

I have list1 let's say:
items=['SETTLEMENT DATE:', 'CASH ACCOUNT:', 'ISIN:', 'TRADE DATE:', 'PRICE CFA', 'CASH ACCOUNT:', 'SECURITY NAME:']
I have a list2 let's say:
split_t=['{1:F01SCBLMUMUXSSU0438794344}{2:O5991054200218SCBLGHACXSSU04387943442002181454N}{3:{108:2175129}}{4:', ':20:EPACK', 'SALE', 'CDI', ':21:EPACK', 'SALE', 'CDI', ':79:ATTN:MU', 'TEAM', 'KINDLY', 'ACCEPT', 'THIS', 'AS', 'AUTHORISATION', 'TO', 'SETTLE', 'TRADE', 'WITH', 'DETAILS', 'BELOW', 'MARKET:', 'COTE', 'DIVOIRE', 'CLIENT', 'NAME:', 'EPACK', 'OFFSHORE', 'ACCOUNT', 'NAME:', 'STANDARD', 'CHARTERED', 'GHANA', 'NOMINEE', 'RE', 'DATABANK', 'EPACK', 'INVESTMENT', 'FUND', 'LTD', 'IVORY', 'COAST', 'TRADE', 'TYPE:', 'DELIVER', 'AGAINST', 'PAYMENT', 'SCA:', '2CEPACKIVO', 'CASH', 'ACCOUNT:', '420551901501', 'TRADE', 'DETAILS:', 'TRADE', 'DATE:', '17.02.2020', 'SETTLEMENT', 'DATE:', '20.02.2020', 'SECURITY', 'NAME:', 'SONATEL', 'ISIN:', 'SN0000000019', 'CLEARING', 'BIC:', 'SCBLCIABSSUXXX', 'QUANTITY:', '10,500', 'PRICE', 'CFA', '14,500.4667', 'CONSIDERATION', 'CFA', '152,254,900.00', 'TOTAL', 'FEES', '1,796,608.00', 'SETTLEMENT', 'AMOUNT', 'CFA', '150,458,292.35', 'CURRENCY:', 'CFA', 'AC:', 'CI0000010373', 'REGARDS', 'STANDARD', 'CHARTERED', 'BANK', '-}']
I want to search contiguously the items of list1 in list2 and return the immediate next element of list2 when there's a match.
As you can see, one item of list1 is probably two contiguous item in list2.
For example, the 1st element of list1, 'SETTLEMENT DATE:', There's a match in list2 and I want to return the next element of the match in list2, '20.02.2020'.
I have written my python function accordingly:
def test(items, split_t):
phrases = [w for w in items]
for i, t in enumerate(split_t):
to_match = split_t[i+1: i+1+len(phrases)]
if to_match and all(p == m for p,m in zip(phrases, to_match)):
return [*map(lambda x:split_t[i])]
Which is returning None even when it has matches as you can see. I might be wrong in implementing the *map in the return statement which I'm failing to understand from debugging. Any help is highly appreciated.

One way is:
>>> import re
>>> def test(items, split_t):
... split_t_str = ' '.join(split_t)
... res = {}
... for i in items:
... m = re.search(rf'(?<={i})\s(.*?)\s', split_t_str)
... res[i] = m.group(1)
... return res
...
>>> test(items, split_t)
{'SETTLEMENT DATE:': '20.02.2020', 'CASH ACCOUNT:': '420551901501', 'ISIN:': 'SN0000000019', 'TRADE DATE:': '17.02.2020', 'PRICE CFA': '14,500.4667', 'SECURITY NAME:': 'SONATEL'}
The above:
creates a str from split_t, i.e., split_t_str,
iterates over items using each element to construct a regex for performing a positive lookbehind assertion (see re's docs) against split_t_str,
stores each element as key in a dict, called res, and the corresponding match as value, and
returns the dict

If there is no spaces in "list 2" items. This way you can.
def match(l1, l2):
result = []
string = ' '.join(l2) + ' '
for i in l1:
index = string.find(i)
if index != -1:
result.append(string[index + len(i) + 1:string.find(' ', index + len(i) + 1)])
return result
print(match(items, split_t))
Output:
['20.02.2020', '420551901501', 'SN0000000019', '17.02.2020', '14,500.4667', '420551901501', 'SONATEL']

Related

How to split the given 'key-value' list into two lists separated as 'keys' and 'values' with python

This is my List
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
I need to separate the List into two differnt List's sepearted as 'keys' and 'values'
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
KeyList
KeyList = ['function', 'string', 'hello', 'new', 'test']
ValueList
ValueList = ['function1', 'string1', 'hello1', 'new1', 'test1']
There are different possible approach. One is the method proposed by Tim, but if you are not familiar with re you could also do:
List = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
KeyList = []
ValueList = []
for item in List:
val = item.split(' = ')
KeyList.append(val[0])
ValueList.append(val[1])
print(KeyList)
print(ValueList)
and the output is:
['function', 'string', 'hello', 'new', 'test']
['function1', 'string1', 'hello1', 'new1', 'test1']
You can simply use split(" = ") and unzip the list of key-value pairs to two tuples:
keys, values = zip(*map(lambda s: s.split(" = "), List))
# keys
# >>> ('function', 'string', 'hello', 'new', 'test')
# values
# >>>('function1', 'string1', 'hello1', 'new1', 'test1')
This is based on the fact that zip(*a_zipped_iterable) works as an unzipping function.
We can use re.findall here:
inp = ['function = function1', 'string = string1', 'hello = hello1', 'new = new1', 'test = test1']
keys = [re.findall(r'(\w+) =', x)[0] for x in inp]
vals = [re.findall(r'\w+ = (\w+)', x)[0] for x in inp]
keys = [pair[0] for pair in pairs]
values = [pair[1] for pair in pairs]

Retrieving the first list only (JSON-Python)

I have a complicated JSON object with dictionary information about words and I want to get only the synonyms. I managed to retrieve them, but some words have two or more lists of synonyms (since they can be for example a verb and a noun at the same time). I would like to get only the first list of synonyms. Here is what I've done:
import requests
import json
with open(r'C:\Users...') as file:
list = []
for line in file.readlines():
list += line.split()
for keyword in list:
print(keyword)
ship_api_url = "https://..."
request_data = requests.get(ship_api_url)
data = request_data.text
parsed = json.loads(data)
# print(json.dumps(parsed, indent=3))
for item in parsed:
print(item['meta']['syns'][0])
And here's what I get - note that the word 'watch' has three lists of synonyms, the word 'create' has only one list of synonyms and the word 'created' has two lists of synonyms:
watch
['custodian', 'guard', 'guardian', 'keeper', 'lookout', 'minder', 'picket', 'sentinel', 'sentry', 'warden', 'warder', 'watcher', 'watchman']
['eye', 'follow', 'observe']
['anticipate', 'await', 'expect', 'hope (for)']
create
['beget', 'breed', 'bring', 'bring about', 'bring on', 'catalyze', 'cause', 'do', 'draw on', 'effect', 'effectuate', 'engender', 'generate', 'induce', 'invoke', 'make', 'occasion', 'produce', 'prompt', 'result (in)', 'spawn', 'translate (into)', 'work', 'yield']
created
['begot', 'bred', 'brought', 'brought about', 'brought on', 'catalyzed', 'caused', 'did', 'drew on', 'effected', 'effectuated', 'engendered', 'generated', 'induced', 'invoked', 'made', 'occasioned', 'produced', 'prompted', 'resulted (in)', 'spawned', 'translated (into)', 'worked', 'yielded']
['beget', 'breed', 'bring', 'bring about', 'bring on', 'catalyze', 'cause', 'do', 'draw on', 'effect', 'effectuate', 'engender', 'generate', 'induce', 'invoke', 'make', 'occasion', 'produce', 'prompt', 'result (in)', 'spawn', 'translate (into)', 'work', 'yield']
If I add another [0] after the [0] I already have, I get the first word of each list, not the first whole list as I need...
If I got it right, you want to do something like this:
import requests
import json
with open(r'C:\Users...') as file:
list = []
for line in file.readlines():
list += line.split()
for keyword in list:
print(keyword)
ship_api_url = "https://..."
request_data = requests.get(ship_api_url)
data = request_data.text
parsed = json.loads(data)
# print(json.dumps(parsed, indent=3))
for item in parsed:
for i in item['meta']['syns']:
print(item['meta']['syns'][i])
Also, don't name your variable list as it is reserved variable in Python.
As suggested in a comment by martineau, I solved the problem by adding a break statement after the print(item['meta']['syns'][0]) to stop the loop.

Replace multiple instances of a sub-string with items in a list

I have a string like below:
e = "how are you how do you how are they how"
My expected output is:
out = "how1 are you how2 do you how3 are they how4"
I'm trying in the following way:
def givs(y,x):
tt = []
va = [i+1 for i in list(range(y.count(x)))]
for i in va:
tt.append(x+str(i))
return tt
ls = givs(e, 'how')
ls = ['how1', 'how2', 'how3', 'how4']
fg = []
for i in e.split(' '):
fg.append(i)
fg = ['how', 'are', 'you', 'how', 'do', 'you', 'how', 'are', 'they', 'how']
For every instance of 'how' in 'fg' I want to replace with items in 'ls' and finally use join function to get the required output.
expected_output = ['how1', 'are', 'you', 'how2', 'do', 'you', 'how3', 'are',
'they', 'how4']
so that I can join the items by:
' '.join(expected_output)
to get:
out = "how1 are you how2 do you how3 are they how4"
You could use itertools.count:
from itertools import count
counter = count(1)
e = "how are you how do you how are they how"
result = ' '.join([w if w != "how" else w + str(next(counter)) for w in e.split()])
print(result)
Output
how1 are you how2 do you how3 are they how4
There is no need to make your code complex, just add a counter and add it to every "how". At the end make the new string.
e = "how are you how do you how are they how"
e_ok = ""
count = 1
for i in e.split():
if i == "how":
i = i+str(count)
count += 1
e_ok += i + " "
print(e_ok)

Python - nested dictionary. Where is the bug?

I have a CSV file that I've filtered into a list and grouped. Example:
52713
['52713', '', 'Vmax', '', 'Start Value', '', '\n']
['52713', '', 'Vmax', '', 'ECNumber', '1.14.12.17', '\n']
['52713', 'O2', 'Km', 'M', 'Start Value', '3.5E-5', '\n']
['52713', 'O2', 'Km', 'M', 'ECNumber', '1.14.12.17', '\n']
52714
['52714', '', 'Vmax', '', 'Start Value', '', '\n']
['52714', '', 'Vmax', '', 'ECNumber', '1.14.12.17', '\n']
['52714', 'O2', 'Km', 'M', 'Start Value', '1.3E-5', '\n']
['52714', 'O2', 'Km', 'M', 'ECNumber', '1.14.12.17', '\n']
From this, I create a nested dictionary with the structure:
dict = ID number:{Km:n, Kcat:n, ECNumber:n}
...for every ID in the list.
I use the following code to create this dictionary
dict = {}
for key, items in groupby(FilteredTable1[1:], itemgetter(0)):
#print key
for subitem in items:
#print subitem
dict[subitem[EntryID]] = {}
dict[subitem[EntryID]]['EC'] = []
dict[subitem[EntryID]]['Km'] = []
dict[subitem[EntryID]]['Kcat'] = []
if 'ECNumber' in subitem:
dict[subitem[EntryID]]['EC'] = subitem[value]
if 'Km' in subitem and 'Start Value' in subitem:
dict[subitem[EntryID]]['Km'] = subitem[value]
#print subitem
This works for the ECNumber value, but not the Km value. It can print the line, showing that it identifies the Km value as being present, but doesn't put it in the dictionary.
Example output:
{'Km': [], 'EC': '1.14.12.17', 'Kcat': []}
Any ideas?
Ben
The problem is that your inner for loop keeps reinitializing dict[subitem[EntryID]] even though it may already exist. That's fixed in the following by explicitly checking to see if it's already there:
dict = {}
for key, items in groupby(FilteredTable1[1:], itemgetter(0)):
#print key
for subitem in items:
#print ' ', subitem
if subitem[EntryID] not in dict:
dict[subitem[EntryID]] = {}
dict[subitem[EntryID]]['EC'] = []
dict[subitem[EntryID]]['Km'] = []
dict[subitem[EntryID]]['Kcat'] = []
if 'ECNumber' in subitem:
dict[subitem[EntryID]]['EC'] = subitem[value]
if 'Km' in subitem and 'Start Value' in subitem:
dict[subitem[EntryID]]['Km'] = subitem[value]
#print subitem
However this code could be made more efficient by using something like the following instead, which avoids recomputing values and double dictionary lookups. It also doesn't use the name of a built-in type for a variable name, which goes against the guidelines given in the PEP8 - Style Guide for Python Code. It also suggests using CamelCase only for class names, not for variable names like FilteredTable1 — but I didn't change that.
adict = {}
for key, items in groupby(FilteredTable1[1:], itemgetter(0)):
#print key
for subitem in items:
#print ' ', subitem
entry_id = subitem[EntryID]
if entry_id not in adict:
adict[entry_id] = {'EC': [], 'Km': [], 'Kcat': []}
entry = adict[entry_id]
if 'ECNumber' in subitem:
entry['EC'] = subitem[value]
if 'Km' in subitem and 'Start Value' in subitem:
entry['Km'] = subitem[value]
#print subitem
Actually, since you're building a dictionary of dictionaries, it's not clear that there's any advantage to using groupby to do so.
I'm posting this to follow-up and extend on my previous answer.
For starters, you could streamline the code a little further by eliminating the need to check for preexisting entries simply making the dictionary being created a collections.defaultdict dict subclass instead of a regular one:
from collections import defaultdict
adict = defaultdict(lambda: {'EC': [], 'Km': [], 'Kcat': []})
for key, items in groupby(FilteredTable1[1:], itemgetter(0)):
for subitem in items:
entry = adict[subitem[EntryID]]
if 'ECNumber' in subitem:
entry['EC'] = subitem[value]
if 'Km' in subitem and 'Start Value' in subitem:
entry['Km'] = subitem[value]
Secondly, as I mentioned in the other answer, I don't think you're gaining anything by using itertools.groupby() to do this — except making the process more complicated than needed. This is a because basically what you're doing is making a dictionary-of-dictionaries whose entries can all be randomly accessed, so there's no benefit in going to the trouble of grouping them before doing so. The code below proves this (in conjunction with using a defaultdict as shown above):
adict = defaultdict(lambda: {'EC': [], 'Km': [], 'Kcat': []})
for subitem in FilteredTable1[1:]:
entry = adict[subitem[EntryID]]
if 'ECNumber' in subitem:
entry['EC'] = subitem[value]
if 'Km' in subitem and 'Start Value' in subitem:
entry['Km'] = subitem[value]

Failing to append to dictionary. Python

I am experiencing a strange faulty behaviour, where a dictionary is only appended once and I can not add more key value pairs to it.
My code reads in a multi-line string and extracts substrings via split(), to be added to a dictionary. I make use of conditional statements. Strangely only the key:value pairs under the first conditional statement are added.
Therefore I can not complete the dictionary.
How can I solve this issue?
Minimal code:
#I hope the '\n' is sufficient or use '\r\n'
example = "Name: Bugs Bunny\nDOB: 01/04/1900\nAddress: 111 Jokes Drive, Hollywood Hills, CA 11111, United States"
def format(data):
dic = {}
for line in data.splitlines():
#print('Line:', line)
if ':' in line:
info = line.split(': ', 1)[1].rstrip() #does not work with files
#print('Info: ', info)
if ' Name:' in info: #middle name problems! /maiden name
dic['F_NAME'] = info.split(' ', 1)[0].rstrip()
dic['L_NAME'] = info.split(' ', 1)[1].rstrip()
elif 'DOB' in info: #overhang
dic['DD'] = info.split('/', 2)[0].rstrip()
dic['MM'] = info.split('/', 2)[1].rstrip()
dic['YY'] = info.split('/', 2)[2].rstrip()
elif 'Address' in info:
dic['STREET'] = info.split(', ', 2)[0].rstrip()
dic['CITY'] = info.split(', ', 2)[1].rstrip()
dic['ZIP'] = info.split(', ', 2)[2].rstrip()
return dic
if __name__ == '__main__':
x = format(example)
for v, k in x.iteritems():
print v, k
Your code doesn't work, at all. You split off the name before the colon and discard it, looking only at the value after the colon, stored in info. That value never contains the names you are looking for; Name, DOB and Address all are part of the line before the :.
Python lets you assign to multiple names at once; make use of this when splitting:
def format(data):
dic = {}
for line in data.splitlines():
if ':' not in line:
continue
name, _, value = line.partition(':')
name = name.strip()
if name == 'Name':
dic['F_NAME'], dic['L_NAME'] = value.split(None, 1) # strips whitespace for us
elif name == 'DOB':
dic['DD'], dic['MM'], dic['YY'] = (v.strip() for v in value.split('/', 2))
elif name == 'Address':
dic['STREET'], dic['CITY'], dic['ZIP'] = (v.strip() for v in value.split(', ', 2))
return dic
I used str.partition() here rather than limit str.split() to just one split; it is slightly faster that way.
For your sample input this produces:
>>> format(example)
{'CITY': 'Hollywood Hills', 'ZIP': 'CA 11111, United States', 'L_NAME': 'Bunny', 'F_NAME': 'Bugs', 'YY': '1900', 'MM': '04', 'STREET': '111 Jokes Drive', 'DD': '01'}
>>> from pprint import pprint
>>> pprint(format(example))
{'CITY': 'Hollywood Hills',
'DD': '01',
'F_NAME': 'Bugs',
'L_NAME': 'Bunny',
'MM': '04',
'STREET': '111 Jokes Drive',
'YY': '1900',
'ZIP': 'CA 11111, United States'}

Categories