Extracting text after string - python

I want to extract the string after "name=" from the following text. I have written the following regular expression but it isn't really working. The desired output is [Taal, Muntinlupa city]
text = [ "id='00e5885868b4d7ed', url='https://api.twitter.com/1.1/geo/id/00e5885868b4d7ed.json', place_type='city', name='Taal', full_name='Taal, Calabarzon', country_code='PH', country='Republic of the Philippines'",
"id='00c699d656122ebe', url='https://api.twitter.com/1.1/geo/id/00c699d656122ebe.json', place_type='city', name='Muntinlupa City', full_name='Muntinlupa City, National Capital Region', country_code='PH', country='Republic of the Philippines']
matched_vals = [re.findall(r'(?<=name\=).*(?=\s)',tweet) for tweet in text]

Use pattern r"name='(.+?)'"
Ex:
import re
text = [ "id='00e5885868b4d7ed', url='https://api.twitter.com/1.1/geo/id/00e5885868b4d7ed.json', place_type='city', name='Taal', full_name='Taal, Calabarzon', country_code='PH', country='Republic of the Philippines'",
"id='00c699d656122ebe', url='https://api.twitter.com/1.1/geo/id/00c699d656122ebe.json', place_type='city', name='Muntinlupa City', full_name='Muntinlupa City, National Capital Region', country_code='PH', country='Republic of the Philippines'"
]
for i in text:
print(re.search(r"name='(.+?)'", i).group(1))
Output:
Taal
Muntinlupa City

Create a dictionary out of the string, and that take the value of the key 'name':
dicts = []
for dic in text:
dicts.append(ast.literal_eval(dic))
and then you can you these name (and other data very efficient):
for d in dicts:
print(d['name'])

Related

Using regular expression to remove commonly used company suffixes from a list of companies

I have the following code that I use to generate a list of common company suffixes below:
import re
from cleanco import typesources,
import string
def generate_common_suffixes():
unique_items = []
company_suffixes_raw = typesources()
for item in company_suffixes_raw:
for i in item:
if i.lower() not in unique_items:
unique_items.append(i.lower())
unique_items.extend(['holding'])
return unique_items
I'm then trying to use the following code to remove those suffixes from a list of company names
company_name = ['SAMSUNG ÊLECTRONICS Holding, LTD', 'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC', 'Global Credit
Investments','Seatown', 'Sona Asset Management']
suffixes = generate_common_suffixes()
cleaned_names = []
for company in company_name:
for suffix in suffixes:
new = re.sub(r'\b{}\b'.format(re.escape(suffix)), '', company)
cleaned_names.append(new)
I keep getting a list of unchanged company names despite knowing that the suffixes are there.
Alternate Attempt
I've also tried an alternate method where I'd look for the word and replace it without regex, but i couldn't figure out why it was removing parts of the company name itself - for example, it would remove the first 3 letters in Samsung
for word in common_words:
name = name.replace(word, "")
Any help is greatly appreciated!
import unicodedata
from cleanco import basename
import re
company_names = ['SAMSUNG ÊLECTRONICS Holding, LTD',
'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC',
'Global Credit Investments',
'Seatown',
'Sona Asset Management']
suffix = ["holding"] # "Common words"? You can add more
cleaned_names = []
for company_name in company_names:
# To Lower
company_name = company_name.lower()
# Fix unicode
company_name = unicodedata.normalize('NFKD', company_name).encode('ASCII', 'ignore').decode()
# Remove punctuation
company_name = re.sub(r'[^\w\s]', '', company_name)
# Remove suffixes
company_name = basename(company_name)
# Remove common words
for word in suffix:
company_name = re.sub(fr"\b{word}\b", '', company_name)
# Save
cleaned_names.append(company_name)
print(cleaned_names)
Ouput:
['samsung aalectronics ', 'apple', 'fiig securities limited asset management arm', 'first eagle alternative credit', 'global credit investments', 'seatown', 'sona asset management']

How to retrieve information in the first section of the raw data only by regular expressions?

Below is a sample of the raw data which my code will process by regular expressions:
raw_data = '''
name : John
age : 26
gender : male
occupation : teacher
Father
---------------------
name : Bill
age : 52
gender : male
Mother
---------------------
name : Mary
age : 48
gender : female
'''
I want to retrieve the following part of information from the raw data and store it in a dictionary:
dict(name = 'John', age = 26, gender = 'male', occupation = 'teacher')
However, when I run my code as follows, it does not work as I expect:
import re
p = re.compile('[^-]*?^([^:\-]+?):([^\r\n]*?)$', re.M)
rets = p.findall(raw_data)
infoAboutJohnAsDict = {}
if rets != []:
for ret in rets:
infoAboutJohnAsDict[ret[0]] = ret[1]
else:
print("Not match.")
print(f'rets = {rets}')
print(f'infoAboutJohnAsDict = {infoAboutJohnAsDict}')
Can anyone give me any suggestion about how I should modify my code to achieve what I intend to do?
Here is one approach using regular expressions. We can first trim off the latter portion of the input which you don't want using re.sub. Then, use re.findall to find all key value pairs for John, and convert to a dictionary.
raw_data = re.sub(r'\s+\w+\s+-+.*', '', raw_data, flags=re.S)
matches = re.findall(r'(\w+)\s*:\s*(\w+)', raw_data)
d = dict()
for m in matches:
d[m[0]] = m[1]
print(d)
# {'gender': 'male', 'age': '26', 'name': 'John', 'occupation': 'teacher'}

Remove punctation from every value in Python dictionary

I have a long dictionary which looks like this:
name = 'Barack.'
name_last = 'Obama!'
street_name = "President Streeet?"
list_of_slot_names = {'name':name, 'name_last':name_last, 'street_name':street_name}
I want to remove the punctation for every slot (name, name_last,...).
I could do it this way:
name = name.translate(str.maketrans('', '', string.punctuation))
name_last = name_last.translate(str.maketrans('', '', string.punctuation))
street_name = street_name.translate(str.maketrans('', '', string.punctuation))
Do you know a shorter (more compact) way to write this?
Result:
>>> print(name, name_last, street_name)
>>> Barack Obama President Streeet
Use a loop / dictionary comprehension
{k: v.translate(str.maketrans('', '', string.punctuation)) for k, v in list_of_slot_names.items()}
You can either assign this back to list_of_slot_names if you want to overwrite existing values or assign to a new variable
You can also then print via
print(*list_of_slot_names.values())
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([str_.strip('.?!') for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
-> Barack Obama President Streeet
Unless you also want to remove them from the middle. Then do this
import re
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([re.sub('[.?!]+',"",str_) for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
import re, string
s = 'hell:o? wor!d.'
clean = re.sub(rf"[{string.punctuation}]", "", s)
print(clean)
output
hello world

Python Re: Overwrite Issue

I am having an issue with replacing a part of a string. Right now this code. My goal is for every string that includes a key in this dictionary.
mapping = { "St": "Street",
"St.": "Street",
'Rd': 'Road',
'Rd.': 'Road',
'Ave': 'Avenue',
'Ave.': 'Avenue',
'Ln':'Lane',
'Ln.':'Lane',
'Dr':'Drive',
'Dr.':'Drive',
'Pl':'Place',
'Pl.':'Place',
'Pkwy':'Parkway',
'Blvd.': 'Boulevard',
'Blvd': 'Boulevard'
}
To replace that part of the string with the value in the dictionary.
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
def update_name(name, mapping):
for key,value in mapping.iteritems():
if key in name:
newname = re.sub(street_type_re,value,name)
print name,'==>',newname
return name
Right now the code is doing stuff like this
National Rd SW ==> National Rd Road
I need to fix it so that it returns this
National Rd SW ==> National Road SW
newname = re.sub(key,value,name)
You can simply replace key instead of matching it with precompiled regex or
newname = re.sub(r"\b"+key+r"\b",value,name)
Yours replaces the last as you have $

Failing to append to dictionary. Python

I am experiencing a strange faulty behaviour, where a dictionary is only appended once and I can not add more key value pairs to it.
My code reads in a multi-line string and extracts substrings via split(), to be added to a dictionary. I make use of conditional statements. Strangely only the key:value pairs under the first conditional statement are added.
Therefore I can not complete the dictionary.
How can I solve this issue?
Minimal code:
#I hope the '\n' is sufficient or use '\r\n'
example = "Name: Bugs Bunny\nDOB: 01/04/1900\nAddress: 111 Jokes Drive, Hollywood Hills, CA 11111, United States"
def format(data):
dic = {}
for line in data.splitlines():
#print('Line:', line)
if ':' in line:
info = line.split(': ', 1)[1].rstrip() #does not work with files
#print('Info: ', info)
if ' Name:' in info: #middle name problems! /maiden name
dic['F_NAME'] = info.split(' ', 1)[0].rstrip()
dic['L_NAME'] = info.split(' ', 1)[1].rstrip()
elif 'DOB' in info: #overhang
dic['DD'] = info.split('/', 2)[0].rstrip()
dic['MM'] = info.split('/', 2)[1].rstrip()
dic['YY'] = info.split('/', 2)[2].rstrip()
elif 'Address' in info:
dic['STREET'] = info.split(', ', 2)[0].rstrip()
dic['CITY'] = info.split(', ', 2)[1].rstrip()
dic['ZIP'] = info.split(', ', 2)[2].rstrip()
return dic
if __name__ == '__main__':
x = format(example)
for v, k in x.iteritems():
print v, k
Your code doesn't work, at all. You split off the name before the colon and discard it, looking only at the value after the colon, stored in info. That value never contains the names you are looking for; Name, DOB and Address all are part of the line before the :.
Python lets you assign to multiple names at once; make use of this when splitting:
def format(data):
dic = {}
for line in data.splitlines():
if ':' not in line:
continue
name, _, value = line.partition(':')
name = name.strip()
if name == 'Name':
dic['F_NAME'], dic['L_NAME'] = value.split(None, 1) # strips whitespace for us
elif name == 'DOB':
dic['DD'], dic['MM'], dic['YY'] = (v.strip() for v in value.split('/', 2))
elif name == 'Address':
dic['STREET'], dic['CITY'], dic['ZIP'] = (v.strip() for v in value.split(', ', 2))
return dic
I used str.partition() here rather than limit str.split() to just one split; it is slightly faster that way.
For your sample input this produces:
>>> format(example)
{'CITY': 'Hollywood Hills', 'ZIP': 'CA 11111, United States', 'L_NAME': 'Bunny', 'F_NAME': 'Bugs', 'YY': '1900', 'MM': '04', 'STREET': '111 Jokes Drive', 'DD': '01'}
>>> from pprint import pprint
>>> pprint(format(example))
{'CITY': 'Hollywood Hills',
'DD': '01',
'F_NAME': 'Bugs',
'L_NAME': 'Bunny',
'MM': '04',
'STREET': '111 Jokes Drive',
'YY': '1900',
'ZIP': 'CA 11111, United States'}

Categories