Getting rid of white space between name, number and height - python

I have txt file like this;
name lastname 17 189cm
How do I get it to be like this?
name lastname, 17, 189cm

Using str.strip and str.split:
>>> my_string = 'name lastname 17 189cm'
>>> s = list(map(str.strip, my_string.split()))
>>> ', '.join([' '.join(s[:2]), *s[2:] ])
'name lastname, 17, 189cm'

You can use regex to replace multiple spaces (or tabs) with a comma:
import re
text = 'name lastname 17 189cm'
re.sub(r'\s\s+|\t', ', ', text)

text = 'name lastname 17 189cm'
out = ', '.join(text.rsplit(maxsplit=2)) # if sep is not provided then any consecutive whitespace is a separator
print(out) # name lastname, 17, 189cm

You could use re.sub:
import re
s = "name lastname 17 189cm"
re.sub("[ ]{2,}",", ", s)
PS: for the first problem you proposed, I had the following solution:
s = "name lastname 17 189cm"
s[::-1].replace(" ",",", 2)[::-1]

Related

Delete one by one the elements of a list, and then restore the previous ones

I need it to extract the word that starts with a capital letter, if and only if, this word is preceded by the beginning of the sentence or by one of these options (?:,and|and|her friends|,or |or |,)
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
match_names = match
print("match names: ")
print(match_names)
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs : ")
for name in match_names:
match_aux = match_names
for m in match_aux:
if (m == name):
match_aux[n] = ""
n += 1
n = 0
print(match_aux)
I need that output lists:
match names:
['Susan', 'Lisa', 'Veronica', 'Katy']
match_auxs :
['','Lisa', 'Veronica', 'Katy']
['Susan', '', 'Veronica', 'Katy']
['Susan', 'Lisa', '', 'Katy']
['Susan', 'Lisa', 'Veronica', '']
But I get this ( and it's wrong)...
match names:
['Susan', 'Lisa', 'Veronica', 'Katy']
match_auxs :
['', 'Lisa', 'Veronica', 'Katy']
['', '', 'Veronica', 'Katy']
['', '', '', 'Katy']
['', '', '', '']
As said in the comments, assigning a list to another variable doesn't create a copy of it. Along with this, your code can be simplified by using functions like enumerate:
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match_names := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
print(f"match names: {match_names}")
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs: ")
for index, name in enumerate(match_names):
match_aux = match_names.copy()
match_aux[index] = ""
n = 0
print(match_aux)
If you don't want to use copy on the list (for speed), this code will also work:
import re
match_names = ""
input_sense = "Susan gave some cosmetic gifts to her friends Lisa, Veronica and Katy, but only Katy thanked her"
#I concatenate a series of characters that probably nobody uses so that it searches at the beginning
input_sense = "rlt99ll" + input_sense
if match_names := re.findall(r"(?:rlt99ll|,and|and|her friends|,or |or |,)\s*([A-Z].*?\b)", input_sense):
print(f"match names: {match_names}")
input_sense = input_sense.replace("rlt99ll", "") #I add this aux-string only for the pattern
n = 0
print("match_auxs: ")
prev = ""
for index, name in enumerate(match_names):
if index > 0:
match_names[index - 1] = prev
prev = match_names[index]
match_names[index] = ""
print(match_names)
match_names[-1] = prev

How to retrieve information in the first section of the raw data only by regular expressions?

Below is a sample of the raw data which my code will process by regular expressions:
raw_data = '''
name : John
age : 26
gender : male
occupation : teacher
Father
---------------------
name : Bill
age : 52
gender : male
Mother
---------------------
name : Mary
age : 48
gender : female
'''
I want to retrieve the following part of information from the raw data and store it in a dictionary:
dict(name = 'John', age = 26, gender = 'male', occupation = 'teacher')
However, when I run my code as follows, it does not work as I expect:
import re
p = re.compile('[^-]*?^([^:\-]+?):([^\r\n]*?)$', re.M)
rets = p.findall(raw_data)
infoAboutJohnAsDict = {}
if rets != []:
for ret in rets:
infoAboutJohnAsDict[ret[0]] = ret[1]
else:
print("Not match.")
print(f'rets = {rets}')
print(f'infoAboutJohnAsDict = {infoAboutJohnAsDict}')
Can anyone give me any suggestion about how I should modify my code to achieve what I intend to do?
Here is one approach using regular expressions. We can first trim off the latter portion of the input which you don't want using re.sub. Then, use re.findall to find all key value pairs for John, and convert to a dictionary.
raw_data = re.sub(r'\s+\w+\s+-+.*', '', raw_data, flags=re.S)
matches = re.findall(r'(\w+)\s*:\s*(\w+)', raw_data)
d = dict()
for m in matches:
d[m[0]] = m[1]
print(d)
# {'gender': 'male', 'age': '26', 'name': 'John', 'occupation': 'teacher'}

Remove punctation from every value in Python dictionary

I have a long dictionary which looks like this:
name = 'Barack.'
name_last = 'Obama!'
street_name = "President Streeet?"
list_of_slot_names = {'name':name, 'name_last':name_last, 'street_name':street_name}
I want to remove the punctation for every slot (name, name_last,...).
I could do it this way:
name = name.translate(str.maketrans('', '', string.punctuation))
name_last = name_last.translate(str.maketrans('', '', string.punctuation))
street_name = street_name.translate(str.maketrans('', '', string.punctuation))
Do you know a shorter (more compact) way to write this?
Result:
>>> print(name, name_last, street_name)
>>> Barack Obama President Streeet
Use a loop / dictionary comprehension
{k: v.translate(str.maketrans('', '', string.punctuation)) for k, v in list_of_slot_names.items()}
You can either assign this back to list_of_slot_names if you want to overwrite existing values or assign to a new variable
You can also then print via
print(*list_of_slot_names.values())
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([str_.strip('.?!') for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
-> Barack Obama President Streeet
Unless you also want to remove them from the middle. Then do this
import re
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([re.sub('[.?!]+',"",str_) for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
import re, string
s = 'hell:o? wor!d.'
clean = re.sub(rf"[{string.punctuation}]", "", s)
print(clean)
output
hello world

prevent pandas from removing spaces in numbers in text columns

I'm trying to load CSV file into pandas dataframe. CSV is semicolon delimited. Values in text columns are in double quotation marks.
File in question: https://www.dropbox.com/s/1xv391gebjzmmco/file_01.csv?dl=0
In one of the text columns ('TYTUL') i have following value:
"00 307 1457 212"
I specify the column as str but when i print or export results to excel I get
003071457212
instead of
00 307 1457 212
How do I prevent pandas from removing spaces?
Here is my code:
import pandas
df = pandas.read_csv(r'file_01.csv'
,sep = ';'
,quotechar = '"'
,names = ['DATA_OPERACJI'
,'DATA_KSIEGOWANIA'
,'OPIS_OPERACJI'
,'TYTUL'
,'NADAWCA_ODBIORCA'
,'NUMER_KONTA'
,'KWOTA'
,'SALDO_PO_OPERACJI'
,'KOLUMNA_9']
,usecols = [0,1,2,3,4,5,6,7]
,skiprows = 38
,skipfooter = 3
,encoding = 'cp1250'
,thousands = ' '
,decimal = ','
,parse_dates = [0,1]
,converters = {'OPIS_OPERACJI': str
,'TYTUL': str
,'NADAWCA_ODBIORCA': str
,'NUMER_KONTA': str}
,engine = 'python'
)
df.TYTUL.replace([' +', '^ +', ' +$'], [' ', '', ''],regex=True,inplace=True) #this only removes excessive spaces
print(df.TYTUL)
I also came up with a workaround (comment #workaround) but I would like to ask if there is a better way.
import pandas
df = pandas.read_csv(r'file_01.csv'
,sep = ';'
,quotechar = '?' #workaround
,names = ['DATA_OPERACJI'
,'DATA_KSIEGOWANIA'
,'OPIS_OPERACJI'
,'TYTUL'
,'NADAWCA_ODBIORCA'
,'NUMER_KONTA'
,'KWOTA'
,'SALDO_PO_OPERACJI'
,'KOLUMNA_9']
,usecols = [0,1,2,3,4,5,6,7]
,skiprows = 38
,skipfooter = 3
,encoding = 'cp1250'
,thousands = ' '
,decimal = ','
,parse_dates = [0,1]
,converters = {'OPIS_OPERACJI': str
,'TYTUL': str
,'NADAWCA_ODBIORCA': str
,'NUMER_KONTA': str}
,engine = 'python'
)
df.TYTUL.replace([' +', '^ +', ' +$'], [' ', '', ''],regex=True,inplace=True) #this only removes excessive spaces
df.TYTUL.replace(['^"', '"$'], ['', ''],regex=True,inplace=True) #workaround
print(df.TYTUL)
remove this line from your read_csv code
,thousands = ' '
I tested it, the output is correct without this option
'00 307 1457 212'

Multiple split of input?

I know that you can use split() to split a user input into two, but how would you split input that consists of multiple variables ? For example:
User input:
Shawn=14:soccer#2991842
What I would like to do:
name = Shawn
age = 14
course = soccer
idnumber = 2991842
What's the best way to do such thing ?
str = 'Shawn=14:soccer#2991842'
keys = ['name', 'age', 'course', 'idnumber']
values = re.split('[=:#]', str)
print dict(zip(keys, values))
Out[114]: {'age': '14', 'course': 'soccer', 'idnumber': '2991842', 'name': 'Shawn'}
I think Regex will work best here:
>>> from re import split
>>> mystr = "Shawn=14:soccer#2991842"
>>> split("\W", mystr)
['Shawn', '14', 'soccer', '2991842']
>>> lst = split("\W", mystr)
>>> name = lst[0]
>>> name
'Shawn'
>>> age = lst[1]
>>> age
'14'
>>> course = lst[2]
>>> course
'soccer'
>>> idnumber = lst[3]
>>> idnumber
'2991842'
>>>
Also, the above is a step-by-step demonstration. You can actually just do:
name, age, course, idnumber = split("\W", mystr)
Here's how I would do it.
def splitStr(str):
temp = str.split(':')
temp_nameAge = temp[0].split('=')
temp_courseId = temp[1].split('#')
name = temp_nameAge[0]
age = int(temp_nameAge[1])
course = temp_courseId[0]
idnumber = int(temp_courseId[1])
print 'Name = %s, age = %i, course = %s, id_number = %i' % (name, age, course, idnumber)
Another thing you can do is use split like: string.split(":").
Then you can change the format to "name:age:course:number"
You could just keep splitting the splits...
text2split = "Shawn=14:soccer#2991842"
name = text2split.split('=')[0]
age = text2split.split('=')[1].split(':')[0]
course = text2split.split('=')[1].split(':')[1].split('#')[0]
idnumber = text2split.split('=')[1].split(':')[1].split('#')[1]
This isn't the most elegant way to do it, but it'll work so long as text2split always has the same delimeters.
If you are ok with storing them under dictionary keys, you could use named group references
import re
x='shawn=14:soccer#2991842'
re.match(r'(?P<name>.*?)=(?P<age>.*):(?P<course>.*?)#(?P<idnumber>.*)', x).groupdict()
{'idnumber': '2991842', 'course': 'soccer', 'age': '14', 'name': 'shawn

Categories