Removing unwanted characters from a string in Python - python

I have some strings that I want to delete some unwanted characters from them.
For example: Adam'sApple ----> AdamsApple.(case insensitive)
Can someone help me, I need the fastest way to do it, cause I have a couple of millions of records that have to be polished.
Thanks

One simple way:
>>> s = "Adam'sApple"
>>> x = s.replace("'", "")
>>> print x
'AdamsApple'
... or take a look at regex substitutions.

Here is a function that removes all the irritating ascii characters, the only exception is "&" which is replaced with "and". I use it to police a filesystem and ensure that all of the files adhere to the file naming scheme I insist everyone uses.
def cleanString(incomingString):
newstring = incomingString
newstring = newstring.replace("!","")
newstring = newstring.replace("#","")
newstring = newstring.replace("#","")
newstring = newstring.replace("$","")
newstring = newstring.replace("%","")
newstring = newstring.replace("^","")
newstring = newstring.replace("&","and")
newstring = newstring.replace("*","")
newstring = newstring.replace("(","")
newstring = newstring.replace(")","")
newstring = newstring.replace("+","")
newstring = newstring.replace("=","")
newstring = newstring.replace("?","")
newstring = newstring.replace("\'","")
newstring = newstring.replace("\"","")
newstring = newstring.replace("{","")
newstring = newstring.replace("}","")
newstring = newstring.replace("[","")
newstring = newstring.replace("]","")
newstring = newstring.replace("<","")
newstring = newstring.replace(">","")
newstring = newstring.replace("~","")
newstring = newstring.replace("`","")
newstring = newstring.replace(":","")
newstring = newstring.replace(";","")
newstring = newstring.replace("|","")
newstring = newstring.replace("\\","")
newstring = newstring.replace("/","")
return newstring

Any characters in the 2nd argument of the translate method are deleted:
>>> "Adam's Apple!".translate(None,"'!")
'Adams Apple'
NOTE: translate requires Python 2.6 or later to use None for the first argument, which otherwise must be a translation string of length 256. string.maketrans('','') can be used in place of None for pre-2.6 versions.

Try:
"Adam'sApple".replace("'", '')
One step further, to replace multiple characters with nothing:
import re
print re.sub(r'''['"x]''', '', '''a'"xb''')
Yields:
ab

str.replace("'","");

As has been pointed out several times now, you have to either use replace or regular expressions (most likely you don't need regexes though), but if you also have to make sure that the resulting string is plain ASCII (doesn't contain funky characters like é, ò, µ, æ or φ), you could finally do
>>> u'(like é, ò, µ, æ or φ)'.encode('ascii', 'ignore')
'(like , , , or )'

An alternative that will take in a string and an array of unwanted chars
# function that removes unwanted signs from str
#Pass the string to the function and an array ofunwanted chars
def removeSigns(str,arrayOfChars):
charFound = False
newstr = ""
for letter in str:
for char in arrayOfChars:
if letter == char:
charFound = True
break
if charFound == False:
newstr += letter
charFound = False
return newstr

Let's say we have the following list:
states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'south carolina##', 'West virginia?']
Now we will define a function clean_strings()
import re
def clean_strings(strings):
result = []
for value in strings:
value = value.strip()
value = re.sub('[!#?]', '', value)
value = value.title()
result.append(value)
return result
When we call the function clean_strings(states)
The result will look like:
['Alabama',
'Georgia',
'Georgia',
'Georgia',
'Florida',
'South Carolina',
'West Virginia']

I am probably late for the answer but i think below code would also do ( to an extreme end)
it will remove all the unncesary chars:
a = '; niraj kale 984wywn on 2/2/2017'
a= re.sub('[^a-zA-Z0-9.?]',' ',a)
a = a.replace(' ',' ').lstrip().rstrip()
which will give
'niraj kale 984wywn on 2 2 2017'

Related

Format the string with missing elements - Python

I have the following string:
string1 = "1/0/1/A1,A2"
string2 = "1/1/A1,A2"
string3 = "0/A1,A2"
In the above strings I have to replace the character with zero if it does not exist. The default structure will be "number/number/number/any_character`", if any of number is missing It has to replace with zero. The answer will be as follows.
print(string1) = "1/0/1/A1,A2"
print(string2) = "1/1/0/A1,A2"
print(string3) = "0/0/0/A1,A2"
You can use str.split:
def pad_string(_input, _add='0'):
*_vals, _str = _input.split('/')
return '/'.join([*_vals, *([_add]*(3-len(_vals))), _str])
results = list(map(pad_string, ['1/0/1/A1,A2', '1/1/A1,A2', '0/A1,A2']))
Output:
['1/0/1/A1,A2', '1/1/0/A1,A2', '0/0/0/A1,A2']
You can easily fill missing elements from the left:
def fillZeros(item):
chunks = item.split('/')
for inserts in range(0, 4 - len(chunks)):
chunks.insert(0, '0')
return '/'.join(chunks)
string1 = "1/0/1/A1,A2"
string2 = "1/1/A1,A2"
string3 = "0/A1,A2"
for myString in (string1, string2, string3):
print fillZeros(myString)
Prints:
1/0/1/A1,A2
0/1/1/A1,A2
0/0/0/A1,A2
But for you string2 example you need to identify which element is missing: 1/1/A1,A2. Is the first or the third element missing ?!
If you want to use just string manipulation and loops, try this
strings_list = []
for string in [string1, string2, string3]: # make list containing all strings
strings_list.append(string)
new_strings = [] # make list containing the new strings
for string in strings_list:
if string.count("0/") + string.count("1/") == 3:
# identify the strings not missing a number
new_strings.append(string)
if string.count("0/") + string.count("1/") == 2:
# identify the strings missing 1 number
string = string[:4] + "0/" + string[4:]
new_strings.append(string)
if string.count("0/") + string.count("1/") == 1:
# identify the strings missing 2 numbers
string = string[:2] + "0/" + string[2:]
new_strings.append(string)
print(new_strings)
This results in ['1/0/1/A1,A2', '1/1/0/A1,A2', '0/0/A1,A2'].

Split string with delimiter as whitespace but it preserve whitespace within doubleqoutes and also doubleqoutes in Python

Split string with delimiter as whitespace but it should preserve whitespace within doubleqoutes and also doubleqoutes in Python
a='Append ",","te st1",input To output'
output list should be like below
['Append', '",","te st1",input', 'To', 'output']
I found a solution using regular expressions:
re.findall("(?:\".*?\"|\S)+", a)
gives
['Append', '",","te st1",input', 'To', 'output']
Update: Improved pattern that includes escaping:
re.findall("(?:\".*?[^\\\"]\"|\S)+", a)
Please note that this also matches the empty string "" by means of the \S part of the pattern.
Note: Old answer below for archival purposes:
The obvious answer would be to use shlex like this:
>>> shlex.split('Append ",","te st1",input To output')
['Append', ',,te st1,input', 'To', 'output']
This will remove the quotes, unfortunately. Anyways, this kind of problem can be solved with a simple state machine. The performance might be sub-par, but it works:
#!/usr/bin/env python2
import string
def split_string_whitespace(s):
current_token = []
result = []
state = 0
for c in s + " ":
if state == 0:
if c in string.whitespace:
if current_token:
result.append("".join(current_token))
current_token = []
else:
current_token.append(c)
if c == '"':
state = 1
else:
current_token.append(c)
if c == '"':
state = 0
return result
print split_string_whitespace('Append ",","te st1",input To output')
The script yields:
['Append', '",","te st1",input', 'To', 'output']
I'm pretty sure one could construct something with the re submodule, so I'm waiting for that answer, too :)
A very simple generator function, maintaining the current "quotation state":
def splitter(s):
i, quoted = 0, False
for n, c in enumerate(s+' '):
if c == '"':
quoted = not quoted
elif c == ' ' and not quoted:
if n > i:
yield s[i:n]
i = n+1
list(splitter(a))
# ['Append', '",","te st1",input', 'To', 'output']

How to copy spaces from one string to another in Python?

I need a way to copy all of the positions of the spaces of one string to another string that has no spaces.
For example:
string1 = "This is a piece of text"
string2 = "ESTDTDLATPNPZQEPIE"
output = "ESTD TD L ATPNP ZQ EPIE"
Insert characters as appropriate into a placeholder list and concatenate it after using str.join.
it = iter(string2)
output = ''.join(
[next(it) if not c.isspace() else ' ' for c in string1]
)
print(output)
'ESTD TD L ATPNP ZQ EPIE'
This is efficient as it avoids repeated string concatenation.
You need to iterate over the indexes and characters in string1 using enumerate().
On each iteration, if the character is a space, add a space to the output string (note that this is inefficient as you are creating a new object as strings are immutable), otherwise add the character in string2 at that index to the output string.
So that code would look like:
output = ''
si = 0
for i, c in enumerate(string1):
if c == ' ':
si += 1
output += ' '
else:
output += string2[i - si]
However, it would be more efficient to use a very similar method, but with a generator and then str.join. This removes the slow concatenations to the output string:
def chars(s1, s2):
si = 0
for i, c in enumerate(s1):
if c == ' ':
si += 1
yield ' '
else:
yield s2[i - si]
output = ''.join(char(string1, string2))
You can try insert method :
string1 = "This is a piece of text"
string2 = "ESTDTDLATPNPZQEPIE"
string3=list(string2)
for j,i in enumerate(string1):
if i==' ':
string3.insert(j,' ')
print("".join(string3))
outout:
ESTD TD L ATPNP ZQ EPIE

How do I split a comma delimited string in Python except for the commas that are within quotes

I am trying to split a comma delimited string in python. The tricky part for me here is that some of the fields in the data themselves have a comma in them and they are enclosed within quotes (" or '). The resulting split string should also have the quotes around the fields removed. Also, some fields can be empty.
Example:
hey,hello,,"hello,world",'hey,world'
needs to be split into 5 parts like below
['hey', 'hello', '', 'hello,world', 'hey,world']
Any ideas/thoughts/suggestions/help with how to go about solving the above problem in Python would be much appreciated.
Thank You,
Vish
Sounds like you want the CSV module.
(Edit: The original answer had trouble with empty fields on the edges due to the way re.findall works, so I refactored it a bit and added tests.)
import re
def parse_fields(text):
r"""
>>> list(parse_fields('hey,hello,,"hello,world",\'hey,world\''))
['hey', 'hello', '', 'hello,world', 'hey,world']
>>> list(parse_fields('hey,hello,,"hello,world",\'hey,world\','))
['hey', 'hello', '', 'hello,world', 'hey,world', '']
>>> list(parse_fields(',hey,hello,,"hello,world",\'hey,world\','))
['', 'hey', 'hello', '', 'hello,world', 'hey,world', '']
>>> list(parse_fields(''))
['']
>>> list(parse_fields(','))
['', '']
>>> list(parse_fields('testing,quotes not at "the" beginning \'of\' the,string'))
['testing', 'quotes not at "the" beginning \'of\' the', 'string']
>>> list(parse_fields('testing,"unterminated quotes'))
['testing', '"unterminated quotes']
"""
pos = 0
exp = re.compile(r"""(['"]?)(.*?)\1(,|$)""")
while True:
m = exp.search(text, pos)
result = m.group(2)
separator = m.group(3)
yield result
if not separator:
break
pos = m.end(0)
if __name__ == "__main__":
import doctest
doctest.testmod()
(['"]?) matches an optional single- or double-quote.
(.*?) matches the string itself. This is a non-greedy match, to match as much as necessary without eating the whole string. This is assigned to result, and it's what we actually yield as a result.
\1 is a backreference, to match the same single- or double-quote we matched earlier (if any).
(,|$) matches the comma separating each entry, or the end of the line. This is assigned to separator.
If separator is false (eg. empty), that means there's no separator, so we're at the end of the string--we're done. Otherwise, we update the new start position based on where the regex finished (m.end(0)), and continue the loop.
The csv module won't handle the scenario of " and ' being quotes at the same time. Absent a module that provides that kind of dialect, one has to get into the parsing business. To avoid reliance on a third party module, we can use the re module to do the lexical analysis, using the re.MatchObject.lastindex gimmick to associate a token type with the matched pattern.
The following code when run as a script passes all the tests shown, with Python 2.7 and 2.2.
import re
# lexical token symbols
DQUOTED, SQUOTED, UNQUOTED, COMMA, NEWLINE = xrange(5)
_pattern_tuples = (
(r'"[^"]*"', DQUOTED),
(r"'[^']*'", SQUOTED),
(r",", COMMA),
(r"$", NEWLINE), # matches end of string OR \n just before end of string
(r"[^,\n]+", UNQUOTED), # order in the above list is important
)
_matcher = re.compile(
'(' + ')|('.join([i[0] for i in _pattern_tuples]) + ')',
).match
_toktype = [None] + [i[1] for i in _pattern_tuples]
# need dummy at start because re.MatchObject.lastindex counts from 1
def csv_split(text):
"""Split a csv string into a list of fields.
Fields may be quoted with " or ' or be unquoted.
An unquoted string can contain both a " and a ', provided neither is at
the start of the string.
A trailing \n will be ignored if present.
"""
fields = []
pos = 0
want_field = True
while 1:
m = _matcher(text, pos)
if not m:
raise ValueError("Problem at offset %d in %r" % (pos, text))
ttype = _toktype[m.lastindex]
if want_field:
if ttype in (DQUOTED, SQUOTED):
fields.append(m.group(0)[1:-1])
want_field = False
elif ttype == UNQUOTED:
fields.append(m.group(0))
want_field = False
elif ttype == COMMA:
fields.append("")
else:
assert ttype == NEWLINE
fields.append("")
break
else:
if ttype == COMMA:
want_field = True
elif ttype == NEWLINE:
break
else:
print "*** Error dump ***", ttype, repr(m.group(0)), fields
raise ValueError("Missing comma at offset %d in %r" % (pos, text))
pos = m.end(0)
return fields
if __name__ == "__main__":
tests = (
("""hey,hello,,"hello,world",'hey,world'\n""", ['hey', 'hello', '', 'hello,world', 'hey,world']),
("""\n""", ['']),
("""""", ['']),
("""a,b\n""", ['a', 'b']),
("""a,b""", ['a', 'b']),
(""",,,\n""", ['', '', '', '']),
("""a,contains both " and ',c""", ['a', 'contains both " and \'', 'c']),
("""a,'"starts with "...',c""", ['a', '"starts with "...', 'c']),
)
for text, expected in tests:
result = csv_split(text)
print
print repr(text)
print repr(result)
print repr(expected)
print result == expected
I fabricated something like this. Very redundant I suppose, but it does the job for me. You have to adapt it a bit to your specifications:
def csv_splitter(line):
splitthese = [0]
splitted = []
splitpos = True
for nr, i in enumerate(line):
if i == "\"" and splitpos == True:
splitpos = False
elif i == "\"" and splitpos == False:
splitpos = True
if i == "," and splitpos == True:
splitthese.append(nr)
splitthese.append(len(line)+1)
for i in range(len(splitthese)-1):
splitted.append(re.sub("^,|\"","",line[splitthese[i]:splitthese[i+1]]))
return splitted

collapsing whitespace in a string

I have a string that kind of looks like this:
"stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD"
and I want to strip off all punctuation, make everything uppercase and collapse all whitespace so that it looks like this:
"STUFF MORE STUFF STUFF DD"
Is this possible with one regex or do I need to combine more than two? This is what I have so far:
def normalize(string):
import re
string = string.upper()
rex = re.compile(r'\W')
rex_s = re.compile(r'\s{2,}')
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex.sub('', result) # this reduces all those spaces
return result
The only thing that doesn't work is the whitespace collapsing. Any ideas?
Here's a single-step approach (but the uppercasing actually uses a string method -- much simpler!):
rex = re.compile(r'\W+')
result = rex.sub(' ', strarg).upper()
where strarg is the string argument (don't use names that shadow builtins or standard library modules, please).
s = "$$$aa1bb2 cc-dd ee_ff ggg."
re.sub(r'\W+', ' ', s).upper()
# ' AA1BB2 CC DD EE_FF GGG '
Is _ punctuation?
re.sub(r'[_\W]+', ' ', s).upper()
# ' AA1BB2 CC DD EE FF GGG '
Don't want the leading and trailing space?
re.sub(r'[_\W]+', ' ', s).strip().upper()
# 'AA1BB2 CC DD EE FF GGG'
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex.sub('', result) # this reduces all those spaces
Because you typo'd and forgot to use rex_s for the second call instead. Also, you need to substitute at least one space back in or you'll end up with any multiple-space gap becoming no gap at all, instead of a single-space gap.
result = rex.sub(' ', string) # this produces a string with tons of whitespace padding
result = rex_s.sub(' ', result) # this reduces all those spaces
Do you have to use regular expressions? Do you feel you must do it in one line?
>>> import string
>>> s = "stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD"
>>> s2 = ''.join(c for c in s if c in string.letters + ' ')
>>> ' '.join(s2.split())
'stuff morestuff stuff DD'
works in python3 this will retain the same whitespace character you collapsed. So if you have a tab and a space next to each other they wont collapse into a single character.
def collapse_whitespace_characters(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
this one will collapse whitespace sets into the first whitespace character it sees
def collapse_whitespace(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or \
(cur_char.isspace() and not prev_char.isspace()):
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
>>> collapse_whitespace_characters('we like spaces and\t\t TABS AND WHATEVER\xa0\xa0IS')
'we like spaces and\t TABS\tAND WHATEVER\xa0IS'
>>> collapse_whitespace('we like spaces and\t\t TABS AND WHATEVER\xa0\xa0IS')
'we like spaces and\tTABS\tAND WHATEVER\xa0IS'
for punctuation
def collapse_punctuation(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if cur_char.isalnum() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
to actually answer the question
orig = 'stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD'
collapse_whitespace(''.join([(c.upper() if c.isalnum() else ' ') for c in orig]))
as said, the regexp would be something like
re.sub('\W+', ' ', orig).upper()
One can use regular expression to substitute reoccurring white spaces.
White space is given by \s with \s+ meaning: at least one.
import re
rex = re.compile(r'\s+')
test = " x y z z"
res = rex.sub(' ', test)
print(f">{res}<")
> x y z z<
Note this also affects/includes carriage return, etc.

Categories