Run length encoding in Python - python

I'm trying to write a simple Python algorithm to solve this problem. Can you please help me figure out how to do this?
If any character is repeated more than 4 times, the entire set of
repeated characters should be replaced with a slash '/', followed by a
2-digit number which is the length of this run of repeated characters,
and the character. For example, "aaaaa" would be encoded as "/05a".
Runs of 4 or less characters should not be replaced since performing
the encoding would not decrease the length of the string.

I see many great solutions here but none that feels very pythonic to my eyes. So I'm contributing with a implementation I wrote myself today for this problem.
def run_length_encode(data: str) -> Iterator[Tuple[str, int]]:
"""Returns run length encoded Tuples for string"""
# A memory efficient (lazy) and pythonic solution using generators
return ((x, sum(1 for _ in y)) for x, y in groupby(data))
This will return a generator of Tuples with the character and number of instances, but can easily be modified to return a string as well. A benefit of doing it this way is that it's all lazy evaluated and won't consume more memory or cpu than needed if you don't need to exhaust the entire search space.
If you still want string encoding the code can quite easily be modified for that use case like this:
def run_length_encode(data: str) -> str:
"""Returns run length encoded string for data"""
# A memory efficient (lazy) and pythonic solution using generators
return "".join(f"{x}{sum(1 for _ in y)}" for x, y in groupby(data))
This is a more generic run length encoding for all lengths, and not just for those of over 4 characters. But this could also quite easily be adapted with a conditional for the string if wanted.

Rosetta Code has a lot of implementations, that should easily be adaptable to your usecase.
Here is Python code with regular expressions:
from re import sub
def encode(text):
'''
Doctest:
>>> encode('WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW')
'12W1B12W3B24W1B14W'
'''
return sub(r'(.)\1*', lambda m: str(len(m.group(0))) + m.group(1),
text)
def decode(text):
'''
Doctest:
>>> decode('12W1B12W3B24W1B14W')
'WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW'
'''
return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)),
text)
textin = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW"
assert decode(encode(textin)) == textin

Aside for setting a=i after encoding a sequence and setting a width for your int when printed into the string. You could also do the following which takes advantage of pythons groupby. Its also a good idea to use format when constructing strings.
from itertools import groupby
def runLengthEncode (plainText):
res = []
for k,i in groupby(plainText):
run = list(i)
if(len(run) > 4):
res.append("/{:02}{}".format(len(run), k))
else:
res.extend(run)
return "".join(res)

Just observe the behaviour:
>>> runLengthEncode("abcd")
'abc'
Last character is ignored. You have to append what you've collected.
>>> runLengthEncode("abbbbbcd")
'a/5b/5b'
Oops, problem after encoding. You should set a=i even if you found a long enough sequence.

I know this is not the most efficient solution, but we haven't studied functions like groupby() yet so here's what I did:
def runLengthEncode (plainText):
res=''
a=''
count = 0
for i in plainText:
count+=1
if a.count(i)>0:
a+=i
else:
if len(a)>4:
if len(a)<10:
res+="/0"+str(len(a))+a[0][:1]
else:
res+="/" + str(len(a)) + a[0][:1]
a=i
else:
res+=a
a=i
if count == len(plainText):
if len(a)>4:
if len(a)<10:
res+="/0"+str(len(a))+a[0][:1]
else:
res+="/" + str(len(a)) + a[0][:1]
else:
res+=a
return(res)

Split=(list(input("Enter string: ")))
Split.append("")
a = 0
for i in range(len(Split)):
try:
if (Split[i] in Split) >0:
a = a + 1
if Split[i] != Split[i+1]:
print(Split[i],a)
a = 0
except IndexError:
print()
this is much easier and works everytime

def RLE_comp_encode(text):
if text == text[0]*len(text) :
return str(len(text))+text[0]
else:
comp_text , r = '' , 1
for i in range (1,len(text)):
if text[i]==text[i-1]:
r +=1
if i == len(text)-1:
comp_text += str(r)+text[i]
else :
comp_text += str(r)+text[i-1]
r = 1
return comp_text
This worked for me,

You can use the groupby() function combined with a list/generator comprehension:
from itertools import groupby, imap
''.join(x if reps <= 4 else "/%02d%s" % (reps, x) for x, reps in imap(lambda x: (x[0], len(list(x[1]))), groupby(s)))

An easy solution to run-length encoding which I can think of:
For encoding a string like "a4b5c6d7...":
def encode(s):
counts = {}
for c in s:
if counts.get(c) is None:
counts[c] = s.count(c)
return "".join(k+str(v) for k,v in counts.items())
For decoding a string like "aaaaaabbbdddddccccc....":
def decode(s):
return "".join((map(lambda tup: tup[0] * int(tup[1]), zip(s[0:len(s):2], s[1:len(s):2]))))
Fairly easy to read and simple.

text=input("Please enter the string to encode")
encoded=[]
index=0
amount=1
while index<=(len(text)-1):
if index==(len(text)-1) or text[index]!=text[(index+1)]:
encoded.append((text[index],amount))
amount=1
else:
amount=amount+1
index=index+1
print(encoded)

Related

Convert list of similar ints to tuple of int and occurances [duplicate]

I'm trying to write a simple Python algorithm to solve this problem. Can you please help me figure out how to do this?
If any character is repeated more than 4 times, the entire set of
repeated characters should be replaced with a slash '/', followed by a
2-digit number which is the length of this run of repeated characters,
and the character. For example, "aaaaa" would be encoded as "/05a".
Runs of 4 or less characters should not be replaced since performing
the encoding would not decrease the length of the string.
I see many great solutions here but none that feels very pythonic to my eyes. So I'm contributing with a implementation I wrote myself today for this problem.
def run_length_encode(data: str) -> Iterator[Tuple[str, int]]:
"""Returns run length encoded Tuples for string"""
# A memory efficient (lazy) and pythonic solution using generators
return ((x, sum(1 for _ in y)) for x, y in groupby(data))
This will return a generator of Tuples with the character and number of instances, but can easily be modified to return a string as well. A benefit of doing it this way is that it's all lazy evaluated and won't consume more memory or cpu than needed if you don't need to exhaust the entire search space.
If you still want string encoding the code can quite easily be modified for that use case like this:
def run_length_encode(data: str) -> str:
"""Returns run length encoded string for data"""
# A memory efficient (lazy) and pythonic solution using generators
return "".join(f"{x}{sum(1 for _ in y)}" for x, y in groupby(data))
This is a more generic run length encoding for all lengths, and not just for those of over 4 characters. But this could also quite easily be adapted with a conditional for the string if wanted.
Rosetta Code has a lot of implementations, that should easily be adaptable to your usecase.
Here is Python code with regular expressions:
from re import sub
def encode(text):
'''
Doctest:
>>> encode('WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW')
'12W1B12W3B24W1B14W'
'''
return sub(r'(.)\1*', lambda m: str(len(m.group(0))) + m.group(1),
text)
def decode(text):
'''
Doctest:
>>> decode('12W1B12W3B24W1B14W')
'WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW'
'''
return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)),
text)
textin = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWWW"
assert decode(encode(textin)) == textin
Aside for setting a=i after encoding a sequence and setting a width for your int when printed into the string. You could also do the following which takes advantage of pythons groupby. Its also a good idea to use format when constructing strings.
from itertools import groupby
def runLengthEncode (plainText):
res = []
for k,i in groupby(plainText):
run = list(i)
if(len(run) > 4):
res.append("/{:02}{}".format(len(run), k))
else:
res.extend(run)
return "".join(res)
Just observe the behaviour:
>>> runLengthEncode("abcd")
'abc'
Last character is ignored. You have to append what you've collected.
>>> runLengthEncode("abbbbbcd")
'a/5b/5b'
Oops, problem after encoding. You should set a=i even if you found a long enough sequence.
I know this is not the most efficient solution, but we haven't studied functions like groupby() yet so here's what I did:
def runLengthEncode (plainText):
res=''
a=''
count = 0
for i in plainText:
count+=1
if a.count(i)>0:
a+=i
else:
if len(a)>4:
if len(a)<10:
res+="/0"+str(len(a))+a[0][:1]
else:
res+="/" + str(len(a)) + a[0][:1]
a=i
else:
res+=a
a=i
if count == len(plainText):
if len(a)>4:
if len(a)<10:
res+="/0"+str(len(a))+a[0][:1]
else:
res+="/" + str(len(a)) + a[0][:1]
else:
res+=a
return(res)
Split=(list(input("Enter string: ")))
Split.append("")
a = 0
for i in range(len(Split)):
try:
if (Split[i] in Split) >0:
a = a + 1
if Split[i] != Split[i+1]:
print(Split[i],a)
a = 0
except IndexError:
print()
this is much easier and works everytime
def RLE_comp_encode(text):
if text == text[0]*len(text) :
return str(len(text))+text[0]
else:
comp_text , r = '' , 1
for i in range (1,len(text)):
if text[i]==text[i-1]:
r +=1
if i == len(text)-1:
comp_text += str(r)+text[i]
else :
comp_text += str(r)+text[i-1]
r = 1
return comp_text
This worked for me,
You can use the groupby() function combined with a list/generator comprehension:
from itertools import groupby, imap
''.join(x if reps <= 4 else "/%02d%s" % (reps, x) for x, reps in imap(lambda x: (x[0], len(list(x[1]))), groupby(s)))
An easy solution to run-length encoding which I can think of:
For encoding a string like "a4b5c6d7...":
def encode(s):
counts = {}
for c in s:
if counts.get(c) is None:
counts[c] = s.count(c)
return "".join(k+str(v) for k,v in counts.items())
For decoding a string like "aaaaaabbbdddddccccc....":
def decode(s):
return "".join((map(lambda tup: tup[0] * int(tup[1]), zip(s[0:len(s):2], s[1:len(s):2]))))
Fairly easy to read and simple.
text=input("Please enter the string to encode")
encoded=[]
index=0
amount=1
while index<=(len(text)-1):
if index==(len(text)-1) or text[index]!=text[(index+1)]:
encoded.append((text[index],amount))
amount=1
else:
amount=amount+1
index=index+1
print(encoded)

Regex replace in Spyder with case conversion [duplicate]

This question's answers are a community effort. Edit existing answers to improve this post. It is not currently accepting new answers or interactions.
Example:
>>> convert('CamelCase')
'camel_case'
Camel case to snake case
import re
name = 'CamelCaseName'
name = re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
print(name) # camel_case_name
If you do this many times and the above is slow, compile the regex beforehand:
pattern = re.compile(r'(?<!^)(?=[A-Z])')
name = pattern.sub('_', name).lower()
To handle more advanced cases specially (this is not reversible anymore):
def camel_to_snake(name):
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
print(camel_to_snake('camel2_camel2_case')) # camel2_camel2_case
print(camel_to_snake('getHTTPResponseCode')) # get_http_response_code
print(camel_to_snake('HTTPResponseCodeXYZ')) # http_response_code_xyz
To add also cases with two underscores or more:
def to_snake_case(name):
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
name = re.sub('__([A-Z])', r'_\1', name)
name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name)
return name.lower()
Snake case to pascal case
name = 'snake_case_name'
name = ''.join(word.title() for word in name.split('_'))
print(name) # SnakeCaseName
There's an inflection library in the package index that can handle these things for you. In this case, you'd be looking for inflection.underscore():
>>> inflection.underscore('CamelCase')
'camel_case'
I don't know why these are all so complicating.
for most cases, the simple expression ([A-Z]+) will do the trick
>>> re.sub('([A-Z]+)', r'_\1','CamelCase').lower()
'_camel_case'
>>> re.sub('([A-Z]+)', r'_\1','camelCase').lower()
'camel_case'
>>> re.sub('([A-Z]+)', r'_\1','camel2Case2').lower()
'camel2_case2'
>>> re.sub('([A-Z]+)', r'_\1','camelCamelCase').lower()
'camel_camel_case'
>>> re.sub('([A-Z]+)', r'_\1','getHTTPResponseCode').lower()
'get_httpresponse_code'
To ignore the first character simply add look behind (?!^)
>>> re.sub('(?!^)([A-Z]+)', r'_\1','CamelCase').lower()
'camel_case'
>>> re.sub('(?!^)([A-Z]+)', r'_\1','CamelCamelCase').lower()
'camel_camel_case'
>>> re.sub('(?!^)([A-Z]+)', r'_\1','Camel2Camel2Case').lower()
'camel2_camel2_case'
>>> re.sub('(?!^)([A-Z]+)', r'_\1','getHTTPResponseCode').lower()
'get_httpresponse_code'
If you want to separate ALLCaps to all_caps and expect numbers in your string you still don't need to do two separate runs just use | This expression ((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z])) can handle just about every scenario in the book
>>> a = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
>>> a.sub(r'_\1', 'getHTTPResponseCode').lower()
'get_http_response_code'
>>> a.sub(r'_\1', 'get2HTTPResponseCode').lower()
'get2_http_response_code'
>>> a.sub(r'_\1', 'get2HTTPResponse123Code').lower()
'get2_http_response123_code'
>>> a.sub(r'_\1', 'HTTPResponseCode').lower()
'http_response_code'
>>> a.sub(r'_\1', 'HTTPResponseCodeXYZ').lower()
'http_response_code_xyz'
It all depends on what you want so use the solution that best suits your needs as it should not be overly complicated.
nJoy!
Avoiding libraries and regular expressions:
def camel_to_snake(s):
return ''.join(['_'+c.lower() if c.isupper() else c for c in s]).lstrip('_')
>>> camel_to_snake('ThisIsMyString')
'this_is_my_string'
stringcase is my go-to library for this; e.g.:
>>> from stringcase import pascalcase, snakecase
>>> snakecase('FooBarBaz')
'foo_bar_baz'
>>> pascalcase('foo_bar_baz')
'FooBarBaz'
I think this solution is more straightforward than previous answers:
import re
def convert (camel_input):
words = re.findall(r'[A-Z]?[a-z]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+', camel_input)
return '_'.join(map(str.lower, words))
# Let's test it
test_strings = [
'CamelCase',
'camelCamelCase',
'Camel2Camel2Case',
'getHTTPResponseCode',
'get200HTTPResponseCode',
'getHTTP200ResponseCode',
'HTTPResponseCode',
'ResponseHTTP',
'ResponseHTTP2',
'Fun?!awesome',
'Fun?!Awesome',
'10CoolDudes',
'20coolDudes'
]
for test_string in test_strings:
print(convert(test_string))
Which outputs:
camel_case
camel_camel_case
camel_2_camel_2_case
get_http_response_code
get_200_http_response_code
get_http_200_response_code
http_response_code
response_http
response_http_2
fun_awesome
fun_awesome
10_cool_dudes
20_cool_dudes
The regular expression matches three patterns:
[A-Z]?[a-z]+: Consecutive lower-case letters that optionally start with an upper-case letter.
[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$): Two or more consecutive upper-case letters. It uses a lookahead to exclude the last upper-case letter if it is followed by a lower-case letter.
\d+: Consecutive numbers.
By using re.findall we get a list of individual "words" that can be converted to lower-case and joined with underscores.
Personally I am not sure how anything using regular expressions in python can be described as elegant. Most answers here are just doing "code golf" type RE tricks. Elegant coding is supposed to be easily understood.
def to_snake_case(not_snake_case):
final = ''
for i in xrange(len(not_snake_case)):
item = not_snake_case[i]
if i < len(not_snake_case) - 1:
next_char_will_be_underscored = (
not_snake_case[i+1] == "_" or
not_snake_case[i+1] == " " or
not_snake_case[i+1].isupper()
)
if (item == " " or item == "_") and next_char_will_be_underscored:
continue
elif (item == " " or item == "_"):
final += "_"
elif item.isupper():
final += "_"+item.lower()
else:
final += item
if final[0] == "_":
final = final[1:]
return final
>>> to_snake_case("RegularExpressionsAreFunky")
'regular_expressions_are_funky'
>>> to_snake_case("RegularExpressionsAre Funky")
'regular_expressions_are_funky'
>>> to_snake_case("RegularExpressionsAre_Funky")
'regular_expressions_are_funky'
''.join('_'+c.lower() if c.isupper() else c for c in "DeathToCamelCase").strip('_')
re.sub("(.)([A-Z])", r'\1_\2', 'DeathToCamelCase').lower()
Here's my solution:
def un_camel(text):
""" Converts a CamelCase name into an under_score name.
>>> un_camel('CamelCase')
'camel_case'
>>> un_camel('getHTTPResponseCode')
'get_http_response_code'
"""
result = []
pos = 0
while pos < len(text):
if text[pos].isupper():
if pos-1 > 0 and text[pos-1].islower() or pos-1 > 0 and \
pos+1 < len(text) and text[pos+1].islower():
result.append("_%s" % text[pos].lower())
else:
result.append(text[pos].lower())
else:
result.append(text[pos])
pos += 1
return "".join(result)
It supports those corner cases discussed in the comments. For instance, it'll convert getHTTPResponseCode to get_http_response_code like it should.
I don't get idea why using both .sub() calls? :) I'm not regex guru, but I simplified function to this one, which is suitable for my certain needs, I just needed a solution to convert camelCasedVars from POST request to vars_with_underscore:
def myFunc(...):
return re.sub('(.)([A-Z]{1})', r'\1_\2', "iTriedToWriteNicely").lower()
It does not work with such names like getHTTPResponse, cause I heard it is bad naming convention (should be like getHttpResponse, it's obviously, that it's much easier memorize this form).
For the fun of it:
>>> def un_camel(input):
... output = [input[0].lower()]
... for c in input[1:]:
... if c in ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
... output.append('_')
... output.append(c.lower())
... else:
... output.append(c)
... return str.join('', output)
...
>>> un_camel("camel_case")
'camel_case'
>>> un_camel("CamelCase")
'camel_case'
Or, more for the fun of it:
>>> un_camel = lambda i: i[0].lower() + str.join('', ("_" + c.lower() if c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ" else c for c in i[1:]))
>>> un_camel("camel_case")
'camel_case'
>>> un_camel("CamelCase")
'camel_case'
Using regexes may be the shortest, but this solution is way more readable:
def to_snake_case(s):
snake = "".join(["_"+c.lower() if c.isupper() else c for c in s])
return snake[1:] if snake.startswith("_") else snake
This is not a elegant method, is a very 'low level' implementation of a simple state machine (bitfield state machine), possibly the most anti pythonic mode to resolve this, however re module also implements a too complex state machine to resolve this simple task, so i think this is a good solution.
def splitSymbol(s):
si, ci, state = 0, 0, 0 # start_index, current_index
'''
state bits:
0: no yields
1: lower yields
2: lower yields - 1
4: upper yields
8: digit yields
16: other yields
32 : upper sequence mark
'''
for c in s:
if c.islower():
if state & 1:
yield s[si:ci]
si = ci
elif state & 2:
yield s[si:ci - 1]
si = ci - 1
state = 4 | 8 | 16
ci += 1
elif c.isupper():
if state & 4:
yield s[si:ci]
si = ci
if state & 32:
state = 2 | 8 | 16 | 32
else:
state = 8 | 16 | 32
ci += 1
elif c.isdigit():
if state & 8:
yield s[si:ci]
si = ci
state = 1 | 4 | 16
ci += 1
else:
if state & 16:
yield s[si:ci]
state = 0
ci += 1 # eat ci
si = ci
print(' : ', c, bin(state))
if state:
yield s[si:ci]
def camelcaseToUnderscore(s):
return '_'.join(splitSymbol(s))
splitsymbol can parses all case types: UpperSEQUENCEInterleaved, under_score, BIG_SYMBOLS and cammelCasedMethods
I hope it is useful
Take a look at the excellent Schematics lib
https://github.com/schematics/schematics
It allows you to created typed data structures that can serialize/deserialize from python to Javascript flavour, eg:
class MapPrice(Model):
price_before_vat = DecimalType(serialized_name='priceBeforeVat')
vat_rate = DecimalType(serialized_name='vatRate')
vat = DecimalType()
total_price = DecimalType(serialized_name='totalPrice')
So many complicated methods...
Just find all "Titled" group and join its lower cased variant with underscore.
>>> import re
>>> def camel_to_snake(string):
... groups = re.findall('([A-z0-9][a-z]*)', string)
... return '_'.join([i.lower() for i in groups])
...
>>> camel_to_snake('ABCPingPongByTheWay2KWhereIsOurBorderlands3???')
'a_b_c_ping_pong_by_the_way_2_k_where_is_our_borderlands_3'
If you don't want make numbers like first character of group or separate group - you can use ([A-z][a-z0-9]*) mask.
A horrendous example using regular expressions (you could easily clean this up :) ):
def f(s):
return s.group(1).lower() + "_" + s.group(2).lower()
p = re.compile("([A-Z]+[a-z]+)([A-Z]?)")
print p.sub(f, "CamelCase")
print p.sub(f, "getHTTPResponseCode")
Works for getHTTPResponseCode though!
Alternatively, using lambda:
p = re.compile("([A-Z]+[a-z]+)([A-Z]?)")
print p.sub(lambda x: x.group(1).lower() + "_" + x.group(2).lower(), "CamelCase")
print p.sub(lambda x: x.group(1).lower() + "_" + x.group(2).lower(), "getHTTPResponseCode")
EDIT: It should also be pretty easy to see that there's room for improvement for cases like "Test", because the underscore is unconditionally inserted.
Lightely adapted from https://stackoverflow.com/users/267781/matth
who use generators.
def uncamelize(s):
buff, l = '', []
for ltr in s:
if ltr.isupper():
if buff:
l.append(buff)
buff = ''
buff += ltr
l.append(buff)
return '_'.join(l).lower()
This simple method should do the job:
import re
def convert(name):
return re.sub(r'([A-Z]*)([A-Z][a-z]+)', lambda x: (x.group(1) + '_' if x.group(1) else '') + x.group(2) + '_', name).rstrip('_').lower()
We look for capital letters that are precedeed by any number of (or zero) capital letters, and followed by any number of lowercase characters.
An underscore is placed just before the occurence of the last capital letter found in the group, and one can be placed before that capital letter in case it is preceded by other capital letters.
If there are trailing underscores, remove those.
Finally, the whole result string is changed to lower case.
(taken from here, see working example online)
Here's something I did to change the headers on a tab-delimited file. I'm omitting the part where I only edited the first line of the file. You could adapt it to Python pretty easily with the re library. This also includes separating out numbers (but keeps the digits together). I did it in two steps because that was easier than telling it not to put an underscore at the start of a line or tab.
Step One...find uppercase letters or integers preceded by lowercase letters, and precede them with an underscore:
Search:
([a-z]+)([A-Z]|[0-9]+)
Replacement:
\1_\l\2/
Step Two...take the above and run it again to convert all caps to lowercase:
Search:
([A-Z])
Replacement (that's backslash, lowercase L, backslash, one):
\l\1
I was looking for a solution to the same problem, except that I needed a chain; e.g.
"CamelCamelCamelCase" -> "Camel-camel-camel-case"
Starting from the nice two-word solutions here, I came up with the following:
"-".join(x.group(1).lower() if x.group(2) is None else x.group(1) \
for x in re.finditer("((^.[^A-Z]+)|([A-Z][^A-Z]+))", "stringToSplit"))
Most of the complicated logic is to avoid lowercasing the first word. Here's a simpler version if you don't mind altering the first word:
"-".join(x.group(1).lower() for x in re.finditer("(^[^A-Z]+|[A-Z][^A-Z]+)", "stringToSplit"))
Of course, you can pre-compile the regular expressions or join with underscore instead of hyphen, as discussed in the other solutions.
Concise without regular expressions, but HTTPResponseCode=> httpresponse_code:
def from_camel(name):
"""
ThisIsCamelCase ==> this_is_camel_case
"""
name = name.replace("_", "")
_cas = lambda _x : [_i.isupper() for _i in _x]
seq = zip(_cas(name[1:-1]), _cas(name[2:]))
ss = [_x + 1 for _x, (_i, _j) in enumerate(seq) if (_i, _j) == (False, True)]
return "".join([ch + "_" if _x in ss else ch for _x, ch in numerate(name.lower())])
Without any library :
def camelify(out):
return (''.join(["_"+x.lower() if i<len(out)-1 and x.isupper() and out[i+1].islower()
else x.lower()+"_" if i<len(out)-1 and x.islower() and out[i+1].isupper()
else x.lower() for i,x in enumerate(list(out))])).lstrip('_').replace('__','_')
A bit heavy, but
CamelCamelCamelCase -> camel_camel_camel_case
HTTPRequest -> http_request
GetHTTPRequest -> get_http_request
getHTTPRequest -> get_http_request
Very nice RegEx proposed on this site:
(?<!^)(?=[A-Z])
If python have a String Split method, it should work...
In Java:
String s = "loremIpsum";
words = s.split("(?<!^)(?=[A-Z])");
Just in case someone needs to transform a complete source file, here is a script that will do it.
# Copy and paste your camel case code in the string below
camelCaseCode ="""
cv2.Matx33d ComputeZoomMatrix(const cv2.Point2d & zoomCenter, double zoomRatio)
{
auto mat = cv2.Matx33d::eye();
mat(0, 0) = zoomRatio;
mat(1, 1) = zoomRatio;
mat(0, 2) = zoomCenter.x * (1. - zoomRatio);
mat(1, 2) = zoomCenter.y * (1. - zoomRatio);
return mat;
}
"""
import re
def snake_case(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
def lines(str):
return str.split("\n")
def unlines(lst):
return "\n".join(lst)
def words(str):
return str.split(" ")
def unwords(lst):
return " ".join(lst)
def map_partial(function):
return lambda values : [ function(v) for v in values]
import functools
def compose(*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
snake_case_code = compose(
unlines ,
map_partial(unwords),
map_partial(map_partial(snake_case)),
map_partial(words),
lines
)
print(snake_case_code(camelCaseCode))
Wow I just stole this from django snippets. ref http://djangosnippets.org/snippets/585/
Pretty elegant
camelcase_to_underscore = lambda str: re.sub(r'(?<=[a-z])[A-Z]|[A-Z](?=[^A-Z])', r'_\g<0>', str).lower().strip('_')
Example:
camelcase_to_underscore('ThisUser')
Returns:
'this_user'
REGEX DEMO
def convert(name):
return reduce(
lambda x, y: x + ('_' if y.isupper() else '') + y,
name
).lower()
And if we need to cover a case with already-un-cameled input:
def convert(name):
return reduce(
lambda x, y: x + ('_' if y.isupper() and not x.endswith('_') else '') + y,
name
).lower()
Not in the standard library, but I found this module that appears to contain the functionality you need.
If you use Google's (nearly) deterministic Camel case algorithm, then one does not need to handle things like HTMLDocument since it should be HtmlDocument, then this regex based approach is simple. It replace all capitals or numbers with an underscore. Note does not handle multi digit numbers.
import re
def to_snake_case(camel_str):
return re.sub('([A-Z0-9])', r'_\1', camel_str).lower().lstrip('_')
def convert(camel_str):
temp_list = []
for letter in camel_str:
if letter.islower():
temp_list.append(letter)
else:
temp_list.append('_')
temp_list.append(letter)
result = "".join(temp_list)
return result.lower()
Use: str.capitalize() to convert first letter of the string (contained in variable str) to a capital letter and returns the entire string.
Example:
Command: "hello".capitalize()
Output: Hello

How do I reverse words in a string with Python

I am trying to reverse words of a string, but having difficulty, any assistance will be appreciated:
S = " what is my name"
def reversStr(S):
for x in range(len(S)):
return S[::-1]
break
What I get now is: eman ym si tahw
However, I am trying to get: tahw is ym eman (individual words reversed)
def reverseStr(s):
return ' '.join([x[::-1] for x in s.split(' ')])
orig = "what is my name"
reverse = ""
for word in orig.split():
reverse = "{} {}".format(reverse, word[::-1])
print(reverse)
Since everyone else's covered the case where the punctuation moves, I'll cover the one where you don't want the punctuation to move.
import re
def reverse_words(sentence):
return re.sub(r'[a-zA-Z]+', lambda x : x.group()[::-1], sentence)
Breaking this down.
re is python's regex module, and re.sub is the function in that module that handles substitutions. It has three required parameters.
The first is the regex you're matching by. In this case, I'm using r'\w+'. The r denotes a raw string, [a-zA-Z] matches all letters, and + means "at least one".
The second is either a string to substitute in, or a function that takes in a re.MatchObject and outputs a string. I'm using a lambda (or nameless) function that simply outputs the matched string, reversed.
The third is the string you want to do a find in a replace in.
So "What is my name?" -> "tahW si ym eman?"
Addendum:
I considered a regex of r'\w+' initially, because better unicode support (if the right flags are given), but \w also includes numbers and underscores. Matching - might also be desired behavior: the regexes would be r'[a-zA-Z-]+' (note trailing hyphen) and r'[\w-]+' but then you'd probably want to not match double-dashes (ie --) so more regex modifications might be needed.
The built-in reversed outputs a reversed object, which you have to cast back to string, so I generally prefer the [::-1] option.
inplace refers to modifying the object without creating a copy. Yes, like many of us has already pointed out that python strings are immutable. So technically we cannot reverse a python string datatype object inplace. However, if you use a mutable datatype, say bytearray for storing the string characters, you can actually reverse it inplace
#slicing creates copy; implies not-inplace reversing
def rev(x):
return x[-1::-1]
# inplace reversing, if input is bytearray datatype
def rev_inplace(x: bytearray):
i = 0; j = len(x)-1
while i<j:
t = x[i]
x[i] = x[j]
x[j] = t
i += 1; j -= 1
return x
Input:
x = bytearray(b'some string to reverse')
rev_inplace(x)
Output:
bytearray(b'esrever ot gnirts emose')
Try splitting each word in the string into a list (see: https://docs.python.org/2/library/stdtypes.html#str.split).
Example:
>>string = "This will be split up"
>>string_list = string.split(" ")
>>string_list
>>['This', 'will', 'be', 'split', 'up']
Then iterate through the list and reverse each constituent list item (i.e. word) which you have working already.
def reverse_in_place(phrase):
res = []
phrase = phrase.split(" ")
for word in phrase:
word = word[::-1]
res.append(word)
res = " ".join(res)
return res
[thread has been closed, but IMO, not well answered]
the python string.lib doesn't include an in place str.reverse() method.
So use the built in reversed() function call to accomplish the same thing.
>>> S = " what is my name"
>>> ("").join(reversed(S))
'eman ym si tahw'
There is no obvious way of reversing a string "truly" in-place with Python. However, you can do something like:
def reverse_string_inplace(string):
w = len(string)-1
p = w
while True:
q = string[p]
string = ' ' + string + q
w -= 1
if w < 0:
break
return string[(p+1)*2:]
Hope this makes sense.
In Python, strings are immutable. This means you cannot change the string once you have created it. So in-place reverse is not possible.
There are many ways to reverse the string in python, but memory allocation is required for that reversed string.
print(' '.join(word[::-1] for word in string))
s1 = input("Enter a string with multiple words:")
print(f'Original:{s1}')
print(f'Reverse is:{s1[::-1]}')
each_word_new_list = []
s1_split = s1.split()
for i in range(0,len(s1_split)):
each_word_new_list.append(s1_split[i][::-1])
print(f'New Reverse as List:{each_word_new_list}')
each_word_new_string=' '.join(each_word_new_list)
print(f'New Reverse as String:{each_word_new_string}')
If the sentence contains multiple spaces then usage of split() function will cause trouble because you won't know then how many spaces you need to rejoin after you reverse each word in the sentence. Below snippet might help:
# Sentence having multiple spaces
given_str = "I know this country runs by mafia "
tmp = ""
tmp_list = []
for i in given_str:
if i != ' ':
tmp = tmp + i
else:
if tmp == "":
tmp_list.append(i)
else:
tmp_list.append(tmp)
tmp_list.append(i)
tmp = ""
print(tmp_list)
rev_list = []
for x in tmp_list:
rev = x[::-1]
rev_list.append(rev)
print(rev_list)
print(''.join(rev_list))
output:
def rev(a):
if a == "":
return ""
else:
z = rev(a[1:]) + a[0]
return z
Reverse string --> gnirts esreveR
def rev(k):
y = rev(k).split()
for i in range(len(y)-1,-1,-1):
print y[i],
-->esreveR gnirts

Using a function in Python to return a substring

I have a feeling my question is pretty basic, as I am a first semester computer science student.
I have been asked to return the substring formed before a digit in a string similar to "abcd5efgh". The idea is to use a function to give me "abcd". I think I need to use .isdigit, but I'm not sure how to turn it into a function. Thank you in advance!
It could be done with regexp, but if you already discovered isdigit, why not use it in this case?
You can modify the last return s line to return something else if no digit is found:
def string_before_digit(s):
for i, c in enumerate(s):
if c.isdigit():
return s[:i]
return s # no digit found
print(string_before_digit("abcd5efgh"))
I am also currently a student and this is how i would approch this problem:
*For my school we are not allowed to use built in function like that in python :/
def parse(string):
newstring = ""
for i in string:
if i >= "0" and i <= "9":
break
else:
newstring += i
print newstring #Can use return if your needing it in another function
parse("abcd5efgh")
Hope this helps
A functional approach :)
>>> from itertools import compress, count, imap
>>> text = "abcd5efgh"
>>> text[:next(compress(count(), imap(str.isdigit, text)), len(text))]
'abcd'
The code is below will give you the first non digit part by using regular expression.
import re
myPattern=re.compile('[a-zA-Z]*')
firstNonDigitPart=myPattern.match('abcd5efgh')
firstNonDigitPart.group()
>>> 'abcd'
If you are not allowed to use regexes, maybe because they told you to do it explicitly by hand, you can do it like this:
def digit_index(s):
"""Helper function."""
# next(..., -1) asks the given iterator for the next value and returns -1 if there is none.
# This iterator gives the index n of the first "true-giving" element of the asked generator expression. True-giving is any character which is a digit.
return next(
(n for n, i in enumerate(i.isdigit() for i in "abc123") if i),
-1)
def before_digit(s):
di = digit_index(s)
if di == -1: return s
return s[:di]
should give you your wanted result.
A quite simple one-liner, using isdigit :)
>>> s = 'abcd5efgh'
>>> s[:[i for i, j in enumerate([_ for _ in s]) if j.isdigit()][0]]
'abcd'
An itertools approach:
>>> from itertools import takewhile
>>> s="abcd5efgh"
>>> ''.join(takewhile(lambda x: not x.isdigit(), s))
'abcd'

How can I simplify this conversion from underscore to camelcase in Python?

I have written the function below that converts underscore to camelcase with first word in lowercase, i.e. "get_this_value" -> "getThisValue". Also I have requirement to preserve leading and trailing underscores and also double (triple etc.) underscores, if any, i.e.
"_get__this_value_" -> "_get_ThisValue_".
The code:
def underscore_to_camelcase(value):
output = ""
first_word_passed = False
for word in value.split("_"):
if not word:
output += "_"
continue
if first_word_passed:
output += word.capitalize()
else:
output += word.lower()
first_word_passed = True
return output
I am feeling the code above as written in non-Pythonic style, though it works as expected, so looking how to simplify the code and write it using list comprehensions etc.
This one works except for leaving the first word as lowercase.
def convert(word):
return ''.join(x.capitalize() or '_' for x in word.split('_'))
(I know this isn't exactly what you asked for, and this thread is quite old, but since it's quite prominent when searching for such conversions on Google I thought I'd add my solution in case it helps anyone else).
Your code is fine. The problem I think you're trying to solve is that if first_word_passed looks a little bit ugly.
One option for fixing this is a generator. We can easily make this return one thing for first entry and another for all subsequent entries. As Python has first-class functions we can get the generator to return the function we want to use to process each word.
We then just need to use the conditional operator so we can handle the blank entries returned by double underscores within a list comprehension.
So if we have a word we call the generator to get the function to use to set the case, and if we don't we just use _ leaving the generator untouched.
def underscore_to_camelcase(value):
def camelcase():
yield str.lower
while True:
yield str.capitalize
c = camelcase()
return "".join(c.next()(x) if x else '_' for x in value.split("_"))
I prefer a regular expression, personally. Here's one that is doing the trick for me:
import re
def to_camelcase(s):
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), s)
Using unutbu's tests:
tests = [('get__this_value', 'get_ThisValue'),
('_get__this_value', '_get_ThisValue'),
('_get__this_value_', '_get_ThisValue_'),
('get_this_value', 'getThisValue'),
('get__this__value', 'get_This_Value')]
for test, expected in tests:
assert to_camelcase(test) == expected
Here's a simpler one. Might not be perfect for all situations, but it meets my requirements, since I'm just converting python variables, which have a specific format, to camel-case. This does capitalize all but the first word.
def underscore_to_camelcase(text):
"""
Converts underscore_delimited_text to camelCase.
Useful for JSON output
"""
return ''.join(word.title() if i else word for i, word in enumerate(text.split('_')))
I think the code is fine. You've got a fairly complex specification, so if you insist on squashing it into the Procrustean bed of a list comprehension, then you're likely to harm the clarity of the code.
The only changes I'd make would be:
To use the join method to build the result in O(n) space and time, rather than repeated applications of += which is O(n²).
To add a docstring.
Like this:
def underscore_to_camelcase(s):
"""Take the underscore-separated string s and return a camelCase
equivalent. Initial and final underscores are preserved, and medial
pairs of underscores are turned into a single underscore."""
def camelcase_words(words):
first_word_passed = False
for word in words:
if not word:
yield "_"
continue
if first_word_passed:
yield word.capitalize()
else:
yield word.lower()
first_word_passed = True
return ''.join(camelcase_words(s.split('_')))
Depending on the application, another change I would consider making would be to memoize the function. I presume you're automatically translating source code in some way, and you expect the same names to occur many times. So you might as well store the conversion instead of re-computing it each time. An easy way to do that would be to use the #memoized decorator from the Python decorator library.
This algorithm performs well with digit:
import re
PATTERN = re.compile(r'''
(?<!\A) # not at the start of the string
_
(?=[a-zA-Z]) # followed by a letter
''', re.X)
def camelize(value):
tokens = PATTERN.split(value)
response = tokens.pop(0).lower()
for remain in tokens:
response += remain.capitalize()
return response
Examples:
>>> camelize('Foo')
'foo'
>>> camelize('_Foo')
'_foo'
>>> camelize('Foo_')
'foo_'
>>> camelize('Foo_Bar')
'fooBar'
>>> camelize('Foo__Bar')
'foo_Bar'
>>> camelize('9')
'9'
>>> camelize('9_foo')
'9Foo'
>>> camelize('foo_9')
'foo_9'
>>> camelize('foo_9_bar')
'foo_9Bar'
>>> camelize('foo__9__bar')
'foo__9_Bar'
Here's mine, relying mainly on list comprehension, split, and join. Plus optional parameter to use different delimiter:
def underscore_to_camel(in_str, delim="_"):
chunks = in_str.split(delim)
chunks[1:] = [_.title() for _ in chunks[1:]]
return "".join(chunks)
Also, for sake of completeness, including what was referenced earlier as solution from another question as the reverse (NOT my own code, just repeating for easy reference):
first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
def camel_to_underscore(in_str):
s1 = first_cap_re.sub(r'\1_\2', name)
return all_cap_re.sub(r'\1_\2', s1).lower()
I agree with Gareth that the code is ok. However, if you really want a shorter, yet readable approach you could try something like this:
def underscore_to_camelcase(value):
# Make a list of capitalized words and underscores to be preserved
capitalized_words = [w.capitalize() if w else '_' for w in value.split('_')]
# Convert the first word to lowercase
for i, word in enumerate(capitalized_words):
if word != '_':
capitalized_words[i] = word.lower()
break
# Join all words to a single string and return it
return "".join(capitalized_words)
The problem calls for a function that returns a lowercase word the first time, but capitalized words afterwards. You can do that with an if clause, but then the if clause has to be evaluated for every word. An appealing alternative is to use a generator. It can return one thing on the first call, and something else on successive calls, and it does not require as many ifs.
def lower_camelcase(seq):
it=iter(seq)
for word in it:
yield word.lower()
if word.isalnum(): break
for word in it:
yield word.capitalize()
def underscore_to_camelcase(text):
return ''.join(lower_camelcase(word if word else '_' for word in text.split('_')))
Here is some test code to show that it works:
tests=[('get__this_value','get_ThisValue'),
('_get__this_value','_get_ThisValue'),
('_get__this_value_','_get_ThisValue_'),
('get_this_value','getThisValue'),
('get__this__value','get_This_Value'),
]
for test,answer in tests:
result=underscore_to_camelcase(test)
try:
assert result==answer
except AssertionError:
print('{r!r} != {a!r}'.format(r=result,a=answer))
Here is a list comprehension style generator expression.
from itertools import count
def underscore_to_camelcase(value):
words = value.split('_')
counter = count()
return ''.join('_' if w == '' else w.capitalize() if counter.next() else w for w in words )
def convert(word):
if not isinstance(word, str):
return word
if word.startswith("_"):
word = word[1:]
words = word.split("_")
_words = []
for idx, _word in enumerate(words):
if idx == 0:
_words.append(_word)
continue
_words.append(_word.capitalize())
return ''.join(_words)
This is the most compact way to do it:
def underscore_to_camelcase(value):
words = [word.capitalize() for word in value.split('_')]
words[0]=words[0].lower()
return "".join(words)
Another regexp solution:
import re
def conv(s):
"""Convert underscore-separated strings to camelCase equivalents.
>>> conv('get')
'get'
>>> conv('_get')
'_get'
>>> conv('get_this_value')
'getThisValue'
>>> conv('__get__this_value_')
'_get_ThisValue_'
>>> conv('_get__this_value__')
'_get_ThisValue_'
>>> conv('___get_this_value')
'_getThisValue'
"""
# convert case:
s = re.sub(r'(_*[A-Z])', lambda m: m.group(1).lower(), s.title(), count=1)
# remove/normalize underscores:
s = re.sub(r'__+|^_+|_+$', '|', s).replace('_', '').replace('|', '_')
return s
if __name__ == "__main__":
import doctest
doctest.testmod()
It works for your examples, but it might fail for names containting digits - it depends how you would capitalize them.
For regexp sake !
import re
def underscore_to_camelcase(value):
def rep(m):
if m.group(1) != None:
return m.group(2) + m.group(3).lower() + '_'
else:
return m.group(3).capitalize()
ret, nb_repl = re.subn(r'(^)?(_*)([a-zA-Z]+)', rep, value)
return ret if (nb_repl > 1) else ret[:-1]
A slightly modified version:
import re
def underscore_to_camelcase(value):
first = True
res = []
for u,w in re.findall('([_]*)([^_]*)',value):
if first:
res.append(u+w)
first = False
elif len(w)==0: # trailing underscores
res.append(u)
else: # trim an underscore and capitalize
res.append(u[:-1] + w.title())
return ''.join(res)
I know this has already been answered, but I came up with some syntactic sugar that handles a special case that the selected answer does not (words with dunders in them i.e. "my_word__is_____ugly" to "myWordIsUgly"). Obviously this can be broken up into multiple lines but I liked the challenge of getting it on one. I added line breaks for clarity.
def underscore_to_camel(in_string):
return "".join(
list(
map(
lambda index_word:
index_word[1].lower() if index_word[0] == 0
else index_word[1][0].upper() + (index_word[1][1:] if len(index_word[1]) > 0 else ""),
list(enumerate(re.split(re.compile(r"_+"), in_string)
)
)
)
)
)
Maybe, pydash works for this purpose (https://pydash.readthedocs.io/en/latest/)
>>> from pydash.strings import snake_case
>>>> snake_case('needToBeSnakeCased')
'get__this_value'
>>> from pydash.strings import camel_case
>>>camel_case('_get__this_value_')
'getThisValue'

Categories