Python regular expression to extract optional number at the end of string - python

I'm trying to write a Python regular expression that can parse strings of the type "<name>(<number>)", where <number> is optional.
For example, if I pass 'sclkout', then there is no number at the end, so it should just match 'sclkout'. If the input is 'line7', then is should match 'line' and '7'. The name can also contain numbers inside it, so if I give it 'dx3f', then the output should be 'dx3f', but for 'dx3b0' it should match 'dx3b' and 0.
This is what I first tried:
import re
def do_match(signal):
match = re.match('(\w+)(\d+)?', signal)
assert match
print "Input = " + signal
print "group1 = " + match.group(1)
if match.lastindex == 2:
print "group2 = " + match.group(2)
print ""
# should match 'sclkout'
do_match("sclkout")
# should match 'line' and '7'
do_match("line7")
# should match 'dx4f'
do_match("dx4f")
# should match 'dx3b' and '0'
do_match("dx3b0")
This is of course wrong because of greedy matching in the (\w+) group, so I tried setting that to non-greedy:
match = re.match('(\w+?)(\d+)?', signal)
This however only matches the first letter of the string.

You don't need regex for this:
from itertools import takewhile
def do_match(s):
num = ''.join(takewhile(str.isdigit, reversed(s)))[::-1]
return s[:s.rindex(num)], num
...
>>> do_match('sclkout')
('sclkout', '')
>>> do_match('line7')
('line', '7')
>>> do_match('dx4f')
('dx4f', '')
>>> do_match('dx3b0')
('dx3b', '0')

You can use a possessive quantifier like this:
^(?<name>\w+?)(?<number>\d+)?$
Or ^(\w+?)(\d+)?$, if you don't want the named capture groups.
See live demo here: http://rubular.com/r/44Ntc4mLDY

([a-zA-Z0-9]*[a-zA-Z]+)([0-9]*) is what you want.
import re
test = ["sclkout", "line7", "dx4f", "dx3b0"]
ans = [("sclkout", ""), ("line", "7"), ("dx4f", ""), ("dx3b", "0")]
for t, a in zip(test, ans):
m = re.match(r'([a-zA-Z0-9]*[a-zA-Z]+)([0-9]*)', t)
if m.groups() == a:
print "OK"
else:
print "NG"
output:
OK
OK
OK
OK

Related

Replace ip partially with x in python

I have several ip addresses like
162.1.10.15
160.15.20.222
145.155.222.1
I am trying to replace the ip's like below.
162.x.xx.xx
160.xx.xx.xxx
145.xxx.xxx.x
How to achieve this in python.
Here’s a slightly simpler solution
import re
txt = "192.1.2.3"
x = txt.split(".", 1) # ['192', '1.2.3']
y = x[0] + "." + re.sub(r"\d", "x", x[1])
print(y) # 192.x.x.x
We can use re.sub with a callback function here:
def repl(m):
return m.group(1) + '.' + re.sub(r'.', 'x', m.group(2)) + '.' + re.sub(r'.', 'x', m.group(3)) + '.' + re.sub(r'.', 'x', m.group(4))
inp = "160.15.20.222"
output = re.sub(r'\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b', repl, inp)
print(output) # 160.xx.xx.xxx
In the callback, the idea is to use re.sub to surgically replace each digit by x. This keeps the same width of each original number.
This is not the optimize solution but it works for me .
import re
Ip_string = "160.15.20.222"
Ip_string = Ip_string.split('.')
Ip_String_x =""
flag = False
for num in Ip_string:
if flag:
num = re.sub('\d','x',num)
Ip_String_x = Ip_String_x + '.'+ num
else:
flag = True
Ip_String_x = num
Solution 1
Other answers are good, and this single regex works, too:
import re
strings = [
'162.1.10.15',
'160.15.20.222',
'145.155.222.1',
]
for string in strings:
print(re.sub(r'(?:(?<=\.)|(?<=\.\d)|(?<=\.\d\d))\d', 'x', string))
output:
162.x.xx.xx
160.xx.xx.xxx
145.xxx.xxx.x
Explanation
(?<=\.) means following by single dot.
(?<=\.\d) means follwing by single dot and single digit.
(?<=\.\d\d) means following by single dot and double digit.
\d means a digit.
So, all digits that following by single dot and none/single/double digits are replaced with 'x'
(?<=\.\d{0,2}) or similar patterns are not allowed since look-behind ((?<=...)) should has fixed-width.
Solution 2
Without re module and regex,
for string in strings:
first, *rest = string.split('.')
print('.'.join([first, *map(lambda x: 'x' * len(x), rest)]))
above code has same result.
There are multiple ways to go about this. Regex is the most versatile and fancy way to write string manipulation codes. But you can also do it by same old for-loops with split and join functions.
ip = "162.1.10.15"
#Splitting the IPv4 address using '.' as the delimiter
ip = ip.split(".")
#Converting the substrings to x's except 1st string
for i,val in enumerate(ip[1:]):
cnt = 0
for x in val:
cnt += 1
ip[i+1] = "x" * cnt
#Combining the substrings back to ip
ip = ".".join(ip)
print(ip)
I highly recommend checking Regex but this is also a valid way to go about this task.
Hope you find this useful!
Pass an array of IPs to this function:
def replace_ips(ip_list):
r_list=[]
for i in ip_list:
first,*other=i.split(".",3)
r_item=[]
r_item.append(first)
for i2 in other:
r_item.append("x"*len(i2))
r_list.append(".".join(r_item))
return r_list
In case of your example:
print(replace_ips(["162.1.10.15","160.15.20.222","145.155.222.1"]))#==> expected output: ["162.x.xx.xx","160.xx.xx.xxx","145.xxx.xxx.x"]
Oneliner FYI:
import re
ips = ['162.1.10.15', '160.15.20.222', '145.155.222.1']
pattern = r'\d{1,3}'
replacement_sign = 'x'
res = [re.sub(pattern, replacement_sign, ip[::-1], 3)[::-1] for ip in ips]
print(res)

Is there a regrex script that can be used to extract texts by defining a start and an end in a text file [duplicate]

Let's say I have a string 'gfgfdAAA1234ZZZuijjk' and I want to extract just the '1234' part.
I only know what will be the few characters directly before AAA, and after ZZZ the part I am interested in 1234.
With sed it is possible to do something like this with a string:
echo "$STRING" | sed -e "s|.*AAA\(.*\)ZZZ.*|\1|"
And this will give me 1234 as a result.
How to do the same thing in Python?
Using regular expressions - documentation for further reference
import re
text = 'gfgfdAAA1234ZZZuijjk'
m = re.search('AAA(.+?)ZZZ', text)
if m:
found = m.group(1)
# found: 1234
or:
import re
text = 'gfgfdAAA1234ZZZuijjk'
try:
found = re.search('AAA(.+?)ZZZ', text).group(1)
except AttributeError:
# AAA, ZZZ not found in the original string
found = '' # apply your error handling
# found: 1234
>>> s = 'gfgfdAAA1234ZZZuijjk'
>>> start = s.find('AAA') + 3
>>> end = s.find('ZZZ', start)
>>> s[start:end]
'1234'
Then you can use regexps with the re module as well, if you want, but that's not necessary in your case.
regular expression
import re
re.search(r"(?<=AAA).*?(?=ZZZ)", your_text).group(0)
The above as-is will fail with an AttributeError if there are no "AAA" and "ZZZ" in your_text
string methods
your_text.partition("AAA")[2].partition("ZZZ")[0]
The above will return an empty string if either "AAA" or "ZZZ" don't exist in your_text.
PS Python Challenge?
Surprised that nobody has mentioned this which is my quick version for one-off scripts:
>>> x = 'gfgfdAAA1234ZZZuijjk'
>>> x.split('AAA')[1].split('ZZZ')[0]
'1234'
you can do using just one line of code
>>> import re
>>> re.findall(r'\d{1,5}','gfgfdAAA1234ZZZuijjk')
>>> ['1234']
result will receive list...
import re
print re.search('AAA(.*?)ZZZ', 'gfgfdAAA1234ZZZuijjk').group(1)
You can use re module for that:
>>> import re
>>> re.compile(".*AAA(.*)ZZZ.*").match("gfgfdAAA1234ZZZuijjk").groups()
('1234,)
In python, extracting substring form string can be done using findall method in regular expression (re) module.
>>> import re
>>> s = 'gfgfdAAA1234ZZZuijjk'
>>> ss = re.findall('AAA(.+)ZZZ', s)
>>> print ss
['1234']
text = 'I want to find a string between two substrings'
left = 'find a '
right = 'between two'
print(text[text.index(left)+len(left):text.index(right)])
Gives
string
>>> s = '/tmp/10508.constantstring'
>>> s.split('/tmp/')[1].split('constantstring')[0].strip('.')
With sed it is possible to do something like this with a string:
echo "$STRING" | sed -e "s|.*AAA\(.*\)ZZZ.*|\1|"
And this will give me 1234 as a result.
You could do the same with re.sub function using the same regex.
>>> re.sub(r'.*AAA(.*)ZZZ.*', r'\1', 'gfgfdAAA1234ZZZuijjk')
'1234'
In basic sed, capturing group are represented by \(..\), but in python it was represented by (..).
You can find first substring with this function in your code (by character index). Also, you can find what is after a substring.
def FindSubString(strText, strSubString, Offset=None):
try:
Start = strText.find(strSubString)
if Start == -1:
return -1 # Not Found
else:
if Offset == None:
Result = strText[Start+len(strSubString):]
elif Offset == 0:
return Start
else:
AfterSubString = Start+len(strSubString)
Result = strText[AfterSubString:AfterSubString + int(Offset)]
return Result
except:
return -1
# Example:
Text = "Thanks for contributing an answer to Stack Overflow!"
subText = "to"
print("Start of first substring in a text:")
start = FindSubString(Text, subText, 0)
print(start); print("")
print("Exact substring in a text:")
print(Text[start:start+len(subText)]); print("")
print("What is after substring \"%s\"?" %(subText))
print(FindSubString(Text, subText))
# Your answer:
Text = "gfgfdAAA1234ZZZuijjk"
subText1 = "AAA"
subText2 = "ZZZ"
AfterText1 = FindSubString(Text, subText1, 0) + len(subText1)
BeforText2 = FindSubString(Text, subText2, 0)
print("\nYour answer:\n%s" %(Text[AfterText1:BeforText2]))
Using PyParsing
import pyparsing as pp
word = pp.Word(pp.alphanums)
s = 'gfgfdAAA1234ZZZuijjk'
rule = pp.nestedExpr('AAA', 'ZZZ')
for match in rule.searchString(s):
print(match)
which yields:
[['1234']]
One liner with Python 3.8 if text is guaranteed to contain the substring:
text[text.find(start:='AAA')+len(start):text.find('ZZZ')]
Just in case somebody will have to do the same thing that I did. I had to extract everything inside parenthesis in a line. For example, if I have a line like 'US president (Barack Obama) met with ...' and I want to get only 'Barack Obama' this is solution:
regex = '.*\((.*?)\).*'
matches = re.search(regex, line)
line = matches.group(1) + '\n'
I.e. you need to block parenthesis with slash \ sign. Though it is a problem about more regular expressions that Python.
Also, in some cases you may see 'r' symbols before regex definition. If there is no r prefix, you need to use escape characters like in C. Here is more discussion on that.
also, you can find all combinations in the bellow function
s = 'Part 1. Part 2. Part 3 then more text'
def find_all_places(text,word):
word_places = []
i=0
while True:
word_place = text.find(word,i)
i+=len(word)+word_place
if i>=len(text):
break
if word_place<0:
break
word_places.append(word_place)
return word_places
def find_all_combination(text,start,end):
start_places = find_all_places(text,start)
end_places = find_all_places(text,end)
combination_list = []
for start_place in start_places:
for end_place in end_places:
print(start_place)
print(end_place)
if start_place>=end_place:
continue
combination_list.append(text[start_place:end_place])
return combination_list
find_all_combination(s,"Part","Part")
result:
['Part 1. ', 'Part 1. Part 2. ', 'Part 2. ']
In case you want to look for multiple occurences.
content ="Prefix_helloworld_Suffix_stuff_Prefix_42_Suffix_andsoon"
strings = []
for c in content.split('Prefix_'):
spos = c.find('_Suffix')
if spos!=-1:
strings.append( c[:spos])
print( strings )
Or more quickly :
strings = [ c[:c.find('_Suffix')] for c in content.split('Prefix_') if c.find('_Suffix')!=-1 ]
Here's a solution without regex that also accounts for scenarios where the first substring contains the second substring. This function will only find a substring if the second marker is after the first marker.
def find_substring(string, start, end):
len_until_end_of_first_match = string.find(start) + len(start)
after_start = string[len_until_end_of_first_match:]
return string[string.find(start) + len(start):len_until_end_of_first_match + after_start.find(end)]
Another way of doing it is using lists (supposing the substring you are looking for is made of numbers, only) :
string = 'gfgfdAAA1234ZZZuijjk'
numbersList = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
output = []
for char in string:
if char in numbersList: output.append(char)
print(f"output: {''.join(output)}")
### output: 1234
Typescript. Gets string in between two other strings.
Searches shortest string between prefixes and postfixes
prefixes - string / array of strings / null (means search from the start).
postfixes - string / array of strings / null (means search until the end).
public getStringInBetween(str: string, prefixes: string | string[] | null,
postfixes: string | string[] | null): string {
if (typeof prefixes === 'string') {
prefixes = [prefixes];
}
if (typeof postfixes === 'string') {
postfixes = [postfixes];
}
if (!str || str.length < 1) {
throw new Error(str + ' should contain ' + prefixes);
}
let start = prefixes === null ? { pos: 0, sub: '' } : this.indexOf(str, prefixes);
const end = postfixes === null ? { pos: str.length, sub: '' } : this.indexOf(str, postfixes, start.pos + start.sub.length);
let value = str.substring(start.pos + start.sub.length, end.pos);
if (!value || value.length < 1) {
throw new Error(str + ' should contain string in between ' + prefixes + ' and ' + postfixes);
}
while (true) {
try {
start = this.indexOf(value, prefixes);
} catch (e) {
break;
}
value = value.substring(start.pos + start.sub.length);
if (!value || value.length < 1) {
throw new Error(str + ' should contain string in between ' + prefixes + ' and ' + postfixes);
}
}
return value;
}
a simple approach could be the following:
string_to_search_in = 'could be anything'
start = string_to_search_in.find(str("sub string u want to identify"))
length = len("sub string u want to identify")
First_part_removed = string_to_search_in[start:]
end_coord = length
Extracted_substring=First_part_removed[:end_coord]
One liners that return other string if there was no match.
Edit: improved version uses next function, replace "not-found" with something else if needed:
import re
res = next( (m.group(1) for m in [re.search("AAA(.*?)ZZZ", "gfgfdAAA1234ZZZuijjk" ),] if m), "not-found" )
My other method to do this, less optimal, uses regex 2nd time, still didn't found a shorter way:
import re
res = ( ( re.search("AAA(.*?)ZZZ", "gfgfdAAA1234ZZZuijjk") or re.search("()","") ).group(1) )

Python - Remove All Occurences Of A Substring Within A String

There are 2 main rules of note for the function I am trying to make:
No use of modules are allowed
The substring must be obtained by a 'begin' and 'end' string.
The aim is to take a base, begin, and end string. Then, remove all text between those strings. This has to be for each occurrence, not just the first.
eg:
base is "yes_and_no___yes_and_no",
begin is "yes",
end is "no"
output: "yesno___yesno"
This is my code so far, however it only works for the first occurrence. Would a recursive implementation be ideal?
def extractFromString(baseStr, extStr1, extStr2):
if extStr1 and extStr2 in baseStr:
# >1. Get start/end indices
start = baseStr.find(extStr1) + len(extStr1)
end = baseStr.find(extStr2)
# >2. Get first/second halves
firstHalf = baseStr[:start]
secondHalf = baseStr[end:]
# >3. Combine and return
result = firstHalf + secondHalf
return result
extStr1 = "yes"
extStr2 = "no"
def extractFromString(baseStr, extStr1, extStr2):
if extStr1 in baseStr and extStr2 in baseStr:
# >1. Get start/end indices
start = baseStr.find(extStr1) + len(extStr1)
end = baseStr.find(extStr2, start)
if end == -1:
return baseStr
processStr = baseStr[:end+len(extStr2)]
queueStr = baseStr[end+len(extStr2):]
firstHalf = processStr[:start]
secondHalf = processStr[end:]
processStr = firstHalf + secondHalf
return processStr + extractFromString(queueStr, extStr1, extStr2)
else:
return baseStr
for exampleStr in exampleStrs:
print("input:")
print(exampleStr)
print("output:")
print(extractFromString(exampleStr, extStr1, extStr2))
print("\n")
gives the following output:
input:
yes_and_no___yes_and_no
output:
yesno___yesno
input:
aha_no_yes_deleteThis_no_no_no_yes
output:
aha_no_yesno_no_no_yes
input:
yes_yes_aha_no_no_yes_no_no
output:
yesno_no_yesno_no
input:
yes_yes_no_no
output:
yesno_no
this is done by splitting the string and recursively calling the function.
Check for the last example if this is the behaviour you want tho.
There's a problem with your if. if extStr1 and extStr2 in baseStr doesn't do what you think it does. You need to check if each substring is in the base string individually like if extStr1 in baseStr and extStr2 in baseStr
Instead of using loops or recursion, I'd suggest using regular expressions and re.sub()
First, we build a regex to match yes, then as few of any character as possible, and then no: yes.*?no Try it
Remember to escape() the input strings in case they contain special characters.
Next, we replace all occurrences of this regex with yesno.
import re
def extractFromString(baseStr, extStr1, extStr2):
rexp = re.compile(f"{re.escape(extStr1)}.*?{re.escape(extStr2)}")
return re.sub(rexp, extStr1 + extStr2, baseStr)
Running this with a bunch of inputs
extractFromString("yes_and_no___yes_and_no", "yes", "no")
# Output: 'yesno___yesno'
extractFromString("aha_no_yes_deleteThis_no_no_no_yes", "yes", "no")
# Output: 'aha_no_yesno_no_no_yes'
extractFromString("yes_yes_aha_no_no_yes_no_no", "yes", "no")
# Output: 'yesno_no_yesno_no'
extractFromString("yes_yes_no_no", "yes", "no")
# Output: 'yesno_no'
You can split the base string at every occurrence of your extStr2 first and then split it at the occurrence of extStr1
def extractFromString(baseStr, extStr1, extStr2):
final_str= ""
if extStr1 and extStr2 in baseStr:
base_subStr= baseStr.split(extStr2)
for index in range(0,len(base_subStr)):
if extStr1 not in base_subStr[index]:
final_str= final_str + base_subStr[index]
else:
final_str= final_str + base_subStr[index].split(extStr1)[0] + extStr2
I haven't run this, but this might work for your case

Replacing a certain number of characters only

I was wondering if anyone could help provide some insight on the following problem that I am currently struggling with.
Let's assume that you have a file that contains the following characters:
|**********|
You have another file that contains a pattern, such as:
-
/-\
/---\
/-----\
/-------\
How would you go about replacing the characters in the pattern with the characters from the first file BUT at the same time - you can only print the specific number of *'s that are in the first file.
Once you have printed say the 10 stars, in total, you have to STOP printing.
So it would be something like:
*
***
*****
*
Any hints or tips or help would be greatly appreciated.
I have been using .replace() to replace all of the characters in the pattern with the '*' but I am unable to print the specific amount only.
for ch in ['-', '/', '\\']:
if ch in form:
form = form.replace(ch, '*')
Here's my aestric file(aestricks.txt), which contains:
************
And pattern file (pattern.txt), which contains:
-
/-\
/---\
/-----\
/-------\
And here's the code. I know it can be optimized a little more, but I am posting the basic one:
file1 = open("aestricks.txt","r")
file1 = file1.read()
t_c = len(file1)
form = open("pattern.txt","r")
form = form.read()
form1 = form
count = 0
for ch in form1:
if ch in ['-','/', '\\']:
form = form.replace(ch, '*', 1)
count += 1
if count == t_c:
break
for ch in form1:
if ch in ['-','/', '\\']:
form = form.replace(ch, '')
print(form)
OUTPUT:
*
***
*****
***
You can use regular expressions and sub() function from re module.
sub() takes an optional count argument that indicates the maximal number of pattern occurrences to be replaced.
import re
with open('asterisks.txt') as asterisks_file, open('ascii_art.txt') as ascii_art_file:
pattern = re.compile(r'[' # match one from within square brackets:
r'\\' # either backslash
r'/' # or slash
r'-' # or hyphen
r']')
# let n be the number of asterisks from the first file
n = asterisks_file.read().count('*')
# replace first n matches of our pattern (one of /\- characters)
replaced_b = pattern.sub('*', ascii_art_file.read(), n)
# replace rest of the /\- characters with spaces (based on your example)
result = pattern.sub(' ', replaced_b)
print(result)
OUTPUT:
*
***
*****
*
Instead of replacing every character at once you can replace items one at a time and use some count on number of replacements.
But str object doesn't support item assignment at specific index, so you have to convert the str object into list first. Then do your operations and convert back to str again.
you can write something like this.
characters = ['-', '/', '\\']
count = 0
a = list(form) # convert your string to list
for i in range(len(a)):
if a[i] in characters and count < 10: # iterate through each character
a[i] = '*' # replace with '*'
count += 1 # increment count
result = "".join(a) # convert list back into str
print(result)
import re
file1 = open("file1.txt", "r")
s=file1.read()
starcount=s.count('*')
file2 = open("file2.txt", "r")
line = re.sub(r'[-|\\|/]', r'*', file2.read(), starcount)
line = re.sub(r'[-|\\|/]', r'', line)
print(line)
Syntax of sub
>>> import re
>>> help(re.sub)
Help on function sub in module re:
sub(pattern, repl, string, count=0, flags=0)
Return the string obtained by replacing the leftmost
non-overlapping occurrences of the pattern in string by the
replacement repl. repl can be either a string or a callable;
if a string, backslash escapes in it are processed. If it is
a callable, it's passed the match object and must return
a replacement string to be used.
Output
*
***
*****
*
Demo
https://repl.it/repls/ObeseNoteworthyDevelopment
You just need to keep track of the number of * in the input line and then continue to replace the dashes until the counter runs out. Once the counter runs out then replace the remaining dashes with empty strings.
def replace(p, s):
counter = len(s) - 2
chars = ['\\', '/', '-']
i = 0
for c in p:
if c in chars:
p = p.replace(c, '*', 1)
i += 1
if i == counter:
break
p = p.replace('\\', '')
p = p.replace('/', '')
p = p.replace('-', '')
return p
if __name__ == '__main__':
stars = '|**********|'
pyramid = r'''
-
/-\
/---\
/-----\
/-------\ '''
print(pyramid)
print(replace(pyramid, stars))
OUTPUT
*
***
*****
*
import re
inp = open('stars.txt', 'r').read()
count = len(inp.strip('|')) #stripping off the extra characters from either end
pattern = open('pattern.txt', 'r').read() # read the entire pattern
out = re.sub(r'-|/|\\', r'*', pattern, count=count) # for any of the characters in '-' or '|' or '\', replace them with a '*' only **count** number of times.
out = re.sub(r'-|/|\\', r'', out) # to remove the left out characters
print (out)
Added one more re.sub line to remove the left out characters if any.

How do I split a comma delimited string in Python except for the commas that are within quotes

I am trying to split a comma delimited string in python. The tricky part for me here is that some of the fields in the data themselves have a comma in them and they are enclosed within quotes (" or '). The resulting split string should also have the quotes around the fields removed. Also, some fields can be empty.
Example:
hey,hello,,"hello,world",'hey,world'
needs to be split into 5 parts like below
['hey', 'hello', '', 'hello,world', 'hey,world']
Any ideas/thoughts/suggestions/help with how to go about solving the above problem in Python would be much appreciated.
Thank You,
Vish
Sounds like you want the CSV module.
(Edit: The original answer had trouble with empty fields on the edges due to the way re.findall works, so I refactored it a bit and added tests.)
import re
def parse_fields(text):
r"""
>>> list(parse_fields('hey,hello,,"hello,world",\'hey,world\''))
['hey', 'hello', '', 'hello,world', 'hey,world']
>>> list(parse_fields('hey,hello,,"hello,world",\'hey,world\','))
['hey', 'hello', '', 'hello,world', 'hey,world', '']
>>> list(parse_fields(',hey,hello,,"hello,world",\'hey,world\','))
['', 'hey', 'hello', '', 'hello,world', 'hey,world', '']
>>> list(parse_fields(''))
['']
>>> list(parse_fields(','))
['', '']
>>> list(parse_fields('testing,quotes not at "the" beginning \'of\' the,string'))
['testing', 'quotes not at "the" beginning \'of\' the', 'string']
>>> list(parse_fields('testing,"unterminated quotes'))
['testing', '"unterminated quotes']
"""
pos = 0
exp = re.compile(r"""(['"]?)(.*?)\1(,|$)""")
while True:
m = exp.search(text, pos)
result = m.group(2)
separator = m.group(3)
yield result
if not separator:
break
pos = m.end(0)
if __name__ == "__main__":
import doctest
doctest.testmod()
(['"]?) matches an optional single- or double-quote.
(.*?) matches the string itself. This is a non-greedy match, to match as much as necessary without eating the whole string. This is assigned to result, and it's what we actually yield as a result.
\1 is a backreference, to match the same single- or double-quote we matched earlier (if any).
(,|$) matches the comma separating each entry, or the end of the line. This is assigned to separator.
If separator is false (eg. empty), that means there's no separator, so we're at the end of the string--we're done. Otherwise, we update the new start position based on where the regex finished (m.end(0)), and continue the loop.
The csv module won't handle the scenario of " and ' being quotes at the same time. Absent a module that provides that kind of dialect, one has to get into the parsing business. To avoid reliance on a third party module, we can use the re module to do the lexical analysis, using the re.MatchObject.lastindex gimmick to associate a token type with the matched pattern.
The following code when run as a script passes all the tests shown, with Python 2.7 and 2.2.
import re
# lexical token symbols
DQUOTED, SQUOTED, UNQUOTED, COMMA, NEWLINE = xrange(5)
_pattern_tuples = (
(r'"[^"]*"', DQUOTED),
(r"'[^']*'", SQUOTED),
(r",", COMMA),
(r"$", NEWLINE), # matches end of string OR \n just before end of string
(r"[^,\n]+", UNQUOTED), # order in the above list is important
)
_matcher = re.compile(
'(' + ')|('.join([i[0] for i in _pattern_tuples]) + ')',
).match
_toktype = [None] + [i[1] for i in _pattern_tuples]
# need dummy at start because re.MatchObject.lastindex counts from 1
def csv_split(text):
"""Split a csv string into a list of fields.
Fields may be quoted with " or ' or be unquoted.
An unquoted string can contain both a " and a ', provided neither is at
the start of the string.
A trailing \n will be ignored if present.
"""
fields = []
pos = 0
want_field = True
while 1:
m = _matcher(text, pos)
if not m:
raise ValueError("Problem at offset %d in %r" % (pos, text))
ttype = _toktype[m.lastindex]
if want_field:
if ttype in (DQUOTED, SQUOTED):
fields.append(m.group(0)[1:-1])
want_field = False
elif ttype == UNQUOTED:
fields.append(m.group(0))
want_field = False
elif ttype == COMMA:
fields.append("")
else:
assert ttype == NEWLINE
fields.append("")
break
else:
if ttype == COMMA:
want_field = True
elif ttype == NEWLINE:
break
else:
print "*** Error dump ***", ttype, repr(m.group(0)), fields
raise ValueError("Missing comma at offset %d in %r" % (pos, text))
pos = m.end(0)
return fields
if __name__ == "__main__":
tests = (
("""hey,hello,,"hello,world",'hey,world'\n""", ['hey', 'hello', '', 'hello,world', 'hey,world']),
("""\n""", ['']),
("""""", ['']),
("""a,b\n""", ['a', 'b']),
("""a,b""", ['a', 'b']),
(""",,,\n""", ['', '', '', '']),
("""a,contains both " and ',c""", ['a', 'contains both " and \'', 'c']),
("""a,'"starts with "...',c""", ['a', '"starts with "...', 'c']),
)
for text, expected in tests:
result = csv_split(text)
print
print repr(text)
print repr(result)
print repr(expected)
print result == expected
I fabricated something like this. Very redundant I suppose, but it does the job for me. You have to adapt it a bit to your specifications:
def csv_splitter(line):
splitthese = [0]
splitted = []
splitpos = True
for nr, i in enumerate(line):
if i == "\"" and splitpos == True:
splitpos = False
elif i == "\"" and splitpos == False:
splitpos = True
if i == "," and splitpos == True:
splitthese.append(nr)
splitthese.append(len(line)+1)
for i in range(len(splitthese)-1):
splitted.append(re.sub("^,|\"","",line[splitthese[i]:splitthese[i+1]]))
return splitted

Categories