Double quote string manipulation - python

I have some input data from ASCII files which uses double quote to encapsulate string as well as still use double quote inside those strings, for example:
"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'21.844"" "No Shift"
Notice the double quote used in the coordinate.
So I have been using:
valList = shlex.split(line)
But shlex get's confused with the double quote used as the second in the coordinate.
I've been doing a find and replace on '\"\"' to '\\\"\"'. This of course turns an empty strings to \"" as well so I do a find and replace on (this time with spaces) ' \\\"\" ' to ' \"\"" '. Not exactly the most efficient way of doing it!
Any suggestions on handling this double quote in the coordinate?

I would do it this way:
I would treat this line of text as a csv file. Then according to RFC 4180 :
If double-quotes are used to enclose fields, then a double-quote
appearing inside a field must be escaped by preceding it with
another double quote. For example:
"aaa","b""bb","ccc"
Then all you would need to do is to add another " to your coordinates. So it would look like this "S 0556'21.844"""(NOTE extra quote) Then you can use a standartcsv` module to break it apart and extract necessary information.
>>> from StringIO import StringIO
>>> import csv
>>>
>>> test = '''"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'21.844""" "No Shift"'''
>>> test_obj = StringIO(test)
>>> reader = csv.reader(test_obj, delimiter=' ', quotechar='"', quoting=csv.QUOTE_ALL)
>>> for i in reader:
... print i
...
The output would be :
['Reliable', 'Africa', '567.87', 'Bob', '', '', '', 'S 05`56\'21.844"', 'No Shift']

I'm not good with regexes, but this non-regex suggestion might help ...
INPUT = ('"Reliable" "Africa" 567.87 "Bob" "" "" "" "S 05`56'
"'"
'21.844"" "No Shift"')
def main(input):
output = input
surrounding_quote_symbol = '<!>'
if input.startswith('"'):
output = '%s%s' % (surrounding_quote_symbol, output[1:])
if input.endswith('"'):
output = '%s%s' % (output[:-1], surrounding_quote_symbol)
output = output.replace('" ', '%s ' % surrounding_quote_symbol)
output = output.replace(' "', ' %s' % surrounding_quote_symbol)
print "Stage 1:", output
output = output.replace('"', '\"')
output = output.replace(surrounding_quote_symbol, '"')
return output
if __name__ == "__main__":
output = main(INPUT)
print "End results:", output

Related

issues when using re.finditer with + sign character in string

I am using the following code to find the location the start index of some strings as well as a temperature all of which are read from a text file.
The array searchString, contains what I'm looking for. It does locate the index of the first character of each string. The issue is that unless I put the backslash in front of the string: +25°C, finditer gives an error.
(Alternately, if I remove the + sign, it works - but I need to look for the specific +25). My question is am I correctly escaping the + sign, since the line: print('Looking for: ' + headerName + ' in the file: ' + filename )
displays : Looking for: +25°C in the file: 123.txt (with the slash showing in front of of the +)
Am I just 'getting away with this', or is this escaping as it should?
thanks
import re
path = 'C:\mypath\\'
searchString =["Power","Cal", "test", "Frequency", "Max", "\+25°C"]
filename = '123.txt' # file name to check for text
def search_str(file_path):
with open(file_path, 'r') as file:
content = file.read()
for headerName in searchString:
print('Looking for: ' + headerName + ' in the file: ' + filename )
match =re.finditer(headerName, content)
sub_indices=[]
for temp in match:
index = temp.start()
sub_indices.append(index)
print(sub_indices ,'\n')
You should use the re.escape() function to escape your string pattern. It will escape all the special characters in given string, for example:
>>> print(re.escape('+25°C'))
\+25°C
>>> print(re.escape('my_pattern with specials+&$#('))
my_pattern\ with\ specials\+\&\$#\(
So replace your searchString with literal strings and try it with:
def search_str(file_path):
with open(file_path, 'r') as file:
content = file.read()
for headerName in searchString:
print('Looking for: ' + headerName + ' in the file: ' + filename )
match =re.finditer(re.escape(headerName), content)
sub_indices=[]
for temp in match:
index = temp.start()
sub_indices.append(index)
print(sub_indices ,'\n')

Get rid of parenthesis in output

I think this is an easy question for you as i am a beginner on python3.
When printing header of fasta file it contains parenthesis. How can i remove them ??
import sys
from Bio import Entrez
from Bio import SeqIO
#define email for entrez login
db = "nuccore"
Entrez.email = "someone#email.com"
#load accessions from arguments
if len(sys.argv[1:]) > 1:
accs = sys.argv[1:]
else: #load accesions from stdin
accs = [ l.strip() for l in sys.stdin if l.strip() ]
#fetch
sys.stderr.write( "Fetching %s entries from GenBank: %s\n" % (len(accs), ", ".join(accs[:10])))
for i,acc in enumerate(accs):
try:
sys.stderr.write( " %9i %s \r" % (i+1,acc))
handle = Entrez.efetch(db=db, rettype="fasta", id=acc)
seq_record = SeqIO.read(handle, "fasta")
if (len(seq_record.seq) > 0):
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
print(seq_record.seq)
except:
sys.stderr.write( "Error! Cannot fetch: %s \n" % acc)
./acc2fasta.py 163345 303239
It will return
(">M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:", 1379)
TCCTGCTGCTCTCGGGGGTCCTGGTCCTGACCGAGACCCGGGCTGGCTCCCACTCGATGAGGTATTTCAGCACCGCCGTGTCCCGGCCCGGCCTCGGGGAGCCCCGGTACCTGGAAGTCGGCTACGTGGACGACACGCAGTTCGTGCGGTTTGACAGCGACGCCCCGAATCCGAGGATGGAGCCGCGGGCGCGGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGATCGGGAGACGCAAAGGGCCAAGGGCAACGCACAATTTTTCCGAGTGAGCCTGAACAACCTGCGCGGCTACTACAACCAGAGCGAGGCCGGGTCTCACACCCTCCAGTGGATGTCCGGCTGCTACGTGGGGCCGGACGGGCGTCCTCCGCGCGGGTTCATGCAGTTCGGCTACGACGGCAGAGATTACCTCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCGGTGGAGACGATGGCTCAGATCTCCAAACGCAAGATGGAGGCGGCCGGTGAAGCTGAGGTACAGAGGAACTACCTGGAGGGCCGGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACACGCTGCTGCGCGCAGACCCTCCAAAGGCACATGTGACCCGTCACCCGATCTCTGGTCGTGAGGTCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGAAGAGATCTCACTGACCTGGCAGCGCAATGGGGAGGACCAGACCCAGGACATGGAGCTTGTGGAGACCAGGCCTTCAGGGGACGGAAACTTCCAGAAGTGGGCGGCCCTGTTGGTGCCTTCTGGAGAGGAGCAGAAATACACATGCCAAGTGCAGCACGAGGGGCTTCAGGAGCCCCTCACCCTGAAATGGGAACCTCCTCAGCCCTCCTTCCTCACCATGGGCATCATTGTTGGCCTGGTTCTCCTCGTGGTCACTGGAGCTGTGGTGGCTGGAGTTGTGATCTGCATGAAGAAGCGCTCAGGTGAAAAACGAGGGACTTATATCCAGGCTTCAAGCAGTGACAGTGCCCAGGGCTCTGATGTGTCTCTCACGGTTCCTAAAGTGTGAGACACCTGCCTTCGGGGGACTGAGTGATGCTTCATCCCGCTATGTGACATCAGATCCCCGGAACCCCTTTTTCTGCAGCTGCATCTGAATGTGTCAGTGCCCCTATTCGCATAAGTAGGAGTTAGGGAGACTGGCCCACCCATGCCCACTGCTGCCCTTCCCCACTGCCGTCCCTCCCCACCCTGACCTGTGTTCTCTTCCCTGATCCACTGTCCTGTTCCAGCAGAGACGAGGCTGGACCATGTCTATCCCTGTCTTTGCTTTATATGCACTGAAAAATGATATCTTCTTTCCTTATTGAAAATAAAATCTGTC
Error! Cannot fetch: 303239
How to get rid of parenthesis in output ??
header = ">" + seq_record.description + " Len:" , len(seq_record.seq)
print(header)
You're printing the representation of the tuple by doing so, with commas (expected) but also parentheses (unrequired)
The best way would be to join the data instead, so comma is inserted between the string fields, but tuple representation is left out:
print(",".join(header))
In your case it's a little tricker, you have to convert non-string arguments to string (tuple representation did the conversion but join doesn't):
print(",".join([str(x) for x in header]))
result:
>M69206.1 Bovine MHC class I AW10 mRNA (haplotype AW10), 3' end Len:,1379

How to replace quote ( `”` ) with normal quote (`"`)

I have some large content in a text file like this:
1. name="user1” age="21”
2. name="user2” age="25”
....
If we notice I have this ( ” ) special type of quote here at end of each word.
I just want to replace that quote ( ” ) with normal quote (")
Code:
import codecs
f = codecs.open('myfile.txt',encoding='utf-8')
for line in f:
print "str text : ",line
a = repr(line)
print "repr text : ",a
x = a.replace(u'\u201d', '"')
print "new text : ",x
Output:
str text : 1. name="user1” age="21”
repr text : u'1. name="user1\u201d age="21\u201d\n'
new text : u'1. name="user1\u201d age="21\u201d\n'
but its not working. What I am missing here?
Update :
I just tried this:
import codecs
f = codecs.open('one.txt')
for line in f:
print "str text : ",line
y= line.replace("\xe2\x80\x9d", '"')
print "ynew text : ",y
and it is working now.
Still I want to know what was wrong with x = a.replace(u'\u201d', '"')
a is the repr of the line, which does not contain the char ”, but contains the string \,u,2,0,1,d.
So changing a = repr(line) to a = line will fix the problem.

Python shlex.split(), ignore single quotes

How, in Python, can I use shlex.split() or similar to split strings, preserving only double quotes? For example, if the input is "hello, world" is what 'i say' then the output would be ["hello, world", "is", "what", "'i", "say'"].
import shlex
def newSplit(value):
lex = shlex.shlex(value)
lex.quotes = '"'
lex.whitespace_split = True
lex.commenters = ''
return list(lex)
print newSplit('''This string has "some double quotes" and 'some single quotes'.''')
You can use shlex.quotes to control which characters will be considered string quotes. You'll need to modify shlex.wordchars as well, to keep the ' with the i and the say.
import shlex
input = '"hello, world" is what \'i say\''
lexer = shlex.shlex(input)
lexer.quotes = '"'
lexer.wordchars += '\''
output = list(lexer)
# ['"hello, world"', 'is', 'what', "'i", "say'"]

Python: Replace tags but preserve inner text V2

I've got a script to do search and replace. it's based on a script here.
It was modified to accept file as input but it does not seem to recognize regex well.
The script:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
import re
import glob
_replacements = {
'[B]': '**',
'[/B]': '**',
'[I]': '//',
'[/I]': '//',
}
def _do_replace(match):
return _replacements.get(match.group(0))
def replace_tags(text, _re=re.compile('|'.join((r) for r in _replacements))):
return _re.sub(_do_replace, text)
def getfilecont(FN):
if not glob.glob(FN): return -1 # No such file
text = open(FN, 'rt').read()
text = replace_tags(text, re.compile('|'.join(re.escape(r) for r in _replacements)))
return replace_tags(text)
scriptName = os.path.basename(sys.argv[0])
if sys.argv[1:]:
srcfile = glob.glob(sys.argv[1])[0]
else:
print """%s: Error you must specify file, to convert forum tages to wiki tags!
Type %s FILENAME """ % (scriptName, scriptName)
exit(1)
dstfile = os.path.join('.' , os.path.basename(srcfile)+'_wiki.txt')
converted = getfilecont(srcfile)
try:
open(dstfile, 'wt+').write(converted)
print 'Done.'
except:
print 'Error saving file %s' % dstfile
print converted
#print replace_tags("This is an [[example]] sentence. It is [[{{awesome}}]].")
What I want is to replace
'[B]': '**',
'[/B]': '**',
with only one line like this as in regex
\[B\](.*?)\[\/B\] : **\1**
That very would be helpful with BBcode tags like this:
[FONT=Arial]Hello, how are you?[/FONT]
Then I can use something like this
\[FONT=(.*?)\](.*?)\[\/FONT\] : ''\2''
But I can not seem to be able to do that with this script. There are another ways to do regex search and replace in the original source of this script but it works for one tag at a time using re.sub. Other advantage of this script that I can add as many line as I want so I can update it later.
For starters, you're escaping the patterns on this line:
text = replace_tags(text, re.compile('|'.join(re.escape(r) for r in _replacements)))
re.escape takes a string and escapes it in such a way that if the new string were used as a regex it would match exactly the input string.
Removing the re.escape won't entirely solve your problem, however, ans you find the replacement by just doing a lookup of the matched text in your dict on this line:
return _replacements.get(match.group(0))
To fix this you could make each pattern into its own capture group:
text = replace_tags(text, re.compile('|'.join('(%s)' % r for r in _replacements)))
You'll also need to know which pattern goes with which substitution. Something like this might work:
_replacements_dict = {
'[B]': '**',
'[/B]': '**',
'[I]': '//',
'[/I]': '//',
}
_replacements, _subs = zip(*_replacements_dict.items())
def _do_replace(match):
for i, group in m.groups():
if group:
return _subs[i]
Note that this changes _replacements into a list of the patterns, and creates a parallel array _subs for the actual replacements. (I would have named them regexes and replacements, but didn't want to have to re-edit every occurrence of "_replacements").
Someone has done it here.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
import re
import glob
_replacements_dict = {
'\[B\]': '**',
'\[\/B\]': '**',
'\[I\]': '//',
'\[\/I\]': '//',
'\[IMG\]' : '{{',
'\[\/IMG\]' : '}}',
'\[URL=(.*?)\]\s*(.*?)\s*\[\/URL\]' : r'[[\1|\2]]',
'\[URL\]\s*(.*?)\s*\[\/URL\]' : r'[[\1]]',
'\[FONT=(.*?)\]' : '',
'\[color=(.*?)\]' : '',
'\[SIZE=(.*?)\]' : '',
'\[CENTER]' : '',
'\[\/CENTER]' : '',
'\[\/FONT\]' : '',
'\[\/color\]' : '',
'\[\/size\]' : '',
}
_replacements, _subs = zip(*_replacements_dict.items())
def replace_tags(text):
for i, _s in enumerate(_replacements):
tag_re = re.compile(r''+_s, re.I)
text, n = tag_re.subn(r''+_subs[i], text)
return text
def getfilecont(FN):
if not glob.glob(FN): return -1 # No such file
text = open(FN, 'rt').read()
return replace_tags(text)
scriptName = os.path.basename(sys.argv[0])
if sys.argv[1:]:
srcfile = glob.glob(sys.argv[1])[0]
else:
print """%s: Error you must specify file, to convert forum tages to wiki tags!
Type %s FILENAME """ % (scriptName, scriptName)
exit(1)
dstfile = os.path.join('.' , os.path.basename(srcfile)+'_wiki.txt')
converted = getfilecont(srcfile)
try:
open(dstfile, 'wt+').write(converted)
print 'Done.'
except:
print 'Error saving file %s' % dstfile
#print converted
#print replace_tags("This is an [[example]] sentence. It is [[{{awesome}}]].")
http://pastie.org/1447448

Categories