String from file to string utf-8 in python - python

So I am reading and manipulate a file with :
base_file = open(path+'/'+base_name, "r")
lines = base_file.readlines()
After this I search and find the "raw_data" start of line.
if re.match("\s{0,100}raw_data: ",line):
split_line = line.split("raw_data:")
print(split_line)
raw_string = split_line[1]
One example of raw_data is:
raw_data: "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
And raw_string will be
print(raw_data)
"&\276!\300\307
=\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
If I tried to read this file I will obtain one char to one char even for escape characters.
So, my question is how to transform this plain text to utf-8 string so that I can have one character when reading \300 and not 4 characters.
I tried to pass "encondig =utf-8" in open file method but does not work.
I have made the same example passing raw_data as variable and it works properly.
RAW_DATA = "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
For this example \300 is only one char.
Hope someone can help me.

The problem is that on the read file the escape \ symbols are coming in as \, but in the example you've provided they are being evaluated as part of the numerics that follow it. ie, \276 is read as a single character.
If you run:
RAW_DATA = r"&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
You would should be getting the same error that you were getting originally. Notice that we are using the raw-string literal instead of regular string literal. This will ensure that the \ don't get escaped.
You would need to evaluate the RAW_DATA to force it to evaluate the \.
You can do something like RAW_DATA = eval(f'"{RAW_DATA}"') or
import ast
RAW_DATA = ast.literal_eval(f'"{RAW_DATA}"')
Note, the second option is a bit more secure that doing a straight eval as you are limiting the scope of what can be executed.

Related

Read Null terminated string in python

I'm trying to read a null terminated string but i'm having issues when unpacking a char and putting it together with a string.
This is the code:
def readString(f):
str = ''
while True:
char = readChar(f)
str = str.join(char)
if (hex(ord(char))) == '0x0':
break
return str
def readChar(f):
char = unpack('c',f.read(1))[0]
return char
Now this is giving me this error:
TypeError: sequence item 0: expected str instance, int found
I'm also trying the following:
char = unpack('c',f.read(1)).decode("ascii")
But it throws me:
AttributeError: 'tuple' object has no attribute 'decode'
I don't even know how to read the chars and add it to the string, Is there any proper way to do this?
Here's a version that (ab)uses __iter__'s lesser-known "sentinel" argument:
with open('file.txt', 'rb') as f:
val = ''.join(iter(lambda: f.read(1).decode('ascii'), '\x00'))
How about:
myString = myNullTerminatedString.split("\x00")[0]
For example:
myNullTerminatedString = "hello world\x00\x00\x00\x00\x00\x00"
myString = myNullTerminatedString.split("\x00")[0]
print(myString) # "hello world"
This works by splitting the string on the null character. Since the string should terminate at the first null character, we simply grab the first item in the list after splitting. split will return a list of one item if the delimiter doesn't exist, so it still works even if there's no null terminator at all.
It also will work with byte strings:
myByteString = b'hello world\x00'
myStr = myByteString.split(b'\x00')[0].decode('ascii') # "hello world" as normal string
If you're reading from a file, you can do a relatively larger read - estimate how much you'll need to read to find your null string. This is a lot faster than reading byte-by-byte. For example:
resultingStr = ''
while True:
buf = f.read(512)
resultingStr += buf
if len(buf)==0: break
if (b"\x00" in resultingStr):
extraBytes = resultingStr.index(b"\x00")
resultingStr = resultingStr.split(b"\x00")[0]
break
# now "resultingStr" contains the string
f.seek(0 - extraBytes,1) # seek backwards by the number of bytes, now the pointer will be on the null byte in the file
# or f.seek(1 - extraBytes,1) to skip the null byte in the file
(edit version 2, added extra way at the end)
Maybe there are some libraries out there that can help you with this, but as I don't know about them lets attack the problem at hand with what we know.
In python 2 bytes and string are basically the same thing, that change in python 3 where string is what in py2 is unicode and bytes is its own separate type, which mean that you don't need to define a read char if you are in py2 as no extra work is required, so I don't think you need that unpack function for this particular case, with that in mind lets define the new readString
def readString(myfile):
chars = []
while True:
c = myfile.read(1)
if c == chr(0):
return "".join(chars)
chars.append(c)
just like with your code I read a character one at the time but I instead save them in a list, the reason is that string are immutable so doing str+=char result in unnecessary copies; and when I find the null character return the join string. And chr is the inverse of ord, it will give you the character given its ascii value. This will exclude the null character, if its needed just move the appending...
Now lets test it with your sample file
for instance lets try to read "Sword_Wea_Dummy" from it
with open("sword.blendscn","rb") as archi:
#lets simulate that some prior processing was made by
#moving the pointer of the file
archi.seek(6)
string=readString(archi)
print "string repr:", repr(string)
print "string:", string
print ""
#and the rest of the file is there waiting to be processed
print "rest of the file: ", repr(archi.read())
and this is the output
string repr: 'Sword_Wea_Dummy'
string: Sword_Wea_Dummy
rest of the file: '\xcd\xcc\xcc=p=\x8a4:\xa66\xbfJ\x15\xc6=\x00\x00\x00\x00\xeaQ8?\x9e\x8d\x874$-i\xb3\x00\x00\x00\x00\x9b\xc6\xaa2K\x15\xc6=;\xa66?\x00\x00\x00\x00\xb8\x88\xbf#\x0e\xf3\xb1#ITuB\x00\x00\x80?\xcd\xcc\xcc=\x00\x00\x00\x00\xcd\xccL>'
other tests
>>> with open("sword.blendscn","rb") as archi:
print readString(archi)
print readString(archi)
print readString(archi)
sword
Sword_Wea_Dummy
ÍÌÌ=p=Š4:¦6¿JÆ=
>>> with open("sword.blendscn","rb") as archi:
print repr(readString(archi))
print repr(readString(archi))
print repr(readString(archi))
'sword'
'Sword_Wea_Dummy'
'\xcd\xcc\xcc=p=\x8a4:\xa66\xbfJ\x15\xc6='
>>>
Now that I think about it, you mention that the data portion is of fixed size, if that is true for all files and the structure on all of them is as follow
[unknow size data][know size data]
then that is a pattern we can exploit, we only need to know the size of the file and we can get both part smoothly as follow
import os
def getDataPair(filename,knowSize):
size = os.path.getsize(filename)
with open(filename, "rb") as archi:
unknown = archi.read(size-knowSize)
know = archi.read()
return unknown, know
and by knowing the size of the data portion, its use is simple (which I get by playing with the prior example)
>>> strins_data, data = getDataPair("sword.blendscn", 80)
>>> string_data, data = getDataPair("sword.blendscn", 80)
>>> string_data
'sword\x00Sword_Wea_Dummy\x00'
>>> data
'\xcd\xcc\xcc=p=\x8a4:\xa66\xbfJ\x15\xc6=\x00\x00\x00\x00\xeaQ8?\x9e\x8d\x874$-i\xb3\x00\x00\x00\x00\x9b\xc6\xaa2K\x15\xc6=;\xa66?\x00\x00\x00\x00\xb8\x88\xbf#\x0e\xf3\xb1#ITuB\x00\x00\x80?\xcd\xcc\xcc=\x00\x00\x00\x00\xcd\xccL>'
>>> string_data.split(chr(0))
['sword', 'Sword_Wea_Dummy', '']
>>>
Now to get each string a simple split will suffice and you can pass the rest of the file contained in data to the appropriated function to be processed
Doing file I/O one character at a time is horribly slow.
Instead use readline0, now on pypi: https://pypi.org/project/readline0/ . Or something like it.
In 3.x, there's a "newline" argument to open, but it doesn't appear to be as flexible as readline0.
Here is my implementation:
import struct
def read_null_str(f):
r_str = ""
while 1:
back_offset = f.tell()
try:
r_char = struct.unpack("c", f.read(1))[0].decode("utf8")
except:
f.seek(back_offset)
temp_char = struct.unpack("<H", f.read(2))[0]
r_char = chr(temp_char)
if ord(r_char) == 0:
return r_str
else:
r_str += r_char

Unicode text represented as u'xxxx instead of Japanese in Python 2.7

I've had many struggles with Unicode in Python over the years as I work with many text files in Japanese, so I'm familiar with using .encode("utf-8") to get Japanese text back into Japanese display from u'xxxx. I am NOT getting any encoding/decoding errors. But text I'm reading from a unicode file, manipulating, then writing back into a new file is being represented as strings of u'xxxx instead of the original Japanese text. I have tried .encode() and .decode() in multiple places, and also not using them at all, every time with the same result. Any suggestions are welcome.
Specifically, I am using the Scrapy library to write a spider that takes text from a file it crawls, extracts bits of text to construct the filename of a new file, and then writes the first div of the HTML file as a string into that new file.
What is even more confusing to me is that the bits of text I'm using to create the filename all render in Japanese, as does the filename itself. Is it because I am using str() on the div that I am getting u'xxxx as the content of my file? Please toward the end of the code to see this line.
Here is my complete code (and please ignore how hacky some of it is):
def parse_item(self, response):
original = 0
author = "noauthor"
title = "notitle"
year = "xxxx"
publisher = "xxxx"
typer = "xxxx"
ispub = 0
filename = response.url.split("/")[-1]
if "_" in filename:
filename = filename.split("_")[0]
if filename.isdigit():
title = response.xpath("//h1/text()").extract()[0].encode("utf-8")
author = response.xpath("//h2/text()").extract()[0].encode("utf-8")
ID = filename
bibliographic_info = response.xpath("//div[2]/text()").extract()
for subyear in bibliographic_info:
ispub = 0
subyear = subyear.encode("utf-8").strip()
if "初出:" in subyear:
publisher = subyear.split(":")[1]
original = 1
ispub = 1
if "入力:" in subyear:
typer = subyear.split(":")[1]
if len(subyear) > 1 and (original == 1) and (ispub == 0):
counter = 0
while counter < len(subyear):
if subyear[counter].isdigit():
break
counter+=1
if counter != len(subyear):
year = subyear[counter:(counter+4)]
original = 0
body = str(response.xpath("//div[1]/text()").extract())
new_filename = author + "_" + title + "_" + publisher + "_" + year + "_" + typer + ".html"
file = open(new_filename, "a")
file.write(body.encode("utf-8")
file.close()
# -*- coding: utf-8 -*-
# u'初出' and u'\u521d\u51fa' are different ways to specify *the same* string
assert u'初出' == u'\u521d\u51fa'
#XXX don't mix Unicode and bytes!!!
assert u'初出' != '初出' and u'初出' != '\u521d\u51fa'
Don't use str() at all with a Unicode string as an argument, use the explicit .encode() instead.
Do not call .encode(), .decode() unless necessary; use Unicode sandwich instead:
decode bytes that you receive from outside world into Unicode
keep it Unicode inside your script
encode into bytes at the end to save to a file, send over a network.
Both the first and the last step might be implicit i.e., your program might only see Unicode text.
Note, these are three different things:
the way a string looks like in the source code when you specify it using a string literal (unicode escapes, source code encoding, raw string literals)
the content of the string
how it looks like if you print it (repr(), 'backslashreplace' error handler)
If you see u'...' in the output; it means that at some point repr(unicode_string) is called. It may be implicit e.g., via print([unicode_string]) because repr() is called on items of the list while it is converted to string.
print(u'\u521d\u51fa') # -> 初出 #NOTE: no u'', \u..
print(repr(u'\u521d\u51fa')) # -> u'\u521d\u51fa'

Two regex functions together do not work

I am trying to get the index for the start of a tag and the end of another tag. However, when I use one regex it works absolutely fine but for two regex functions, it gives an error for the second one.
Kindly help in explaining the reason
The below code works fine:
import re
f = open('C:/Users/Jyoti/Desktop/PythonPrograms/try.xml','r')
opentag = re.search('<TEXT>',f.read())
begin = opentag.start()+6
print begin
But when I add another similar regex it give me the error
AttributeError: 'NoneType' object has no attribute 'start'
which I understand is due to the start() function returning None
Below is the code:
import re
f = open('C:/Users/Jyoti/Desktop/PythonPrograms/try.xml','r')
opentag = re.search('<TEXT>',f.read())
begin = opentag.start()+6
print begin
closetag = re.search('</TEXT>',f.read())
end = closetag.start() - 1
print end
Please provide a solution to how can I get this working. Also I am a newbie here so please don't mind if I ask more questions on the solution.
You are reading the file in f.read() which reads the whole file, and so the file descriptor moves forward, which means the text can't be read again when you do f.read() the next time.
If you need to search on the same text again, save the output of f.read(), and then do a regular expression search on it as below:
import re
f = open('C:/Users/Jyoti/Desktop/PythonPrograms/try.xml','r')
text = f.read()
opentag = re.search('<TEXT>',text)
begin = opentag.start()+6
print begin
closetag = re.search('</TEXT>',text)
end = closetag.start() - 1
print end
f.read() reads the whole file. So there's nothing left to read on the second f.read() call.
See https://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects
First of all you have to know that f.read() after read file sets the pointer to the EOF so if you again use f.read() it gives you empty string ''. Secondly you should use r before string passed as a pattern of re.search function, which means raw, and automatically escapes special characters. So you have to do something like this:
import re
f = open('C:/Users/Jyoti/Desktop/PythonPrograms/try.xml','r')
data = f.read()
opentag = re.search(r'<TEXT>',data)
begin = opentag.start()+6
print begin
closetag = re.search(r'</TEXT>',data)
end = closetag.start() - 1
print end
gl & hf with Python :)

Python: converting hex values, stored as string, to hex data

(Answer found. Close the topic)
I'm trying to convert hex values, stored as string, in to hex data.
I have:
data_input = 'AB688FB2509AA9D85C239B5DE16DD557D6477DEC23AF86F2AABD6D3B3E278FF9'
I need:
data_output = '\xAB\x68\x8F\xB2\x50\x9A\xA9\xD8\x5C\x23\x9B\x5D\xE1\x6D\xD5\x57\xD6\x47\x7D\xEC\x23\xAF\x86\xF2\xAA\xBD\x6D\x3B\x3E\x27\x8F\xF9'
I was trying data_input.decode('hex'), binascii.unhexlify(data_input) but all they return:
"\xabh\x8f\xb2P\x9a\xa9\xd8\\#\x9b]\xe1m\xd5W\xd6G}\xec#\xaf\x86\xf2\xaa\xbdm;>'\x8f\xf9"
What should I write to receive all bytes in '\xFF' view?
updating:
I need representation in '\xFF' view to write this data to a file (I'm opening file with 'wb') as:
«hЏІPљ©Ш\#›]бmХWЦG}м#Ї†тЄЅm;>'Џщ
update2
Sorry for bothering. An answer lies under my nose all the time:
data_output = data_input.decode('hex')
write_file(filename, data_output) #just opens a file 'wb', ant write a data in it
gives the same result as I need
I like chopping strings into fixed-width chunks using re.findall
print '\\x' + '\\x'.join(re.findall('.{2}', data_input))
If you want to actually convert the string into a list of ints, you can do that like this:
data = [int(x, 16) for x in re.findall('.{2}', data_input)]
It's an inefficient solution, but there's always:
flag = True
data_output = ''
for char in data_input:
if flag:
buffer = char
flag = False
else:
data_output = data_output + '\\x' + buffer + char
flag = True
EDIT HOPEFULLY THE LAST: Who knew I could mess up in so many different ways on that simple a loop? Should actually run now...
>>> int('0x10AFCC', 16)
1093580
>>> hex(1093580)
'0x10afcc'
So prepend your string with '0x' then do the above

Python RegEx nested search and replace

I need to to a RegEx search and replace of all commas found inside of quote blocks.
i.e.
"thing1,blah","thing2,blah","thing3,blah",thing4
needs to become
"thing1\,blah","thing2\,blah","thing3\,blah",thing4
my code:
inFile = open(inFileName,'r')
inFileRl = inFile.readlines()
inFile.close()
p = re.compile(r'["]([^"]*)["]')
for line in inFileRl:
pg = p.search(line)
# found comment block
if pg:
q = re.compile(r'[^\\],')
# found comma within comment block
qg = q.search(pg.group(0))
if qg:
# Here I want to reconstitute the line and print it with the replaced text
#print re.sub(r'([^\\])\,',r'\1\,',pg.group(0))
I need to filter only the columns I want based on a RegEx, filter further,
then do the RegEx replace, then reconstitute the line back.
How can I do this in Python?
The csv module is perfect for parsing data like this as csv.reader in the default dialect ignores quoted commas. csv.writer reinserts the quotes due to the presence of commas. I used StringIO to give a file like interface to a string.
import csv
import StringIO
s = '''"thing1,blah","thing2,blah","thing3,blah"
"thing4,blah","thing5,blah","thing6,blah"'''
source = StringIO.StringIO(s)
dest = StringIO.StringIO()
rdr = csv.reader(source)
wtr = csv.writer(dest)
for row in rdr:
wtr.writerow([item.replace('\\,',',').replace(',','\\,') for item in row])
print dest.getvalue()
result:
"thing1\,blah","thing2\,blah","thing3\,blah"
"thing4\,blah","thing5\,blah","thing6\,blah"
General Edit
There was
"thing1\\,blah","thing2\\,blah","thing3\\,blah",thing4
in the question, and now it is not there anymore.
Moreover, I hadn't remarked r'[^\\],'.
So, I completely rewrite my answer.
"thing1,blah","thing2,blah","thing3,blah",thing4
and
"thing1\,blah","thing2\,blah","thing3\,blah",thing4
being displays of strings (I suppose)
import re
ss = '"thing1,blah","thing2,blah","thing3\,blah",thing4 '
regx = re.compile('"[^"]*"')
def repl(mat, ri = re.compile('(?<!\\\\),') ):
return ri.sub('\\\\',mat.group())
print ss
print repr(ss)
print
print regx.sub(repl, ss)
print repr(regx.sub(repl, ss))
result
"thing1,blah","thing2,blah","thing3\,blah",thing4
'"thing1,blah","thing2,blah","thing3\\,blah",thing4 '
"thing1\blah","thing2\blah","thing3\,blah",thing4
'"thing1\\blah","thing2\\blah","thing3\\,blah",thing4 '
You can try this regex.
>>> re.sub('(?<!"),(?!")', r"\\,",
'"thing1,blah","thing2,blah","thing3,blah",thing4')
#Gives "thing1\,blah","thing2\,blah","thing3\,blah",thing4
The logic behind this is to substitute a , with \, if it is not immediately both preceded and followed by a "
I came up with an iterative solution using several regex functions:
finditer(), findall(), group(), start() and end()
There's a way to turn all this into a recursive function that calls itself.
Any takers?
outfile = open(outfileName,'w')
p = re.compile(r'["]([^"]*)["]')
q = re.compile(r'([^\\])(,)')
for line in outfileRl:
pg = p.finditer(line)
pglen = len(p.findall(line))
if pglen > 0:
mpgstart = 0;
mpgend = 0;
for i,mpg in enumerate(pg):
if i == 0:
outfile.write(line[:mpg.start()])
qg = q.finditer(mpg.group(0))
qglen = len(q.findall(mpg.group(0)))
if i > 0 and i < pglen:
outfile.write(line[mpgend:mpg.start()])
if qglen > 0:
for j,mqg in enumerate(qg):
if j == 0:
outfile.write( mpg.group(0)[:mqg.start()] )
outfile.write( re.sub(r'([^\\])(,)',r'\1\\\2',mqg.group(0)) )
if j == (qglen-1):
outfile.write( mpg.group(0)[mqg.end():] )
else:
outfile.write(mpg.group(0))
if i == (pglen-1):
outfile.write(line[mpg.end():])
mpgstart = mpg.start()
mpgend = mpg.end()
else:
outfile.write(line)
outfile.close()
have you looked into str.replace()?
str.replace(old, new[, count])
Return a copy of the string with all occurrences of substring old
replaced by new. If the optional argument count is given, only the
first count occurrences are replaced.
here is some documentation
hope this helps

Categories