Custom encoding and decoding UTF special characters - python

Just for fun I've been embedding text in images. The following code is a distillation and demonstration of the encoding and decoding mechanism I am using.
class encChar:
def __init__(self,char):
self.p = self.enc(char)
def enc(self,char):
d = bin(ord(char)).split('b')[1]
while len(d)<8:
d = "0"+d
rdif = int(d[0])*4 + int(d[1])*2 + int(d[2])*1
gdif = int(d[3])*2 + int(d[4])*1
bdif = int(d[5])*4 + int(d[6])*2 + int(d[7])*1
return (rdif,gdif,bdif)
def dec(self):
dmap = {0:"000",1:"001",2:"010",3:"011",4:"100",5:"101",6:"110",7:"111"}
r = dmap[self.p[0]]
g = dmap[self.p[1]][1:]
b = dmap[self.p[2]]
n = int(r+g+b,2)
return chr(int(r+g+b,2))
testStr = """Languages
Deutsch
Español
Français
한국어
Italiano
Русский
Tagalog
Tiếng Việt
中文"""
result= ""
for line in testStr.split("\n"):
result+=line+"\n"
print(line)
print("".join([encChar(k).dec() for k in line]))
result+="".join([encChar(k).dec() for k in line])+"\n"
print()
result+="\n"
with open("errorop.txt","w",encoding="utf8") as op:
op.write(result)
Which produces the following document:
Languages
Languages
Deutsch
Deutsch
Español
Español
Français
Français
한국어
Õ­Å
Italiano
Italiano
Русский
Tagalog
Tagalog
Tiếng Việt
Tiõng Viöt
中文
Ë
As you can see several runes are altered by the process and I'm wondering how I can preserve them through this process.

Related

matching names across multiple lists

the following code is what I've tried to do so far:
import json
uids = {'483775843796': '"jared trav"','483843796': '"azu jared"', '483843996': '"hello azu"', '44384376': '"bitten virgo"', '48384326': '"bitten hello"', '61063868': '"charm feline voxela derp virgo"', '11136664': '"jessica"', '11485423': '"yukkixxtsuki"', '10401438': '"howen"', '29176667': '"zaku ramba char"', '36976082': '"bulma zelda dame prince"', '99661300': '"voxela"', '76923817': '"juniperrose"', '16179876': '"gnollfighter"', '45012369': '"pianist fuzz t travis blunt trav ttttttttttttttttttyt whole ryann lol tiper cuz"', '62797501': '"asriel"', '73647929': '"voxela"', '95019796': '"dao daoisms"', '70094978': '"mort"', '16233382': '"purrs"', '89270209': '"apocalevie waify"', '42873540': '"tear slash peaches attitude maso lyra juvia innocent"', '61284894': '"pup"', '68487075': '"ninja"', '66451758': '"az"', '23492247': '"vegeta"', '77980169': '"virus"'}
def _whois(string):
a = []
for i in uids:
i = json.loads(uids[i])
i = i.split()
if string in i:
a += i
for i in uids:
i = json.loads(uids[i])
i = i.split()
if bool(set(i) & set(a)) == True:
a += i
return list(set(a))
def whois(string):
a = []
ret = _whois(string)
for i in ret:
a += _whois(i)
return list(set(a))
print(whois("charm"))
I am trying to match a search term with accounts that share an id with the term in it, and then match each of those other accounts that are with the id to other accounts on other ids and so on and basically see all of the linked accounts that start from a single term.
For example, if I searched "charm" it would return: "charm feline voxela derp virgo bitten hello" from the example uids above.
After a certain way down the line of connected accounts it stops matching. How would I successfully do this so that it matches all accounts potentially infinitely?
i think i got it to work:
import json
terms = {'4837759863453450996': '"mamma riyoken"','4833480984509580996': '"mamma heika"','483775980980996': '"nemo heika"','4867568843796': '"control nemo"','4956775843796': '"t control"','483775843796': '"jared trav"','483843796': '"azu jared"', '483843996': '"hello azu"', '44384376': '"bitten virgo"', '48384326': '"bitten hello"', '61063868': '"charm feline voxela derp virgo"', '11136664': '"jessica"', '11485423': '"yukkixxtsuki"', '10401438': '"howen"', '29176667': '"zaku ramba char"', '36976082': '"bulma zelda dame prince"', '99661300': '"voxela"', '76923817': '"juniperrose"', '16179876': '"gnollfighter"', '45012369': '"pianist fuzz t travis blunt trav ttttttttttttttttttyt whole ryann lol tiper cuz"', '62797501': '"asriel"', '73647929': '"voxela"', '95019796': '"dao daoisms"', '70094978': '"mort"', '16233382': '"purrs"', '89270209': '"apocalevie waify"', '42873540': '"tear slash peaches attitude maso lyra juvia innocent"', '61284894': '"pup"', '68487075': '"ninja"', '66451758': '"az"', '23492247': '"vegeta"', '77980169': '"virus"'}
def _search(string):
a = []
for i in terms:
i = json.loads(terms[i])
i = i.split()
if string in i:
a += i
return list(set(a))
def search(string):
a = []
a.append(string)
while True:
l = len(a)
for n in a:
a += _search(n)
a = list(set(a))
if l == len(a):
break
return a
print(search("charm"))
Try this:
ids = {'483775843796': '"jared trav"','483843796': '"azu jared"', '483843996': '"hello azu"', '44384376': '"bitten virgo"', '48384326': '"bitten hello"', '61063868': '"charm feline voxela derp virgo"', '11136664': '"jessica"', '11485423': '"yukkixxtsuki"', '10401438': '"howen"', '29176667': '"zaku ramba char"', '36976082': '"bulma zelda dame prince"', '99661300': '"voxela"', '76923817': '"juniperrose"', '16179876': '"gnollfighter"', '45012369': '"pianist fuzz t travis blunt trav ttttttttttttttttttyt whole ryann lol tiper cuz"', '62797501': '"asriel"', '73647929': '"voxela"', '95019796': '"dao daoisms"', '70094978': '"mort"', '16233382': '"purrs"', '89270209': '"apocalevie waify"', '42873540': '"tear slash peaches attitude maso lyra juvia innocent"', '61284894': '"pup"', '68487075': '"ninja"', '66451758': '"az"', '23492247': '"vegeta"', '77980169': '"virus"'}
def find_word(word,dict):
for i,j in dict.items():
if word.lower() in j.lower():
print(i,j)
find_word('jared', ids)
Result:
483775843796 "jared trav"
483843796 "azu jared"

I am trying to extract sequences from a file, but getting following error

Code to extract sequences
from Bio import SeqIO
def get_cds_feature_with_qualifier_value(seq_record, name, value):
for feature in genome_record.features:
if feature.type == "CDS" and value in feature.qualifiers.get(name, []):
return feature
return None
genome_record = SeqIO.read("470.8208.gbk", "genbank")
db_xref = ['fig|470.8208.peg.2198', 'fig|470.8208.peg.2200', 'fig|470.8208.peg.2203', 'fig|470.8208.peg.2199', 'fig|470.8208.peg.2201', 'fig|470.8208.peg.2197', 'fig|470.8208.peg.2202', 'fig|470.8208.peg.2501', 'fig|470.8208.peg.2643', 'fig|470.8208.peg.2193', 'fig|470.8208.peg.2670', 'fig|470.8208.peg.2695', 'fig|470.8208.peg.2696', 'fig|470.8208.peg.2189', 'fig|470.8208.peg.2458', 'fig|470.8208.peg.2191', 'fig|470.8208.peg.2190', 'fig|470.8208.peg.2188', 'fig|470.8208.peg.2192', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.3215', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2686', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2685', 'fig|470.8208.peg.2684', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2640', 'fig|470.8208.peg.3221', 'fig|470.8208.peg.3222', 'fig|470.8208.peg.3389', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2653', 'fig|470.8208.peg.3216', 'fig|470.8208.peg.3231', 'fig|470.8208.peg.2641', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2637', 'fig|470.8208.peg.2642', 'fig|470.8208.peg.2679', 'fig|470.8208.peg.3230', 'fig|470.8208.peg.2676', 'fig|470.8208.peg.2677', 'fig|470.8208.peg.1238', 'fig|470.8208.peg.2478', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.854', 'fig|470.8208.peg.382', 'fig|470.8208.peg.383']
with open("nucleotides.fasta", "w") as nt_output, open("proteins.fasta", "w") as aa_output:
for xref in db_xref:
print ("Looking at " + xref)
cds_feature = get_cds_feature_with_qualifier_value (genome_record, "db_xref", xref)
gene_sequence = cds_feature.extract(genome_record.seq)
protein_sequence = gene_sequence.translate(table=11, cds=True)
# This is asking Python to halt if the translation does not match:
assert protein_sequence == cds_feature.qualifiers["translation"][0]
# Output FASTA records - note \n means insert a new line.
# This is a little lazy as it won't line wrap the sequence:
nt_output.write(">%s\n%s\n" % (xref, gene_sequence))
aa_output.write(">%s\n%s\n" % (xref, gene_sequence))
print("Done")
getting following error
/usr/local/lib/python3.7/dist-packages/Bio/GenBank/Scanner.py:1394: BiopythonParserWarning: Truncated LOCUS line found - is this correct?
:'LOCUS CP027704 3430798 bp DNA linear UNK \n'
BiopythonParserWarning,
Looking at fig|470.8208.peg.2198
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-32-323ff320990a> in <module>()
15 print ("Looking at " + xref)
16 cds_feature = get_cds_feature_with_qualifier_value (genome_record, "db_xref", xref)
---> 17 gene_sequence = cds_feature.extract(genome_record.seq)
18 protein_sequence = gene_sequence.translate(table=11, cds=True)
19
AttributeError: 'NoneType' object has no attribute 'extract'
You have a space between the get_cds_feature_with_qualifier_value call and its parameters (genome_record, "db_xref", xref), so the call is probably failing, leaving cds_feature as None.
Generally, you should provide reproducible examples so that someone else (who doesn't have the gbk file you're referencing) can still reproduce and troubleshoot your error.
Solved the problem.
'''
from Bio import SeqIO
def get_cds_feature_with_qualifier_value(seq_record, name, value):
for feature in genome_record.features:
if feature.type == "CDS" and value in feature.qualifiers.get(name, []):
return feature
return None
genome_record = SeqIO.read("470.8208.gbk", "genbank")
da_xref = ['fig|470.8208.peg.2198', 'fig|470.8208.peg.2200', 'fig|470.8208.peg.2203', 'fig|470.8208.peg.2199', 'fig|470.8208.peg.2201', 'fig|470.8208.peg.2197', 'fig|470.8208.peg.2202', 'fig|470.8208.peg.2501', 'fig|470.8208.peg.2643', 'fig|470.8208.peg.2193', 'fig|470.8208.peg.2670', 'fig|470.8208.peg.2695', 'fig|470.8208.peg.2696', 'fig|470.8208.peg.2189', 'fig|470.8208.peg.2458', 'fig|470.8208.peg.2191', 'fig|470.8208.peg.2190', 'fig|470.8208.peg.2188', 'fig|470.8208.peg.2192', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.3215', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2686', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2685', 'fig|470.8208.peg.2684', 'fig|470.8208.peg.2633', 'fig|470.8208.peg.2682', 'fig|470.8208.peg.3186', 'fig|470.8208.peg.2632', 'fig|470.8208.peg.2683', 'fig|470.8208.peg.3187', 'fig|470.8208.peg.2640', 'fig|470.8208.peg.3221', 'fig|470.8208.peg.3222', 'fig|470.8208.peg.3389', 'fig|470.8208.peg.2764', 'fig|470.8208.peg.2653', 'fig|470.8208.peg.3216', 'fig|470.8208.peg.3231', 'fig|470.8208.peg.2641', 'fig|470.8208.peg.2638', 'fig|470.8208.peg.2680', 'fig|470.8208.peg.2637', 'fig|470.8208.peg.2642', 'fig|470.8208.peg.2679', 'fig|470.8208.peg.3230', 'fig|470.8208.peg.2676', 'fig|470.8208.peg.2677', 'fig|470.8208.peg.1238', 'fig|470.8208.peg.2478', 'fig|470.8208.peg.2639', 'fig|470.8208.peg.854', 'fig|470.8208.peg.382', 'fig|470.8208.peg.383']
db_xref=[]
for xref in da_xref:
db_xref.append('SEED:' + xref)
with open("nucleotides.fasta", "w") as nt_output, open("proteins.fasta", "w") as aa_output:
for xref in db_xref:
print ("Looking at", xref)
cds_feature = get_cds_feature_with_qualifier_value(genome_record, "db_xref", xref)
gene_sequence = cds_feature.extract(genome_record.seq)
protein_sequence = gene_sequence.translate(table=11, cds=True)
# This is asking Python to halt if the translation does not match:
assert protein_sequence == cds_feature.qualifiers["translation"][0]
# Output FASTA records - note \n means insert a new line.
# This is a little lazy as it won't line wrap the sequence:
nt_output.write(">%s\n%s\n" % (xref, gene_sequence))
aa_output.write(">%s\n%s\n" % (xref, gene_sequence))
print("Done")
'''

accented characters in a regex with Python

This is my code
# -*- coding: utf-8 -*-
import json
import re
with open("/Users/paul/Desktop/file.json") as json_file:
file = json.load(json_file)
print file["desc"]
key="capacità"
result = re.findall("((?:[\S,]+\s+){0,3})"+key+"\s+((?:[\S,]+\s*){0,3})", file["desc"], re.IGNORECASE)
print result
This is the content of the file
{
"desc": "Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
}
My result is []
but what I want is result = "capacità"
You need to treat your string as an Unicode string, like this:
str = u"Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
And as you can see if you print str.encode('utf-8') you'll get:
Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+
The same way you can make your regex string an unicode or raw string with u or r respectively.
You can use this function to display different encodings.
The default encoding on your editor should be UTF-8. Check you settings with sys.getdefaultencoding().
def find_context(word_, n_before, n_after, string_):
# finds the word and n words before and after it
import re
b= '\w+\W+' * n_before
a= '\W+\w+' * n_after
pattern = '(' + b + word_ + a + ')'
return re.search(pattern, string_).groups(1)[0]
s = "Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+"
# find 0 words before and 3 after the word capacità
print(find_context('capacità',0,3,s) )
capacità di 215 litri
print(find_context(' capacit\u00e0',0,3,s) )
capacità di 215 litri

Regex with unicode and str

I have a list of regex and a replace function.
regex function
replacement_patterns = [(ur'\\u20ac', ur' euros'),(ur'\xe2\x82\xac', r' euros'),(ur'\b[eE]?[uU]?[rR]\b', r' euros'), (ur'\b([0-9]+)[eE][uU]?[rR]?[oO]?[sS]?\b',ur' \1 euros')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
If I write the string as bellow:
string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
replacer = RegexpReplacer()
texto= replacer.replace(string)
I get perfect results.
But if I call the function when iterating over a JSON file I have just loaded, it does not work (no error but no replacement)
What seems to happen is that when I call the function over the typed variable the function receives a STR, and when I call it from the JSON iteration it receives a unicode.
My question is why my regex is not working on the unicode, wouldnt it be supposed to?
Maybe you need something like this
import re
regex = re.compile("^http://.+", re.UNICODE)
And if you need more than one, you can do like this
regex = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
Get the example
>>> r = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
>>> r.match('HTTP://ыыы')
<_sre.SRE_Match object at 0x7f572455d648>
Does it correct result?
>>> class RegexpReplacer(object):
... def __init__(self, patterns=replacement_patterns):
... self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
... def replace(self, text):
... s = text
... for (pattern, repl) in self.patterns:
... (s, count) = re.subn(pattern, repl, s)
... return s
...
>>> string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
>>> replacer = RegexpReplacer()
>>> texto= replacer.replace(string)
>>> texto
u'730 euros.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 euros\r\n\nPack 850 euros, reparaci\\u00f3n. \r\n\n'
If you want Unicode replacement patterns, you need also be operating on Unicode strings. JSON should be returning Unicode as well.
Change the following by removing \\ and removing UTF-8 (won't see in a Unicode string). Also you compile with IGNORE_CASE so no need for [eE], etc.:
replacement_patterns = [(ur'\u20ac', ur' euros'),(ur'\be?u?r\b', r' euros'), (ur'\b([0-9]+)eu?r?o?s?\b',ur' \1 euros')]
Make the following a Unicode string (add u):
string = u'730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
Then it should operator on Unicode JSON as well.

UnicodeWarning: special characters in Tkinter

I have written a program in Tkinter (Python 2.7), a scrabblehelper in Norwegian which contains some special characters (æøå), which means my wordlist (ordliste) contains words with special characters.
When I run my function finnord(c*), it returns 'cd'. I am using an entry.get() to get the word to put in my function.
My problem is with the encoding of entry.get(). I have local coding UTF-8, but I get an UniCodeError when I am writing any special characters in my entrybox and matching them to my wordliste.
Here is my output.
Warning (from warnings module):
File "C:\pythonprog\scrabble\feud.py", line 46
if s not in liste and s in ordliste:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode -
interpreting them as being unequal
When i write in my shell:
> ordinn.get()
u'k\xf8**e'
> ordinn.get().encode('utf-8')
'k\xc3\xb8**e'
> print ordinn.get()
kø**e
> print ordinn.get().encode('utf-8')
kø**e
Anyone knows why I can't match ordinn.get() (entry) to my wordlist ?
I can reproduce the error this way:
% python
Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
[GCC 4.6.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> 'k\xf8**e' in [u'k\xf8**e']
__main__:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
False
So perhaps s is a str object, and liste or ordliste contains unicode, or (as eryksun points out in the comments) vice versa. The solution is to decode the str objects (most likely with the utf-8 codec) to make them unicode.
If that does not help, please print out and post the output of
print(repr(s))
print(repr(liste))
print(repr(ordliste))
I believe the problem can be avoided by converting all strings to unicode.
When you generate ordliste from norsk.txt, use
codecs.open('norsk.txt','r','utf-8'):
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r','utf-8') as fil:
ordliste = [line.rstrip(u'\n') for line in fil]
Convert all user input to unicode as soon as possible:
def get_unicode(widget):
streng = widget.get()
try:
streng = streng.decode('utf-8')
except UnicodeEncodeError:
pass
return streng
So perhaps try this:
import Tkinter as tk
import tkMessageBox
import codecs
import itertools
import sys
alfabetet = (u"abcdefghijklmnopqrstuvwxyz"
u"\N{LATIN SMALL LETTER AE}"
u"\N{LATIN SMALL LETTER O WITH STROKE}"
u"\N{LATIN SMALL LETTER A WITH RING ABOVE}")
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r',encoding) as fil:
ordliste = set(line.rstrip(u'\n') for line in fil)
def get_unicode(widget):
streng = widget.get()
if isinstance(streng,str):
streng = streng.decode('latin-1')
return streng
def siord():
alfa=lagtabell()
try:
streng = get_unicode(ordinn)
ordene=finnord(streng,alfa)
if len(ordene) == 0:
# There are no words that match
tkMessageBox.showinfo('Dessverre..','Det er ingen ord som passer...')
else:
# Done: The words that fit the pattern
tkMessageBox.showinfo('Ferdig',
'Ordene som passer er:\n'+ordene.encode('utf-8'))
except Exception as err:
# There has been a mistake .. Check your word
print(repr(err))
tkMessageBox.showerror('ERROR','Det har skjedd en feil.. Sjekk ordet ditt.')
def finnord(streng,alfa):
liste = set()
for substitution in itertools.permutations(alfa,streng.count(u'*')):
s = streng
for ch in substitution:
s = s.replace(u'*',ch,1)
if s in ordliste:
liste.add(s)
liste = [streng]+list(liste)
return u','.join(liste)+u'.'
def lagtabell():
tinbox = get_unicode(bokstinn)
if not tinbox.isalpha():
alfa = alfabetet
else:
alfa = tinbox.lower()
return alfa
root = tk.Tk()
root.title('FeudHjelper av Martin Skow Røed')
root.geometry('400x250+450+200')
# root.iconbitmap('data/ikon.ico')
skrift1 = tk.Label(root,
text = '''\
Velkommen til FeudHjelper. Skriv inn de bokstavene du har, og erstatt ukjente med *.
F. eks: sl**ge
Det er kun lov til å bruke tre stjerner, altså tre ukjente bokstaver.''',
font = ('Verdana',8), wraplength=350)
skrift1.pack(pady = 5)
ordinn = tk.StringVar(None)
tekstboks = tk.Entry(root, textvariable = ordinn)
tekstboks.pack(pady = 5)
# What letters do you have? Eg "ahneki". Leave blank here if you want all the words.
skrift2 = tk.Label(root, text = '''Hvilke bokstaver har du? F. eks "ahneki". La det være blankt her hvis du vil ha alle ordene.''',
font = ('Verdana',8), wraplength=350)
skrift2.pack(pady = 10)
bokstinn = tk.StringVar(None)
tekstboks2 = tk.Entry(root, textvariable = bokstinn)
tekstboks2.pack()
knapp = tk.Button(text = 'Finn ord!', command = siord)
knapp.pack(pady = 10)
root.mainloop()

Categories