reading # char in python - python

can someone help with me reading "#" char in python? i can't seem to get the file. because this is an output from the stanford postagger, is there any scripts available to convert the stanford postagger http://nlp.stanford.edu/software/tagger.shtml file to cwb. http://cogsci.uni-osnabrueck.de/~korpora/ws/CWBdoc/CWB_Encoding_Tutorial/node3.html
so this is the utf-8 txt file that i'm trying to read:
如果#CS 您#PN 在#P 新加坡#NR 只#AD 能#VV 前往#VV 一#CD 间#M 俱乐部#NN ,#PU 祖卡#NN 酒吧#NN 必然#AD 是#VC 您#PN 的#DEG 不二#JJ 选择#NN 。#PU
作为#P 或许#AD 是#VC 新加坡#NR 唯一#JJ 一#CD 家#M 国际#NN 知名#VA 的#DEC 夜店#NN ,#PU 祖卡#NN 既#CC 是#VC 一#CD 个#M 公共#JJ 机构#NN ,#PU
So with this code i'm not readin the # char in the utf-8 txt files:
#!/usr/bin/python # -*- coding: utf-8 -*-
'''
stanford POS tagger to CWB format
'''
import codecs
import nltk
import os, sys, re, glob
reload(sys)
sys.setdefaultencoding('utf-8')
cwd = './path/to/file.txt' #os.getcwd()
for infile in glob.glob(os.path.join(cwd, 'zouk.txt')):
print infile
(PATH, FILENAME) = os.path.split(infile)
reader = codecs.open(infile, 'r', 'utf-8')
for line in reader:
for word in line:
if word == '\#':
print 'hex is here'

if word == '\#':
This probably doesn't do what you think it does. (Hint: print "\#")

If Python does not recognize an escape sequence then it will include the backslash in the string.
>>> '\#' == '\\#'
True

Related

Python CSV write to file unreadable in Excel (Chinese characters)

I am trying to performing text analysis on Chinese texts. The program is provided below. I got the result with unreadable characters such as 浜烘皯鏃ユ姤绀捐. And if I change the output file result.csv to result.txt, the characters are correct as 人民日报社论. So what's wrong with this? I can not figure out. I tried several ways including add decoder and encoder.
# -*- coding: utf-8 -*-
import os
import glob
import jieba
import jieba.analyse
import csv
import codecs
segList = []
raw_data_path = 'monthly_raw_data/'
file_name = ["201010", "201011", "201012", "201101", "201103", "201105", "201107", "201109", "201110", "201111", "201112", "201201", "201202", "201203", "201205", "201206", "201208", "201210", "201211"]
jieba.load_userdict("customized_dict.txt")
for name in file_name:
all_text = ""
multi_line_text = ""
with open(raw_data_path + name + ".txt", "r") as file:
for line in file:
if line != '\n':
multi_line_text += line
templist = multi_line_text.split('\n')
for text in templist:
all_text += text
seg_list = jieba.cut(all_text,cut_all=False)
temp_text = []
for item in seg_list:
temp_text.append(item.encode('utf-8'))
stop_list = []
with open("stopwords.txt", "r") as stoplistfile:
for item in stoplistfile:
stop_list.append(item.rstrip('\r\n'))
text_without_stopwords = []
for word in temp_text:
if word not in stop_list:
text_without_stopwords.append(word)
segList.append(text_without_stopwords)
with open("results/result.csv", 'wb') as f:
writer = csv.writer(f)
writer.writerows(segList)
For UTF-8 encoding, Excel requires a BOM (byte order mark) codepoint written at the start of the file or it will assume an ANSI encoding, which is locale-dependent. U+FEFF is the Unicode BOM. Here's an example that will open in Excel correctly:
#!python2
#coding:utf8
import csv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
Python 3 makes this easier. Use 'w', newline='', encoding='utf-8-sig' parameters instead of 'wb' which will accept Unicode strings directly and automatically write a BOM:
#!python3
#coding:utf8
import csv
data = [['American', '美国人'],
['Chinese', '中国人']]
with open('results.csv', 'w', newline='', encoding='utf-8-sig') as f:
w = csv.writer(f)
w.writerows(data)
There is also a 3rd–party unicodecsv module that makes Python 2 easier to use as well:
#!python2
#coding:utf8
import unicodecsv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv', 'wb') as f:
w = unicodecsv.writer(f ,encoding='utf-8-sig')
w.writerows(data)
Here is another way kinda tricky:
#!python2
#coding:utf8
import csv
data = [[u'American',u'美国人'],
[u'Chinese',u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
This code block generate csv file encoded utf-8 .
open file with notepad++ (or other Editor with encode feature)
Encoding -> convert to ANSI
save
Open file with Excel, it's OK.

Python unidecode function opening lists/documents

How can I open document as Unicode?
I have txt file which contains foreign characters. I need to open it word by word using this unidecode function.
I am getting error saying -- TypeError: 'module' object is not callable
import os
import re
import unidecode
def splitToWords(stringOfWords):
retVal = re.split('; |;|, |,|\*|\n|\. |\.|-| |\"',stringOfWords)
while '' in retVal:
retVal.remove('')
[val.lower() for val in retVal]
return retVal
....
with open(file,"r") as f:
file_content = f.read()
file_content = splitToWords(file_content)
for word in file_content
word = unidecode.unidecode(word)
f.close()
Hi please check the below code, is this you wanted ?
unicodestring = "u there"
utf8tostring = unicodestring.encode("utf-16")
print utf8tostring
code refered from the following website https://www.safaribooksonline.com/library/view/python-cookbook-2nd/0596007973/ch01s22.html
You can try something like this:
# you have to import unidecode function first
from unidecode import unidecode
with open(file) as f:
for line in f:
# this will split a line to words and decode them.
# you don't have to close() the file, "with open()" does that for you.
decoded_words = [unidecode(word) for word in line.split()]

Prepend information in base64 encoding

Here is an answer which gives some information on how to base64 encode a file. However, I also want to pass in the filetype and mimetype. for the information in the base64 encoded string.
So far I have for my base64 string:
x=base64.b64encode(open('/Users/user/Desktop/img.PNG').read())
What is the correct information to prepend, and how would I do this?
It seems like the following is how I would get the base64 file information to pass to the server:
file = '/Users/user/Desktop/img.PNG'
prepend_info = 'data:%s;base64' % mimetypes.guess_type(file)[0]
base_64_data = open(file).read().encode('base64')
image_data_base64 = '%s,%s' % (prepend_info, base_64_data)
This then gives me:
...
Perhaps something along these lines:
from __future__ import print_function
import base64
import binascii
import os
def base64_encode_file(filename):
filetype = os.path.splitext(filename)[1][1:] # remove leading '.' from ext
with open(filename) as file:
data = file.read()
return base64.b64encode(','.join((filename, filetype, data))), data
filename = 'C:/Users/martin/Desktop/img.PNG'
#filename = '/Users/user/Desktop/img.PNG'
encoded, data = base64_encode_file(filename)
print('encoded: {} (hex file data: {})'.format(encoded, binascii.hexlify(data)))
decoded = base64.b64decode(encoded).split(',', 2)
print('decoded:', decoded[0], decoded[1], binascii.hexlify(decoded[2]))
Output:
encoded: QzovVXNlcnMvbWFydGluL0Rlc2t0b3AvaW1nLlBORyxQTkcsiVBORwo=
(hex file data: 89504e470a)
decoded: C:/Users/martin/Desktop/img.PNG PNG 89504e470a

Tagging spanish text with Unicode characters not possible with NLTK?

I'm trying to parse some spanish sentences that contain non-ascii characters (mostly accents in words...for instance: película (film), atención (attention), etc).
I'm reading the lines from a file encoded with utf-8. Here is a sample of my script:
# -*- coding: utf-8 -*-
import nltk
import sys
from nltk.corpus import cess_esp as cess
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
f = codecs.open('spanish_sentences', encoding='utf-8')
results_file = codecs.open('tagging_results', encoding='utf-8', mode='w+')
for line in iter(f):
output_line = "Current line contents before tagging->" + str(line.decode('utf-8', 'replace'))
print output_line
results_file.write(output_line.encode('utf8'))
output_line = "Unigram tagger->"
print output_line
results_file.write(output_line)
s = line.decode('utf-8', 'replace')
output_line = tagger.uni.tag(s.split())
print output_line
results_file.write(str(output_line).encode('utf8'))
f.close()
results_file.close()
At this line:
output_line = tagger.uni.tag(s.split())
I'm getting this error:
/usr/local/lib/python2.7/dist-packages/nltk-2.0.4-py2.7.egg/nltk/tag/sequential.py:138: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
return self._context_to_tag.get(context)
Here is some output for a simple sentence:
Current line contents before tagging->tengo una queja y cada que hablo a atención me dejan en la linea media hora y cortan la llamada!!
Unigram tagger->
[(u'tengo', 'vmip1s0'), (u'una', 'di0fs0'), (u'queja', 'ncfs000'), (u'y', 'cc'), (u'cada', 'di0cs0'), (u'que', 'pr0cn000'), (u'hablo', 'vmip1s0'), (u'a', 'sps00'), (u'atenci\xf3n', None), (u'me', 'pp1cs000'), (u'dejan', 'vmip3p0'), (u'en', 'sps00'), (u'la', 'da0fs0'), (u'linea', None), (u'media', 'dn0fs0'), (u'hora', 'ncfs000'), (u'y', 'cc'), (u'cortan', None), (u'la', 'da0fs0'), (u'llamada!!', None)]
If I understood correctly from this chapter...the process is correct...I decode the line from utf-8 to Unicode, tag, and then encode from Unicode to utf-8 again...I don't understand this error
Any idea what I'm doing wrong?
Thanks,
Alejandro
EDIT: found the problem...basically the spanish cess_esp corpus is encoded with Latin-2 encoding. See the code below to see how to be able to train the tagger correctly.
tagged_sents = (
[(word.decode('Latin2'), tag) for (word, tag) in sent]
for sent in cess.tagged_sents()
)
tagger = UT(tagged_sents) # training a tagger
A better way would be to use the CorpusReader class to ask for the corpus encoding, thus you don't need to know it before-hand.
Possibly something is wrong with your tagger object or how your file is read. I re-wrote part of your code and it runs without error:
# -*- coding: utf-8 -*-
import urllib2, codecs
from nltk.corpus import cess_esp as cess
from nltk import word_tokenize
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
tagger = ut(cess.tagged_sents())
url = 'https://db.tt/42Lt5M5K'
fin = urllib2.urlopen(url).read().strip().decode('utf8')
fout = codecs.open('tagger.out', 'w', 'utf8')
for line in fin.split('\n'):
print>>fout, "Current line contents before tagging->", line
print>>fout, "Unigram tagger->",
print>>fout, tagger.tag(word_tokenize(line))
print>>fout, ""
[out]:
http://pastebin.com/n0NK574a

How to fix this UnicodeDecodeError that appears when I try to remove accents in Python strings?

I'm trying to use this function:
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
in the code below (which unzips and reads files with non-ASCII strings). But I'm getting this error, (from this library file C:\Python27\Lib\encodings\utf_8.py):
Message File Name Line Position
Traceback
<module> C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 64
getNameList C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 26
remove_accents C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 17
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 3: ordinal not in range(128)
Why am I getting this error? How to avoid it and make remove_accents work?
Thanks for any help!
Here's the entire code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import re
from zipfile import ZipFile
import csv
##def strip_accents(s):
## return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getNameList():
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
for name in namesDict:
print name
# name = strip_accents(name)
name = remove_accents(name)
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
# print maleNames
return names
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
file=zf.open(filename,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
#name=row[0].upper().decode('latin1')
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
file.close()
# print '\tImported %s'%filename
# print names
return names
if __name__ == "__main__":
getNameList()
Best practice is to decode to Unicode when the data comes into your program:
for row in rows:
name=row[0].upper().decode('utf8') # or whatever...you DO need to know the encoding.
Then remove_accents can just be:
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u''.join(c for c in nkfd_form if not unicodedata.combining(c))
Encode data when leaving your program such as writing to a file, database, terminal, etc.
Why remove accents in the first place?
If you want to robustly convert unicode characters to ascii in a string, you should use the awesome unidecode module:
>>> import unidecode
>>> unidecode.unidecode(u'Björk')
'Bjork'
>>> unidecode.unidecode(u'András Sütő')
'Andras Suto'
>>> unidecode.unidecode(u'Ελλάς')
'Ellas'
You get it because you are decoding from a bytestring without specifying a codec:
unicode(input_str)
Add a codec there (here I assume your data is encoded in utf-8, 0xe1 would be the first of a 3-byte character):
unicode(input_str, 'utf8')

Categories