reading # char in python - python

can someone help with me reading "#" char in python? i can't seem to get the file. because this is an output from the stanford postagger, is there any scripts available to convert the stanford postagger file to cwb.
so this is the utf-8 txt file that i'm trying to read:
如果#CS 您#PN 在#P 新加坡#NR 只#AD 能#VV 前往#VV 一#CD 间#M 俱乐部#NN ,#PU 祖卡#NN 酒吧#NN 必然#AD 是#VC 您#PN 的#DEG 不二#JJ 选择#NN 。#PU
作为#P 或许#AD 是#VC 新加坡#NR 唯一#JJ 一#CD 家#M 国际#NN 知名#VA 的#DEC 夜店#NN ,#PU 祖卡#NN 既#CC 是#VC 一#CD 个#M 公共#JJ 机构#NN ,#PU
So with this code i'm not readin the # char in the utf-8 txt files:
#!/usr/bin/python # -*- coding: utf-8 -*-
stanford POS tagger to CWB format
import codecs
import nltk
import os, sys, re, glob
cwd = './path/to/file.txt' #os.getcwd()
for infile in glob.glob(os.path.join(cwd, 'zouk.txt')):
print infile
(PATH, FILENAME) = os.path.split(infile)
reader =, 'r', 'utf-8')
for line in reader:
for word in line:
if word == '\#':
print 'hex is here'

if word == '\#':
This probably doesn't do what you think it does. (Hint: print "\#")

If Python does not recognize an escape sequence then it will include the backslash in the string.
>>> '\#' == '\\#'


Python CSV write to file unreadable in Excel (Chinese characters)

I am trying to performing text analysis on Chinese texts. The program is provided below. I got the result with unreadable characters such as 浜烘皯鏃ユ姤绀捐. And if I change the output file result.csv to result.txt, the characters are correct as 人民日报社论. So what's wrong with this? I can not figure out. I tried several ways including add decoder and encoder.
# -*- coding: utf-8 -*-
import os
import glob
import jieba
import jieba.analyse
import csv
import codecs
segList = []
raw_data_path = 'monthly_raw_data/'
file_name = ["201010", "201011", "201012", "201101", "201103", "201105", "201107", "201109", "201110", "201111", "201112", "201201", "201202", "201203", "201205", "201206", "201208", "201210", "201211"]
for name in file_name:
all_text = ""
multi_line_text = ""
with open(raw_data_path + name + ".txt", "r") as file:
for line in file:
if line != '\n':
multi_line_text += line
templist = multi_line_text.split('\n')
for text in templist:
all_text += text
seg_list = jieba.cut(all_text,cut_all=False)
temp_text = []
for item in seg_list:
stop_list = []
with open("stopwords.txt", "r") as stoplistfile:
for item in stoplistfile:
text_without_stopwords = []
for word in temp_text:
if word not in stop_list:
with open("results/result.csv", 'wb') as f:
writer = csv.writer(f)
For UTF-8 encoding, Excel requires a BOM (byte order mark) codepoint written at the start of the file or it will assume an ANSI encoding, which is locale-dependent. U+FEFF is the Unicode BOM. Here's an example that will open in Excel correctly:
import csv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv','wb') as f:
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
Python 3 makes this easier. Use 'w', newline='', encoding='utf-8-sig' parameters instead of 'wb' which will accept Unicode strings directly and automatically write a BOM:
import csv
data = [['American', '美国人'],
['Chinese', '中国人']]
with open('results.csv', 'w', newline='', encoding='utf-8-sig') as f:
w = csv.writer(f)
There is also a 3rd–party unicodecsv module that makes Python 2 easier to use as well:
import unicodecsv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv', 'wb') as f:
w = unicodecsv.writer(f ,encoding='utf-8-sig')
Here is another way kinda tricky:
import csv
data = [[u'American',u'美国人'],
with open('results.csv','wb') as f:
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
This code block generate csv file encoded utf-8 .
open file with notepad++ (or other Editor with encode feature)
Encoding -> convert to ANSI
Open file with Excel, it's OK.

Python unidecode function opening lists/documents

How can I open document as Unicode?
I have txt file which contains foreign characters. I need to open it word by word using this unidecode function.
I am getting error saying -- TypeError: 'module' object is not callable
import os
import re
import unidecode
def splitToWords(stringOfWords):
retVal = re.split('; |;|, |,|\*|\n|\. |\.|-| |\"',stringOfWords)
while '' in retVal:
[val.lower() for val in retVal]
return retVal
with open(file,"r") as f:
file_content =
file_content = splitToWords(file_content)
for word in file_content
word = unidecode.unidecode(word)
Hi please check the below code, is this you wanted ?
unicodestring = "u there"
utf8tostring = unicodestring.encode("utf-16")
print utf8tostring
code refered from the following website
You can try something like this:
# you have to import unidecode function first
from unidecode import unidecode
with open(file) as f:
for line in f:
# this will split a line to words and decode them.
# you don't have to close() the file, "with open()" does that for you.
decoded_words = [unidecode(word) for word in line.split()]

Prepend information in base64 encoding

Here is an answer which gives some information on how to base64 encode a file. However, I also want to pass in the filetype and mimetype. for the information in the base64 encoded string.
So far I have for my base64 string:
What is the correct information to prepend, and how would I do this?
It seems like the following is how I would get the base64 file information to pass to the server:
file = '/Users/user/Desktop/img.PNG'
prepend_info = 'data:%s;base64' % mimetypes.guess_type(file)[0]
base_64_data = open(file).read().encode('base64')
image_data_base64 = '%s,%s' % (prepend_info, base_64_data)
This then gives me:
Perhaps something along these lines:
from __future__ import print_function
import base64
import binascii
import os
def base64_encode_file(filename):
filetype = os.path.splitext(filename)[1][1:] # remove leading '.' from ext
with open(filename) as file:
data =
return base64.b64encode(','.join((filename, filetype, data))), data
filename = 'C:/Users/martin/Desktop/img.PNG'
#filename = '/Users/user/Desktop/img.PNG'
encoded, data = base64_encode_file(filename)
print('encoded: {} (hex file data: {})'.format(encoded, binascii.hexlify(data)))
decoded = base64.b64decode(encoded).split(',', 2)
print('decoded:', decoded[0], decoded[1], binascii.hexlify(decoded[2]))
encoded: QzovVXNlcnMvbWFydGluL0Rlc2t0b3AvaW1nLlBORyxQTkcsiVBORwo=
(hex file data: 89504e470a)
decoded: C:/Users/martin/Desktop/img.PNG PNG 89504e470a

Tagging spanish text with Unicode characters not possible with NLTK?

I'm trying to parse some spanish sentences that contain non-ascii characters (mostly accents in words...for instance: película (film), atención (attention), etc).
I'm reading the lines from a file encoded with utf-8. Here is a sample of my script:
# -*- coding: utf-8 -*-
import nltk
import sys
from nltk.corpus import cess_esp as cess
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
f ='spanish_sentences', encoding='utf-8')
results_file ='tagging_results', encoding='utf-8', mode='w+')
for line in iter(f):
output_line = "Current line contents before tagging->" + str(line.decode('utf-8', 'replace'))
print output_line
output_line = "Unigram tagger->"
print output_line
s = line.decode('utf-8', 'replace')
output_line = tagger.uni.tag(s.split())
print output_line
At this line:
output_line = tagger.uni.tag(s.split())
I'm getting this error:
/usr/local/lib/python2.7/dist-packages/nltk-2.0.4-py2.7.egg/nltk/tag/ UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
return self._context_to_tag.get(context)
Here is some output for a simple sentence:
Current line contents before tagging->tengo una queja y cada que hablo a atención me dejan en la linea media hora y cortan la llamada!!
Unigram tagger->
[(u'tengo', 'vmip1s0'), (u'una', 'di0fs0'), (u'queja', 'ncfs000'), (u'y', 'cc'), (u'cada', 'di0cs0'), (u'que', 'pr0cn000'), (u'hablo', 'vmip1s0'), (u'a', 'sps00'), (u'atenci\xf3n', None), (u'me', 'pp1cs000'), (u'dejan', 'vmip3p0'), (u'en', 'sps00'), (u'la', 'da0fs0'), (u'linea', None), (u'media', 'dn0fs0'), (u'hora', 'ncfs000'), (u'y', 'cc'), (u'cortan', None), (u'la', 'da0fs0'), (u'llamada!!', None)]
If I understood correctly from this chapter...the process is correct...I decode the line from utf-8 to Unicode, tag, and then encode from Unicode to utf-8 again...I don't understand this error
Any idea what I'm doing wrong?
EDIT: found the problem...basically the spanish cess_esp corpus is encoded with Latin-2 encoding. See the code below to see how to be able to train the tagger correctly.
tagged_sents = (
[(word.decode('Latin2'), tag) for (word, tag) in sent]
for sent in cess.tagged_sents()
tagger = UT(tagged_sents) # training a tagger
A better way would be to use the CorpusReader class to ask for the corpus encoding, thus you don't need to know it before-hand.
Possibly something is wrong with your tagger object or how your file is read. I re-wrote part of your code and it runs without error:
# -*- coding: utf-8 -*-
import urllib2, codecs
from nltk.corpus import cess_esp as cess
from nltk import word_tokenize
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
tagger = ut(cess.tagged_sents())
url = ''
fin = urllib2.urlopen(url).read().strip().decode('utf8')
fout ='tagger.out', 'w', 'utf8')
for line in fin.split('\n'):
print>>fout, "Current line contents before tagging->", line
print>>fout, "Unigram tagger->",
print>>fout, tagger.tag(word_tokenize(line))
print>>fout, ""

How to fix this UnicodeDecodeError that appears when I try to remove accents in Python strings?

I'm trying to use this function:
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
in the code below (which unzips and reads files with non-ASCII strings). But I'm getting this error, (from this library file C:\Python27\Lib\encodings\
Message File Name Line Position
<module> C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\ 64
getNameList C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\ 26
remove_accents C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\ 17
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 3: ordinal not in range(128)
Why am I getting this error? How to avoid it and make remove_accents work?
Thanks for any help!
Here's the entire code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import re
from zipfile import ZipFile
import csv
##def strip_accents(s):
## return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getNameList():
for name in namesDict:
print name
# name = strip_accents(name)
name = remove_accents(name)
if counts[0]>counts[1]:
elif counts[1]>counts[0]:
# print maleNames
return names
def extractNamesDict():
zf=ZipFile('', 'r')
for filename in filenames:,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
if not names.has_key(name):
# print '\tImported %s'%filename
# print names
return names
if __name__ == "__main__":
Best practice is to decode to Unicode when the data comes into your program:
for row in rows:
name=row[0].upper().decode('utf8') # or DO need to know the encoding.
Then remove_accents can just be:
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u''.join(c for c in nkfd_form if not unicodedata.combining(c))
Encode data when leaving your program such as writing to a file, database, terminal, etc.
Why remove accents in the first place?
If you want to robustly convert unicode characters to ascii in a string, you should use the awesome unidecode module:
>>> import unidecode
>>> unidecode.unidecode(u'Björk')
>>> unidecode.unidecode(u'András Sütő')
'Andras Suto'
>>> unidecode.unidecode(u'Ελλάς')
You get it because you are decoding from a bytestring without specifying a codec:
Add a codec there (here I assume your data is encoded in utf-8, 0xe1 would be the first of a 3-byte character):
unicode(input_str, 'utf8')
