I have to encode that json file by utf-8 and use a generator to get content. when I tried to run it, there is an AttributeError:
Traceback (most recent call last):
File "F:\Files\python\yiyouhome\WordSeg\json_load.py", line 25, in <module>
tags = jieba.analyse.extract_tags(content_seg,topK = top_K, withWeight = False, allowPOS = allow_pos)
File "C:\Users\ThinkPad\AppData\Local\Programs\Python\Python36\lib\site-packages\jieba\analyse\tfidf.py", line 94, in extract_tags
for w in words:
File "C:\Users\ThinkPad\AppData\Local\Programs\Python\Python36\lib\site-packages\jieba\posseg\__init__.py", line 249, in cut
for w in self.__cut_internal(sentence, HMM=HMM):
File "C:\Users\ThinkPad\AppData\Local\Programs\Python\Python36\lib\site-packages\jieba\posseg\__init__.py", line 217, in __cut_internal
sentence = strdecode(sentence)
File "C:\Users\ThinkPad\AppData\Local\Programs\Python\Python36\lib\site-packages\jieba\_compat.py", line 37, in strdecode
sentence = sentence.decode('utf-8')
AttributeError: 'generator' object has no attribute 'decode'
Why does this happen?
At first:
Traceback (most recent call last):
File "F:\Files\python\yiyouhome\WordSeg\json_load.py", line 10, in <module>
json_data = open('spider_raw.json',encoding = 'gbk').read() #,encoding = 'utf-8'
UnicodeDecodeError: 'gbk' codec can't decode byte 0xa3 in position 74: illegal multibyte sequence
So i add encoding = 'utf-8' to fix it.
Here is my code:
import json
import jieba.analyse
import jieba.posseg as pseg
json_data = open('spider_raw.json',encoding = 'utf-8').read()
data = json.loads(json_data)
top_K = 20
allow_pos = ('nr',)
def getcontent(spiderlist):
for k,v in spiderlist.items():
for item in v['talk_mutidetails']:
yield(item['cotent'])
#def getcontenttopic(spiderlist):
item = getcontent(data)
content_seg = pseg.cut(item)
tags = jieba.analyse.extract_tags(content_seg,topK = top_K, withWeight = False, allowPOS = allow_pos)
for t in tags:
print(t)
Related
i am making a discord bot heres the code and the error
f = open("rules.txt","r")
rules = f.readlines()
error:
Traceback (most recent call last):
File "C:\Users\Windows10\OneDrive\Desktop\YourBot\bot.py", line 8, in <module>
rules = f.readlines()
File "C:\Users\Windows10\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 7: character maps to <undefined>
please help me..
Kindly Try
First:
f = open('rules.txt', 'r', encoding='utf8')
rules = f.readlines()
Second:
f = open('rules.txt', 'r', errors = 'ignore')
rules = f.readlines()
I am trying to convert a JSON file into a CSV file. My code is down below. However, I keep getting this error:
Traceback (most recent call last):
File "C:\Users\...\PythonParse.py", line 42, in <module>
writer.writerow(data)
File "C:\Documents and Settings\...\Python37\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 38409-38412: character maps to <undefined>
import json
import gzip
import csv
outfile = open("VideoGamesMeta.csv","w")
writer = csv.writer(outfile)
data = []
items = []
names = []
checkItems = False;
checkUsers = False;
numItems = []
numUsers = []
for line in open("meta_Video_Games.json","r",encoding="utf-8"):
results = (json.loads(line))
if 'title' in results:
if 'asin' in results:
name = results['title']
item = results['asin']
data = [item,name]
writer.writerow(data)
items.append(item)
names.append(name)
I am importing csv file for cleaning purpose but pycharm showing me this error
I have tried encoding format but it didn't work
import csv
txt1 = ""
txt2 = ""
i = 0
with open('data.csv',encoding='cp1252') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
i += 10
print(i)
txt1 = str(row['posts'])
print(txt1)
#print(row['type'], row['posts'])
My Traceback:
> Traceback (most recent call last):
> File "C:/Users/Administrator/PycharmProjects/mosh/clean.py", line 7, in <module>
> for row in reader:
> File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\csv.py",
> line 112, in __next__
> row = next(self.reader)
> File `enter code here`"C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\encodings\cp1252.py",
> line 23, in decode
> return codecs.charmap_decode(input,self.errors,decoding_table)[0]
> UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2409: character maps to <undefined>
>
> Process finished with exit code 1
Hi I'm working on Language translation using Keras. I have a text file with English text and a file with Hindi text.
I'm facing "UnicodeDecodeError:". And I believe maybe its because it is unable to convert non-unicode to unicode.
Please let me know how to go about it. The github link is below
https://github.com/shashankg7/Seq2Seq/tree/master/seq2seq
Code Snippet:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import codecs
import pdb
import numpy as np
#from utils import preprocess_text, text2seq_generator
def preprocess_text(file_path_src, file_path_tar, max_feats):
f_src = open(file_path_src)
f_tar = open(file_path_tar)
vocab = defaultdict(int)
freq_src = defaultdict(int)
freq_tar = defaultdict(int)
sents_src = [line.rstrip() for line in f_src.readlines()]
sents_tar = [line.rstrip() for line in f_tar.readlines()]
def preprocess(self):
# Preprocessing source and target text sequence files
self.vocab_src, self.vocab_tar, self.sents_src, self.sents_tar =
preprocess_text(self.path_src, self.path_tar, self.max_feat)
if __name__ == "__main__":
pre = preprocess('C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.hi', 'C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.en', 5500, 15)
pre.preprocess()
for e in range(1):
print("epoch no %d"%e)
for X,Y in pre.gen_batch():
print(X)
Error :
Using TensorFlow backend.
Traceback (most recent call last):
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2898, in run_code
self.showtraceback()
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1807, in showtraceback
self.showsyntaxerror(filename)
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1864, in showsyntaxerror
stb = self.SyntaxTB.structured_traceback(etype, value, [])
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\ultratb.py", line 1441, in structured_traceback
newtext = ulinecache.getline(value.filename, value.lineno)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 137, in updatecache
lines = fp.readlines()
File "C:\Users\anagha\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 7588: invalid start byte
I found an error in the logs of a website of mine, in the log i got the body of the request, so i tried to reproduce that
This is what i got.
>>> from mondishop.models import *
>>> from pyramid.request import *
>>> req = Request.blank('/')
>>> b = DBSession.query(Log).filter(Log.id == 503).one().payload.encode('utf-8')
>>> req.method = 'POST'
>>> req.body = b
>>> req.params
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/request.py", line 856, in params
params = NestedMultiDict(self.GET, self.POST)
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/request.py", line 807, in POST
vars = MultiDict.from_fieldstorage(fs)
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/multidict.py", line 92, in from_fieldstorage
obj.add(field.name, decode(value))
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/multidict.py", line 78, in <lambda>
decode = lambda b: b.decode(charset)
File "/home/phas/virtualenv/mondishop/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xb0 in position 52: invalid start byte
>>> req.POST
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/request.py", line 807, in POST
vars = MultiDict.from_fieldstorage(fs)
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/multidict.py", line 92, in from_fieldstorage
obj.add(field.name, decode(value))
File "/home/phas/virtualenv/mondishop/local/lib/python2.7/site-packages/webob/multidict.py", line 78, in <lambda>
decode = lambda b: b.decode(charset)
File "/home/phas/virtualenv/mondishop/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xb0 in position 52: invalid start byte
>>>
The error is the same as the one i got i my log, so apparently something goes bad try to decoding the original post.
What is weird is that i get an error trying to utf-8 decode something that i just utf-8 encoded.
I cannot provide the content of the original request body because it contains some sensitive data (it's a paypal IPN) and i don't really have any idea on how to start addressing this issue.