'charmap' codec can't encode characters - python

I'm using tweepy and get this error when printing tweet messages on the screen (Windows).
#!/usr/bin/env python
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
#consumer key, consumer secret, access token, access secret.
ckey = 'xyz'
csecret = 'xyz'
atoken = 'xyz'
asecret = 'xyz'
class Listener(StreamListener):
def on_data(self, data):
print json.loads(data)['text']
return True
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, Listener())
twitterStream.filter(track=['#hash1', '#hash2'], languages=['en'])
> Traceback (most recent call last): File
> "C:....twitterSentiment.py",
> line 34, in <module>
> twitterStream.filter(track=['#hash1', '#hash2'], languages=['en']) File
> line 430, in filter
> self._start(async) File "C:......streaming.py",
> line 346, in _start
> self._run() File "C:.....streaming.py",
> line 286, in _run
> raise exception UnicodeEncodeError: 'charmap' codec can't encode characters in position 108-111: character maps to <undefined>
It is caused by Windows not supporting all characters. Is there a workaround for this?

You are getting this error, because it is not able to print unicode part of tweet.text. Encode it to utf-8 (unicode).
def on_data(self, data):
print json.loads(data)['text'].encode('utf-8')
return True

chcp 65001
This is the prescribed solution in multiple threads. I was using a symbol "∞" which was not getting printed. I ran the python code from cmd after running
chcp 65001
It worked like a charm. Hope it helps.
p.s. It only works in cmd not in atom editor nor via cygwin.

Related

Not able to handle the exception for Connection Reset while executing twitter streaming in Python

I am trying to catch the exception which is raised when the connection is reset from the peer during the real-time streaming of tweet, but seems the try-exception block is not properly catching the error raised and pass through it. Please advise, if the block is not rightly placed in the code or there is something wrong with the code.
I have created a script that will stream the tweet in real time to an excel file. Lot of times it has happened that streaming got disconnected due to ECONNRESET error which is connection reset by peer -
Exception in thread Thread-1:
Traceback (most recent call last):
File “/usr/lib/python2.7/threading.py”, line 801, in __bootstrap_inner
self.run()
File “/usr/lib/python2.7/threading.py”, line 754, in run
self.__target(*self.__args, **self.__kwargs)
File “/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py”, line 297, in _run
six.reraise(*exc_info)
File “/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py”, line 266, in _run
self._read_loop(resp)
File “/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py”, line 316, in _read_loop
line = buf.read_line().strip()
File “/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py”, line 181, in read_line
self._buffer += self._stream.read(self._chunk_size)
File “/usr/local/lib/python2.7/dist-packages/urllib3/response.py”, line 430, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File “/usr/lib/python2.7/contextlib.py”, line 35, in exit
self.gen.throw(type, value, traceback)
File “/usr/local/lib/python2.7/dist-packages/urllib3/response.py”, line 349, in _error_catcher
raise ProtocolError(‘Connection broken: %r’ % e, e)
ProtocolError: (‘Connection broken: error("(104, ‘ECONNRESET’)",)’, error("(104, ‘ECONNRESET’)",))
Its a protocol error and i tried to catch this error by importing urllib3 library as it has protocol exceptions, but the try and exception block is not able to suppress it and continue with the streaming.
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
import tweepy
import ast
from datetime import datetime
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
from unidecode import unidecode
from urllib3.exceptions import ProtocolError
from urllib3.exceptions import IncompleteRead
import requests
consumer_key= 'xxxxxxxxx'
consumer_secret= 'xxxxxxxxx'
access_token= 'xxxxxxxxx'
access_token_secret= 'xxxxxxxxx'
with open('TEST_FEB.csv','w')as f:
f.truncate()
f.close()
class listener(StreamListener):
def on_data(self,data):
data1 = json.loads(data)
time = data1["created_at"]
if hasattr(data1,"retweeted_status:"):
tweet = unidecode(data1["tweet"]["text"])
if data1["truncated"] == "true":
tweet = unidecode(data1["extended_tweet"]["full_text"])
else:
tweet = unidecode(data1["text"])
tweet1 = BeautifulSoup(tweet, "lxml").get_text()
url = "https://twitter.com/{}/status/{}".format(data1["user"]
["screen_name"], data1["id_str"])
file = open('TEST_FEB.csv', 'a')
csv_writer = csv.writer(file)
csv_writer.writerow([time, tweet1, url])
file.close()
def on_limit(self, track):
return True
auth = OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
while True:
try:
twitterStream = Stream(auth, listener(),
wait_on_rate_limit=True, retry_count=10, stall_warnings=True)
twitterStream.filter(track=["abcd"], async = True)
except ProtocolError as error:
print (str(error))
continue
except IncompleteRead as IR:
print (str(IR))
continue
The expected result is that whenever the connection is reset from the peer and the said error is raised, the code should suppress it and continue with the streaming. The code in the current form is not working that way.

TypeError: 'str' object is not callable when insert tweet data to mysql Python 3

This is my code to insert tweet data in MYSQL
import pymysql
import tweepy
import time
import json
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import pymysql.cursors
ckey= ''
csecret= ''
atoken=''
asecret=''
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='admin1234', db='mysql')
cur = conn.cursor()
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
a=0
#username = all_data["user"]["screen_name"]
cur.execute("INSERT INTO tweet (textt) VALUES (%s)" (tweet))
print (tweet)
return True
def on_error(self, status):
print (status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track = ["puasa"])
cur.close()
conn.close()
but i get error TypeError: 'str' object is not callable
traceback error
Traceback (most recent call last):
File "collect-sql.py", line 40, in <module>
twitterStream.filter(track = ["puasa"])
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 450, in filter
self._start(async)
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 364, in _start
self._run()
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 297, in _run
six.reraise(*exc_info)
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/six.py", line 693, in reraise
raise value
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 266, in _run
self._read_loop(resp)
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 327, in _read_loop
self._data(next_status_obj)
File "/Users/amzar/anaconda3/lib/python3.6/site-packages/tweepy/streaming.py", line 300, in _data
if self.listener.on_data(data) is False:
File "collect-sql.py", line 30, in on_data
cur.execute("INSERT INTO tweet (textt) VALUES (%s)" (tweet))
TypeError: 'str' object is not callable
You need 2 extra commas:
cur.execute("INSERT INTO tweet (textt) VALUES (%s)", (tweet,))
The first separates the query string from the arguments, the second turns the value in brackets into the first element in a 1 element tuple (it actually would work if you just used a single string instead of a tuple, assuming you only have one argument, but this isn't officially supported from the look of things).
But this error that you mentioned in the comments:
UnicodeEncodeError: 'latin-1' codec can't encode character '\u201c' in position 97: ordinal not in range(256)
means you are trying to interpret unicode text containing a character from the extended character set into latin-1.
If the field is already internally defined (in your mysql database) as unicode, you may need to specify the character set to use when connecting e.g.:
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='admin1234', db='mysql', use_unicode=True, charset="utf8")
If the field in mysql is not already something like utf-8 then I recommend you alter or otherwise redefine the database to use a unicode character se tfor this column.
https://dev.mysql.com/doc/refman/8.0/en/charset-mysql.html

python script execution failed due to tweepy error 401

I'm using below code to streaming tweets and analyse them for making decisions. while running the below code I got an error. that error occurs twitter users those who had the friend list of more than 50.
import re
import tweepy
import sys
import time
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
users = tweepy.Cursor(api.friends, screen_name='#myuser').items()
while True:
try:
user = next(users)
except tweepy.TweepError:
time.sleep(60*15)
user = next(users)
except StopIteration:
break
for status in tweepy.Cursor(api.user_timeline,screen_name=user.screen_name,result_type='recent').items(5):
text=status._json['text'].translate(non_bmp_map)
print (user.screen_name + ' >>>>>> '+text)
while executing this script I have got an error as below.
Traceback (most recent call last):
File "D:sensitive2demo.py", line 31, in <module>
for status in tweepy.Cursor(api.user_timeline,screen_name=user.screen_name,result_type='recent').items(5):
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\tweepy-3.6.0-py3.6.egg\tweepy\cursor.py", line 49, in __next__
return self.next()
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\tweepy-3.6.0-py3.6.egg\tweepy\cursor.py", line 197, in next
self.current_page = self.page_iterator.next()
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\tweepy-3.6.0-py3.6.egg\tweepy\cursor.py", line 108, in next
data = self.method(max_id=self.max_id, parser=RawParser(), *self.args, **self.kargs)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\tweepy-3.6.0-py3.6.egg\tweepy\binder.py", line 250, in _call
return method.execute()
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\tweepy-3.6.0-py3.6.egg\tweepy\binder.py", line 234, in execute
raise TweepError(error_msg, resp, api_code=api_error_code)
tweepy.error.TweepError: Twitter error response: status code = 401
I have googled a lot.but nothing worked. Can somebody help me to solve the problem?
401 is an http status code for 'Unauthorized'. I would suggest verifying your credentials.

Twitter Streaming in Python: cp949 codec

I am currently using tweepy to gather data using Streaming API.
Here is my code and I ran this on Acaconda command prompt. When streaming starts, it returns tweets and then after giving few tweets it gives the following error:
Streaming Started ...
RT #ish10040: Crack Dealer Released Early From Prison By Obama Murders Woman And Her 2 Young Kids… Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Users\Jae Hee\Anaconda2\lib\threading.py", line 801, in __bootstrap_inner
self.run()
File "C:\Users\Jae Hee\Anaconda2\lib\threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Users\Jae Hee\Anaconda2\lib\site-packages\tweepy\streaming.py", line 294, in _run
raise exception
UnicodeEncodeError: 'cp949' codec can't encode character u'\xab' in position 31: illegal multibyte sequence
I believe that it has to do with encoding so I used chcp 65001 to deal with this issue but it does not give the solution!
Here is the code
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
print(status.text)
def on_error(self, status_code):
#returning False in on_data disconnects the stream
if status_code == 420:
return False
def main():
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener = myStreamListener)
print "Streaming Started ..."
try:
myStream.filter(track=['Obama'], async = True)
except:
print "error!"
myStream.disconnect()
if __name__ == '__main__':
main()
All text produced and accepted through the twitter API should be encoded as UTF-8, so your code should be using that codec to decode what's coming back.
See here: https://dev.twitter.com/overview/api/counting-characters

Python encoding issue when trying to parse JSON tweets

I am trying to parse out the tweet and username sections of the JSON object returned from Twitter using the following code:
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
username = all_data["user"]["screen_name"]
c.execute("INSERT INTO tweets (tweet_time, username, tweet) VALUES (%s,%s,%s)" ,
(time.time(), username, tweet))
print (username, tweet)
return True
def on_error(self, status):
print (status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track = ["LeBron James"])
But I get the following error. How can the code be adjusted to decode or encode the response properly?
Traceback (most recent call last):
File "C:/Users/sagars/PycharmProjects/YouTube NLP Lessons/Twitter Stream to DB.py", line 45, in <module>
twitterStream.filter(track = ["LeBron James"])
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 428, in filter
self._start(async)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 346, in _start
self._run()
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 286, in _run
raise exception
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 255, in _run
self._read_loop(resp)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 309, in _read_loop
self._data(next_status_obj)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 289, in _data
if self.listener.on_data(data) is False:
File "C:/Users/sagars/PycharmProjects/YouTube NLP Lessons/Twitter Stream to DB.py", line 36, in on_data
print (username, tweet)
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-8: character maps to <undefined>
Unfortunately the problem with that is the information you get from twitter is not utf-8 encoded, which is causing you to get the charmap error. To fix that, you'll need to encode it.
tweet = all_data["text"].encode('utf-8')
username = all_data["user"]["screen_name"].encode('utf-8')
This will cause you to lose some of emoji and special characters that show up in the tweet, it will be converted to \x899. If you really need that information (I discard it myself) for sentiment analysis, then you'll need to install a package with a pre-compiled list to convert them accordingly.

Categories