#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tweepy
import json
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class listener(StreamListener):
def on_data(self, data):
try:
print data
tweet = data.split(',"text":"')[1].split('","source')[0]
print tweet
saveThis = str(time.time())+'::' + tweet
saveFile = open("tweetDB3.csv", "a")
saveFile.write(saveThis)
saveFile.write("\n")
saveFile.close()
return True
except BaseException, e:
print "failed ondata,",str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track = ['오늘'])
example of result:
1465042178.01::RT #BTS_twt: korea#\ud83c\uddf0\ud83c\uddf7 https://t.co/zwKaGo4Lcj
1465042181.76::RT #wdfrog: \ud5e4\ub7f4\ub4dc \uacbd\uc81c\uac00 \uc774\ubc88 \uc77c\ub85c \uc0ac\uacfc\ubb38\uc744
\uc62c\ub838\uc9c0\ub9cc \uc774\uc790\ub4e4\uc740 \ubd88\uacfc
3\uac1c\uc6d4 \uc804\uc778 3\uc6d4 4\uc77c\uc5d0\ub3c4
\uc55e\uc73c\ub85c \uc870\uc2ec\ud558\uaca0\ub2e4\ub294
\uc0ac\uacfc\ubb38\uc744 \uc62c\ub9b0 \ubc14 \uc788\ub2e4.
\uc77c\uc774 \ucee4\uc9c8\uae4c \uba74\ud53c\ud558\ub294
\uac83\uc774\ub2c8 \uc5b8\ub860\uc911\uc7ac\uc704\uc5d0 \ud55c\uce35
\uac00\uc5f4\ucc28\uac8c \ubbfc\uc6d0\uc744
\ub123\uc74d\uc2dc\ub2e4\nhttps://t.co/Wb\u2026
Question:
If I do a twitter API stream through the above code (using Korean characters)
the message above is what is being created in excel file which is shown as unicode.
These unicodes have corresponding Korean characters that can be found by print u'string'
But is it possible to make all these unicodes automatically converted Korean?
I've tried to fix python code and tried to solve within excel but no luck.
Despite the setdefaultencoding method you can't change default encoding in python 2.7. You should use python 3, (default encoding is UTF-8 and you can change it)
Related
I'm trying to get Arabic tweets by using tweepy library in python 3.6, with English it works perfectly but when i try to get Arabic tweets i faced many problemm the problem with this last code is that the tweets in Arabic characters appear as "\u0635\u0648\u0651\u062a\u0648\u0627 "
i tried several solution in the internet but there is no one that solved my problem because most of them try to get just "text" of the tweet so they can fix the encode problem directly with the text only, but for me i want to get the whole info in json
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
access_token = '-'
access_token_secret = '-'
consumer_key = '-'
consumer_secret = '-'
class StdOutListener(StreamListener):
def on_data(self, data):
print (data.encode("UTF-8"))
return True
def on_error(self, status):
print (status)
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
stream.filter( track=["عربي"])
> $ python file.py > file2.txt
the results in text file and in the terminal:
{"created_at":"Thu Jan 17 12:12:16 +0000 2019","id":1085872428432195585,"id_str":"1085872428432195585","text":"RT #MALHACHIMI: \u0642\u0627\u062f\u0629 \u062d\u0631\u0643\u0629 \u0627\u0644\u0646\u0647\u0636\u0629 \u0635\u0648\u0651\u062a\u0648\u0627 \u0636\u062f \u0627\u0639\u062a\....etc}
If I do this with the first example in your question:
>>> print( "\u0635\u0648\u0651\u062a\u0648\u0627 ")
صوّتوا
the Arabic appears. But if you display a dict at the console, without specifying how you want it displayed, Python will just use a default representation that uses the ASCII character set, and anything not printable in that character set will be represented as escapes. This is because if you wanted to code this string in a program, your IDE editor might have a problem coping with the Arabic. The reason is that switches between the left-to-right order of the Python code and the right-to-left order of the string is very hard to manage. The information hasn't been lost or mangled, just displayed in a lowest-common-denominator format.
I want to add a referer while retrieving data from the web but this is not working on my python2 referer request.add_header('Referer', 'https://www.python.org').
My Url.txt content
https://www.python.org/about/
https://stackoverflow.com/questions
https://docs.python.org/2.7/
These are my codes
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib2
import threading
import time
import requests
max_thread = 5
urllist = open("Url.txt").readlines()
def url_connect(url):
try :
request = urllib2.Request(url)
request.add_header('Referer', 'https://www.python.org')
request.add_header('User-agent', 'Mozilla/5.0')
goo = re.findall('<title>(.*?)</title>', urllib2.urlopen(url.replace(' ','')).read())[0]
print '\n' + goo.decode("utf-8")
with open('SaveMyDataFile.txt', 'ab') as f:
f.write(goo + "\n")
except Exception as Errors:
pass
for i in urllist:
i = i.strip()
if i.startswith("http"):
while threading.activeCount() >= max_thread:
time.sleep(0.1)
threading.Thread(target=url_connect, args=(i,)).start()
Looks to me the problem is in your call to urlopen. You call it with the url and not with the request.
From https://docs.python.org/2/library/urllib2.html#urllib2.urlopen
Open the URL url, which can be either a string or a Request object.
You need to pass urllib.urlopen() the Request object that you just built--you're currently not doing anything with that.
def scrapeFacebookPageFeedStatus(page_id, access_token):
# -*- coding: utf-8 -*-
with open('%s_facebook_statuses.csv' % page_id, 'wb') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print "Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print "%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print "\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime)
scrapeFacebookPageFeedStatus(page_id, access_token)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 40-43: ordinal not in range(128)
I'm writing code to scrape through Facebook pages to gather all the posts in cvs file.
The code is working properly when there is only the English language, but
the error above appears when I try to scrape through pages that post in Arabic.
I know the solution is to use utf-8 but I don't know how to implement it on the code.
Your problem probably is not in this code, I suspect is in your processFacebookPageFeedStatus function. But when you are formatting your fields you'll want to make sure any that may contain unicode characters are all decoded (or encoded as appropriate) in utf-8.
import codecs
field_a = "some unicode text in here"
field_a.decode('utf-8') -----> \u1234\u........
field_a.encode('utf-8') -----> Back to original unicode
Your CSV probably doesn't support unicode, so you need to decode each field in your source data.
Debugging unicode is a pain, but there are a lot of SO posts about different problems related to encoding/decoding unicode
import sys
reload(sys).setdefaultencoding("utf-8")
I added this piece of code and it works fine when I open this file in pandas .
there are no other errors or what so ever for now
How can I get song name from internet radio stream?
Python: Get name of shoutcast/internet radio station from url I looked here, but there is only getting name of radio station. But how to get name of the playing song? Here is stream link from where I want to get name of song. http://pool.cdn.lagardere.cz/fm-evropa2-128
How should I do it? Can you help me please?
To get the stream title, you need to request metadata. See shoutcast/icecast protocol description:
#!/usr/bin/env python
from __future__ import print_function
import re
import struct
import sys
try:
import urllib2
except ImportError: # Python 3
import urllib.request as urllib2
url = 'http://pool.cdn.lagardere.cz/fm-evropa2-128' # radio stream
encoding = 'latin1' # default: iso-8859-1 for mp3 and utf-8 for ogg streams
request = urllib2.Request(url, headers={'Icy-MetaData': 1}) # request metadata
response = urllib2.urlopen(request)
print(response.headers, file=sys.stderr)
metaint = int(response.headers['icy-metaint'])
for _ in range(10): # # title may be empty initially, try several times
response.read(metaint) # skip to metadata
metadata_length = struct.unpack('B', response.read(1))[0] * 16 # length byte
metadata = response.read(metadata_length).rstrip(b'\0')
print(metadata, file=sys.stderr)
# extract title from the metadata
m = re.search(br"StreamTitle='([^']*)';", metadata)
if m:
title = m.group(1)
if title:
break
else:
sys.exit('no title found')
print(title.decode(encoding, errors='replace'))
The stream title is empty in this case.
I wrote a code to connect to imap and then parse the body information and insert into database. But I am having some problems with accents.
From email header I got this information:
Content-Type: text/html; charset=ISO-8859-1
But, I am not sure if I can trust in this information...
The email was wrote in portuguese, so we have a lot of words with accents. For example, I extract the following phrase from the email source code (using my browser):
"...instalação de eletrônicos..."
So, I connected to imap and fetched some emails:
... typ, data = M.fetch(num, '(RFC822)') ...
When I print the content, I get the following word:
print data[0][1]
instala+º+úo de eletr+¦nicos
I tried to use .decode('utf-8') but I had no success.
instalação de eletrônicos
How can I make it a human readable? My database is in utf-8.
The header says it is using "ISO-8859-1" charset. So you need to decode the string with that encoding.
Try this:
data[0][1].decode('iso-8859-1')
Specifying the source code encoding worked for me. It's the code at the top of my example code below. This should be defined at the top of your python file.
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
value = """...instalação de eletrônicos...""".decode("iso-8859-15")
print value
# prints: ...instalação de eletrônicos...
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii','ignore')
print value
# prints: ...instalacao de eletronicos...
And now you can do str(value) without an exception as well.
See: http://docs.python.org/2/library/unicodedata.html
This seems to keep all accents:
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
import unicodedata
value = """...instalação de eletrônicos...""".decode("iso-8859-15")
value = unicodedata.normalize('NFKC', value).encode('utf-8')
print value
print str(value)
# prints (without exceptions/errors):
# ...instalação de eletrônicos...
# ...instalação de eletrônicos...
EDIT:
Do note that with the last version even though the outcome looks the same it doesn't return equal is True. In example:
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
import unicodedata
inValue = """...instalação de eletrônicos...""".decode("iso-8859-15")
normalizedValue = unicodedata.normalize('NFKC', inValue).encode('utf-8')
try:
print inValue == normalizedValue
except UnicodeWarning:
pass
# False
EDIT2:
This returns the same:
normalizedValue = unicode("""...instalação de eletrônicos...""".decode("iso-8859-15")).encode('utf-8')
print normalizedValue
print str(normalizedValue )
# prints (without exceptions/errors):
# ...instalação de eletrônicos...
# ...instalação de eletrônicos...
Though I'm not sure this will actually be valid for a utf-8 encoded database. Probably not?
Thanks for Martijn Pieters. We figured out that the email had two different encode. I had to split this parts and treat individually.