I ran into an issue a string encoded with "utf-8" doesn't print as expected. The string contains accented letters (á, é, ü, ñ, etc.), and is part of a JSON dict returned from Wikipedia API.
Below is an example with the letter é:
== The complete code: ==
import urllib
import json
import re
def check(wikitext):
redirect_title = re.findall('\[\[[\S ]+\]\]', str(wikitext))[0]
redirect_title = redirect_title.strip('[]')
redirect_title = redirect_title.decode('ISO-8859-1').encode('utf8')
return redirect_title
serviceurl = 'https://en.wikipedia.org/w/api.php?'
action = 'parse'
formatjs = 'json'
prop = 'text|wikitext'
title = 'Jose Eduardo Agualusa'
url = serviceurl + urllib.urlencode({'action':action, 'page': title, 'format': formatjs, 'prop': prop})
uh = urllib.urlopen(url)
data = uh.read()
try:
js = json.loads(data)
except:
js = None
print ' Page is not found'
wikitext = js["parse"]["wikitext"]
redirect_title = check(wikitext)
print 'redirect_title:',redirect_title
redirect_title2 = 'Jos\xe9 Eduardo Agualusa'
redirect_title2 = redirect_title2.decode('ISO-8859-1').encode('utf8')
print 'redirect_title2:', redirect_title2
The result is:
redirect_title: Jos\xe9 Eduardo Agualusa
redirect_title2: José Eduardo Agualusa
redirect_title is parsed from the Wikipedia API JSON. Before encoded, it prints as 'Jos\xe9 Eduardo Agualusa'. After encoded, it doesn't seem to change.
redirect_title2 is assigned directly with the string 'Jos\xe9 Eduardo Agualusa' and then encoded.
Why do I get different results for redirect_title and redirect_title2? How can I make redirect_title print as "José Eduardo Agualusa"?
Your check() routine does some very odd things, including parsing the string representation of a dictionary.
Try this instead:
def check(wikitext):
for value in wikitext.values():
result = re.findall(ur'\[\[.*?\]\]', value)
if result:
return result[0].strip(u'[]')
return u''
Or this:
def check(wikitext):
redirect_title = u''.join(wikitext.values())
redirect_title = re.findall(u'\[\[[\S ]+\]\]', redirect_title)[0]
redirect_title = redirect_title.strip(u'[]')
return redirect_title
Related
I am trying to gather a bunch of links using xpath which need to be scraped from the next page however, I keep getting the error that can only parse strings? I tried looking at the type of lk and it was a string after I casted it? What seems to be wrong?
def unicode_to_string(types):
try:
types = unicodedata.normalize("NFKD", types).encode('ascii', 'ignore')
return types
except:
return types
def getData():
req = "http://analytical360.com/access-points"
page = urllib2.urlopen(req)
tree = etree.HTML(page.read())
i = 0
for lk in tree.xpath('//a[#class="sabai-file sabai-file-image sabai-file-type-jpg "]//#href'):
print "Scraping Vendor #" + str(i)
trees = etree.HTML(urllib2.urlopen(unicode_to_string(lk)))
for ll in trees.xpath('//table[#id="archived"]//tr//td//a//#href'):
final = etree.HTML(urllib2.urlopen(unicode_to_string(ll)))
You should pass in strings not urllib2.orlopen.
Perhaps change the code like so:
trees = etree.HTML(urllib2.urlopen(unicode_to_string(lk)).read())
for i, ll in enumerate(trees.xpath('//table[#id="archived"]//tr//td//a//#href')):
final = etree.HTML(urllib2.urlopen(unicode_to_string(ll)).read())
Also, you don't seem to increment i.
So the below function is working for print statement but not for return stament How can i get the count no of tweets using return statement
>>>import requests
>>>from requests_oauthlib import OAuth1
>>>import urllib
# Credentials to fetch
>>>consumer_key = '********'
>>>consumer_secret = '******************'
>>>url = 'https://api.twitter.com/1.1/search/tweets.json'
>>>def get_keyword_tweets(keyword, Count):
param = urllib.urlencode({'q': keyword, 'lang': 'en', 'result_type':
'recent', 'count': Count})
url2 = url.endswith('?') and (url+param) or (url + '?' + param)
auth = OAuth1(consumer_key, consumer_secret)
r = requests.get(url2, auth=auth)
tweets = r.json()
keywordtweets = tweets['statuses']
dict1 = keywordtweets[0:Count]
#data = dict1['id_str'],dict1['text']
for tweet in dict1:
data = tweet['id_str'],tweet['text']
print data
return data
The output i am getting when i am using the above function is
>>>In [1]: from http_twitter import get_keyword_tweets
>>>In [2]: get_keyword_tweets("CWC15",Count=4)
(u'578172948231495680', u'RT #ICC: Fascinating stat from Pool Stages with the breakdown of wickets in the tournament, just wait for #AUSvPAK!! \n#cwc15 http://t.co/Jw\u2026')
(u'578172941977808896', u'RT #Surbhivb: Venkat: on the UAE cricket team, led by Khurram Khan, an airline purser. Only fully amateur team in the #CWC15. http://t.co/c\u2026')
(u'578172938467176448', u'RT #iTweety_19: "I am ready to go to the World Cup if the selectors pick me as the replacement for Mohammad Irfan" says #REALsaeedajmal \n#c\u2026')
(u'578172935115960320', u'Thank you #KumarSanga2 & #MahelaJay for all the epic partnerships! #ThankYouSanga #ThankYouMahela #SAvSL #CWC15 http://t.co/li0QgPniI0"')
(((((The above output is for print data"(i got 4 tweets as i mentioned)")))))
Out[2]:
(u'578172935115960320',
u'Thank you #KumarSanga2 & #MahelaJay for all the epic partnerships! #ThankYouSanga #ThankYouMahela #SAvSL #CWC15 http://t.co/li0QgPniI0"')
(((((The above output is for 'return data'(I got only one tweet but i need four))))))
so how can i return the count no of tweets Please help me.
You are returning just 1 data at the end. You need to return a list or the whole dict as pointed out in a comment above.
replace:-
for tweet in dict1:
data = tweet['id_str'],tweet['text']
print data
return data
By:-
data_list = []
for tweet in dict1:
data = tweet['id_str'] + tweet['text']
data_list.append(data)
print data
return data_list
def dcrawl(link):
#importing the req. libraries & modules
from bs4 import BeautifulSoup
import urllib
#fetching the document
op = urllib.FancyURLopener({})
f = op.open(link)
h_doc = f.read()
#trimming for the base document
idoc1 = BeautifulSoup(h_doc)
idoc2 = str(idoc1.find(id = "bwStory"))
bdoc = BeautifulSoup(idoc2)
#extract the date as a string
dat = str(bdoc.div.div.string)[0:13]
date = dst(dat)
#extract the title as a string
title = str(bdoc.b.string)
#extract the full report as a string
freport = str(bdoc.find_all("p"))
#extract the place as a string
plc = bdoc.find(id = "bwStoryBody")
puni = plc.p.string
#encoding to ascii to eliminate discrepancies
pasi = puni.encode('ascii', 'ignore')
com = pasi.find("-")
place = pasi[:com]
the same conversion "bdoc.b.string" works here:
#extract the full report as a string
freport = str(bdoc.find_all("p"))
In the line:
plc = bdoc.find(id = "bwStoryBody")
plc returns some data. and plc.p returns the first <p>....<p>, but converting it to string doesn't work.
because puni returned a string object earlier, I stumbled upon unicode errors and so had to use the encode to handle the pasi result.
.find() returns None when an object was not found. Evidently some pages do not have the elements that you are looking for.
Test for it explicitly if you want to prevent attribute errors:
plc = bdoc.find(id = "bwStoryBody")
if plc is not None:
puni = plc.p.string
#encoding to ascii to eliminate discrepancies
#By default python processes in unicode
pasi = puni.encode('ascii', 'ignore')
com = pasi.find("-")
place = pasi[:com]
For those familiar with imageboards, an OP post may or may not contain a 'subject' and a 'comment'
I wrote this to search all pages of a given board for thread subjects and OP posts.
If my search term exists on one of them but the other key is inexistent it will not get appended to my res list.
So how do I search json keys where 1 key or the other may not exist?
import urllib, json, HTMLParser
def s4Chan(board, search):
logo = '3::54chan'
res = []
p = HTMLParser.HTMLParser()
catalog = json.load(urllib.urlopen('https://api.4chan.org/%s/catalog.json' % board))
for i in catalog:
for j in i['threads']:
try:
if search.lower() in j['sub'].lower() or search.lower() in j['com'].lower():
subject = j['sub']
post = p.unescape(str(j['com'])).replace('<br>', ' ')
if len(post) > 300:
post = post[0:300]
post = post + '...'
text = str('%s /%s/ %s | %s | %s (R:%s, I:%s)' % (logo, board, subject, post, 'https://4chan.org/%s/res/%s' % (board, j['no']), j['replies'], j['images']))
res.append(text)
except(KeyError):
continue
return res
json.load returns objects as Python dictionaries. You can, for example, use the get method of dict:
if search.lower() in j.get('sub', '').lower() or search.lower() in j.get('com', '').lower():
I have modified a python babelizer to help me to translate english to chinese.
## {{{ http://code.activestate.com/recipes/64937/ (r4)
# babelizer.py - API for simple access to babelfish.altavista.com.
# Requires python 2.0 or better.
#
# See it in use at http://babel.MrFeinberg.com/
"""API for simple access to babelfish.altavista.com.
Summary:
import babelizer
print ' '.join(babelizer.available_languages)
print babelizer.translate( 'How much is that doggie in the window?',
'English', 'French' )
def babel_callback(phrase):
print phrase
sys.stdout.flush()
babelizer.babelize( 'I love a reigning knight.',
'English', 'German',
callback = babel_callback )
available_languages
A list of languages available for use with babelfish.
translate( phrase, from_lang, to_lang )
Uses babelfish to translate phrase from from_lang to to_lang.
babelize(phrase, from_lang, through_lang, limit = 12, callback = None)
Uses babelfish to translate back and forth between from_lang and
through_lang until either no more changes occur in translation or
limit iterations have been reached, whichever comes first. Takes
an optional callback function which should receive a single
parameter, being the next translation. Without the callback
returns a list of successive translations.
It's only guaranteed to work if 'english' is one of the two languages
given to either of the translation methods.
Both translation methods throw exceptions which are all subclasses of
BabelizerError. They include
LanguageNotAvailableError
Thrown on an attempt to use an unknown language.
BabelfishChangedError
Thrown when babelfish.altavista.com changes some detail of their
layout, and babelizer can no longer parse the results or submit
the correct form (a not infrequent occurance).
BabelizerIOError
Thrown for various networking and IO errors.
Version: $Id: babelizer.py,v 1.4 2001/06/04 21:25:09 Administrator Exp $
Author: Jonathan Feinberg <jdf#pobox.com>
"""
import re, string, urllib
import httplib, urllib
import sys
"""
Various patterns I have encountered in looking for the babelfish result.
We try each of them in turn, based on the relative number of times I've
seen each of these patterns. $1.00 to anyone who can provide a heuristic
for knowing which one to use. This includes AltaVista employees.
"""
__where = [ re.compile(r'name=\"q\">([^<]*)'),
re.compile(r'td bgcolor=white>([^<]*)'),
re.compile(r'<\/strong><br>([^<]*)')
]
# <div id="result"><div style="padding:0.6em;">??</div></div>
__where = [ re.compile(r'<div id=\"result\"><div style=\"padding\:0\.6em\;\">(.*)<\/div><\/div>', re.U) ]
__languages = { 'english' : 'en',
'french' : 'fr',
'spanish' : 'es',
'german' : 'de',
'italian' : 'it',
'portugese' : 'pt',
'chinese' : 'zh'
}
"""
All of the available language names.
"""
available_languages = [ x.title() for x in __languages.keys() ]
"""
Calling translate() or babelize() can raise a BabelizerError
"""
class BabelizerError(Exception):
pass
class LanguageNotAvailableError(BabelizerError):
pass
class BabelfishChangedError(BabelizerError):
pass
class BabelizerIOError(BabelizerError):
pass
def saveHTML(txt):
f = open('page.html', 'wb')
f.write(txt)
f.close()
def clean(text):
return ' '.join(string.replace(text.strip(), "\n", ' ').split())
def translate(phrase, from_lang, to_lang):
phrase = clean(phrase)
try:
from_code = __languages[from_lang.lower()]
to_code = __languages[to_lang.lower()]
except KeyError, lang:
raise LanguageNotAvailableError(lang)
html = ""
try:
params = urllib.urlencode({'ei':'UTF-8', 'doit':'done', 'fr':'bf-res', 'intl':'1' , 'tt':'urltext', 'trtext':phrase, 'lp' : from_code + '_' + to_code , 'btnTrTxt':'Translate'})
headers = {"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
conn = httplib.HTTPConnection("babelfish.yahoo.com")
conn.request("POST", "http://babelfish.yahoo.com/translate_txt", params, headers)
response = conn.getresponse()
html = response.read()
saveHTML(html)
conn.close()
#response = urllib.urlopen('http://babelfish.yahoo.com/translate_txt', params)
except IOError, what:
raise BabelizerIOError("Couldn't talk to server: %s" % what)
#print html
for regex in __where:
match = regex.search(html)
if match:
break
if not match:
raise BabelfishChangedError("Can't recognize translated string.")
return match.group(1)
#return clean(match.group(1))
def babelize(phrase, from_language, through_language, limit = 12, callback = None):
phrase = clean(phrase)
seen = { phrase: 1 }
if callback:
callback(phrase)
else:
results = [ phrase ]
flip = { from_language: through_language, through_language: from_language }
next = from_language
for i in range(limit):
phrase = translate(phrase, next, flip[next])
if seen.has_key(phrase): break
seen[phrase] = 1
if callback:
callback(phrase)
else:
results.append(phrase)
next = flip[next]
if not callback: return results
if __name__ == '__main__':
import sys
def printer(x):
print x
sys.stdout.flush();
babelize("I won't take that sort of treatment from you, or from your doggie!",
'english', 'french', callback = printer)
## end of http://code.activestate.com/recipes/64937/ }}}
and the test code is
import babelizer
print ' '.join(babelizer.available_languages)
result = babelizer.translate( 'How much is that dog in the window?', 'English', 'chinese' )
f = open('result.txt', 'wb')
f.write(result)
f.close()
print result
The result is to be expected inside a div block . I modded the script to save the html response . What I found is that all utf8 characters are turned to nul . Do I need take special care in treating the utf8 response ?
I think you need to use:
import codecs
codecs.open
instead of plain open, in your:
saveHTML
method, to handle utf-8 docs. See the Python Unicode Howto for a complete explanation.