Python Wikipedia library - python

I'm using python library Wikipedia to parse data. When its get to the second part of the code I'm getting page errors.Page Errors
import wikipedia
print ("1: Searching Wikipedia for 'List of Lexus vehicles'")
try:
print (wikipedia.page('List of Lexus'))
print ('-' * 60)
except wikipedia.exceptions.DisambiguationError as e:
print (str(e))
print ('+' * 60)
print ('DisambiguationError: The page name is ambiguous')
print
print ("2: Searching Wikipedia for 'List of Lexus (vehicles)'")
print (wikipedia.page('List of Lexus_(vehicles)'))
print
result = wikipedia.page('List of Lexus_(vehicles)').content.encode('UTF8')
print ("3: Result of searching Wikipedia for 'List of Lexus_(vehicles)':")
print (result)
print
lexus_count = result.count('ct','lfa','rx')
print
print ("The Wikipedia page for 'Lexus_(company)' has " + \
"{} occurrences of the word 'Lexus'".format(lexus_count))
print
Updated
I'm able to parse page data but getting Type error on count
23 print
24
25 lexus_count = result.count('ct','lfa','rx')
26 print
TypError: slice indices must be integers or None or have an __index__ method

There was multiple issues with your program. Here is an updated program, with the errors fixed and marked.
import wikipedia
print ("1: Searching Wikipedia for 'Lexus'")
try:
print (wikipedia.page('Lexus'))
print ('-' * 60)
except wikipedia.exceptions.DisambiguationError as e:
print (str(e))
print ('+' * 60)
print ('DisambiguationError: The page name is ambiguous')
print
print ("2: Searching Wikipedia for 'Lexus (company)'")
result = wikipedia.page('Lexus (company)')
# ERR; PAGE NAME SEPARATED BY SPACE NOT WITH AN UNDERSCORE
# <> PAGE ERROR AS PAGE WILL NOT BE FOUND.
print (result)
print
result = result.content
print ("3: Result of searching Wikipedia for 'Lexus_(company)':")
print (result)
print
lexus_count = result.count('Lexus')
# changed variable name from orange_count -> lexus_count, as referenced in the print function below.
# you were counting for 'lexus' you will not find any occurrences as this function is case sensitive.
print
print ("The Wikipedia page for 'Lexus_(company)' has " + \
"{} occurrences of the word 'Lexus'".format(lexus_count))
print
Hope this helps.

Which page error exactly are you getting?
According to the wikipedia documentation: https://wikipedia.readthedocs.io/en/latest/quickstart.html#quickstart
But watch out - wikipedia.summary will raise a DisambiguationError if the page is a disambiguation page, or a PageError if the page doesn’t exist (although by default, it tries to find the page you meant with suggest and search.):

Related

How to write correctly a program which extract all links from a web page?

This is the part of Udacity course WEB SEARCH ENGINE.The goal of this quiz is to write a program which extract all links from the web page.On the output program must return only LINKS.But in my case program returns all links and "NONE" twice.I know that the error in the second part of program after "WHILE" and after "ELSE".But i dont know what i must write there.
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None,0
else:
start_quote = page.find('"', start_link)
endquo = page.find('"',start_quote + 1)
url = page[(start_quote + 1) : endquo]
return url,endquo
page = 'i know what you doing summer <a href="Udasity".i know what you doing summer <a href="Georgia" i know what you doing summer '
def ALLlink(page):
url = 1
while url != None:
url,endquo = get_next_target(page)
if url:
print url
page = page[endquo:]
else:
print ALLlink(page)
First, you can remove your else statement in your ALLlink() function since it's not doing anything.
Also, when comparing to None, you should use is not instead of !=:
while url != None: # bad
while url is not None # good
That said, I think your error is in your last line:
print ALLlink(page)
You basically have two print statements. The first is inside your function and the second is on the last line of your script. Really, you don't need the last print statement there because you're already printing in your ALLlink() function. So if you change the line to just ALLlink(page), I think it'll work.
If you do want to print there, you could modify your function to store the URLs in an array, and then print that array. Something like this:
def ALLlink(page):
urls = []
url = 1
while url is not None:
url, endquo = get_next_target(page)
if url:
urls.append(url)
page = page[endquo:]
return urls
print ALLlink(page)

How to send scraped data through reddit bot

So I've got this bot that I want to use to reply with the box score of the mets game anytime someone says "mets score" on a specific subreddit. This is my first python project and I plan on using it on a dummy subreddit I created as a learning tool. I'm having trouble sending the scores from the website I scraped through the bot so it can appear in the reply to the "mets score" comments. Any suggestions?
import praw
import time
from lxml import html
import requests
from bs4 import BeautifulSoup
r = praw.Reddit(user_agent = 'my_first_bot')
r.login('user_name', 'password')
def scores():
soup = BeautifulSoup(requests.get("http://scores.nbcsports.com/mlb/scoreboard.asp?day=20160621&meta=true").content, "lxml")
table = soup.find("a",class_="teamName", text="NY Mets").find_previous("table")
a, b = [a.text for a in table.find_all("a",class_="teamName")]
inn, a_score, b_score = ([td.text for td in row.select("td.shsTotD")] for row in table.find_all("tr"))
print (" ".join(inn))
print ("{}: {}".format(a, " ".join(a_score)))
print ("{}: {}".format(b, " ".join(b_score)))
words_to_match = ['mets score']
cache = []
def run_bot():
print("Grabbing subreddit...")
subreddit = r.get_subreddit("random_subreddit")
print("Grabbing comments...")
comments = subreddit.get_comments(limit=40)
for comment in comments:
print(comment.id)
comment_text = comment.body.lower()
isMatch = any(string in comment_text for string in words_to_match)
if comment.id not in cache and isMatch:
print("match found!" + comment.id)
comment.reply('heres the score to last nights mets game...' scores())
print("reply successful")
cache.append(comment.id)
print("loop finished, goodnight")
while True:
run_bot()
time.sleep(120)
I think I'll just put you out of your misery ;). There are multiple issues with your code snippet:
comment.reply('heres the score to last nights mets game...' scores())
The .reply() method requires a string or an object that can have a good enough representation as a string. Assuming the method scores() returns a string, you should concatenate the two arguments, like this:
comment.reply('heres the score to last nights mets game...'+ scores())
It looks like your knowledge of basic python syntax and constructs is dusty. For a quick refresher see this.
Your method scores() doesn't return anything. It just prints out a bunch of lines (which I assume are for debugging purposes).
def scores():
soup = BeautifulSoup(requests.get("http://scores.nbcsports.com/mlb/scoreboard.asp?day=20160621&meta=true").content, "lxml")
.......
print (" ".join(inn))
print ("{}: {}".format(a, " ".join(a_score)))
print ("{}: {}".format(b, " ".join(b_score)))
Funnily enough you could use those exact strings for your return value (or maybe something else entirely, as suit your needs) like this:
def scores():
.......
inn_string = " ".join(inn)
a_string = "{}: {}".format(a, " ".join(a_score))
b_string = "{}: {}".format(b, " ".join(b_score))
return "\n".join([inn_string, a_string, b_string])
These should get you up and running.
More advice: Have you read the Reddit PRAW docs? You should. You should also probably use praw.helpers.comment_stream(). It's simple and easy to use and will handle retrieving new comments for you. Currently you try and fetch a maximum of 40 comments every 120 seconds. What happens when there are more than that many relevant comments in that 120 second span. You'll end up missing some of the comments you should've replied to. .comment_stream() will take care of rate limiting for you so that your bot can reply to each new comment which needs its attention at its own pace. Read more about this here.

Instagram API Tagging Error Using Python

I am trying to print out all the caption text that have 'Starhub' in
it. It works, but I only able to print out a total of 19 text.
Found this function: api.tag_recent_media(count, max_tag_id, tag_name)
- https://github.com/Instagram/python-instagram
So I make use of that function, but it failed. I type 50 counts
meaning I hope that 50 text would come out, but it only print out 19
instead.
My codes:
from instagram.client import InstagramAPI
from instagram.bind import InstagramAPIError
access_token = "<TOKEN>"
client_secret = "<SECRET>"
api = InstagramAPI(access_token=access_token, client_secret=client_secret)
recent_media, next_ = api.tag_recent_media(count=50, tag_name="Starhub")
count = 0
for media in recent_media: try:
print media.caption.text, "--->", media.user.username
print ""
count += 1
except UnicodeEncodeError:
pass
print count
This code print out only 19. So what I try to do instead is this.
I add this inside my code:
while next_:
more_media, next_ =api.tag_recent_media(with_next_url=next_)
recent_media.extend(more_media)
There is error: No paremeter value found for path variable; tag_name.
Any idea how can I print out all text that have the tag 'Starhub'?

else if condition with 3 checks

I am getting 10 result of google search.
My scenario is:
if any result(link) out of 10 belongs to wikipedia, consider that result
Else consider Google instant result (result which appear on top before links) if exist
Else consider description of all 10 link
Here is my code:
for contentIndex in self.search_response['links']:
domain = self.search_response['links'][contentIndex]['domain']
if "wikipedia.org" in domain:
google_query = ''
google_query = self.search_response['links'][contentIndex]['content']
print "wiki link"
break
elif google_instant:
google_query = ''
google_query = google_instant
print "\n \n Instant result : " + google_instant
break
else:
google_query += self.search_response['links'][contentIndex]['content']
But this condition gets crashed. Like if first link is not wiki link and instant result is present then it will not connsider wiki link, but instant result.
You're breaking out of the loop on the google_instant condition. If this condition is met before you find a wikipedia link, then it will always use the google_instant link. What you actually need to do here is keep iterating through the results, then at the end check if there is a wikipedia or google instant link.
search_results = ''
wikipedia_result = None
google_instant_result = None
for contentIndex in self.search_response['links']:
domain = self.search_response['links'][contentIndex]['domain']
if "wikipedia.org" in domain:
wikipedia_rsult = self.search_response['links'][contentIndex]['content']
print "wiki link"
elif google_instant:
google_instant_result = google_instant
print "\n \n Instant result : " + google_instant
else:
search_results += self.search_response['links'][contentIndex]['content']
google_query = wikipedia_result or google_instant or search_results

Need to handle keyerror exception python

I'm getting a keyerror exception when I input a player name here that is not in the records list. I can search it and get back any valid name, but if I input anything else, i get a keyerror. I'm not really sure how to go about handling this since it's kindof confusing already dealing with like 3 sets of data created from parsing my file.
I know this code is bad I'm new to python so please excuse the mess - also note that this is a sortof test file to get this functionality working, which I will then write into functions in my real main file. Kindof a testbed here, if that makes any sense.
This is what my data file, stats4.txt, has in it:
[00000] Cho'Gath - 12/16/3 - Loss - 2012-11-22
[00001] Fizz - 12/5/16 - Win - 2012-11-22
[00002] Caitlyn - 13/4/6 - Win - 2012-11-22
[00003] Sona - 4/5/9 - Loss - 2012-11-23
[00004] Sona - 2/1/20 - Win - 2012-11-23
[00005] Sona - 6/3/17 - Loss - 2012-11-23
[00006] Caitlyn - 14/2/16 - Win - 2012-11-24
[00007] Lux - 10/2/14 - Win - 2012-11-24
[00008] Sona - 8/1/22 - Win - 2012-11-27
Here's my code:
import re
info = {}
records = []
search = []
with open('stats4.txt') as data:
for line in data:
gameid = [item.strip('[') for item in line.split(']')]
del gameid[-1]
gameidstr = ''.join(gameid)
gameid = gameidstr
line = line[7:]
player, stats, outcome, date = [item.strip() for item in line.split('-', 3)]
stats = dict(zip(('kills', 'deaths', 'assists'), map(int, stats.split('/'))))
date = tuple(map(int, date.split('-')))
info[player] = dict(zip(('gameid', 'player', 'stats', 'outcome', 'date'), (gameid, player, stats, outcome, date)))
records.append(tuple((gameid, info[player])))
print "\n\n", info, "\n\n" #print the info dictionary just to see
champ = raw_input() #get champion name
#print info[champ].get('stats').get('kills'), "\n\n"
#print "[%s] %s - %s/%s/%s - %s-%s-%s" % (info[champ].get('gameid'), champ, info[champ].get('stats').get('kills'), info[champ].get('stats').get('deaths'), info[champ].get('stats').get('assists'), info[champ].get('date')[0], info[champ].get('date')[1], info[champ].get('date')[2])
#print "\n\n"
#print info[champ].values()
i = 0
for item in records: #this prints out all records
print "\n", "[%s] %s - %s/%s/%s - %s - %s-%s-%s" % (records[i][0], records[i][1]['player'], records[i][1]['stats']['kills'], records[i][1]['stats']['deaths'], records[i][1]['stats']['assists'], records[i][1]['outcome'], records[i][1]['date'][0], records[i][1]['date'][1], records[i][1]['date'][2])
i = i + 1
print "\n" + "*" * 50
i = 0
for item in records:
if champ in records[i][1]['player']:
search.append(records[i][1])
else:
pass
i = i + 1
s = 0
if not search:
print "no availble records" #how can I get this to print even if nothing is inputted in raw_input above for champ?
print "****"
for item in search:
print "\n[%s] %s - %s/%s/%s - %s - %s-%s-%s" % (search[s]['gameid'], search[s]['player'], search[s]['stats']['kills'], search[s]['stats']['deaths'], search[s]['stats']['assists'], search[s]['outcome'], search[s]['date'][0], search[s]['date'][1], search[s]['date'][2])
s = s + 1
I tried setting up a Try; Except sort of thing but I couldn't get any different result when entering an invalid player name. I think I could probably set something up with a function and returning different things if the name is present or not but I think I've just gotten myself a bit confused. Also notice that no match does indeed print for the 8 records that aren't matches, though thats not quite how I want it to work. Basically I need to get something like that for any invalid input name, not just a valid input that happens to not be in a record in the loop.
Valid input names for this data are:
Cho'Gath, Fizz, Caitlyn, Sona, or Lux - anything else gives a keyerror, thats what I need to handle so it doesn't raise an error and instead just prints something like "no records available for that champion" (and prints that only once, rather then 8 times)
Thanks for any help!
[edit] I was finally able to update this code in the post (thank you martineau for getting it added in, for some reason backticks aren't working to block code and it was showing up as bold normal text when i pasted. Anyways, look at if not search, how can I get that to print even if nothing is entered at all? just pressing return on raw_input, currently it prints all records after **** even though i didn't give it any search champ
where is your exact error occurring?
i'm just assuming it is when champ = raw_input() #get champion name
and then info[champ]
you can either check if the key exists first
if champ not in info:
print 'no records avaialble'
or use get
if info.get(champ)
or you can just try and access the key
try:
info[champ]
# do stuff
except KeyError:
print 'no records available'
the more specific you can be in your question the better, although you explained your problem you really didn't include any specifics Please always include a traceback if available, and post the relevant code IN your post not on a link.
Here's some modifications that I think address your problem. I also reformatted the code to make it a little more readable. In Python it's possible to continue long lines onto the next either by ending with a \ or just going to the next line if there's an unpaired '(' or '[' on the previous line.
Also, the way I put code in my questions or answer here is by cutting it out of my text editor and then pasting it into the edit window, after that I make sure it's all selected and then just use the {} tool at the top of edit window to format it all.
import re
from pprint import pprint
info = {}
records = []
with open('stats4.txt') as data:
for line in data:
gameid = [item.strip('[') for item in line.split(']')]
del gameid[-1]
gameidstr = ''.join(gameid)
gameid = gameidstr
line = line[7:]
player, stats, outcome, date = [item.strip() for item in line.split('-', 3)]
stats = dict(zip(('kills', 'deaths', 'assists'), map(int, stats.split('/'))))
date = tuple(map(int, date.split('-')))
info[player] = dict(zip(('gameid', 'player', 'stats', 'outcome', 'date'),
(gameid, player, stats, outcome, date)))
records.append(tuple((gameid, info[player])))
#print "\n\n", info, "\n\n" #print the info dictionary just to see
pprint(info)
champ = raw_input("Champ's name: ") #get champion name
#print info[champ].get('stats').get('kills'), "\n\n"
#print "[%s] %s - %s/%s/%s - %s-%s-%s" % (
# info[champ].get('gameid'), champ, info[champ].get('stats').get('kills'),
# info[champ].get('stats').get('deaths'), info[champ].get('stats').get('assists'),
# info[champ].get('date')[0], info[champ].get('date')[1],
# info[champ].get('date')[2])
#print "\n\n"
#print info[champ].values()
i = 0
for item in records: #this prints out all records
print "\n", "[%s] %s - %s/%s/%s - %s - %s-%s-%s" % (
records[i][0], records[i][1]['player'], records[i][1]['stats']['kills'],
records[i][1]['stats']['deaths'], records[i][1]['stats']['assists'],
records[i][1]['outcome'], records[i][1]['date'][0],
records[i][1]['date'][1], records[i][1]['date'][2])
i = i + 1
print "\n" + "*" * 50
i = 0
search = []
for item in records:
if champ in records[i][1]['player']:
search.append(records[i][1])
i = i + 1
if not search:
print "no match"
exit()
s = 0
for item in search:
print "\n[%s] %s - %s/%s/%s - %s - %s-%s-%s" % (search[s]['gameid'],
search[s]['player'], search[s]['stats']['kills'],
search[s]['stats']['deaths'], search[s]['stats']['assists'],
search[s]['outcome'], search[s]['date'][0], search[s]['date'][1],
search[s]['date'][2])
s = s + 1

Categories