I have a problem with passing a variable from a method to method within a given class.
The code is this (I am practicing beginner):
class Calendar():
def __init__(self,link):
self.link = link
self.request = requests.get(link)
self.request.encoding='UTF-8'
self.soup = BeautifulSoup(self.request.text,'lxml')
def DaysMonth(self):
Dates = []
tds = self.soup.findAll('td', {'class':'action'})
for td in tds:
check = (td.findAll('a')[0].text)
if "Víkendová odstávka" in check:
date = td.findAll('span')[0].text
Dates.append(date)
return Dates
def PrintCal(self):
return ['Víkendová odstávka serverů nastane ' + date + '. den v měsíci.' for date in Dates]
def main(self):
PrintCal(DaysMonth())
I would like to pass the list Dates from the method DaysMonth to the method PrintCal. When I initiate the class, i.e. cal = Calendar('link'), and run cal.PrinCal(), I get that the name Dates has not been defined. If I run cal.DaysMonth(), the output is as expected.
What is the issue here? Thank you!
Dates is a local variable in the DaysMonth method, and is therefore not visible anywhere else. Fortunately, DaysMonth does return Dates, so it's easy to get the value you want. Simply add the following line to your PrintCal method (before the return statement):
Dates = self.DaysMonth()
You are trying to do too much in the Calendar object, particularly in the init method. You don't want to combine the scraping and parsing of a website at the time the object is instantiated. I would use the Calendar object to store and display the results of the scraping/parsing. If you need everything to be object-oriented, than create a separate Scraper/Parser class that handles that part of the logic.
class Calendar():
def __init__(self, dates):
self.dates = dates
def display_dates(self):
return ['Víkendová odstávka serverů nastane ' + date + '. den v měsíci.'
for date in self.dates]
r = requests.get(link, encoding='UTF-8')
soup = BeautifulSoup(r.text,'lxml')
dates = []
for td in soup.findAll('td', {'class':'action'}):
check = (td.findAll('a')[0].text)
if "Víkendová odstávka" in check:
dates.append(td.findAll('span')[0].text)
c = Calendar(dates=dates)
print(c.display_dates)
is expected the below is supposed to run without issues.
Solution to Reddit data:
import requests
import re
import praw
from datetime import date
import csv
import pandas as pd
import time
import sys
class Crawler(object):
'''
basic_url is the reddit site.
headers is for requests.get method
REX is to find submission ids.
'''
def __init__(self, subreddit="apple"):
'''
Initialize a Crawler object.
subreddit is the topic you want to parse. default is r"apple"
basic_url is the reddit site.
headers is for requests.get method
REX is to find submission ids.
submission_ids save all the ids of submission you will parse.
reddit is an object created using praw API. Please check it before you use.
'''
self.basic_url = "https://www.reddit.com"
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
self.REX = re.compile(r"<div class=\" thing id-t3_[\w]+")
self.subreddit = subreddit
self.submission_ids = []
self.reddit = praw.Reddit(client_id="your_id", client_secret="your_secret", user_agent="subreddit_comments_crawler")
def get_submission_ids(self, pages=2):
'''
Collect all ids of submissions..
One page has 25 submissions.
page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
id(after) is the last submission from last page.
'''
# This is page url.
url = self.basic_url + "/r/" + self.subreddit
if pages <= 0:
return []
text = requests.get(url, headers=self.headers).text
ids = self.REX.findall(text)
ids = list(map(lambda x: x[-6:], ids))
if pages == 1:
self.submission_ids = ids
return ids
count = 0
after = ids[-1]
for i in range(1, pages):
count += 25
temp_url = self.basic_url + "/r/" + self.subreddit + "?count=" + str(count) + "&after=t3_" + ids[-1]
text = requests.get(temp_url, headers=self.headers).text
temp_list = self.REX.findall(text)
temp_list = list(map(lambda x: x[-6:], temp_list))
ids += temp_list
if count % 100 == 0:
time.sleep(60)
self.submission_ids = ids
return ids
def get_comments(self, submission):
'''
Submission is an object created using praw API.
'''
# Remove all "more comments".
submission.comments.replace_more(limit=None)
comments = []
for each in submission.comments.list():
try:
comments.append((each.id, each.link_id[3:], each.author.name, date.fromtimestamp(each.created_utc).isoformat(), each.score, each.body) )
except AttributeError as e: # Some comments are deleted, we cannot access them.
# print(each.link_id, e)
continue
return comments
def save_comments_submissions(self, pages):
'''
1. Save all the ids of submissions.
2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
4. Separately, save them to two csv file.
Note: You can link them with submission_id.
Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
'''
print("Start to collect all submission ids...")
self.get_submission_ids(pages)
print("Start to collect comments...This may cost a long time depending on # of pages.")
submission_url = self.basic_url + "/r/" + self.subreddit + "/comments/"
comments = []
submissions = []
count = 0
for idx in self.submission_ids:
temp_url = submission_url + idx
submission = self.reddit.submission(url=temp_url)
submissions.append((submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext))
temp_comments = self.get_comments(submission)
comments += temp_comments
count += 1
print(str(count) + " submissions have got...")
if count % 50 == 0:
time.sleep(60)
comments_fieldnames = ["comment_id", "submission_id", "author_name", "post_time", "comment_score", "text"]
df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
df_comments.to_csv("comments.csv")
submissions_fieldnames = ["submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text"]
df_submission = pd.DataFrame(submissions, columns=submissions_fieldnames)
df_submission.to_csv("submissions.csv")
return df_comments
if __name__ == "__main__":
args = sys.argv[1:]
if len(args) != 2:
print("Wrong number of args...")
exit()
subreddit, pages = args
c = Crawler(subreddit)
c.save_comments_submissions(int(pages))
but I got:
(base) UserAir:scrape_reddit user$ python reddit_crawler.py apple 2
Start to collect all submission ids...
Traceback (most recent call last):
File "reddit_crawler.py", line 127, in
c.save_comments_submissions(int(pages))
File "reddit_crawler.py", line 94, in save_comments_submissions
self.get_submission_ids(pages)
File "reddit_crawler.py", line 54, in get_submission_ids
after = ids[-1]
IndexError: list index out of range
Erik's answer diagnoses the specific cause of this error, but more broadly I think this is caused by you not using PRAW to its fullest potential. Your script imports requests and performs a lot of manual requests that PRAW has methods for already. The whole point of PRAW is to prevent you from having to write these requests that do things such as paginate a listing, so I recommend you take advantage of that.
As an example, your get_submission_ids function (which scrapes the web version of Reddit and handles paginating) could be replaced by just
def get_submission_ids(self, pages=2):
return [
submission.id
for submission in self.reddit.subreddit(self.subreddit).hot(
limit=25 * pages
)
]
because the .hot() function does everything you tried to do by hand.
I'm going to go one step further here and have the function just return a list of Submission objects, because the rest of your code ends up doing things that would better by done by interacting with the PRAW Submission object. Here's that code (I renamed the function to reflect its updated purpose):
def get_submissions(self, pages=2):
return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))
(I've updated this function to just return its result, as your version both returns the value and sets it as self.submission_ids, unless pages is 0. That felt quite inconsistent, so I made it just return the value.)
Your get_comments function looks good.
The save_comments_submissions function, like get_submission_ids, does a lot of manual work that PRAW can handle. You construct a temp_url that has the full URL of a post, and then use that to make a PRAW Submission object, but we can replace that with directly using the one returned by get_submissions. You also have some calls to time.sleep() which I removed because PRAW will automatically sleep the appropriate amount for you. Lastly, I removed the return value of this function because the point of the function is to save data to disk, not to return it to anywhere else, and the rest of your script doesn't use the return value. Here's the updated version of that function:
def save_comments_submissions(self, pages):
"""
1. Save all the ids of submissions.
2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
4. Separately, save them to two csv file.
Note: You can link them with submission_id.
Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
"""
print("Start to collect all submission ids...")
submissions = self.get_submissions(pages)
print(
"Start to collect comments...This may cost a long time depending on # of pages."
)
comments = []
pandas_submissions = []
for count, submission in enumerate(submissions):
pandas_submissions.append(
(
submission.name[3:],
submission.num_comments,
submission.score,
submission.subreddit_name_prefixed,
date.fromtimestamp(submission.created_utc).isoformat(),
submission.title,
submission.selftext,
)
)
temp_comments = self.get_comments(submission)
comments += temp_comments
print(str(count) + " submissions have got...")
comments_fieldnames = [
"comment_id",
"submission_id",
"author_name",
"post_time",
"comment_score",
"text",
]
df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
df_comments.to_csv("comments.csv")
submissions_fieldnames = [
"submission_id",
"num_of_comments",
"submission_score",
"submission_subreddit",
"post_date",
"submission_title",
"text",
]
df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
df_submission.to_csv("submissions.csv")
Here's an updated version of the whole script that uses PRAW fully:
from datetime import date
import sys
import pandas as pd
import praw
class Crawler:
"""
basic_url is the reddit site.
headers is for requests.get method
REX is to find submission ids.
"""
def __init__(self, subreddit="apple"):
"""
Initialize a Crawler object.
subreddit is the topic you want to parse. default is r"apple"
basic_url is the reddit site.
headers is for requests.get method
REX is to find submission ids.
submission_ids save all the ids of submission you will parse.
reddit is an object created using praw API. Please check it before you use.
"""
self.subreddit = subreddit
self.submission_ids = []
self.reddit = praw.Reddit(
client_id="your_id",
client_secret="your_secret",
user_agent="subreddit_comments_crawler",
)
def get_submissions(self, pages=2):
"""
Collect all submissions..
One page has 25 submissions.
page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
id(after) is the last submission from last page.
"""
return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))
def get_comments(self, submission):
"""
Submission is an object created using praw API.
"""
# Remove all "more comments".
submission.comments.replace_more(limit=None)
comments = []
for each in submission.comments.list():
try:
comments.append(
(
each.id,
each.link_id[3:],
each.author.name,
date.fromtimestamp(each.created_utc).isoformat(),
each.score,
each.body,
)
)
except AttributeError as e: # Some comments are deleted, we cannot access them.
# print(each.link_id, e)
continue
return comments
def save_comments_submissions(self, pages):
"""
1. Save all the ids of submissions.
2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
4. Separately, save them to two csv file.
Note: You can link them with submission_id.
Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
"""
print("Start to collect all submission ids...")
submissions = self.get_submissions(pages)
print(
"Start to collect comments...This may cost a long time depending on # of pages."
)
comments = []
pandas_submissions = []
for count, submission in enumerate(submissions):
pandas_submissions.append(
(
submission.name[3:],
submission.num_comments,
submission.score,
submission.subreddit_name_prefixed,
date.fromtimestamp(submission.created_utc).isoformat(),
submission.title,
submission.selftext,
)
)
temp_comments = self.get_comments(submission)
comments += temp_comments
print(str(count) + " submissions have got...")
comments_fieldnames = [
"comment_id",
"submission_id",
"author_name",
"post_time",
"comment_score",
"text",
]
df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
df_comments.to_csv("comments.csv")
submissions_fieldnames = [
"submission_id",
"num_of_comments",
"submission_score",
"submission_subreddit",
"post_date",
"submission_title",
"text",
]
df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
df_submission.to_csv("submissions.csv")
if __name__ == "__main__":
args = sys.argv[1:]
if len(args) != 2:
print("Wrong number of args...")
exit()
subreddit, pages = args
c = Crawler(subreddit)
c.save_comments_submissions(int(pages))
I realize that my answer here gets into Code Review territory, but I hope that this answer is helpful for understanding some of the things PRAW can do. Your "list index out of range" error would have been avoided by using the pre-existing library code, so I do consider this to be a solution to your problem.
When my_list[-1] throws an IndexError, it means that my_list is empty:
>>> ids = []
>>> ids[-1]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
IndexError: list index out of range
>>> ids = ['1']
>>> ids[-1]
'1'
I'm trying to parse a JSON of a sites stock.
The JSON: https://www.ssense.com/en-us/men/sneakers.json
So I want to take some keywords from the user. Then I want to parse the JSON using these keywords to find the name of the item and (in this specific case) return the ID, SKU and the URL.
So for example:
If I inputted "Black Fennec" I want to parse the JSON and find the ID,SKU, and URL of Black Fennec Sneakers (that have an ID of 3297299, a SKU of 191422M237006, and a url of /men/product/ps-paul-smith/black-fennec-sneakers/3297299 )
I have never attempted doing anything like this. Based on some guides that show how to parse a JSON I started out with this:
r = requests.Session()
stock = r.get("https://www.ssense.com/en-us/men/sneakers.json",headers = headers)
obj json_data = json.loads(stock.text)
However I am now confused. How do I find the product based off the keywords and how do I get the ID,Url and the SKU or it?
Theres a number of ways to handle the output. not sure what you want to do with it. But this should get you going.
EDIT 1:
import requests
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
keyword = input('Enter a keyword: ')
for product in products:
if keyword.upper() in product['name'].upper():
name = product['name']
id_var = product['id']
sku = product['sku']
url = product['url']
print ('Product: %s\nID: %s\nSKU: %s\nURL: %s' %(name, id_var, sku, url))
# if you only want to return the first match, uncomment next line
#break
I also have it setup to store it into a dataframe, and or a list too. Just to give some options of where to go with it.
import requests
import pandas as pd
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
keyword = input('Enter a keyword: ')
products_found = []
results = pd.DataFrame()
for product in products:
if keyword.upper() in product['name'].upper():
name = product['name']
id_var = product['id']
sku = product['sku']
url = product['url']
temp_df = pd.DataFrame([[name, id_var, sku, url]], columns=['name','id','sku','url'])
results = results.append(temp_df)
products_found = products_found.append(name)
print ('Product: %s\nID: %s\nSKU: %s\nURL: %s' %(name, id_var, sku, url))
if products_found == []:
print ('Nothing found')
EDIT 2: Here is another way to do it by converting the json to a dataframe, then filtering by those rows that have the keyword in the name (this is actually a better solution in my opinion)
import requests
import pandas as pd
from pandas.io.json import json_normalize
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
products_df = json_normalize(products)
keyword = input('Enter a keyword: ')
products_found = []
results = pd.DataFrame()
results = products_df[products_df['name'].str.contains(keyword, case = False)]
#print (results[['name', 'id', 'sku', 'url']])
products_found = list(results['name'])
if products_found == []:
print ('Nothing found')
else:
print ('Found: '+ str(products_found))
So sorry about this vague and confusing title. But there is no really better way for me to summarize my problem in one sentence.
I was trying to get the student and grade information from a french website. The link is this (http://www.bankexam.fr/resultat/2014/BACCALAUREAT/AMIENS?filiere=BACS)
My code is as follows:
import time
import urllib2
from bs4 import BeautifulSoup
regions = {'R\xc3\xa9sultats Bac Amiens 2014':'/resultat/2014/BACCALAUREAT/AMIENS'}
base_url = 'http://www.bankexam.fr'
tests = {'es':'?filiere=BACES','s':'?filiere=BACS','l':'?filiere=BACL'}
for i in regions:
for x in tests:
# create the output file
output_file = open('/Users/student project/'+ i + '_' + x + '.txt','a')
time.sleep(2) #compassionate scraping
section_url = base_url + regions[i] + tests[x] #now goes to the x test page of region i
request = urllib2.Request(section_url)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response,'html.parser')
content = soup.find('div',id='zone_res')
for row in content.find_all('tr'):
if row.td:
student = row.find_all('td')
name = student[0].strong.string.encode('utf8').strip()
try:
school = student[1].strong.string.encode('utf8')
except AttributeError:
school = 'NA'
result = student[2].span.string.encode('utf8')
output_file.write ('%s|%s|%s\n' % (name,school,result))
# Find the maximum pages to go through
if soup.find('div','pagination'):
import re
page_info = soup.find('div','pagination')
pages = []
for i in page_info.find_all('a',re.compile('elt')):
try:
pages.append(int(i.string.encode('utf8')))
except ValueError:
continue
max_page = max(pages)
# Now goes through page 2 to max page
for i in range(1,max_page):
page_url = '&p='+str(i)+'#anchor'
section2_url = section_url+page_url
request = urllib2.Request(section2_url)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response,'html.parser')
content = soup.find('div',id='zone_res')
for row in content.find_all('tr'):
if row.td:
student = row.find_all('td')
name = student[0].strong.string.encode('utf8').strip()
try:
school = student[1].strong.string.encode('utf8')
except AttributeError:
school = 'NA'
result = student[2].span.string.encode('utf8')
output_file.write ('%s|%s|%s\n' % (name,school,result))
A little more description about the code:
I created a 'regions' dictionary and 'tests' dictionary because there are 30 other regions I need to collect and I just include one here for showcase. And I'm just interested in the test results of three tests (ES, S, L) and so I created this 'tests' dictionary.
Two errors keep showing up,
one is
KeyError: 2
and the error is linked to line 12,
section_url = base_url + regions[i] + tests[x]
The other is
TypeError: cannot concatenate 'str' and 'int' objects
and this is linked to line 10.
I know there is a lot of information here and I'm probably not listing the most important info for you to help me. But let me know how I can do to fix this!
Thanks
The issue is that you're using the variable i in more than one place.
Near the top of the file, you do:
for i in regions:
So, in some places i is expected to be a key into the regions dictionary.
The trouble comes when you use it again later. You do so in two places:
for i in page_info.find_all('a',re.compile('elt')):
And:
for i in range(1,max_page):
The second of these is what is causing your exceptions, as the integer values that get assigned to i don't appear in the regions dict (nor can an integer be added to a string).
I suggest renaming some or all of those variables. Give them meaningful names, if possible (i is perhaps acceptable for an "index" variable, but I'd avoid using it for anything else unless you're code golfing).