Get all comments from a specific reddit thread in python - python

The official way,
r = praw.Reddit('Comment Scraper 1.0 by u/_Daimon_ see '
'https://praw.readthedocs.org/en/latest/'
'pages/comment_parsing.html')
submission = r.get_submission(submission_id='11v36o')
submission.replace_more_comments(limit=None, threshold=0)
is extremely slow. Is there a way to speed this up? There are people that have extracted every reddit comment into a database, so there must be some way to do this quicker.

Edit: the new praw api (6.0.0) has lists() which make the job easier:
This also handles AttributeError that might occure due to more_comments through the use of replace_more(limit=None)
submissionList = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
submissionList.append(comment)
Edit: The new praw api (5.0.1) is magical and makes this much easier. Here is how to do it now:
def getSubComments(comment, allComments, verbose=True):
allComments.append(comment)
if not hasattr(comment, "replies"):
replies = comment.comments()
if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
else:
replies = comment.replies
for child in replies:
getSubComments(child, allComments, verbose=verbose)
def getAll(r, submissionId, verbose=True):
submission = r.submission(submissionId)
comments = submission.comments
commentsList = []
for comment in comments:
getSubComments(comment, commentsList, verbose=verbose)
return commentsList
Example usage:
res = getAll(r, "6rjwo1")
#res = getAll(r, "6rjwo1", verbose=False) # This won't print out progress if you want it to be silent. Default is verbose=True
Where r is
username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
r = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
Previous stuff (outdated now):
Okay, I wrote code that can reliably pull every comment from a thread, and takes about 10 seconds for 500 comments, and about a minute for 4000 comments. I named this redApi.py Here it is:
import time
import requests
import requests.auth
import praw
username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
def getPraw():
return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
global accessToken
accessToken = None
def getAccessToken():
client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
post_data = {"grant_type": "password", "username": username, "password": password}
headers = {"User-Agent": userAgent}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
return response.json()
def makeRequest(apiUrl, useGet=True):
global accessToken
if accessToken is None:
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
if useGet:
response = requests.get(apiUrl, headers=headers)
else:
response = requests.post(apiUrl, headers=headers)
time.sleep(1.1)
responseJson = response.json()
if 'error' in responseJson:
if responseJson['error'] == 401:
print "Refreshing access token"
time.sleep(1.1)
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
time.sleep(1.1)
response = requests.get(apiUrl, headers=headers)
responseJson = response.json()
return responseJson
global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
global prawReddit
subreddit = prawReddit.get_subreddit(subredditName)
postGetter = praw.helpers.submissions_between(prawReddit, subreddit)
postArray = []
numGotten = 0
while numGotten < numPosts:
postArray.append(postGetter.next())
numGotten += 1
return postArray
# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
allData = makeRequest(requestUrl)
articleData = allData[0]
comments = allData[1]
curComments = comments['data']['children']
resultComments = getCommentsHelper(curComments, submission.name, submission)
return resultComments
# Print out the tree of comments
def printTree(comments):
return printTreeHelper(comments, "")
def printTreeHelper(comments, curIndentation):
resultString = ""
for comment in comments:
resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
if not comment['data']['replies'] == "":
resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + " ")
return resultString
# Get all comments as a single array
def flattenTree(comments):
allComments = []
for comment in comments:
allComments.append(comment)
if not comment['data']['replies'] == "":
allComments += flattenTree(comment['data']['replies']['data']['children'])
return allComments
# Utility functions for getComments
def expandCommentList(commentList, submission):
curComments = commentList
allComments = {}
while True:
thingsToExpand = []
nextComments = []
allParents = {}
for comment in curComments:
if comment['kind'] == "more":
thingsToExpand += comment['data']['children']
else:
if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
print comment
allComments[comment['data']['name']] = comment
if len(thingsToExpand) == 0:
curComments = []
break
curComments = []
if not len(thingsToExpand) == 0:
print "total things to expand: " + str(len(thingsToExpand))
for i in range(0, len(thingsToExpand)/100+1):
curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
curData = makeRequest(requestUrl)
if 'json' in curData and 'data' in curData['json']:
curComments += curData['json']['data']['things']
print (i+1)*100
for comment in curComments:
allComments[comment['data']['name']] = comment
return allComments.values()
def lookForMore(comment):
if comment['kind'] == "more":
return True
if not comment['data']['replies'] == "":
for reply in comment['data']['replies']['data']['children']:
if lookForMore(reply):
return True
return False
def getCommentsHelper(curComments, rootId, submission):
allComments = expandCommentList(curComments, submission)
commentMap = {}
for comment in allComments:
commentMap[comment['data']['name']] = comment
allRootComments = []
for comment in allComments:
if comment['data']['parent_id'] == rootId:
allRootComments.append(comment)
elif comment['data']['parent_id'] in commentMap:
parentComment = commentMap[comment['data']['parent_id']]
if parentComment['data']['replies'] == "":
parentComment['data']['replies'] = {'data': {'children': []}}
alreadyChild = False
for childComment in parentComment['data']['replies']['data']['children']:
if childComment['data']['name'] == comment['data']['name']:
alreadyChild = True
break
if not alreadyChild:
parentComment['data']['replies']['data']['children'].append(comment)
else:
print "pls halp"
completedComments = []
needMoreComments = []
for comment in allRootComments:
if not comment['data']['replies'] == "" or comment['kind'] == 'more':
hasMore = lookForMore(comment)
if hasMore:
needMoreComments.append(comment)
else:
replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = replyComments
completedComments.append(comment)
else:
completedComments.append(comment)
for comment in needMoreComments:
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
allData = makeRequest(requestUrl)
articleData = allData[0]
comment = allData[1]['data']['children'][0]
if comment['data']['replies'] == "":
completedComments.append(comment)
else:
comments = comment['data']['replies']['data']['children']
actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = actualComments
completedComments.append(comment)
return completedComments
To use this script, in a python prompt, type the following:
# Get all comments from a post
# Submission is a praw submission, obtained via:
r = redApi.getPraw()
submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
comments = redApi.getComments(submission)

Looks like praw has been updated? In 4.5.1 it looks more like:
#!/usr/local/bin/python
import praw
reddit = praw.Reddit(
client_id='<client_id>',
client_secret='<client_secret>',
user_agent='davehodg/0.1')
submission = reddit.submission(id='<submission_id>')
sumbission = reddit.submission(url='<submission_url>') #in case you don't have submission id
for comment in submission.comments.list():
print(comment.body)
Edit: seems like the most I can get back is 1000 comments?

Im adding a bunch of prints and debugging, but right now #danielle your script does nothing. Just returned back to prompt.

Related

asynchronous aiohttp requests send nothing, but synchronous requests succeed

I'm trying to get coin value on parasawp but the Async way does not work while the sync yes.
I've tried to remove ssl, wait betweens requests, limit TCP but in the end nothing worked correctly with async/aiohttp ...
I'm working on a old raspberry pi if it's useful.
here's my code :
async def get_price_as():
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False, limit=8)) as session:
req_url = "https://apiv5.paraswap.io/prices/"
param = {}
param["srcDecimals"] = str(18)
param["destDecimals"] = str(18)
param["amount"] = str(10**18)
for res in os.listdir(wu.chemin_cles):
W3[res] = Web3(Web3.HTTPProvider(wu.url_provider[res]["rpc"]))
clef_privee[res] = wu.cle_privee(res)
param["network"] = wu.chain_ID(res)
for a in adresses[res]:
for b in adresses[res]:
if b!=a:
param["srcToken"] = a
param["destToken"] = b
async with session.get(req_url,params=param) as resp:
response = await resp.json(content_type=None)
if response.get('priceRoute') == None:
print(res, " ", a, "/", b, " : ", response['error'])
c_1 += 1
r = paraswap.get_price(adresses[res][a], "18", adresses[res][b], "18", wu.chain_ID(res), str(10**18))
if r!=1:
c_2+=1
print(float(r["priceRoute"]["destAmount"])/10**18, "sync ...")
else :
response = float(response["priceRoute"]["destAmount"])/10**18
print(res, " ", a, "/", b, " : ", response)
asyncio.run(get_price_as())
# paraswap
def get_price(srcToken, srcDecimals, destToken, destDecimals, chainID, amount):
req_url = "https://apiv5.paraswap.io/prices/"
param = {}
param["srcToken"] = srcToken
param["srcDecimals"] = srcDecimals
param["destToken"] = destToken
param["destDecimals"] = destDecimals
param["network"] = chainID
param["amount"] = amount
response = requests.get(req_url,params=param)
if response.status_code == 200:
return response.json()
else:
return 1
it's not pretty yet but I just want something that work for now ... thx

How to create testcase for those method

I have two method and here is test i have done for feedback_selected_material_to_unsplash. I need to convert this test into using mock and also i'm not sure how to write properly test for download_image_material. I appreciate for any help
Here is the testcase
def test_feedback_selected_material_to_unsplash_download_location_not_null(self, mock_post):
URL = 'https://material.richka.co/track/unsplash'
data = {
'download_location': self.download_location
}
resp = requests.post("www.someurl.com", data=json.dumps(data), headers={'Content-Type': 'application/json'})
mock_post.assert_called_with(URL, data=json.dumps(data), headers={'Content-Type': 'application/json'})
def feedback_selected_material_to_unsplash(download_location):
if not download_location == 'null':
URL = 'https://test/track/unsplash'
data = {
'download_location': download_location
}
try:
response = requests.post(URL, data)
logger.info(response.json())
logger.info('Done the feedback to unsplash.com')
except:
pass
this method i dont know how to write testcase for this
def download_image_material(request, data, video, keyword, keyword_gui, material_type):
dname_tmp = settings.BASE_DIR + '/web/static'
hashedDname = get_path_stock_box()
saveFolder = gen_hash(gen_fname())
path = '%s/%s/%s' % (dname_tmp, hashedDname, saveFolder)
if not os.path.exists(path):
os.makedirs(path)
objs = []
material = Material(generated_video_data=video, keyword = keyword)
material.is_video = material_type
material.save()
for index,datum in enumerate(data):
if index == 50:
break
obj = {}
obj['word'] = datum['word']
obj['link'] = datum['link']
obj['user'] = datum['user']
obj['download_location'] = datum['download_location'] if 'unsplash.com' in datum['link'] else None
imgUrl = datum['small']
try:
headers = {}
response, imgRaw = http_get(imgUrl, headers=headers)
except urllib.error.HTTPError as ex:
continue
except:
continue
imgUrlWithoutQuery = re.sub(r'\?.*','',imgUrl)
suffix = imgUrlWithoutQuery[imgUrlWithoutQuery.rfind('.') + 1 :]
suffix = suffix.lower()
if suffix in settings.IMG_EXTENSIONS or suffix in settings.VIDEO_EXTENSIONS or suffix.lower() == 'mp4':
pass
else:
mime = response.info()['Content-Type']
suffix = _mime2suffix(mime)
if suffix not in settings.IMG_EXTENSIONS and suffix not in settings.VIDEO_EXTENSIONS or suffix.lower() == 'mp4':
continue
imgFname = '%s.%s' % (gen_hash(gen_fname()), suffix)
imgPathFname = '%s/%s/%s/%s' % (dname_tmp, hashedDname, saveFolder, imgFname)
imgPathFnameSaved = '%s/%s/%s' % (hashedDname, saveFolder, imgFname)
fout = open(imgPathFname, 'wb')
fout.write(imgRaw)
fout.close()
#process file
obj['media'] = imgPathFnameSaved
if suffix in settings.IMG_EXTENSIONS:
save_image_with_resize_and_rotate(imgPathFname)
obj['media'] = imgPathFnameSaved
elif suffix.lower() in settings.VIDEO_EXTENSIONS:
# convert videos to mp4 and delete original files
upload.conv2mp4(path)
os.remove(path)
hashed_name = imgPathFnameSaved[ : imgPathFnameSaved.rfind('.')] + '.mp4'
obj['media'] = hashed_name
if suffix == 'mp4':
obj['video'] = {}
obj['video']['duration'] = gen.get_video_duration(settings.BASE_DIR + '/web/static/' + obj['media'])
gen_thumbnail(settings.BASE_DIR + '/web/static/' + obj['media'])
fname_mp4 = obj['media']
obj['media'] = fname_mp4[ : fname_mp4.rfind('.')] + '.png'
#process service name
url = urlparse(datum['link'])
obj['service'] = {}
obj['service']['url'] = f'{url.scheme}://{url.netloc}'
obj['service']['hostname'] = url.netloc
# get json from database
material = Material.objects.get(id=material.pk)
objDb = material.get_json_data()
objs = []
if objDb:
for objOld in objDb:
objs.append(objOld)
objs.append(obj)
material.set_json_data(objs)
material.save()
res_json = {'result': True, 'data':obj, 'keyword': keyword_gui, 'pk': material.pk}
yield json.dumps(res_json) + '\n'
material.set_json_data(objs)
material.save()
yield json.dumps({'result': True, 'end': True, 'pk': material.pk}) + '\n'

Getting ValueError: unknown url type: ' '

I have this code below that iterate through some tracks. And then for each track I want to use the musixmatch api to get and print the lyrics of the track based on the artist name and track name.
code that iterete trough some tracks and print the lyrics:
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
print(tracks)
print(song_lyric(title, artist))
The print(tracks) returns in this format:
{12: {'trackID': 12, 'title': 'Achtung Baby', 'number': '1', 'artist': 'U2', 'album': 'Achtung Baby', 'albumID': 2, 'duration': '291'}
When the code exuted the lyrics for the firsts tracks are printed, but then it appears an error:
Traceback (most recent call last):
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 239, in <module>
print(song_lyric(title, artist))
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 72, in song_lyric
lyrics_tracking(tracking_url)
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 79, in lyrics_tracking
request = urllib.request.Request(querystring)
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 329, in __init__
self.full_url = url
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 355, in full_url
self._parse()
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 384, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Do you know why this error is appearing?
The methods to get the lyrics from musixmatch are public available:
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
#print(tracking_url)
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
Full working exemple that reproduces the error:
import requests
import json
import urllib.request, urllib.error, urllib.parse
import socket
apikey_musixmatch = '0b4a363bbd71974c2634837d5b5d1d9a' #generated for the example
apiurl_musixmatch = 'http://api.musixmatch.com/ws/1.1/'
api_key = "b088cbedecd40b35dd89e90f55227ac2" #generated for the example
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
print("Tracking_url====================" +tracking_url + "==================================")
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
ID = 0
#get top artists from country
artists = {}
for i in range(2, 3):
artists_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&country=spain&format=json&page=' + str(i) + '&api_key=' + api_key)
artists_data = artists_response.json()
for artist in artists_data["topartists"]["artist"]:
name = artist["name"]
url = artist["url"]
if ID > 1: continue
artists[ID] = {}
artists[ID]['ID'] = ID
artists[ID]['name'] = name
ID += 1
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
artist_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&format=json&artist=' + chosen + '&api_key=' + api_key)
artist_data = artist_response.json()
# get top albums of the artists
albums = {}
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
topalbums_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.gettopalbums&format=json&artist=' + chosen + '&api_key=' + api_key + '&limit=5')
albums_data = topalbums_response.json()
for album in albums_data['topalbums']['album']:
name = album["name"]
url = album["url"]
albums[ID] = {}
albums[ID]['ID'] = ID
albums[ID]['artist'] = artists[i]['name']
albums[ID]['artistID'] = artists[i]['ID']
albums[ID]['name'] = name
ID += 1
# Get tracks of the album
tracks = {}
for i, v in albums.items():
artist = albums[i]['artist'].replace(" ", "+")
name = albums[i]['name'].replace(" ", "+")
album_response_data = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=album.getinfo&format=json&api_key=' + api_key + '&artist=' + artist + '&album=' + name)
album_response = album_response_data.json()
for album in album_response['album']['tracks']['track']:
title = album['name']
tracks[ID] = {}
tracks[ID]['trackID'] = ID
tracks[ID]['title'] = title
tracks[ID]['artist'] = albums[i]['artist']
tracks[ID]['album'] = albums[i]['name']
tracks[ID]['albumID'] = albums[i]['ID']
ID += 1
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
# print the lyric of each track
print(song_lyric(title, artist))
It seems like url is not correct. It happens here:
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
If you have ability to run that API locally and see what is returned into tracking_url, you can find out what is still wrong with it.
UPDATE:
I reproduced it, so the urllib.request cannot process empty string URL: "", that is why you need to check if the tracking_url != "" and only if its not empty string or None you need to request for the song.

headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent} KeyError: 'access_token'

I found this code in this SO's answer however when I run it I get this error:
in redcom.py I have:
import time
import requests
import requests.auth
import praw
username = 'myusername'
userAgent = "reddit natural language processing " + username
clientId = 'myclientid'
clientSecret = "myclientsecret"
password = "mypasswd"
#app_uri = http://127.0.0.1:65010/authorize_callback
def getPraw():
return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
global accessToken
accessToken = None
def getAccessToken():
client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
post_data = {"grant_type": "password", "username": username, "password": password}
headers = {"User-Agent": userAgent}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
return response.json()
def makeRequest(apiUrl, useGet=True):
global accessToken
if accessToken is None:
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
if useGet:
response = requests.get(apiUrl, headers=headers)
else:
response = requests.post(apiUrl, headers=headers)
time.sleep(1.1)
responseJson = response.json()
if 'error' in responseJson:
if responseJson['error'] == 401:
print "Refreshing access token"
time.sleep(1.1)
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
time.sleep(1.1)
response = requests.get(apiUrl, headers=headers)
responseJson = response.json()
return responseJson
global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
global prawReddit
subreddit = prawReddit.get_subreddit(subredditName)
postGetter = praw.helpers.submissions_between(prawReddit, subreddit)
postArray = []
numGotten = 0
while numGotten < numPosts:
postArray.append(postGetter.next())
numGotten += 1
return postArray
# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
allData = makeRequest(requestUrl)
articleData = allData[0]
comments = allData[1]
curComments = comments['data']['children']
resultComments = getCommentsHelper(curComments, submission.name, submission)
return resultComments
# Print out the tree of comments
def printTree(comments):
return printTreeHelper(comments, "")
def printTreeHelper(comments, curIndentation):
resultString = ""
for comment in comments:
resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
if not comment['data']['replies'] == "":
resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + " ")
return resultString
# Get all comments as a single array
def flattenTree(comments):
allComments = []
for comment in comments:
allComments.append(comment)
if not comment['data']['replies'] == "":
allComments += flattenTree(comment['data']['replies']['data']['children'])
return allComments
# Utility functions for getComments
def expandCommentList(commentList, submission):
curComments = commentList
allComments = {}
while True:
thingsToExpand = []
nextComments = []
allParents = {}
for comment in curComments:
if comment['kind'] == "more":
thingsToExpand += comment['data']['children']
else:
if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
print comment
allComments[comment['data']['name']] = comment
if len(thingsToExpand) == 0:
curComments = []
break
curComments = []
if not len(thingsToExpand) == 0:
print "total things to expand: " + str(len(thingsToExpand))
for i in range(0, len(thingsToExpand)/100+1):
curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
curData = makeRequest(requestUrl)
if 'json' in curData and 'data' in curData['json']:
curComments += curData['json']['data']['things']
print (i+1)*100
for comment in curComments:
allComments[comment['data']['name']] = comment
return allComments.values()
def lookForMore(comment):
if comment['kind'] == "more":
return True
if not comment['data']['replies'] == "":
for reply in comment['data']['replies']['data']['children']:
if lookForMore(reply):
return True
return False
def getCommentsHelper(curComments, rootId, submission):
allComments = expandCommentList(curComments, submission)
commentMap = {}
for comment in allComments:
commentMap[comment['data']['name']] = comment
allRootComments = []
for comment in allComments:
if comment['data']['parent_id'] == rootId:
allRootComments.append(comment)
elif comment['data']['parent_id'] in commentMap:
parentComment = commentMap[comment['data']['parent_id']]
if parentComment['data']['replies'] == "":
parentComment['data']['replies'] = {'data': {'children': []}}
alreadyChild = False
for childComment in parentComment['data']['replies']['data']['children']:
if childComment['data']['name'] == comment['data']['name']:
alreadyChild = True
break
if not alreadyChild:
parentComment['data']['replies']['data']['children'].append(comment)
else:
print "pls halp"
completedComments = []
needMoreComments = []
for comment in allRootComments:
if not comment['data']['replies'] == "" or comment['kind'] == 'more':
hasMore = lookForMore(comment)
if hasMore:
needMoreComments.append(comment)
else:
replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = replyComments
completedComments.append(comment)
else:
completedComments.append(comment)
for comment in needMoreComments:
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
allData = makeRequest(requestUrl)
articleData = allData[0]
comment = allData[1]['data']['children'][0]
if comment['data']['replies'] == "":
completedComments.append(comment)
else:
comments = comment['data']['replies']['data']['children']
actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = actualComments
completedComments.append(comment)
return completedComments
and in tester.py I have:
import redcom
r = redcom.getPraw()
submission = r.get_submission(submission_id='z1c9z')
comments = redcom.getComments(submission)
The error is:
$ python tester.py
Traceback (most recent call last):
File "tester.py", line 4, in <module>
comments = redcom.getComments(submission)
File "/home/jalal/computer_vision/reddit/redcom.py", line 80, in getComments
allData = makeRequest(requestUrl)
File "/home/jalal/computer_vision/reddit/redcom.py", line 31, in makeRequest
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
KeyError: 'access_token'

Twitter Search API using urllib2

I am beginner to API calls using python (or even just API calls). I am trying a basic call with the Twitter API.
My Code for generating oauth_signature is as follows :
def getSignature(query):
key_dict['q'] = urllib.quote(query, '')
finKey = ""
for key in sorted(key_dict.keys()):
finKey += key + "="+key_dict[key]+"&"
finKey = finKey[:-1]
finKey = HTTP_METHOD + "&" + urllib.quote(BASE_URL, '') + "&" + urllib.quote(finKey, '')
key = urllib.quote(CONSUMER_SECRET_KEY, '')+"&"+urllib.quote(ACCESS_TOKEN_SECRET, '')
hashed = hmac.new(key, finKey, sha1)
finKey = binascii.b2a_base64(hashed.digest())
key_dict['oauth_signature'] = urllib.quote(finKey, '')
where key_dict stores all the keys :
key_dict = dict()
key_dict['oauth_consumer_key'] = urllib.quote(CONSUMER_KEY, '')
key_dict['oauth_nonce'] = urllib.quote("9ab59691142584g739134971f75aa986", '')
key_dict['oauth_signature_method'] = urllib.quote("HMAC-SHA1", '')
key_dict['oauth_timestamp'] = urllib.quote(str(int(time.time())), '')
key_dict['oauth_token'] = urllib.quote(ACCESS_TOKEN, '')
key_dict['oauth_version'] = urllib.quote(OAUTH_VERSION, '')
BASE_URL = "https://api.twitter.com/1.1/search/tweets.json?" + urllib.quote("q=delhi+elections", '')
I generate the Base Header String using the following :
def getHeaderString():
ret = "OAuth "
key_list =['oauth_consumer_key', 'oauth_nonce', 'oauth_signature', 'oauth_signature_method', 'oauth_timestamp', 'oauth_token', 'oauth_version']
for key in key_list:
ret = ret+key+"=\""+key_dict[key]+"\", "
ret = ret[:-2]
return ret
Although when I am making the call, I get :
urllib2.HTTPError: HTTP Error 401: Unauthorized
OR
urllib2.URLError: <urlopen error [Errno 60] Operation timed out>
My final request is made using the following :
getSignature("delhi+elections")
headers = { 'Authorization' : getHeaderString()}
req = urllib2.Request(BASE_URL, headers= headers)
response = urllib2.urlopen(req)
Where am I going wrong ?
Few Points that should have been mentioned somewhere :
The method : binascii.b2a_base64(hashed.digest()) appends a new line feed at the end of the string. This cause the oauth_signature to fail Authenticate.
The delhi+elections is actually supposed to be delhi elections. This mismatch again made the Hash Value match in sha1 to fail.
Removing both of them solved the problem.
The final Code :
key_dict = dict()
key_dict['oauth_consumer_key'] = urllib.quote(CONSUMER_KEY, '')
key_dict['oauth_nonce'] = urllib.quote("9aa39691142584s7df134971375aa986", '')
key_dict['oauth_signature_method'] = urllib.quote("HMAC-SHA1", '')
key_dict['oauth_timestamp'] = urllib.quote(str(int(time.time())), '')
key_dict['oauth_token'] = urllib.quote(ACCESS_TOKEN, '')
key_dict['oauth_version'] = urllib.quote(OAUTH_VERSION, '')
BASE_URL = "https://api.twitter.com/1.1/search/tweets.json"
def getSignature(query):
key_dict['q'] = urllib.quote(query, '')
finKey = ""
for key in sorted(key_dict.keys()):
finKey += key + "="+key_dict[key]+"&"
finKey = finKey[:-1]
finKey = HTTP_METHOD + "&" + urllib.quote(BASE_URL, '') + "&" + urllib.quote(finKey, '')
key = urllib.quote(CONSUMER_SECRET_KEY, '')+"&"+urllib.quote(ACCESS_TOKEN_SECRET, '')
hashed = hmac.new(key, finKey, sha1)
finKey = binascii.b2a_base64(hashed.digest())[:-1]
key_dict['oauth_signature'] = urllib.quote(finKey, '')
def getHeaderString():
ret = "OAuth "
key_list =['oauth_consumer_key', 'oauth_nonce', 'oauth_signature', 'oauth_signature_method', 'oauth_timestamp', 'oauth_token', 'oauth_version']
for key in key_list:
ret = ret+key+"=\""+key_dict[key]+"\", "
ret = ret[:-2]
return ret
url = BASE_URL
getSignature("delhi elections")
headers = { 'Authorization' : getHeaderString()}
values = {'q':'delhi elections'}
data = urllib.urlencode(values)
req = urllib2.Request(url+"?"+data, headers= headers)
response = urllib2.urlopen(req)
the_page = response.read()
print the_page
Instead of coding your own client, have you tried using tweepy? For a reference implementation using this library, you can check twitCheck client.

Categories