I have been trying to make a bot that searches for a specific keyword in the reddit title, if that keyword is true it would then comment something in that thread. Everything works find, just I have one problem, after around 4 hours of it running it keeps searching but it stops commenting for some reason, no idea why. Then I restart it and it works ok.
It seems to happen around 3pm PST everyday, it keeps on printing that it is searching but it just wont comment even if there are posts that contain the keywords. Is this somehting that reddit does to stop bots or is something wrong with my code.
Before I had the reddit praw statement ouside of my 3 subreddit functions, but I wanted to test if i kept on reconnecting to the prawl after every seach would it stop the issue.
In sort my reddit bot stops commenting after a certain point and is there any way I could fix this or is it permanent.
#!/usr/bin/python
import praw
import pdb
import re
import os
import threading
import time
sub1_array = ['Title']
sub1_array = ['Comment']
def sub1():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub1')
print("Checking sub1")
for submission in subreddit.new(limit=20):
i = 0
while i <= (len(sub1_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub1_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(link_array[i])
print("Bot replying to match: ", submission.title)
del sub1_array[i]
del sub1_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
sub2_array = ['Title']
sub2_link = ['Comment]
def sub2():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub2')
print("Checking Streams NBA")
for submission in subreddit.new(limit=20):
#print(submission.title)
i = 0
while i <= (len(sub2_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub2_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(sub2_link[i])
print("Bot replying to match: ", submission.title)
del sub2_array[i]
del sub2_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
sub3_array = ['Title']
sub3_link = ['Comment]
def ncaa():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub3')
print("Checking sub3")
for submission in subreddit.new(limit=20):
#print(submission.title)
i = 0
while i <= (len(sub3_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub3_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(sub3_link[i])
print("Bot replying to match: ", submission.title)
del sub3_array[i]
del sub3_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
def should_reset_timer():
pass
def main():
sub1()
sub2()
sub3()
timer = 0
while True:
time.sleep(1)
timer+=1
if should_reset_timer():
timer = 0
if timer == 1*30:
sub1()
sub2()
sub3()
timer = 0
# Store the current id into our list
# Write our updated list back to the file
main()
Related
I am trying to run a multithreaded email checker to see if the emails are office 365 valid.
Looking over and over my code, I cannot seem to find the reason it's not working correctly.
It should be appending the email to a GOOD or BAD list.
Instead, it's not appending anything!
This is my code:
...
currentDirectory = os.getcwd() # set the current directory - /new/
# Locations
location_emails_goods = currentDirectory + '/contacts/goods/'
location_emails_bads = currentDirectory + '/contacts/bads/'
location_emails = currentDirectory + '/contacts/contacts.txt'
now = datetime.now()
todayString = now.strftime('%d-%m-%Y-%H-%M-%S')
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
def saveLogs():
global GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY, file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS
#print(GOOD_EMAILS_ARRAY)
for good in GOOD_EMAILS_ARRAY:
file_goods.write(good + '\n')
file_goods.close()
for bad in BAD_EMAILS_ARRAY:
file_bads.write(bad + '\n')
file_bads.close()
def newChecker(email):
global url, GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
BAD_EMAILS_ARRAY.append(email)
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
BAD_EMAILS_ARRAY.append(email)
# The follow is showing empty array eventhough I have defined GOOD_EMAILS_ARRAY globally so it should be updating
print(GOOD_EMAILS_ARRAY)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
As you can see, I am trying to append an email to either GOOD_EMAILS_ARRAY or BAD_EMAILS_ARRAY.
The BAD_EMAILS_ARRAY and GOOD_EMAILS_ARRAY are global variables but it for reason won't append to them.
I am running this through multiprocessing if you need to know.
Any ideas or errors looking in my code?
Processes do not share memory, the global variable with same name in two processes are two different object.
If you need share state between processes, see this:
https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
Okay so it turns out that I just needed to use the Manager from multiprocessing:
from multiprocessing import Manager, Pool
then I could use a normal array through the manager such as:
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
This allowed me to then use my script like it was, just using these new arrays by Manager which works just how I wanted :)
...
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
def saveLogs():
global file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS, good_list, bad_list
for good in good_list:
file_goods.write(good + '\n')
file_goods.close()
for bad in bad_list:
file_bads.write(bad + '\n')
file_bads.close()
print('{} => Fully completed email scanning'.format(Fore.CYAN))
print('{} => Good emails [{}] || Bad emails [{}]'.format(Fore.GREEN, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS))
def newChecker(email):
global url, good_list, bad_list
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
bad_list.append(email)
if valid:
good_list.append(email)
else:
if valid:
good_list.append(email)
else:
bad_list.append(email)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
created this python script to add 100 members but adds only 3 to 5 and sometime only 1 and after that it stops automatically dont know what is the problem but maybe i think the problem is in time stamp
created this python script to add 100 members but adds only 3 to 5 and sometime only 1 and after that it stops automatically dont know what is the problem but maybe i think the problem is in time stamp
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty, InputPeerChannel, InputPeerUser, PeerUser
from telethon.errors.rpcerrorlist import PeerFloodError, UserPrivacyRestrictedError, ChatWriteForbiddenError, UserAlreadyParticipantError
from telethon.tl.functions.channels import InviteToChannelRequest
from telethon.tl.functions.channels import GetFullChannelRequest, JoinChannelRequest
from telethon import types, utils, errors
import configparser
import sys
import csv
from csv import reader
import traceback
import time
import random
from telethon.sessions import StringSession
print(" ")
print(" " )
print(" ")
print(" ")
delta = 1
with open('../phone.csv', 'r') as read_obj:
csv_reader = reader(read_obj)
list_of_rows = list(csv_reader)
row_number = delta
col_number = 1
value = list_of_rows[row_number - 1][col_number - 1]
with open('../api.csv', 'r') as api_obj_id:
csv_reader = reader(api_obj_id)
list_of_rows = list(csv_reader)
row_number = delta
col_number = 1
deltaop = list_of_rows[row_number - 1][col_number - 1]
with open('../api.csv', 'r') as hash_obj:
csv_reader = reader(hash_obj)
list_of_rows = list(csv_reader)
row_number = delta
col_number = 2
deltaxd = list_of_rows[row_number - 1][col_number - 1]
api_id = int(deltaop)
api_hash = str(deltaxd)
pphone = value
config = configparser.ConfigParser()
config.read("../config.ini")
to_group = config['Telegram']['to_channel']
def autos():
channel_username = to_group
phone = utils.parse_phone(pphone)
client = TelegramClient(f"../sessions/{phone}", api_id, api_hash)
client.connect()
if not client.is_user_authorized():
print('some thing has changed')
client.send_code_request(phone)
client.sign_in(phone, input ('Enter the code: '))
input_file = '../data.csv'
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f, delimiter=",", lineterminator="\n")
next(rows, None)
for row in rows:
user = {}
user['srno'] = row[0]
user['username'] = row[1]
user['id'] = int(row[2])
#user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
startfrom = int(input("Start From = "))
endto = int(input("End To = "))
for user in users:
if (int(startfrom) <= int (user['srno'])) and (int(user['srno']) <= int(endto)):
try:
status = 'delta'
if user['username'] == "":
print("no username, moving to next")
continue
client(InviteToChannelRequest(channel_username,[user['username']]))
status = 'DONE'
#print("Waiting for 60-180 Seconds...")
time.sleep(random.randrange(0, 5))
except UserPrivacyRestrictedError:
status = 'PrivacyRestrictedError'
except UserAlreadyParticipantError:
status = 'ALREADY'
except PeerFloodError as g:
status = 'PeerFloodError :('
print('Script Is Stopping Now')
time.sleep(86400)
except ChatWriteForbiddenError as cwfe:
client(JoinChannelRequest(channel_username))
continue
except errors.RPCError as e:
status = e.__class__.__name__
except Exception as d:
status = d
except:
traceback.print_exc()
print("Unexpected Error")
continue
channel_connect = client.get_entity(channel_username)
channel_full_info = client(GetFullChannelRequest(channel=channel_connect))
countt = int(channel_full_info.full_chat.participants_count)
print(f"ADDING {user['name']} TO {channel_username} TOTAL: {countt} - {status}")
elif int(user['srno']) > int(endto):
print("Members Added Successfully!")
stat = input('Done!\nChoose From Below:\n\n1 - Repeat The Script\nOR Just Hit Enter To Quit\n\nEnter: ')
if stat == '1':
autos()
else:
quit()
autos()
The console is giving a syntax error on line 2 at the end of the word print on the T.
The line by itself functions perfectly. Some of the code is left out as it contains account details but none of it is relevant to the problem.
def run_bot(posts_replied_to):
print ("Searching last 5 posts...")
for submission in praw.Reddit(username = "Sir_Hanush_of_Leipa",
password = "********",
client_id = "kbe2veBF1yE9mA",
client_secret = "*************",
user_agent = "******".subreddit("kingdomcome").new(limit=5)
if (submission.id) not in posts_replied_to :
print ('New post found: ' + submission.id)
submission.reply(random.choice(kingdom_quotes) + "\n\n___\n\nHalt!
I am ...")
print ("Replied to post " + submission.id)
posts_replied_to.append(submission.id)
with open ("posts_replied_to.txt", "a") as f:
f.write(submission.id + "\n")
#Pause for 1 minute
print("Sleeping for 10 minutes...")
time.sleep(600)
print ("Search Completed.")
print (posts_replied_to)
print("Sleeping for 10 minutes...")
time.sleep(600)
def get_saved_posts():
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
f.close()
return posts_replied_to
def login_ritual():
posts_replied_to = get_saved_posts()
print (posts_replied_to)
return posts_replied_to
while True:
posts_replied_to = login_ritual()
try:
run_bot(posts_replied_to)
except:
print(datetime.datetime.now())
print("Unable to execute. Trying again...")
It should use the Reddit API to get the post id and check it to see if the bot has already responded. But it gives a syntax error on the print function.
Edit: This is the error Command prompt gives
File "Kingdombot.py", line 30
print ('New post found: ' + submission.id)
^
SyntaxError: invalid syntax
Here is my code:
import urllib
import webbrowser
from bs4 import BeautifulSoup
import requests
import re
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
response = requests.get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(response.text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
response2 = requests.get(searchLink)
soup2 = BeautifulSoup(response2.text, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
Basically right now it is scraping each "g" one at a time, when this program can benefit massively from scraping each link all at the same time. For example, I want it to have them all scraping at the same time instead of waiting until the one before it is done. Sorry if this is a simple kind of question but I have little experience with asyncio so if anyone could help that would be massively appreciated. Thanks!
To write async program you need:
define functions with async def
call it with await
create event loop and run some function in it
run requests concurrently using asyncio.gather
All other is almost same as usual. Instead of using blocking request module you should use some async one. For example, aiohttp:
python -m pip install aiohttp
And use it like this:
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
Here's code with some changes I statrted. I didn't check if it's actually works since I don't have files you use. You should also move logic inside for g in soup.find_all(class_='g'): to seperate function and run multiple of these functions with asyncio.gather to benefit of asyncio.
import asyncio
import aiohttp
import urllib
import webbrowser
from bs4 import BeautifulSoup
import re
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
async def main():
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
text = await get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
text2 = await get(searchLink)
soup2 = BeautifulSoup(text2, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
Upd:
Main idea is to move logic inside loop that does request into separate coroutine and pass multiple of these coroutines to asyncio.gather. It will parallelize your requests.
async def main():
# Her do all that are before the loop.
coros = [
process_single_g(g)
for g
in soup.find_all(class_='g')
]
results = await asyncio.gather(*coros) # this function will run multiple tasks concurrently
# and return all results together.
for res in results:
ans1Score, ans2Score, ans3Score = res
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
async def process_single_g(g):
# Here do all things you inside loop for concrete g.
text2 = await get(searchLink)
# ...
return ans1Score, ans2Score, ans3Score
The official way,
r = praw.Reddit('Comment Scraper 1.0 by u/_Daimon_ see '
'https://praw.readthedocs.org/en/latest/'
'pages/comment_parsing.html')
submission = r.get_submission(submission_id='11v36o')
submission.replace_more_comments(limit=None, threshold=0)
is extremely slow. Is there a way to speed this up? There are people that have extracted every reddit comment into a database, so there must be some way to do this quicker.
Edit: the new praw api (6.0.0) has lists() which make the job easier:
This also handles AttributeError that might occure due to more_comments through the use of replace_more(limit=None)
submissionList = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
submissionList.append(comment)
Edit: The new praw api (5.0.1) is magical and makes this much easier. Here is how to do it now:
def getSubComments(comment, allComments, verbose=True):
allComments.append(comment)
if not hasattr(comment, "replies"):
replies = comment.comments()
if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
else:
replies = comment.replies
for child in replies:
getSubComments(child, allComments, verbose=verbose)
def getAll(r, submissionId, verbose=True):
submission = r.submission(submissionId)
comments = submission.comments
commentsList = []
for comment in comments:
getSubComments(comment, commentsList, verbose=verbose)
return commentsList
Example usage:
res = getAll(r, "6rjwo1")
#res = getAll(r, "6rjwo1", verbose=False) # This won't print out progress if you want it to be silent. Default is verbose=True
Where r is
username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
r = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
Previous stuff (outdated now):
Okay, I wrote code that can reliably pull every comment from a thread, and takes about 10 seconds for 500 comments, and about a minute for 4000 comments. I named this redApi.py Here it is:
import time
import requests
import requests.auth
import praw
username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
def getPraw():
return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
global accessToken
accessToken = None
def getAccessToken():
client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
post_data = {"grant_type": "password", "username": username, "password": password}
headers = {"User-Agent": userAgent}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
return response.json()
def makeRequest(apiUrl, useGet=True):
global accessToken
if accessToken is None:
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
if useGet:
response = requests.get(apiUrl, headers=headers)
else:
response = requests.post(apiUrl, headers=headers)
time.sleep(1.1)
responseJson = response.json()
if 'error' in responseJson:
if responseJson['error'] == 401:
print "Refreshing access token"
time.sleep(1.1)
accessToken = getAccessToken()
headers = {"Authorization": "bearer " + accessToken['access_token'], "User-Agent": userAgent}
time.sleep(1.1)
response = requests.get(apiUrl, headers=headers)
responseJson = response.json()
return responseJson
global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)
# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
global prawReddit
subreddit = prawReddit.get_subreddit(subredditName)
postGetter = praw.helpers.submissions_between(prawReddit, subreddit)
postArray = []
numGotten = 0
while numGotten < numPosts:
postArray.append(postGetter.next())
numGotten += 1
return postArray
# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
allData = makeRequest(requestUrl)
articleData = allData[0]
comments = allData[1]
curComments = comments['data']['children']
resultComments = getCommentsHelper(curComments, submission.name, submission)
return resultComments
# Print out the tree of comments
def printTree(comments):
return printTreeHelper(comments, "")
def printTreeHelper(comments, curIndentation):
resultString = ""
for comment in comments:
resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
if not comment['data']['replies'] == "":
resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + " ")
return resultString
# Get all comments as a single array
def flattenTree(comments):
allComments = []
for comment in comments:
allComments.append(comment)
if not comment['data']['replies'] == "":
allComments += flattenTree(comment['data']['replies']['data']['children'])
return allComments
# Utility functions for getComments
def expandCommentList(commentList, submission):
curComments = commentList
allComments = {}
while True:
thingsToExpand = []
nextComments = []
allParents = {}
for comment in curComments:
if comment['kind'] == "more":
thingsToExpand += comment['data']['children']
else:
if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
print comment
allComments[comment['data']['name']] = comment
if len(thingsToExpand) == 0:
curComments = []
break
curComments = []
if not len(thingsToExpand) == 0:
print "total things to expand: " + str(len(thingsToExpand))
for i in range(0, len(thingsToExpand)/100+1):
curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
curData = makeRequest(requestUrl)
if 'json' in curData and 'data' in curData['json']:
curComments += curData['json']['data']['things']
print (i+1)*100
for comment in curComments:
allComments[comment['data']['name']] = comment
return allComments.values()
def lookForMore(comment):
if comment['kind'] == "more":
return True
if not comment['data']['replies'] == "":
for reply in comment['data']['replies']['data']['children']:
if lookForMore(reply):
return True
return False
def getCommentsHelper(curComments, rootId, submission):
allComments = expandCommentList(curComments, submission)
commentMap = {}
for comment in allComments:
commentMap[comment['data']['name']] = comment
allRootComments = []
for comment in allComments:
if comment['data']['parent_id'] == rootId:
allRootComments.append(comment)
elif comment['data']['parent_id'] in commentMap:
parentComment = commentMap[comment['data']['parent_id']]
if parentComment['data']['replies'] == "":
parentComment['data']['replies'] = {'data': {'children': []}}
alreadyChild = False
for childComment in parentComment['data']['replies']['data']['children']:
if childComment['data']['name'] == comment['data']['name']:
alreadyChild = True
break
if not alreadyChild:
parentComment['data']['replies']['data']['children'].append(comment)
else:
print "pls halp"
completedComments = []
needMoreComments = []
for comment in allRootComments:
if not comment['data']['replies'] == "" or comment['kind'] == 'more':
hasMore = lookForMore(comment)
if hasMore:
needMoreComments.append(comment)
else:
replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = replyComments
completedComments.append(comment)
else:
completedComments.append(comment)
for comment in needMoreComments:
requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
allData = makeRequest(requestUrl)
articleData = allData[0]
comment = allData[1]['data']['children'][0]
if comment['data']['replies'] == "":
completedComments.append(comment)
else:
comments = comment['data']['replies']['data']['children']
actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
comment['data']['replies']['data']['children'] = actualComments
completedComments.append(comment)
return completedComments
To use this script, in a python prompt, type the following:
# Get all comments from a post
# Submission is a praw submission, obtained via:
r = redApi.getPraw()
submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
comments = redApi.getComments(submission)
Looks like praw has been updated? In 4.5.1 it looks more like:
#!/usr/local/bin/python
import praw
reddit = praw.Reddit(
client_id='<client_id>',
client_secret='<client_secret>',
user_agent='davehodg/0.1')
submission = reddit.submission(id='<submission_id>')
sumbission = reddit.submission(url='<submission_url>') #in case you don't have submission id
for comment in submission.comments.list():
print(comment.body)
Edit: seems like the most I can get back is 1000 comments?
Im adding a bunch of prints and debugging, but right now #danielle your script does nothing. Just returned back to prompt.