I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!
Following code:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
# print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
# df.to_csv(name + str('.csv'), index=False)
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:
TypeError: '<' not supported between instances of 'int' and 'str'
And this is logical as well. So, I guess I won't be able to use limit = "None".
I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.
#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def total_posts(name):
print("Calculating total number of posts")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('Creating dataframe')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('Created {}.csv file!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
Related
I'm trying to get all the tracks from 2 playlists into a CSV file. However, in both playlists, even though I increase the offset parameter by 100 in each query, the first 100 songs of both playlists are returned. So the page is never changed. What could be the problem?
import spotipy, json, csv
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
data_file = open('data.csv', 'w')
writer = csv.writer(data_file)
writer.writerow(['track_num', 'track_id', 'track_name', 'first_artist'] + ['liked'])
playlist_ids = [
'xxxxxxxxxxxxxxxxxxxxxxx', # playlist 1
'yyyyyyyyyyyyyyyyyyyyyyy' # playlist 2
]
for playlist_id in playlist_ids:
offset_n = 0
total = 100
while offset_n < total:
tracks_response = sp.playlist_tracks(playlist_id, offset=offset_n)
tracks_json = json.dumps(tracks_response)
tracks_data = json.loads(tracks_json)
if offset_n == 0:
total = tracks_data['tracks']['total']
for track in tracks_data['tracks']['items']:
track_id = track['track']['id']
track_name = track['track']['name']
first_artist = track['track']['artists'][0]['name']
if playlist_id == playlist_ids[0]:
writer.writerow([row_num, track_id, track_name, first_artist] + [1])
else:
writer.writerow([row_num, track_id, track_name, first_artist] + [0])
offset_n += 100
data_file.close()
The playlist_tracks method returns a paginated result with details of the tracks of a playlist.
So you need to iterate over all pages to get the full data.
You can use this example as a reference:
def get_all_tracks_from_playlist(playlist_id)
tracks_response = sp.playlist_tracks(playlist_id)
tracks = tracks_response["items"]
while tracks_response["next"]:
tracks_response = sp.next(tracks_response)
tracks.extend(tracks_response["items"])
return tracks
Regarding the ReadTimeout exception you have mentioned in the comments:
Spotify client accepts requests_timeout and retries as arguments, according to the documentation the default values are requests_timeout=5, and retries=3
You can extend them as you wish to decrease the chance you will get the ReadTimeout exception.
As a start you can double the request timeout to 10 seconds, and change the retries to 5:
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=10, retries=5)
I am new to Redis and Redisearch.
I want to create an autocomplete using redis in flask app.
Below is what I have tried so far,
autocomplete.py:
import redis
import redisearch
from flask import Flask,request,jsonify,render_template
app = Flask("autocomplete")
#creating a redis connection
r = redis.Redis(host='localhost', port=6379,db=0)
#app.route('/')
def home():
return "This is Home Page"
#route to add a value to autocomplete list
#app.route('/add')
def addValue():
try:
name = request.args.get('name')
n = name.strip()
for l in range(1,len(n)):
prefix = n[0:l]
r.zadd('compl',{prefix:0})
r.zadd('compl',{n+"*":0})
return "Success"
except:
return "Failed"
#route to get the autocomplete
#app.route('/autocomplete')
def autocomplete():
prefix = request.args.get('prefix')
results = []
rangelen = 50
count=5
start = r.zrank('compl',prefix)
if not start:
return []
while (len(results) != count):
range = r.zrange('compl',start,start+rangelen-1)
start += rangelen
if not range or len(range) == 0:
break
for entry in range:
entry=entry.decode('utf-8')
minlen = min(len(entry),len(prefix))
if entry[0:minlen] != prefix[0:minlen]:
count = len(results)
break
if entry[-1] == "*" and len(results) != count:
results.append(entry[0:-1])
return jsonify(results)
Currently the values for #app.route('/add') and prefixes for #app.route('/autocomplete') is fetched through the URL itself.
However, I want the prefixes/text for #app.route('/autocomplete') to be fetched through an input textbox to create dynamic autocomplete.
I would be really grateful if anyone could guide me in implementing the same.
This is a sample output:
autocomplete
I have also referred to https://redis.com/ebook/part-2-core-concepts/chapter-6-application-components-in-redis/6-1-autocomplete/ but was unable to understand on how to implement it
EDIT : I found a solution for this at https://github.com/RediSearch/redisearch-py/blob/master/redisearch/auto_complete.py
You can use redisearch's autocompleter.
Example using flask is available on Github https://github.com/Redislabs-Solution-Architects/redisearch_demo_and_preso
I have created a small script that scraped a webpage that scrapes all items name, link, image and price from a product table.
I am currently facing problem where I am not able to store multiple dataclasses where I want to first of all see if there is a new URL found in the webpage and if there is a new change, I want to print out the name, image and price of the new url that has been found.
import time
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
store: str = attr.ib(factory=str)
link: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
price: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
# -------------------------------------------------------------------------
# Get all latest products found in the webpage
# -------------------------------------------------------------------------
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
return Info(
store="Footish",
link=link,
name=name,
image=image,
price=price
)
if __name__ == '__main__':
all_found_products = set()
while True:
get_all_products: Info = from_page()
diff = set(get_all_products.link) - all_found_products
for new_urls in diff:
print(f"Found new url! {new_urls}")
print(f"Name: {get_all_products.name}")
print(f"image: {get_all_products.image}")
print(f"price: {get_all_products.price}")
print("Sleeping 120 sec")
time.sleep(120)
My problem is that I dont know how return dataclasses that is looped from a for loop for product in doc.css('article.product-wrapper'): as there is multiple products on the webpage and I want to store all found products and then compare to see if there is a new url found and if there is then I would like to print out the name, price and image of the new url.
You should use a list to store multiple Info instances, then return them all
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
infos = []
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
infos.append(Info(store="Footish", link=link, name=name,
image=image, price=price))
return infos
And for the main, it would be more something like
all_found_urls = set()
while True:
get_all_products = from_page()
for info in get_all_products:
if info.link not in all_found_urls:
print(f"Found new url! {info.link}")
print(f"Name: {info.name}")
print(f"image: {info.image}")
print(f"price: {info.price}")
all_found_urls.add(info.link)
print("Sleeping 120 sec")
time.sleep(120)
I'm putting together a python script to make trades on poloniex with the API, and so far I've got it to make trades when certain conditions are met, but I still need it to NOT place anymore trades for the rest of that day (I have the entire script looping every 60 seconds).
So far I have this script:
import requests
import urllib.request
import urllib.parse
import http.client
import hashlib
import hmac
import time
import json
from urllib.request import urlopen
The_Currency_Pair = input('Which Currency Pair?\nPAIRS TO CHOOSE FROM:\nUSDT_BTC\nUSDT_XRP\nUSDT_ETH\nUSDT_BCH\nUSDT_STR\nUSDT_LTC\nUSDT_ETC\nUSDT_XMR\n')
api = 'https://poloniex.com/tradingApi'
key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
secret = 'XXXXXXXXXXXXXXXXXXXXXXXXX'
def main():
poloniexPrices = urlopen('https://poloniex.com/public?command=returnTicker').read()
poloniexjson = json.loads(poloniexPrices)
poloniexlastP = poloniexjson[The_Currency_Pair]['last']
poloniexOCHL = urlopen('https://poloniex.com/public?command=returnChartData¤cyPair=USDT_BCH&start=1538352000&period=86400').read()
poloniexOCHLjson = json.loads(poloniexOCHL)
poloniexlasthigh = poloniexOCHLjson[-2]['high']
print ('Last Price')
print (poloniexlastP)
print ('----------------------------------------')
print ('Last Day High')
print (poloniexlasthigh)
print ('----------------------------------------')
data = {
'command': 'returnBalances',
'nonce' : int(time.time() * 1000)
}
data = urllib.parse.urlencode(data).encode()
signature = hmac.new(secret.encode(), data, hashlib.sha512)
headers = {
'Key' : key,
'Sign': signature.hexdigest()
}
request = urllib.request.Request(
url=api, data=data, headers=headers, method='POST'
)
text = urllib.request.urlopen(request).read().decode()
print ('MY ACCOUNT BALANCE')
try:
print(json.loads(text)['USDT'])
except:
print(text)
print ('-----------------------------------------')
if float(poloniexlastP) > 0:
print ('PLACING TRADE')
print ('-----------------------------------------------')
parms = {"command":"buy",
"currencyPair":The_Currency_Pair,
"rate":100,
"immediateOrCancel":1,
"amount":0.01,
"nonce":int(time.time() * 1000)}
parms = urllib.parse.urlencode(parms).encode()
signature = hmac.new(secret.encode(), parms, hashlib.sha512)
headers = {'Key' : key,
'Sign': signature.hexdigest()}
request = urllib.request.Request(
url=api, data=parms, headers=headers, method='POST')
text = urllib.request.urlopen(request).read().decode()
ordernumber = (json.loads(text)['orderNumber'])
print ('Order Number:')
print (ordernumber)
while True:
main()
time.sleep(60)
Anyway, after a trade has been placed, I need it to make sure that after the 60 second sleep, it doesn't make a second trade unless it is a new day/the day after the trade was made. (Could I use poloniex server time for this?)
So, if it has got as far as print (ordernumber) that means it has placed a trade. But how do I mark it as placed trade for the day or something and use it in the if float(poloniexlastP) > 0: for the next loop to make sure it doesn't place another one?
Maybe you can use Python to get the date, and create a global variable, and after the print statement, you can set the variable to the current date, and the coffee will check if it has already sent, that way it doesn't execute more than once in a day.
import datetime
# This Gets The Day Of The Month
todaysDateNumber = int(datetime.datetime.now().strftime("%d"))
dateNumberTradeSent = 0
if todaysDateNumber == dateNumberTradeSent:
print("The API has already been used once today, try again tomorrow!")
return
else:
# Call Your Trade Sending Code Here
# After The Print Statement That Shows That The Trade Was Sent:
global dateNumberTradeSent
dateNumberTradeSent = int(datetime.datetime.now().strftime("%d"))
I'm struggling to get a Lambda function working. I have a python script to access twitter API, pull information, and export that information into an excel sheet. I'm trying to transfer python script over to AWS/Lambda, and I'm having a lot of trouble.
What I've done so far: Created AWS account, setup S3 to have a bucket, and poked around trying to get things to work.
I think the main area I'm struggling is how to go from a python script that I'm executing via local CLI and transforming that code into lambda-capable code. I'm not sure I understand how the lambda_handler function works, what the event or context arguments actually mean (despite watching a half dozen different tutorial videos), or how to integrate my existing functions into Lambda in the context of the lambda_handler, and I'm just very confused and hoping someone might be able to help me get some clarity!
Code that I'm using to pull twitter data (just a sample):
import time
import datetime
import keys
import pandas as pd
from twython import Twython, TwythonError
import pymysql
def lambda_handler(event, context):
def oauth_authenticate():
twitter_oauth = Twython(keys.APP_KEY, keys.APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter_oauth.obtain_access_token()
twitter = Twython(keys.APP_KEY, access_token = ACCESS_TOKEN)
return twitter
def get_username():
"""
Prompts for the screen name of targetted account
"""
username = input("Enter the Twitter screenname you'd like information on. Do not include '#':")
return username
def get_user_followers(username):
"""
Returns data on all accounts following the targetted user.
WARNING: The number of followers can be huge, and the data isn't very valuable
"""
#username = get_username()
#import pdb; pdb.set_trace()
twitter = oauth_authenticate()
datestamp = str(datetime.datetime.now().strftime("%Y-%m-%d"))
target = twitter.lookup_user(screen_name = username)
for y in target:
target_id = y['id_str']
next_cursor = -1
index = 0
followersdata = {}
while next_cursor:
try:
get_followers = twitter.get_followers_list(screen_name = username,
count = 200,
cursor = next_cursor)
for x in get_followers['users']:
followersdata[index] = {}
followersdata[index]['screen_name'] = x['screen_name']
followersdata[index]['id_str'] = x['id_str']
followersdata[index]['name'] = x['name']
followersdata[index]['description'] = x['description']
followersdata[index]['date_checked'] = datestamp
followersdata[index]['targeted_account_id'] = target_id
index = index + 1
next_cursor = get_followers["next_cursor"]
except TwythonError as e:
print(e)
remainder = (float(twitter.get_lastfunction_header(header = 'x-rate-limit-reset')) \
- time.time())+1
print("Rate limit exceeded. Waiting for:", remainder/60, "minutes")
print("Current Time is:", time.strftime("%I:%M:%S"))
del twitter
time.sleep(remainder)
twitter = oauth_authenticate()
continue
followersDF = pd.DataFrame.from_dict(followersdata, orient = "index")
followersDF.to_excel("%s-%s-follower list.xlsx" % (username, datestamp),
index = False, encoding = 'utf-8')