How to get all the tracks of a Spotify playlist - python

I'm trying to get all the tracks from 2 playlists into a CSV file. However, in both playlists, even though I increase the offset parameter by 100 in each query, the first 100 songs of both playlists are returned. So the page is never changed. What could be the problem?
import spotipy, json, csv
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
data_file = open('data.csv', 'w')
writer = csv.writer(data_file)
writer.writerow(['track_num', 'track_id', 'track_name', 'first_artist'] + ['liked'])
playlist_ids = [
'xxxxxxxxxxxxxxxxxxxxxxx', # playlist 1
'yyyyyyyyyyyyyyyyyyyyyyy' # playlist 2
]
for playlist_id in playlist_ids:
offset_n = 0
total = 100
while offset_n < total:
tracks_response = sp.playlist_tracks(playlist_id, offset=offset_n)
tracks_json = json.dumps(tracks_response)
tracks_data = json.loads(tracks_json)
if offset_n == 0:
total = tracks_data['tracks']['total']
for track in tracks_data['tracks']['items']:
track_id = track['track']['id']
track_name = track['track']['name']
first_artist = track['track']['artists'][0]['name']
if playlist_id == playlist_ids[0]:
writer.writerow([row_num, track_id, track_name, first_artist] + [1])
else:
writer.writerow([row_num, track_id, track_name, first_artist] + [0])
offset_n += 100
data_file.close()

The playlist_tracks method returns a paginated result with details of the tracks of a playlist.
So you need to iterate over all pages to get the full data.
You can use this example as a reference:
def get_all_tracks_from_playlist(playlist_id)
tracks_response = sp.playlist_tracks(playlist_id)
tracks = tracks_response["items"]
while tracks_response["next"]:
tracks_response = sp.next(tracks_response)
tracks.extend(tracks_response["items"])
return tracks
Regarding the ReadTimeout exception you have mentioned in the comments:
Spotify client accepts requests_timeout and retries as arguments, according to the documentation the default values are requests_timeout=5, and retries=3
You can extend them as you wish to decrease the chance you will get the ReadTimeout exception.
As a start you can double the request timeout to 10 seconds, and change the retries to 5:
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=10, retries=5)

Related

How to get total number of posts of a subreddit using Python?

I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!
Following code:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
# print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
# df.to_csv(name + str('.csv'), index=False)
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:
TypeError: '<' not supported between instances of 'int' and 'str'
And this is logical as well. So, I guess I won't be able to use limit = "None".
I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.
#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def total_posts(name):
print("Calculating total number of posts")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('Creating dataframe')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('Created {}.csv file!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')

Giphy-API not giving any results for normal users?

I'm trying to get the latest 100 posts from my giphy user.
It works for accounts like "giphy" and "spongebob"
But not for "jack0_o"
import requests
def get_user_gifs(username):
api_key = "API_KEY"
limit = 25 # The number of GIFs to retrieve per request (max 25)
offset = 0
# Set a flag to indicate when all GIFs have been retrieved
done = False
# Keep making requests until all GIFs have been retrieved
while not done:
# Make the request to the Giphy API
endpoint = f"https://api.giphy.com/v1/gifs/search?api_key={api_key}&q={username}&limit={limit}&offset={offset}&sort=recent"
response = requests.get(endpoint)
data = response.json()
# Extract the GIF URLs from the data and print them one per line
for gif in data["data"]:
print(gif["url"])
# Update the starting index for the next batch of GIFs
offset += limit
# Check if there are more GIFs to retrieve
if len(data["data"]) < limit or offset >= 100:
done = True
get_user_gifs("spongebob") #WORKS
get_user_gifs("jack0_o") #does not work
Already tried adding ratings with "pg", "r", "g"

How can I determine number of jobs for a given technology from a list I have obtained in .json format from GitHub job opening API in python?

My task is to Write a function to get the number of jobs for the given technology.
Note: The API gives a maximum of 50 jobs per page.
If you get 50 jobs per page, it means there could be some more job listings available.
So if you get 50 jobs per page you should make another API call for next page to check for more jobs.
If you get less than 50 jobs per page, you can take it as the final count.
Following is my code
baseurl = "https://jobs.github.com/positions.json"
def get_number_of_jobs(technology):
number_of_jobs = 0
tech = technology
page= 0
PARAMS = {'technology':tech , 'page': page}
jobs=requests.get(url=baseurl,params = PARAMS )
if jobs.ok:
listings = jobs.json()
number_of_jobs=len(listings)
if number_of_jobs==50:
page= page+1
PARAMS = {'technology':tech , 'page': page}
jobs=requests.get(url=baseurl,params = PARAMS )
if jobs.ok:
listings2 = jobs.json()
number_of_jobs= number_of_jobs + len(listings2)
return technology,number_of_jobs
Now I can not figure out how to do the pagination in this function? Meaning how to check if there are more than 50 job posting for a specific technology or not and if it is then run the code again and get those postings as well?
I print the output as
print(get_number_of_jobs('python'))
('python', 100)
Can someone please help?
Many thanks in advance!
Please let me know if should work
import requests
baseurl = 'https://jobs.github.com/positions.json'
total_job = 0
def get_number_of_jobs(technology, page):
global total_job
PARAMS = {'technology':technology , 'page': page}
jobs=requests.get(url=baseurl,params = PARAMS )
total_job += len(jobs.json()) if jobs.ok else 0
return len(jobs.json()) if jobs.ok else 0
def get_jobs(technology):
page = 0
while get_number_of_jobs(technology, page) >= 50:page+=1
return total_job
print(get_jobs('python'))
baseurl = 'https://jobs.github.com/positions.json'
def get_number_of_jobs(technology):
number_of_jobs = 0
page = 0
while True:
payload = {"description":technology,"page":page}
r = requests.get(baseurl,params=payload)
if r.ok:
data = r.json()
number_of_jobs = len(data)
if number_of_jobs >= 50:
page += 1
continue
else:
break
return technology,number_of_jobs

how to take all tweets in a hashtag with tweepy?

I'm trying to take every open tweets in a hashtag but my code does not go further than 299 tweets.
I also trying to take tweets from a specific time line like tweets only in May 2015 and July 2016. Are there any way to do it in the main process or should I write a little code for it?
Here is my code:
# if this is the first time, creates a new array which
# will store max id of the tweets for each keyword
if not os.path.isfile("max_ids.npy"):
max_ids = np.empty(len(keywords))
# every value is initialized as -1 in order to start from the beginning the first time program run
max_ids.fill(-1)
else:
max_ids = np.load("max_ids.npy") # loads the previous max ids
# if there is any new keywords added, extends the max_ids array in order to correspond every keyword
if len(keywords) > len(max_ids):
new_indexes = np.empty(len(keywords) - len(max_ids))
new_indexes.fill(-1)
max_ids = np.append(arr=max_ids, values=new_indexes)
count = 0
for i in range(len(keywords)):
since_date="2015-01-01"
sinceId = None
tweetCount = 0
maxTweets = 5000000000000000000000 # maximum tweets to find per keyword
tweetsPerQry = 100
searchQuery = "#{0}".format(keywords[i])
while tweetCount < maxTweets:
if max_ids[i] < 0:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_ids - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_ids - 1),
since_id=sinceId)
if not new_tweets:
print("Keyword: {0} No more tweets found".format(searchQuery))
break
for tweet in new_tweets:
count += 1
print(count)
file_write.write(
.
.
.
)
item = {
.
.
.
.
.
}
# instead of using mongo's id for _id, using tweet's id
raw_data = tweet._json
raw_data["_id"] = tweet.id
raw_data.pop("id", None)
try:
db["Tweets"].insert_one(item)
except pymongo.errors.DuplicateKeyError as e:
print("Already exists in 'Tweets' collection.")
try:
db["RawTweets"].insert_one(raw_data)
except pymongo.errors.DuplicateKeyError as e:
print("Already exists in 'RawTweets' collection.")
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_ids[i] = new_tweets[-1].id
np.save(arr=max_ids, file="max_ids.npy") # saving in order to continue mining from where left next time program run
Have a look at this: https://tweepy.readthedocs.io/en/v3.5.0/cursor_tutorial.html
And try this:
import tweepy
auth = tweepy.OAuthHandler(CONSUMER_TOKEN, CONSUMER_SECRET)
api = tweepy.API(auth)
for tweet in tweepy.Cursor(api.search, q='#python', rpp=100).items():
# Do something
pass
In your case you have a max number of tweets to get, so as per the linked tutorial you could do:
import tweepy
MAX_TWEETS = 5000000000000000000000
auth = tweepy.OAuthHandler(CONSUMER_TOKEN, CONSUMER_SECRET)
api = tweepy.API(auth)
for tweet in tweepy.Cursor(api.search, q='#python', rpp=100).items(MAX_TWEETS):
# Do something
pass
If you want tweets after a given ID, you can also pass that argument.
Sorry, I can't answer in comment, too long. :)
Sure :) Check this example:
Advanced searched for #data keyword 2015 may - 2016 july
Got this url: https://twitter.com/search?l=&q=%23data%20since%3A2015-05-01%20until%3A2016-07-31&src=typd
session = requests.session()
keyword = 'data'
date1 = '2015-05-01'
date2 = '2016-07-31'
session.get('https://twitter.com/search?l=&q=%23+keyword+%20since%3A+date1+%20until%3A+date2&src=typd', streaming = True)
Now we have all the requested tweets,
Probably you could have problems with 'pagination'
Pagination url ->
https://twitter.com/i/search/timeline?vertical=news&q=%23data%20since%3A2015-05-01%20until%3A2016-07-31&src=typd&include_available_features=1&include_entities=1&max_position=TWEET-759522481271078912-759538448860581892-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false
Probably you could put a random tweet id, or you can parse first, or requests some data from twitter. It can be done.
Use Chrome's networking tab to find all the requested information :)
This code worked for me.
import tweepy
import pandas as pd
import os
#Twitter Access
auth = tweepy.OAuthHandler( 'xxx','xxx')
auth.set_access_token('xxx-xxx','xxx')
api = tweepy.API(auth,wait_on_rate_limit = True)
df = pd.DataFrame(columns=['text', 'source', 'url'])
msgs = []
msg =[]
for tweet in tweepy.Cursor(api.search, q='#bmw', rpp=100).items(10):
msg = [tweet.text, tweet.source, tweet.source_url]
msg = tuple(msg)
msgs.append(msg)
df = pd.DataFrame(msgs)
Check twitter api documentation, probably it allows just 300 tweets to parse.
I would recommend to forget api, make it with requests with streaming. The api is an implementation of requests with limitations.

Twython rate limit

I am getting a 'TwythonRateLimitError' and want to be sure that I don't screw up my account. I am new to working with the Twitter API. How can I check to make sure that I am not going over my query limit? I read that it is 150 queries/hour... What happens if I do? Am I at a risk of this in my code or is it only for particular commands?
I am not building an app, I am just trying to get a specific sample for twitter (random set of users with similar following bases (7500 to 10000 followers). My code so far is below. I will be saving the successful hits to a file but I am waiting to be sure that is necessary.
from twython import Twython, TwythonError, TwythonRateLimitError
from random import randint
APP_KEY = 'redacted'
APP_SECRET = 'redacted'
ACCESS_TOKEN = 'redacted'
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter.obtain_access_token()
twitter = Twython(APP_KEY,access_token=ACCESS_TOKEN)
print "hello twitterQuery\n"
count = 0
step = 0
isError = 0
try:
#new account i made today to set upper bound on userID
maxID = twitter.show_user(screen_name="query_test")['id']
except TwythonRateLimitError:
isError = 1
ids = [0,0,0,0,0,0,0,0,0,0]
if isError == 0 and step <= 150:
while count < 10:
step = step +1
randomID = randint(1,maxID)
isMissing = 0
print str(step) + " " + str(randomID)
try:
randomUserData = twitter.show_user(user_id=randomID)
except TwythonError:
isMissing = 1;
if isMissing == 0:
followers = randomUserData['followers_count']
if followers >= 7500 and followers <= 10000:
print "ID: " + str(randomID) +", followers: "+ str(followers)
ids[count] = randomID
count = count+1
print "\ndone"
for each id in ids:
print id
to see your current rate limit status, pass in your app token and send a GET request to
https://api.twitter.com/1.1/account/rate_limit_status.json
and query the response.
See this page for further context

Categories