How to query arXiv for a specific year? - python

I'm using the code shown below in order to retrieve papers from arXiv. I want to retrieve papers that have words "machine" and "learning" in the title. The number of papers is large, therefore I want to implement a slicing by year (published).
How can I request records of 2020 and 2019 in search_query? Please notice that I'm not interested in post-filtering.
import urllib.request
import time
import feedparser
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# Search parameters
search_query = urllib.parse.quote("ti:machine learning")
start = 0
total_results = 5000
results_per_iteration = 1000
wait_time = 3
papers = []
print('Searching arXiv for %s' % search_query)
for i in range(start,total_results,results_per_iteration):
print("Results %i - %i" % (i,i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
i,
results_per_iteration)
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
# Run through each entry, and print out information
for entry in feed.entries:
#print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
#print('Title: %s' % entry.title)
#feedparser v4.1 only grabs the first author
#print('First Author: %s' % entry.author)
paper = {}
paper["date"] = entry.published
paper["title"] = entry.title
paper["first_author"] = entry.author
paper["summary"] = entry.summary
papers.append(paper)
# Sleep a bit before calling the API again
print('Bulk: %i' % 1)
time.sleep(wait_time)

According to the arXiv documentation, there is no published or date field available.
What you can do is to sort the results by date (by adding &sortBy=submittedDate&sortOrder=descending to your query parameters) and stop making requests when you reach 2018.
Basically your code should be modified like this:
import urllib.request
import time
import feedparser
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# Search parameters
search_query = urllib.parse.quote("ti:machine learning")
i = 0
results_per_iteration = 1000
wait_time = 3
papers = []
year = ""
print('Searching arXiv for %s' % search_query)
while (year != "2018"): #stop requesting when papers date reach 2018
print("Results %i - %i" % (i,i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i&sortBy=submittedDate&sortOrder=descending' % (search_query,
i,
results_per_iteration)
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
# Run through each entry, and print out information
for entry in feed.entries:
#print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
#print('Title: %s' % entry.title)
#feedparser v4.1 only grabs the first author
#print('First Author: %s' % entry.author)
paper = {}
paper["date"] = entry.published
year = paper["date"][0:4]
paper["title"] = entry.title
paper["first_author"] = entry.author
paper["summary"] = entry.summary
papers.append(paper)
# Sleep a bit before calling the API again
print('Bulk: %i' % 1)
i += results_per_iteration
time.sleep(wait_time)
for the "post-filtering" approach, once enough results are collected, I'd do something like this:
papers2019 = [item for item in papers if item["date"][0:4] == "2019"]

Related

How to get all the tracks of a Spotify playlist

I'm trying to get all the tracks from 2 playlists into a CSV file. However, in both playlists, even though I increase the offset parameter by 100 in each query, the first 100 songs of both playlists are returned. So the page is never changed. What could be the problem?
import spotipy, json, csv
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
data_file = open('data.csv', 'w')
writer = csv.writer(data_file)
writer.writerow(['track_num', 'track_id', 'track_name', 'first_artist'] + ['liked'])
playlist_ids = [
'xxxxxxxxxxxxxxxxxxxxxxx', # playlist 1
'yyyyyyyyyyyyyyyyyyyyyyy' # playlist 2
]
for playlist_id in playlist_ids:
offset_n = 0
total = 100
while offset_n < total:
tracks_response = sp.playlist_tracks(playlist_id, offset=offset_n)
tracks_json = json.dumps(tracks_response)
tracks_data = json.loads(tracks_json)
if offset_n == 0:
total = tracks_data['tracks']['total']
for track in tracks_data['tracks']['items']:
track_id = track['track']['id']
track_name = track['track']['name']
first_artist = track['track']['artists'][0]['name']
if playlist_id == playlist_ids[0]:
writer.writerow([row_num, track_id, track_name, first_artist] + [1])
else:
writer.writerow([row_num, track_id, track_name, first_artist] + [0])
offset_n += 100
data_file.close()
The playlist_tracks method returns a paginated result with details of the tracks of a playlist.
So you need to iterate over all pages to get the full data.
You can use this example as a reference:
def get_all_tracks_from_playlist(playlist_id)
tracks_response = sp.playlist_tracks(playlist_id)
tracks = tracks_response["items"]
while tracks_response["next"]:
tracks_response = sp.next(tracks_response)
tracks.extend(tracks_response["items"])
return tracks
Regarding the ReadTimeout exception you have mentioned in the comments:
Spotify client accepts requests_timeout and retries as arguments, according to the documentation the default values are requests_timeout=5, and retries=3
You can extend them as you wish to decrease the chance you will get the ReadTimeout exception.
As a start you can double the request timeout to 10 seconds, and change the retries to 5:
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=10, retries=5)

python - YouTube API search().list_next() not working well

I'm using this code. But I get 50 results only. It does not show the next 50 results. I mean, it does not get the nextPageToken. Am I doing something wrong? Or search().list_next() doesn't work?
def youtube_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
req = youtube.search().list(
q=options.q,
part="id,snippet",
maxResults=50,
channelId="my_channel_id",
order="date",
type="video"
)
while req:
res = req.execute()
for item in res["items"]:
if item["id"]["kind"] == "youtube#video":
video_id=item["id"]["videoId"]
video_title=item["snippet"]["title"]
video_date=item["snippet"]["publishedAt"]
print("%s # %s # %s" % (video_id, video_title, video_date))
req = youtube.search().list_next(req, res)
I'm seaching in my channel using channelId, I know I have more than 50 results.
The maxResults, is the results per page. (value by default is 50)
q=options.q is the query, a word to base the search results ('dog' for example)

Tweepy Twitter API Rate Limit Exceeded

I'm experimenting with a Python script (taken from here) that traces the retweet path of a given tweetID.
I'm aware of the very restrictive rate limits on the Twitter API, but I'm hitting the following error every time I execute the script:
Caught TweepError: [{u'message': u'Rate limit exceeded', u'code': 88}]
The script I'm using is as follows:
#!/usr/bin/python -u
#
# Usage: ./trace.py <tweetId>
#
import sys
import tweepy
import Queue
import time
import json
import redis
CONSUMER_KEY = 'x'
CONSUMER_SECRET = 'x'
ACCESS_KEY = 'x'
ACCESS_SECRET = 'x'
REDIS_FOLLOWERS_KEY = "followers:%s"
# Retweeter who have not yet been connected to the social graph
unconnected = {}
# Retweeters connected to the social graph...become seeds for deeper search
connected = Queue.Queue()
# Social graph
links = []
nodes = []
#----------------------------------------
def addUserToSocialGraph (parent, child):
# parent: tweepy.models.User
# child: tweepy.models.User
#----------------------------------------
global links;
if (child):
nodes.append ({'id':child.id,
'screen_name':child.screen_name,
'followers_count':child.followers_count,
'profile_image_url':child.profile_image_url})
# TODO: Find child and parent indices in nodes in order to create the links
if (parent):
print (nodes)
print ("Adding to socialgraph: %s ==> %s" % (parent.screen_name, child.screen_name))
links.append ({'source':getNodeIndex (parent),
'target':getNodeIndex (child)})
#----------------------------------------
def getNodeIndex (user):
# node: tweepy.models.User
#----------------------------------------
global nodes
for i in range(len(nodes)):
if (user.id == nodes[i]["id"]):
return i
return -1
#----------------------------------------
def isFollower (parent, child):
# parent: tweepy.models.User
# child: tweepy.models.User
#----------------------------------------
global red
# Fetch data from Twitter if we dont have it
key = REDIS_FOLLOWERS_KEY % parent.screen_name
if ( not red.exists (key) ):
print ("No follower data for user %s" % parent.screen_name)
crawlFollowers (parent)
cache_count = red.hlen (key)
if ( parent.followers_count > (cache_count*1.1) ):
# print ("Incomplete follower data for user %s. Have %d followers but should have %d (exceeds 10% margin for error)."
# % (parent.screen_name, cache_count, parent.followers_count))
crawlFollowers (parent)
return red.hexists (key, child.screen_name)
#----------------------------------------
def crawlFollowers (user):
# user: tweepy.models.User
#----------------------------------------
print ("Retrieving followers for %s (%d)" % (user.screen_name, user.followers_count))
count = 0
follower_cursors = tweepy.Cursor (api.followers, id = user.id, count = 15)
followers_iter = follower_cursors.items()
follower = None
while True:
try:
# We may have to retry a failed follower lookup
if ( follower is None ):
follower = followers_iter.next()
# Add link to Redis
red.hset ("followers:%s" % user.screen_name, follower.screen_name, follower.followers_count)
follower = None
count += 1
except StopIteration:
break
except tweepy.error.TweepError as (err):
print ("Caught TweepError: %s" % (err))
if (err.reason == "Not authorized" ):
print ("Not authorized to see users followers. Skipping.")
break
limit = api.rate_limit_status()
if (limit['remaining_hits'] == 0):
seconds_until_reset = int (limit['reset_time_in_seconds'] - time.time())
print ("API request limit reached. Sleeping for %s seconds" % seconds_until_reset)
time.sleep (seconds_until_reset + 5)
else:
print ("Sleeping a few seconds and then retrying")
time.sleep (5)
print ("Added %d followers of user %s" % (count, user.screen_name))
#----------------------------------------
# Main
#----------------------------------------
tweetId = sys.argv[1]
# Connect to Redis
red = redis.Redis(unix_socket_path="/tmp/redis.sock")
# Connect to Twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)
print (api.rate_limit_status())
# Get original Tweet details
status = api.get_status (tweetId)
connected.put(status.user)
addUserToSocialGraph (None, status.user)
retweets = api.retweets (status.id)
print ("Tweet %s, originally posted by %s, was retweeted by..." % (status.id, status.user.screen_name))
for retweet in retweets:
print (retweet.user.screen_name)
unconnected[retweet.user.screen_name] = retweet.user;
# Pivot
while not (connected.empty() or len(unconnected)==0):
# Get next user
pivot = connected.get()
# Check followers of this user against unconnected retweeters
print ("Looking through followers of %s" % pivot.screen_name)
for (screen_name, retweeter) in unconnected.items():
if (isFollower(pivot, retweeter)):
print ("%s <=== %s" % (pivot.screen_name, retweeter.screen_name))
connected.put (retweeter)
addUserToSocialGraph (pivot, retweeter)
del unconnected[retweeter.screen_name]
else:
print ("%s <=X= %s" % (pivot.screen_name, retweeter.screen_name))
# Add unconnected nodes to social graph
for (screen_name, user) in unconnected.items():
addUserToSocialGraph (None, user)
# Encode data as JSON
filename = "%s.json" % status.id
print ("\n\nWriting JSON to %s" % filename)
tweet = {'id':status.id,
'retweet_count':status.retweet_count,
'text':status.text,
'author':status.user.id}
f = open (filename, 'w')
f.write (json.dumps({'tweet':tweet, 'nodes':nodes, 'links':links}, indent=2))
f.close
sys.exit()
I'm sensing that I'm making a mistake in the crawlFollowers object.
Is there a way to somehow stagger the crawler to stay within the rate limit or conform to the rate limit?
Try running with the wait_on_rate_limit flag set to True in Tweepy API:
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

Gettin HTML element and sending new json requests in python

I try to crawl this link by sending json requests. My first request would be :
parameters1 = {'ticker':'XOM', 'countryCode':'US',
'dateTime':'', 'docId':'1222737422 ',
'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
'messageNumber':'','count':'10',
'channelName':'/news/latest/company/us/xom', 'topic':'',
'_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))
for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?
import requests
import json
from bs4 import BeautifulSoup
def main():
html_url = 'http://www.marketwatch.com/investing/stock/xom'
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
unique_id = li_node['data-uniqueid']
print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = {
'ticker':'XOM',
'countryCode':'US',
'docType':'806',
'docId': '', # (Optional) initial value extract from HTML page
'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
'messageNumber':'8589', # initial value extract from HTML page
'count':'10',
'channelName': '/news/latest/company/us/xom',
}
parameters.update(extract_page_params(unique_id))
while True:
resp = requests.get(baseUrl, params = parameters)
data = json.loads(resp.text) # array of size 10
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("\ngot %d data, url: %s" % (len(data), resp.url))
print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
print("")
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
parameters.update(extract_page_params(uid))
input("press <enter> to get next")
def extract_page_params(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
if __name__ == '__main__':
main()
This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)
You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.

gae python datastore query

i m new to gae and python too, i m trying to build simple app using datastore in which script is as follows
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
class Pincodes(db.Model):
city = db.StringProperty()
code = db.StringProperty()
class MainHandler(webapp.RequestHandler):
def get(self):
q = Pincodes.all()
q = q.filter("city =", "some_city")
p = q.get()
r = 'city: %s code: %s' % (pincode.city, pincode.code)
self.response.out.write(r)
my script also contain usual def main() and if__name, im developing it step by step from simple hello world app shown in code.google docs and it worked fine, i hav uploaded sample pincode data containing 10 records to local datastore and its fine too but im not able to query and display it on webpage i tried self.response.out.write and the output is "city: code: " and not "city: mumbai code:400001" whats wrong with my script
use
entity = q.get() # use get if you want one entity only
r = 'city: %s code: %s' %(entity.city, entity.code)
self.response.out.write(r)
instead of the print
edit:
def get(self):
q = Pincodes.all()
q = q.filter("city =", "some_city")
entity = q.get() # use get if you want one entity only
r = 'city: %s code: %s' %(entity.city, entity.code)
self.response.out.write(r)
edit2:
def get(self):
q = Pincodes.all()
q = q.filter("city =", "some_city")
entity = q.get() # use get if you want one entity only
if not entity:
self.response.out.write('sorry no entities found')
else:
r = 'city: %s code: %s' %(entity.city, entity.code)
self.response.out.write(r)

Categories