loop through paginated API using python - python

i am trying to retrieve all the devices from the REST API below, but the API JSON response is limited to 10k rows. The code below only returns 40 rows, not sure why. How do i send multiple requests to retrieve all the devices to get past the 10k row limit?
https://developer.carbonblack.com/reference/carbon-black-cloud/platform/latest/devices-api/
n is the "num_found" from the json response which is retrieved from another python function and it's the total number of devices registered.
def getdevices(n):
param = {
"criteria": {"status": ["ACTIVE"], "target_priority": ["LOW", "MEDIUM", "HIGH", "MISSION_CRITICAL"]}
}
all_endpoints = []
# loop through all and return JSON object
for start in range(0, n, 10001):
r = requests.post(hostname+f'/appservices/v6/orgs/abcdefg/devices/_search?start={start}&rows=10000', data=json.dumps(param), headers=headers)
if r.status_code != 200:
logging.error('status code ' + str(r.status_code))
logging.error('unable to get devices! ' + r.text)
sys.exit(1)
else:
response = json.loads(r.text)
r = response['results']
for d in r:
all_endpoints.append({k: d[k] for k in ("id", "registered_time", "last_contact_time",
"last_external_ip_address", "last_internal_ip_address",
"last_location", "login_user_name", "name")})
return all_endpoints
Another example is the CURL command on the webpage below, how would I use a similar approach with python for the code above to retrieve all the devices?
https://community.carbonblack.com/t5/Knowledge-Base/Carbon-Black-Cloud-How-To-Use-API-Pagination/ta-p/40205
Thanks

Related

How to paginate api in python

I'm using the api to get all the data So, I need to paginate so I need to loop through the pages to get all the data I want. Is there any way to extract this automatically?
Alternatively, should I somehow use a while loop to get all this data? What is the best way? Any thoughts?
import json
import requests
from http import HTTPStatus
client_id = ""
client_secret = ""
os.environ["DX_GATEWAY"] = "http://api.com"
os.environ["DX_CLIENT_ID"] = client_id
os.environ["DX_CLIENT_SECRET"] = client_secret
dx_request = requests.Request()
path = "/path/to/api"
params = {
"Type": "abc",
"Id": "def",
"limit": 999,
"Category": "abc"
}
params_str = "&".join([f"{k}={v}" for k, v in params.items()])
url = "?".join([path, params_str])
vulns = dx_request.get( ##also tried dx_request.args.get(
url,
version=1,
)
if vulns.status_code != int(HTTPStatus.OK):
raise RuntimeError("API call did not return expected response: " + str(vulns))
response_data = vulns.json()
print(json.dumps(response_data))
Why not use the the automatic pagination module included in the Requests Library that you already have loaded in the code?
e.g
import requests
url = 'https://api.example.com/items'
params = {'limit': 100, 'offset': 0} # set initial parameters for first page of results
while True:
response = requests.get(url, params=params)
data = response.json()
items = data['items']
# do something with items here...
if len(items) < 100:
break # if fewer than 100 items were returned, we've reached the last page
params['offset'] += 100 # increment offset to retrieve the next page of results

Sample of filtered twitter api stream

I need to get a filtered sample of twitter stream
I'm using tweepy
I checked the functions for the class Stream to get sample stream and to filter
but I dint' catch how should I set the class
should it be
stream.filter(track=['']).sample()
stream.sample().filter(track=[''])
or each one in a line or what
And if you have another idea how to get a sample stream based on keyword filters please help
Thanks in advance
Twitter v2 APIs include an endpoint for random sampling and endpoint for filtered tweets.
The latter allows for specifying a random sample percentage in a query (for example, sample:10 will return a random 10% sample).
Note that v2 APIs are still new and at the moment have a cap of 500k tweets per month.
As an example for the latter, the following code (modified version of this, see this doc) will collect streaming data with cat or dog tags and store it in a json file for every 100 tweets. (Note: this does not include the random sampling query.)
import requests
import os
import json
import pandas as pd
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
data = []
counter = 0
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def get_rules(headers, bearer_token):
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
)
if response.status_code != 200:
raise Exception(
"Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
)
print(json.dumps(response.json()))
return response.json()
def delete_all_rules(headers, bearer_token, rules):
if rules is None or "data" not in rules:
return None
ids = list(map(lambda rule: rule["id"], rules["data"]))
payload = {"delete": {"ids": ids}}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
headers=headers,
json=payload
)
if response.status_code != 200:
raise Exception(
"Cannot delete rules (HTTP {}): {}".format(
response.status_code, response.text
)
)
print(json.dumps(response.json()))
def set_rules(headers, delete, bearer_token):
# You can adjust the rules if needed
sample_rules = [
{"value": "dog has:images", "tag": "dog pictures"},
{"value": "cat has:images -grumpy", "tag": "cat pictures"},
]
payload = {"add": sample_rules}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
headers=headers,
json=payload,
)
if response.status_code != 201:
raise Exception(
"Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
)
print(json.dumps(response.json()))
def get_stream(headers, set, bearer_token):
global data, counter
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True,
)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Cannot get stream (HTTP {}): {}".format(
response.status_code, response.text
)
)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
print(json.dumps(json_response, indent=4, sort_keys=True))
data.append(json_response['data'])
if len(data) % 100 == 0:
print('storing data')
pd.read_json(json.dumps(data), orient='records').to_json(f'tw_example_{counter}.json', orient='records')
data = []
counter +=1
def main():
bearer_token = os.environ.get("BEARER_TOKEN")
headers = create_headers(bearer_token)
rules = get_rules(headers, bearer_token)
delete = delete_all_rules(headers, bearer_token, rules)
set = set_rules(headers, delete, bearer_token)
get_stream(headers, set, bearer_token)
if __name__ == "__main__":
main()
Then, load data in pandas dataframe as
df = pd.read_json('tw_example.json', orient='records').
I'd suggest reading the api documentation for tweepy. Here you can see how to filter the stream like you want to.
From reading other code snippets, i belive it should be done like this:
stream.filter(track=['Keyword'])
print(stream.sample())
As I understand, tweepy uses twitter v1.1 APIs, which has separate APIs for sampling and filtering tweets in real time.
Twitter API references.
v1 sample-realtime
v1 filter-realtime
Approach 1: one can get filtered stream data using stream.filter(track=['Keyword1', 'keyord2']) etc. and then sample records from the collected data.
class StreamListener(tweepy.StreamListener):
def on_status(self, status):
# do data processing and storing here
see examples like https://www.storybench.org/how-to-collect-tweets-from-the-twitter-streaming-api-using-python/ Ignoring Retweets When Streaming Twitter Tweets
Approach 2: one can write program that starts and stops streaming in random time intervals (for example, random sampling of 3 min interval in every 15 minutes).
Approach 3: one can instead use the sampling API to collect data and then filter with keyword to store relevant data.

Scraping Pricing off a search Bar - reached server limit

With help on Stackoverflow, I was able to come up with the scraper. The code returns a list of part numbers and its corresponding prices.
part1 price1
part2 price2
...
...
partn pricen
However the website seems to only allow 200 requests - when i raise the limit to 200+ i would get the error: "raise JSONDecodeError("Expecting value", s, err.value) from None JSONDecodeError: Expecting value".
I just want to know if there's a way to avoid this error? If not I can raise start:0 by 200 each time, but since I would have 100k+ items easily it won't be very efficient..is there a way I can loop the limit and the start function?
Please see the codes below, any help appreciated!
import requests
# import pprint # to format data on screen `pprint.pprint()
import pandas as pd
# --- fucntions ---
def get_data(query):
"""Get data from server"""
payload = {
# "facets":[{
# "name":"OEM",
# "value":"GE%20Healthcare"
# }],
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
# "urlParams":[{
# "name": "OEM",
# "value": "GE Healthcare"
# }],
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
data = r.json()
return data
all_queries = ['GE Healthcare']
for query in all_queries:
#print('\n--- QUERY:', query, '---\n')
data = get_data(query)
Part_Num = []
Vendor_Item_Num = []
price = []
for item in data['products']:
if not item['options']:
Part_Num.append([])
Vendor_Item_Num.append([])
price.append([])
else:
all_prices = [option['price'] for option in item['options']]
all_vendor = [option['price'] for option in item['options']]
all_part_num = item['partNumber']
Part_Num.append(all_part_num)
Vendor_Item_Num.append(all_vendor)
price.append(all_prices)
list_of_dataframes = [pd.DataFrame(Part_Num),pd.DataFrame(price)]
pd.concat(list_of_dataframes, axis=1).to_csv(r'C:\Users\212677036\Documents\output7.csv')
You should always check the status_code that your request was successful. The API is giving HTTP 500 when limit is > 200. status codes. You need to study the documentation of the API. Many APIs limit requests per second and maximum request size so they can maintain a reliable service.
The json() method will fail if the HTTP request was not successful.
You can get data in batches. Sample code below I stop because I have no want to stay in the loop for 500+ iterations... You could consider using threading so it's not so sequential.
All of this is covered in SO prodasf-vip
import requests
query = 'GE Healthcare'
payload = {
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
js = r.json()
df = pd.json_normalize(js["products"])
while len(df) < js["totalResults"] and len(df)<2000:
payload["start"] += 200
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
df = pd.concat([df, pd.json_normalize(r.json()["products"])])
else:
break
print(f"want: {js['totalResults']} got: {len(df)}")
df

How to return unique values from Google Places API?

I'm making a request from Google Places API with a query "Hotels in Brazil", with the following code:
# url variable store url
url = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
# The text string on which to search
query = input('Search query: ')
# get method of requests module
# return response object
r = requests.get(url + 'query=' + query +
'&key=' + api_key +
'&maxResults=1000')
# json method of response object convert
x = r.json()
# store the value of result key in variable y
y = x['results']
#Getting information from other pages with "next_page_token" param
places=[]
params = {
'query': query,
'key' : api_key,
'maxResults': '1000'
}
while "next_page_token" in x:
params['pageToken'] = x['next_page_token'],
res = requests.get(url, params = params)
z = json.loads(res.content)
places.extend(z['results'])
print(places)
But it's taking too long to return the results ( more than 1 day). So i stopped the execution and when i printed it, i got many repeated names, for an example, i got 16K places but only 26 unique places.
Is it possible to return unique values from Google Places API?

Pagination in Python for API from Solr

I am using Python to extract data from a Solr API, like so:
import requests
user = 'my_username'
password= 'my password'
url = 'my_url'
print ("Accessing API..")
req = requests.get(url = url, auth=(user, password))
print ("Accessed!")
out = req.json()
#print(out)
However, it looks like in some of the API URLs: the output is fairly "large" (many of the columns are lists of dictionaries), and so it doesn't return all rows, which are necessary.
From looking around, it looks like I should use pagination to bring in results in specified increments. Something like this:
url = 'url?start=0&rows=1000'
Then,
url = 'url?start=1000&rows=1000'
and so on, until there is no result is returned.
The way I am thinking about it is write a loop, and append result to output with every loop. However, I am not sure how to do that.
Would someone be able to help please?
Thank you in advance!
Did you look at the output? In my experience, solr response usually includes a 'numFound' in it's result. On a (old) solr I have locally, doing a random query. I get this result.
{
"responseHeader": {
"status": 0,
"QTime": 1,
"params": {
"q": "*:*",
"indent": "true",
"start": "0",
"rows": "10",
"wt": "json",
"_": "1509460751164"
}
},
"response": {
"numFound": 7023,
"start": 0,
"docs": [.. 10 docs]
}
}
While working out this code example, I realized you don't need the numFound really. Solr will just return any empty list for docs if there are no further results. Making it easier to make the loop.
import requests
user = 'my_username'
password = 'my password'
# Starting values
start = 0
rows = 1000 # static, but easier to manipulate if it's a variable
base_url = 'my_url?rows={0}?start={1}'
url = base_url.format(rows, start)
req = requests.get(url=url, auth=(user, password))
out = req.json()
total_found = out.get('response', {}).get('numFound', 0)
# Up the start with 1000, so we fetch the next 1000
start += rows
results = out.get('response', {}).get('docs', [])
all_results = results
# Results will be an empty list if no more results are found
while results:
# Rebuild url base on current start.
url = base_url.format(rows, start)
req = requests.get(url=url, auth=(user, password))
out = req.json()
results = out.get('response', {}).get('docs', [])
all_results += results
start += rows
# All results will now contains all the 'docs' of each request.
print(all_results)
Mind you.. those docs will be dict like, so more parsing will be needed.

Categories