Python- Mocking API request? - python

What am I doing wrong? I have an extractor that works great but writing the test is stumping me and it's failing. Can anyone help me figure out where I'm going wrong?
from unittest.mock import MagicMock, patch
import pandas as pd
import requests
from my_project.task import extractor
from my_project.tests import utils
from prefect.logging import disable_run_logger
CONTACT_RECORD = utils.TEST_CONTACT_RECORD
PAGED_CONTACT_RECORD = utils.TEST_PAGED_CONTACT_RECORD
EXPECTED_CONTACT_RECORD = utils.EXPECTED_CONTACT_RECORD
#patch("requests.get")
def test_contact_extractor(get: MagicMock):
"""
Should call "request.get" once and return a json
containing contact data.
"""
get.return_value.json.return_value = CONTACT_RECORD
with disable_run_logger():
result = extractor.get_contacts()
assert get.call_count == 1
assert result == pd.DataFrame(EXPECTED_CONTACT_RECORD)
#patch("my_project.extractor.get_contacts")
def test_get_paged_contacts(get_contacts: MagicMock):
"""
Should run "requests.get" until ['has-more'] is False
and there is no offset value.
"""
get_contacts.return_value.json.side_effect = [
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
CONTACT_RECORD,
]
with disable_run_logger():
data = extractor.get_paged_contacts(
endpoint=MagicMock, query_string=MagicMock, df=MagicMock
)
assert get_contacts.call_count == 4
assert data == pd.DataFrame(EXPECTED_CONTACT_RECORD)
Some errors I'm getting are:
requests imported but not used
callable[[Union[str,btyes],....], Response] has no attribute "return_value"
EDIT:
No longer getting the second error because I realized I had a typo, but currently getting:
AttributeError: 'NoneType' object has no attribute 'client'
Edit:
Here is my get_paged_data() function:
def get_paged_contacts(
endpoint: str, query_string: typing.Dict[str, typing.Any], df: pd.DataFrame
) -> pd.DataFrame:
"""
Return the results of the get request.
Loops over api response and appends the results of a while loop for pagination, then
merges the results with the previously extracted dataframe.
"""
url = endpoint
contacts = []
response = requests.request("GET", url, headers=header, params=query_string).json()
has_more = response["has-more"]
offset = response["vid-offset"]
while has_more is True:
querystring = {"limit": "100", "archived": "false", "offset": offset}
try:
response = requests.request(
"GET", url, headers=header, params=querystring
).json()
time.sleep(10)
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as j:
logger.error(f"Error occurred: {j}.")
break
for x in range(len(response["contacts"])):
contacts.append(response["contacts"][x])
contacts = json_normalize(contacts)
merged = pd.concat([df, contacts])
return merged

After checking the edited question, here is a possible approach. The code under test could be the following:
def get_paged_contacts(endpoint: str,
query_string: typing.Dict[str, typing.Any],
df: pd.DataFrame) -> pd.DataFrame:
"""
Return the results of the get request.
Loops over api response and appends the results of a while loop
for pagination, then merges the results with the previously
extracted dataframe.
"""
url = endpoint
contacts = []
response = requests.request("GET", url,
headers=header,
params=query_string).json()
has_more = response["has-more"]
offset = response["vid-offset"]
# Get the contacts coming from the first response
contacts.extend(response['contacts'])
while has_more:
querystring = {"limit": "100",
"archived": "false", "offset": offset}
try:
response = requests.request("GET", url,
headers=header,
params=querystring).json()
# Update the looping condition in every response
has_more = response["has-more"]
contacts.extend(response['contacts'])
time.sleep(10)
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as j:
logger.error(f"Error occurred: {j}.")
break
contacts = pd.json_normalize(contacts)
merged = pd.concat([df, contacts])
# Reset the dataframe index after concatenating
merged.reset_index(drop=True, inplace=True)
return merged
It can be refactored by having all requests inside the while loop, to avoid duplication, but it is not clear how you want to handle the query_string parameter, so I left it as it is. Then, the test code could be something like this:
#patch('my_project.task.extractor.requests.request')
def test_get_paged_contacts(request_mock):
request_mock.return_value.json.side_effect = [
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
CONTACT_RECORD,
]
expected_df = pd.DataFrame(EXPECTED_CONTACT_RECORD)
input_df = pd.DataFrame()
res = get_paged_contacts('dummy_endpoint', None, input_df)
assert request_mock.call_count == 4
assert_frame_equal(res, expected_df)
The assert_frame_equal function is a utility provided by pandas to check two dataframes for equality. It is particularly useful for unit testing with pandas dataframes. You can check it here. Of course, you need to import it with from pandas.testing import assert_frame_equal

Related

How to paginate api in python

I'm using the api to get all the data So, I need to paginate so I need to loop through the pages to get all the data I want. Is there any way to extract this automatically?
Alternatively, should I somehow use a while loop to get all this data? What is the best way? Any thoughts?
import json
import requests
from http import HTTPStatus
client_id = ""
client_secret = ""
os.environ["DX_GATEWAY"] = "http://api.com"
os.environ["DX_CLIENT_ID"] = client_id
os.environ["DX_CLIENT_SECRET"] = client_secret
dx_request = requests.Request()
path = "/path/to/api"
params = {
"Type": "abc",
"Id": "def",
"limit": 999,
"Category": "abc"
}
params_str = "&".join([f"{k}={v}" for k, v in params.items()])
url = "?".join([path, params_str])
vulns = dx_request.get( ##also tried dx_request.args.get(
url,
version=1,
)
if vulns.status_code != int(HTTPStatus.OK):
raise RuntimeError("API call did not return expected response: " + str(vulns))
response_data = vulns.json()
print(json.dumps(response_data))
Why not use the the automatic pagination module included in the Requests Library that you already have loaded in the code?
e.g
import requests
url = 'https://api.example.com/items'
params = {'limit': 100, 'offset': 0} # set initial parameters for first page of results
while True:
response = requests.get(url, params=params)
data = response.json()
items = data['items']
# do something with items here...
if len(items) < 100:
break # if fewer than 100 items were returned, we've reached the last page
params['offset'] += 100 # increment offset to retrieve the next page of results

I need a best way to transform json response in python

I'm trying to get a dataframe from a api response.
For optimization I run parallels threads, but the time is really high.
An code example:
def parall_func(tuple):
output = pd.DataFrame()
list_caracts = list(map(str,tuple[2]))
item = [(tuple[1])]
q = len(list_caracts)
headers = {
'Content-Type':'application/json'
}
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = requests.get(url,headers=headers,data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
pool = Threads(cpu_count())
df_parall=list(pool.imap(parall_func, df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
can you help me to correct or suggest another logic or structure different to pandas
the final response has at about 3 millions of records
After I can get the structure i need do some of calcs and then connect to a db with pyodbc to save the data
The two things I would try are:
Create a requests.Session instance and use that to issue your GET requests. According to the documentation for this:
The Session object allows you to persist certain parameters across requests. It also persists cookies across all requests made from the Session instance, and will use urllib3’s connection pooling. So if you’re making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase (see HTTP persistent connection).
Since you are using multithreading, limiting yourself to only a number of threads equal to the number of cores you have will result in under performance. Try creating 500 threads. The only issue becomes whether the website will not complain that too many requests per second are being made.
By the way, you source had an indentation error. I have supplied missing import statements as I suppose they should be and I have renamed argument tuple to tpl since tuple is a built-in type and you should not redefine built-in types without a good reason.
from multiprocessing.pool import ThreadPool as Threads
from requests import Session
from functools import partial
import pandas as pd
import sys
def parall_func(session, tpl):
output = pd.DataFrame()
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session), df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
Update
One additional thing you can try is to replace creating variable output by doing multiple append operations with a single concat:
def parall_func(session, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
output = pd.concat(dataframes)
except:
print("Error: ", sys.exc_info()[0])
raise
return output
If the above doesn't improve performance, one last thing to try is to have the creation of the dataframes done using multiprocessing:
from multiprocessing.pool import ThreadPool as Threads, Pool as MultiProcessingPool
from requests import Session
from functools import partial
import pandas as pd
import sys
def create_data_frames(response):
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
# Perhaps you might want to specify ignore_index=True on the following:
output = pd.concat(dataframes)
return output
def parall_func(session, multiprocessing_pool, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
output = multiprocessing_pool.apply(create_data_frames, args=(response,))
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
multiprocessing_pool = MultiProcessingPool()
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session, multiprocessing_pool), df_queries.itertuples(name=None)))
multiprocessing_pool.close()
multiprocessing_pool.join()
pool.close()
pool.join()
Final=pd.concat(df_parall, ignore_index=True)

Scraping Pricing off a search Bar - reached server limit

With help on Stackoverflow, I was able to come up with the scraper. The code returns a list of part numbers and its corresponding prices.
part1 price1
part2 price2
...
...
partn pricen
However the website seems to only allow 200 requests - when i raise the limit to 200+ i would get the error: "raise JSONDecodeError("Expecting value", s, err.value) from None JSONDecodeError: Expecting value".
I just want to know if there's a way to avoid this error? If not I can raise start:0 by 200 each time, but since I would have 100k+ items easily it won't be very efficient..is there a way I can loop the limit and the start function?
Please see the codes below, any help appreciated!
import requests
# import pprint # to format data on screen `pprint.pprint()
import pandas as pd
# --- fucntions ---
def get_data(query):
"""Get data from server"""
payload = {
# "facets":[{
# "name":"OEM",
# "value":"GE%20Healthcare"
# }],
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
# "urlParams":[{
# "name": "OEM",
# "value": "GE Healthcare"
# }],
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
data = r.json()
return data
all_queries = ['GE Healthcare']
for query in all_queries:
#print('\n--- QUERY:', query, '---\n')
data = get_data(query)
Part_Num = []
Vendor_Item_Num = []
price = []
for item in data['products']:
if not item['options']:
Part_Num.append([])
Vendor_Item_Num.append([])
price.append([])
else:
all_prices = [option['price'] for option in item['options']]
all_vendor = [option['price'] for option in item['options']]
all_part_num = item['partNumber']
Part_Num.append(all_part_num)
Vendor_Item_Num.append(all_vendor)
price.append(all_prices)
list_of_dataframes = [pd.DataFrame(Part_Num),pd.DataFrame(price)]
pd.concat(list_of_dataframes, axis=1).to_csv(r'C:\Users\212677036\Documents\output7.csv')
You should always check the status_code that your request was successful. The API is giving HTTP 500 when limit is > 200. status codes. You need to study the documentation of the API. Many APIs limit requests per second and maximum request size so they can maintain a reliable service.
The json() method will fail if the HTTP request was not successful.
You can get data in batches. Sample code below I stop because I have no want to stay in the loop for 500+ iterations... You could consider using threading so it's not so sequential.
All of this is covered in SO prodasf-vip
import requests
query = 'GE Healthcare'
payload = {
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
js = r.json()
df = pd.json_normalize(js["products"])
while len(df) < js["totalResults"] and len(df)<2000:
payload["start"] += 200
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
df = pd.concat([df, pd.json_normalize(r.json()["products"])])
else:
break
print(f"want: {js['totalResults']} got: {len(df)}")
df

Why json output so small?

This output should be way longer than it is in here.
I start with a GET request, I parse a JSON list and extract the id, which I then call on the second function, that will give me a second ID which then I will use to call on the 3rd function. But, I am only getting one entry whereas I should be getting way more entries.
The code is the following:
from requests.auth import HTTPBasicAuth
import requests
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def countries():
data = requests.get("https://localhost:8543/api/netim/v1/countries/", verify=False, auth=HTTPBasicAuth("admin", "admin"))
rep = data.json()
return [elem.get("id","") for elem in rep['items']]
def regions():
for c in countries():
url = requests.get("https://localhost:8543/api/netim/v1/countries/{}/regions".format(c), verify=False, auth=HTTPBasicAuth("admin", "admin"))
response = url.json()
return [cid.get("id","") for cid in response['items']]
def city():
for r in regions():
api = requests.get("https://localhost:8543/api/netim/v1/regions/{}/cities".format(r), verify=False, auth=HTTPBasicAuth("admin", "admin"))
resolt = api.json()
return(json.dumps([{"name":r.get("name",""),"id":r.get("id", "")} for r in resolt['items']], indent=4))
city()
print(city())
The output is the following :
[
{
"name": "Herat",
"id": "AF~HER~Herat"
}
]
I should have a huge list, so I am not sure what am I missing?
You need to go through all the iterations of your loop and collect the results, then jsonify the and return them.
data = []
for r in regions():
api = requests.get("https://localhost:8543/api/netim/v1/regions/{}/cities".format(r), verify=False, auth=HTTPBasicAuth("admin", "admin"))
resolt = api.json()
data.extend([{"name":r.get("name",""),"id":r.get("id", "")} for r in resolt['items']])
return json.dumps(data, indent=4)
This would be a fix for city() but you have the same problem in all your functions. return immediately exits the function and does not do anything else, effectively all your for loops are doing 1 iteration.
I'll update my example here to give you a better idea what's occurring.
Your functions are basically this:
def test_fn():
for i in [1,2,3,4]:
return i
# output:
1
# We never see 2 or 3 or 4 because we return before looping on them.
What you want:
def test_fn():
results = []
for i in [1,2,3,4]:
results.append(i)
return results
# output
[1,2,3,4]
It seems like you understand that the for loop is going to take some action once for each element in the list. What you're not understanding is that return ends the function NOW. No more for loop, no more actions, and in your code, you immediately return inside the for loop, stopping any further action.

How to fake my response in pytest requests mock

My function that I am trying to test is returning list of strings:
def listForumsIds:
response = requests.get(url)
forums= response.json().get('forums')
forumsIds= [forum['documentId'] for forum in forums]
# return like: ['id1', 'id2', 'id3'.....]
return forumsIds
My test function:
#requests_mock.mock()
def test_forms(self, m):
# I also used json='response'
m.get('valid url', text="response", status_code=200)
resp = listForumsIds('valid url')
# ERROR !!!!
assert resp == "response"
I am getting error like: json.decoder.JSONDecodeError or str object has no attribute get
How to fake my response to be match return value of my function?
You have to pass the desired payload in the json field of the mocked response. Example, adapted to your code:
class MyTests(unittest.TestCase):
#requests_mock.mock()
def test_forms(self, m):
payload = {"forums": [{"documentId": "id1"}]}
m.register_uri("GET", "https://www.example.com", json=payload)
ids = listForumsIds('https://www.example.com')
assert ids == ['id1']

Categories