I'm trying to get a dataframe from a api response.
For optimization I run parallels threads, but the time is really high.
An code example:
def parall_func(tuple):
output = pd.DataFrame()
list_caracts = list(map(str,tuple[2]))
item = [(tuple[1])]
q = len(list_caracts)
headers = {
'Content-Type':'application/json'
}
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = requests.get(url,headers=headers,data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
pool = Threads(cpu_count())
df_parall=list(pool.imap(parall_func, df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
can you help me to correct or suggest another logic or structure different to pandas
the final response has at about 3 millions of records
After I can get the structure i need do some of calcs and then connect to a db with pyodbc to save the data
The two things I would try are:
Create a requests.Session instance and use that to issue your GET requests. According to the documentation for this:
The Session object allows you to persist certain parameters across requests. It also persists cookies across all requests made from the Session instance, and will use urllib3’s connection pooling. So if you’re making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase (see HTTP persistent connection).
Since you are using multithreading, limiting yourself to only a number of threads equal to the number of cores you have will result in under performance. Try creating 500 threads. The only issue becomes whether the website will not complain that too many requests per second are being made.
By the way, you source had an indentation error. I have supplied missing import statements as I suppose they should be and I have renamed argument tuple to tpl since tuple is a built-in type and you should not redefine built-in types without a good reason.
from multiprocessing.pool import ThreadPool as Threads
from requests import Session
from functools import partial
import pandas as pd
import sys
def parall_func(session, tpl):
output = pd.DataFrame()
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session), df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
Update
One additional thing you can try is to replace creating variable output by doing multiple append operations with a single concat:
def parall_func(session, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
output = pd.concat(dataframes)
except:
print("Error: ", sys.exc_info()[0])
raise
return output
If the above doesn't improve performance, one last thing to try is to have the creation of the dataframes done using multiprocessing:
from multiprocessing.pool import ThreadPool as Threads, Pool as MultiProcessingPool
from requests import Session
from functools import partial
import pandas as pd
import sys
def create_data_frames(response):
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
# Perhaps you might want to specify ignore_index=True on the following:
output = pd.concat(dataframes)
return output
def parall_func(session, multiprocessing_pool, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
output = multiprocessing_pool.apply(create_data_frames, args=(response,))
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
multiprocessing_pool = MultiProcessingPool()
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session, multiprocessing_pool), df_queries.itertuples(name=None)))
multiprocessing_pool.close()
multiprocessing_pool.join()
pool.close()
pool.join()
Final=pd.concat(df_parall, ignore_index=True)
Related
What am I doing wrong? I have an extractor that works great but writing the test is stumping me and it's failing. Can anyone help me figure out where I'm going wrong?
from unittest.mock import MagicMock, patch
import pandas as pd
import requests
from my_project.task import extractor
from my_project.tests import utils
from prefect.logging import disable_run_logger
CONTACT_RECORD = utils.TEST_CONTACT_RECORD
PAGED_CONTACT_RECORD = utils.TEST_PAGED_CONTACT_RECORD
EXPECTED_CONTACT_RECORD = utils.EXPECTED_CONTACT_RECORD
#patch("requests.get")
def test_contact_extractor(get: MagicMock):
"""
Should call "request.get" once and return a json
containing contact data.
"""
get.return_value.json.return_value = CONTACT_RECORD
with disable_run_logger():
result = extractor.get_contacts()
assert get.call_count == 1
assert result == pd.DataFrame(EXPECTED_CONTACT_RECORD)
#patch("my_project.extractor.get_contacts")
def test_get_paged_contacts(get_contacts: MagicMock):
"""
Should run "requests.get" until ['has-more'] is False
and there is no offset value.
"""
get_contacts.return_value.json.side_effect = [
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
CONTACT_RECORD,
]
with disable_run_logger():
data = extractor.get_paged_contacts(
endpoint=MagicMock, query_string=MagicMock, df=MagicMock
)
assert get_contacts.call_count == 4
assert data == pd.DataFrame(EXPECTED_CONTACT_RECORD)
Some errors I'm getting are:
requests imported but not used
callable[[Union[str,btyes],....], Response] has no attribute "return_value"
EDIT:
No longer getting the second error because I realized I had a typo, but currently getting:
AttributeError: 'NoneType' object has no attribute 'client'
Edit:
Here is my get_paged_data() function:
def get_paged_contacts(
endpoint: str, query_string: typing.Dict[str, typing.Any], df: pd.DataFrame
) -> pd.DataFrame:
"""
Return the results of the get request.
Loops over api response and appends the results of a while loop for pagination, then
merges the results with the previously extracted dataframe.
"""
url = endpoint
contacts = []
response = requests.request("GET", url, headers=header, params=query_string).json()
has_more = response["has-more"]
offset = response["vid-offset"]
while has_more is True:
querystring = {"limit": "100", "archived": "false", "offset": offset}
try:
response = requests.request(
"GET", url, headers=header, params=querystring
).json()
time.sleep(10)
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as j:
logger.error(f"Error occurred: {j}.")
break
for x in range(len(response["contacts"])):
contacts.append(response["contacts"][x])
contacts = json_normalize(contacts)
merged = pd.concat([df, contacts])
return merged
After checking the edited question, here is a possible approach. The code under test could be the following:
def get_paged_contacts(endpoint: str,
query_string: typing.Dict[str, typing.Any],
df: pd.DataFrame) -> pd.DataFrame:
"""
Return the results of the get request.
Loops over api response and appends the results of a while loop
for pagination, then merges the results with the previously
extracted dataframe.
"""
url = endpoint
contacts = []
response = requests.request("GET", url,
headers=header,
params=query_string).json()
has_more = response["has-more"]
offset = response["vid-offset"]
# Get the contacts coming from the first response
contacts.extend(response['contacts'])
while has_more:
querystring = {"limit": "100",
"archived": "false", "offset": offset}
try:
response = requests.request("GET", url,
headers=header,
params=querystring).json()
# Update the looping condition in every response
has_more = response["has-more"]
contacts.extend(response['contacts'])
time.sleep(10)
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as j:
logger.error(f"Error occurred: {j}.")
break
contacts = pd.json_normalize(contacts)
merged = pd.concat([df, contacts])
# Reset the dataframe index after concatenating
merged.reset_index(drop=True, inplace=True)
return merged
It can be refactored by having all requests inside the while loop, to avoid duplication, but it is not clear how you want to handle the query_string parameter, so I left it as it is. Then, the test code could be something like this:
#patch('my_project.task.extractor.requests.request')
def test_get_paged_contacts(request_mock):
request_mock.return_value.json.side_effect = [
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
PAGED_CONTACT_RECORD,
CONTACT_RECORD,
]
expected_df = pd.DataFrame(EXPECTED_CONTACT_RECORD)
input_df = pd.DataFrame()
res = get_paged_contacts('dummy_endpoint', None, input_df)
assert request_mock.call_count == 4
assert_frame_equal(res, expected_df)
The assert_frame_equal function is a utility provided by pandas to check two dataframes for equality. It is particularly useful for unit testing with pandas dataframes. You can check it here. Of course, you need to import it with from pandas.testing import assert_frame_equal
I am making a python script using API of a free test automation website called TestProject.
Link to their API: https://api.testproject.io/docs/v2/
Basically what i want to do is grab pdf of reports of all tests and save them somewhere.
But to make the GET request to do that i first need projectID and jobID which i already wrote functions getting them and saving them in the array.
But now i have a problem where its looping through both lists and not using correct projectID and jobID and its throwing errors because it does not exist.
So what i need is something to check if jobID is in projectID so that way i can make a GET request to get all the executionID's to get the PDF of the report.
I am kinda new to programming so i would love any help i can get. If anyone has any better solutions please feel free to let me know.
My script:
import requests
import json
import csv
from datetime import datetime
from jsonpath_ng import jsonpath, parse
API_key = 'api_key'
headers = {'Authorization':'{}'.format(API_key)}
list_projectId = []
list_jobId = []
list_executionId = []
ParseData_projectId = parse('$..id')
ParseData_jobId = parse('$..id')
ParseData_executionId = parse('$..id')
def parsing (response,ParseData,list_data):
# parses data and appends it to the list
Data = json.loads(response)
Parsaj = ParseData
Podatki = Parsaj.find(Data)
for i in range(0, len(Podatki)):
vrednost = Podatki[i].value
list_data.append(vrednost)
def projectId():
# gets all projectId's and saves them in list_projectId
url = 'https://api.testproject.io/v2/projects?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_projectId,list_projectId)
def jobId():
# gets all jobId's and saves them in list_jobId
for i in range(0, len(list_projectId)):
id = list_projectId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(id) + '/jobs?onlyScheduled=false&_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_jobId,list_jobId)
def executionId():
# Their API link:
# https://api.testproject.io/v2/projects/{projectId}/jobs/{jobId}/reports?_start=0
# the for loop below does not work here is where i need the help:
for i in range(0, len(list_projectId)):
project_id = list_projectId[i]
job_id = list_jobId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(project_id) + '/jobs/{}'.format(job_id) + '/reports?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_executionId,list_executionId)
projectId()
print("----------LIST PROJECT ID: ----------")
print(list_projectId)
print("")
jobId()
print("----------LIST JOB ID: ----------")
print(list_jobId)
executionId()
print("----------LIST EXECUTION ID: ----------")
print(list_executionId)
you have to use 'in' operator to check the value exist in the list data structure.
the situation is that sometimes a request does not load or gets stuck in Python, in case that happens or any error occurs, I would like to retry it "n" times and wait up to a maximum of 3 seconds for each one and in case the attempts are over tell me a message that f"Could not process {type_1} and {type_2}". Everything runs in parallel with concurrent.futures. Could you help me with that?
import Requests
import concurrent.futures
import json
data = [['PEN','USD'],['USD','EUR']]
def currency(element):
type_1 =element[0]
type_2 = element[1]
s = requests.Session()
url = f'https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&fromCurr={type_1}&toCurr={type_2}'
a = s.get(url)
response = json.loads(a)
value = response["convertedAmount"]
return value
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(
currency, data)
for value in results:
print(value)
Your code is almost there. Here, I modified a few things:
from concurrent.futures import ThreadPoolExecutor
import time
import requests
def convert_currency(tup):
from_currency, to_currency = tup
url = (
"https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0"
"&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&"
f"fromCurr={from_currency}&toCurr={to_currency}"
)
session = requests.Session()
for _ in range(3):
try:
response = session.get(url, timeout=3)
if response.ok:
return response.json()["convertedAmount"]
except requests.exceptions.ConnectTimeout:
time.sleep(3)
return f"Could not process {from_currency} and {to_currency}"
data = [["VND", "XYZ"], ['PEN','USD'], ["ABC", "XYZ"], ['USD','EUR'], ["USD", "XXX"]]
with ThreadPoolExecutor() as executor:
results = executor.map(convert_currency, data)
for value in results:
print(value)
Notes
I retried 3 times (see the for loop)
Use timeout= to specify the time out (in seconds)
The .ok attribute will tell if the call was successful
No need to import json as the response object can JSON decode with the .json() method
You might experiment between ThreadPoolExecutor and ProcessPoolExecutor to see which one performs better
The code below is a sample from my complete program, I tried it to make understandable.
It sends requests to a REST API. It starts with an URL and the number of pages for this specific search and tries to catch the content for each page.
Each page has several results. Each result becomes a FinalObject.
Because there are as many API requests as there are pages, I decided to use multi-threading and the concurrent.futures module.
=> It works but, as I'm new in coding and Python, I still have these 2 questions:
How to use ThreadPoolExecutor sequentially in this case,
Is there a better way to handle multi-threading in this case?
from concurrent.futures import ThreadPoolExecutor
from requests import get as re_get
def main_function(global_page_number, headers, url_request):
# create a list of pages number
pages_numbers_list = [i for i in range(global_page_number)]
# for each page, call the page_handler (MultiThreading)
with ThreadPoolExecutor(max_workers=10) as executor:
for item in pages_numbers_list:
executor.submit(
page_handler,
item,
url_request,
headers
)
def page_handler(page_number, url_request, headers):
# we change the page number in the url request
url_request = change_page(url_request, page_number)
# new request with the new url
result = re_get(url_request, headers=headers)
result = result.json()
# in the result, with found the list of dict in order to create the
# final object
final_object_creation(result['results_list'])
def change_page(url_request, new_page_number):
"to increment the value of the 'page=' attribute in the url"
current_nb_page = ''
start_nb = url_request.find("page=") + len('page=')
while 1:
if url_request[start_nb].isdigit():
current_nb_page = url_request[start_nb]
else:
break
new_url_request = url_request.replace("page=" + current_nb_page,
"page=" + str(new_page_number))
return new_url_request
def final_object_creation(results_list):
'thanks to the object from requests.get(), it builts the final object'
global current_id_decision, dict_decisions
# each item in the results lis should be an instance of the final object
for item in results_list:
# On définit l'identifiant du nouvel objet Decision
current_id_decision += 1
new_id = current_id_decision
# On crée l'objet Décision et on l'ajoute au dico des décisions
dict_decisions[new_id] = FinalObject(item)
class FinalObject:
def __init__(self, content):
self.content = content
current_id_decision = 0
dict_decisions = {}
main_function(1000, "headers", "https://api/v1.0/search?page=0&query=test")
What would be the fastest way to load Device IDs from an excel sheet which contains 800+ Device IDs and pass these Device IDs in a http get request.
I'm fetching Device IDs from the excel sheet, making http get request to get the relevant data and dump it into a list and then save it in an excel file using : -
if __name__ == '__main__':
excel_file = openpyxl.load_workbook("D:\mypath\Book1.xlsx")
active_sheet = excel_file.get_sheet_by_name("Sheet4")
def iter_rows(active_sheet):
for row in active_sheet.iter_rows():
yield [cell.value for cell in row]
res = iter_rows(active_sheet)
keys = next(res)
final_data_to_dump = []
failed_data_dump = []
for new in res:
inventory_data = dict(zip(keys, new))
if None in inventory_data.values():
pass
else:
url_get_event = 'https://some_url&source={}'.format(inventory_data['DeviceID'])
header_events = {
'Authorization': 'Basic authkey_here'}
print(inventory_data['DeviceID'])
try:
r3 = requests.get(url_get_event, headers=header_events)
r3_json = json.loads(r3.content)
if r3_json['events']:
for object in r3_json['events']:
dict_excel_data = {
"DeviceID":object['source']['id'],
"Device Name":object['source']['name'],
"Start 1":object['Start1'],
"Start 2":object['Start2'],
"Watering Mode":object['WateringMode'],
"Duration":object['ActuationDetails']['Duration'],
"Type":object['type'],
"Creation Time":object['creationTime']
}
final_data_to_dump.append(dict_excel_data)
else:
no_dict_excel_data = {
"DeviceID":inventory_data["DeviceID"],
"Device Name":inventory_data["DeviceName"],
"Start 1":"",
"Start 2":"",
"Watering Mode":"",
"Duration":"",
"Type":"",
"Creation Time":""
}
final_data_to_dump.append(no_dict_excel_data)
except requests.ConnectionError:
failed_dict_excel_data = {
"DeviceID":inventory_data['DeviceID'],
"Device Name":inventory_data["DeviceName"],
"Status":"Connection Error"
}
failed_data_dump.append(failed_dict_excel_data)
df = pd.DataFrame.from_dict(final_data_to_dump)
df2 = pd.DataFrame.from_dict(failed_data_dump)
df.to_excel('D:\mypath\ReportReceived_10Apr.xlsx',sheet_name='Sheet1',index=False)
df2.to_excel('D:\mypath\Failed_ReportReceived_10Apr.xlsx',sheet_name='Sheet1',index=False)
But this can take upwards of 10-15 mins as there are 800+ devices in the Book1 sheet and it's likely to increase. How can I make this process faster?
You can use an async library, but the easiest solution here would be to do something like
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as exc:
responses = exc.map(get, device_ids)
def get(device_id):
url_get_event = 'https://some_url&source={}'.format(device_id)
return requests.get(url_get_event)
If the other part of your code is small you may want to submit the functions to the executor and use as_completed to handle them in the main thread while waiting for other requests to run too.