Python - Pandas script too slow with Lambda

Python - Pandas script too slow with Lambda - python

My code is working as i wish but is very slow when i run this line.
--- newdf['Login'] = newdf['Site'].apply(lambda x : "yes" if get(x).status_code == 200 else "no") ---
After comment the code run fast.
How can i change this line to add a Yes or No to login column and keep fast?
And if i can improve all this i will be appreciative.
I hope I made understood myself.
Thank you!
import pandas as pd
import requests
from requests import get
from requests.exceptions import HTTPError
lista = pd.read_csv('sites4.csv', sep=',')
df = pd.DataFrame(lista, columns=['Site', 'Login'])
newdf = df.assign(Site=df['Site'].map(str) + 'Login')
headers = {'Content-Type': 'application/json'}
for i in newdf['Site']:
try:
result = get(i, headers=headers, timeout=5)
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except Exception as err:
print(f'Other error occurred: {err}')
else:
if 'application/json' in result.headers.get('Content-Type') or result.status_code == 406 or result.status_code == 403:
newdf['Login'] = newdf['Site'].apply(lambda x : "yes" if get(x).status_code == 200 else "no")
print(i + ' é Login')
print(result)

This Should Be a Much Faster Implementation of your Functions
from typing import Optional, Coroutine, List
import aiohttp
from pandas import DataFrame
from pandas.errors import EmptyDataError
import pandas as pd
import asyncio
def create_df_form_file() -> Optional[DataFrame]:
try:
site_list = pd.read_csv('sites4.csv', sep=',')
df = pd.DataFrame(site_list, columns=['Site', 'Login'])
return df.assign(Site=df['Site'].map(str) + 'Login')
except EmptyDataError as e:
print(f'File Error: {e}')
return None
new_df: Optional[DataFrame] = create_df_form_file()
if not isinstance(new_df, DataFrame):
print("empty data goodbye")
exit(1)
# NOTE: Async Get Request
async def get_request(x_url: str) -> bool:
async with aiohttp.ClientSession() as session:
async with session.get(url=x_url) as result:
return result.ok
# functions to test if site needs a login and if header contains json and to include Yes or No
def needs_login(result): return result.status == 406 or result.status == 403
def is_json(result): return result.headers.get('Content-Type') == 'application/json'
async def yes_no(x): return 'yes' if await get_request(x) else 'no'
async def _do_work(site_column, _headers: dict) -> Optional[DataFrame]:
async with aiohttp.ClientSession() as session:
async with session.get(site_column['Site'], headers=_headers) as result:
if is_json(result) or needs_login(result):
# Appending Yes No
site_column['login'] = site_column['Site'].apply(yes_no)
print(site_column['Site'] + ' é Login')
print(result)
return site_column
return None
def get_results():
global new_df
headers = {'Content-Type': 'application/json'}
try:
_coro: List[Coroutine] = [_do_work(site_column, _headers=headers) for site_column in new_df['Site']]
except KeyError:
print("please insure your input file is accurate")
exit(1)
try:
event_loop = asyncio.get_event_loop()
except RuntimeError:
asyncio.set_event_loop(asyncio.new_event_loop())
event_loop = asyncio.get_event_loop()
# noinspection PyUnboundLocalVariable
results = event_loop.run_until_complete(*_coro)
print(results)
if __name__ == '__main__':
get_results()
for more information on python programming please visit my tutorial site here

because get is func in requests.
Since requests is not an asynchronous package, all codes are interrupted until the request is completed.
If you use asyncio and aiohttp, you can improve it.

Related

How to get simple threading to work Python

i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()

You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.

How can I asynchronously request URLs in a growing queue with asyncio?

I have X initial urls that are paginated - in order to get the next set of data, I have to grab the next url from the response header until there is no next url. I am having trouble getting this going right. I'm trying a queue approach that I found here.
import asyncio
from aiohttp import ClientSession, TCPConnector
async def get(session, url):
headers = {
'Authorization': 'Bearer KEY',
}
async with session.get(url, headers=headers) as response:
json = await response.json()
return json, response
async def process(session, url, q):
try:
try:
views, response = await get(session, url)
scode = response.status
if scode == 404:
return
except Exception as e:
print(e)
return
try:
await q.put(str(response.links["next"]["url"]))
except:
pass
<do something with views>
except Exception as e:
print(e)
async def fetch_worker(session, q):
while True:
url = await q.get()
try:
await process(session, url, q)
except Exception as e:
print(e)
finally:
q.task_done()
async def d():
<code to query and put data into stdrows>
connector = TCPConnector(limit=500)
async with ClientSession(connector=connector) as session:
url = '<some base url>'
for i in range(500):
tasks.append(asyncio.create_task(fetch_worker(session, url_queue)))
for row in stdrows:
await url_queue.put(url.format(row[1]))
await asyncio.gather(*tasks)
await url_queue.join()
asyncio.run(d())
This appears not to be going at 500 tasks/sec. is it even possible to get to this rate without knowing all the URLs ahead of time? I am hoping to fetch the next url from whatever initial url (or from its paginated url) while i work with views.

Change a while true python script to run only once

I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)

From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)

Python asyncio / aiohttp error

I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e

Better way to write this Python function because of If conditions

def do_request(url, token, json_data=None,
mode="get", work_around_for_image_custom_list=False):
"""Uploads a file. """
header_collection = {"X-Auth-Token": token}
if json_data is not None:
header_collection['Content-Type'] = 'application/json'
try:
if mode == "delete":
# this looks ugly, but there is absolutely no way to
# get requests to do DELETE when there is a blank JSON
# included
r = requests.delete(url, headers=header_collection, timeout=10)
else:
r = getattr(requests, mode)(url, data=json.dumps(json_data),
headers=header_collection, timeout=10)
if r.status_code == 200:
#This looks ugly also, but has to be for a particular function that calls it
if work_around_for_image_custom_list:
return r
else:
http_info = (json.dumps(r.json(), indent=2), r.status_code)
else:
http_info = (r.text, r.status_code)
return http_info
except requests.exceptions.ConnectionError:
print "Connection Error! Http status Code {}".format(r.status_code)
sys.exit()
except (requests.exceptions.RequestException,
requests.exceptions.HTTPError):
print "Ambiguous Error! Http status Code {}".format(r.status_code)
sys.exit()
Using Python 2.7 and the requests module, I have this function that I call with several other functions to make api calls. However, I have to make an exception for one particular function and return the request object...if work_around_for_image_custom_list is True. This seems seems like a ugly hack/work around and I am wondering how I could re-write it to compensate if work_around_for_image_custom_list. For instance, would it be better to make this a class and have each function create a object to use it? If so, how would I over ride if r.status_code == 200:?

Expanding on the comment I made:
def do_raw_request(url, token, json_data=None, mode="get"):
"""Uploads a file. """
header_collection = {"X-Auth-Token": token}
if json_data is not None:
header_collection['Content-Type'] = 'application/json'
try:
if mode == "delete":
# this looks ugly, but there is absolutely no way to
# get requests to do DELETE when there is a blank JSON
# included
r = requests.delete(url, headers=header_collection, timeout=10)
else:
r = getattr(requests, mode)(url, data=json.dumps(json_data),
headers=header_collection, timeout=10)
if r.status_code == 200:
return r, r.status_code
return r.text, r.status_code
except requests.exceptions.ConnectionError:
print "Connection Error! Http status Code {}".format(r.status_code)
sys.exit()
except (requests.exceptions.RequestException,
requests.exceptions.HTTPError):
print "Ambiguous Error! Http status Code {}".format(r.status_code)
sys.exit()
Then:
def do_request(url, token, json_data=None, mode="get"):
res, code = do_raw_request(url, token, json_data, mode)
if code == 200:
return (json.dumps(r.json(), indent=2), r.status_code)
return res, code
and now you call either do_raw_request or do_request as appropriate.
Note that I changed the return so it always returns a tuple otherwise you would have to start checking types to know whether you have a status text or a response object.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - Pandas script too slow with Lambda - python

because get is func in requests. Since requests is not an asynchronous package, all codes are interrupted until the request is completed. If you use asyncio and aiohttp, you can improve it.

Related

How to get simple threading to work Python

How can I asynchronously request URLs in a growing queue with asyncio?

Change a while true python script to run only once

Python asyncio / aiohttp error

Better way to write this Python function because of If conditions

Categories

Resources