PermissionError Multiprocessing argument pyppeteer.Page
successful but inefficient
import asyncio
from pyppeteer import launch
from multiprocessing import Process
async def f(x):
print("async def f(x,page):",x)
browser = await launch(headless=False, autoClose=False)
page = (await browser.pages())[0]
await page.goto('https://example.com')
h1 = await page.querySelector("body > div > h1")
await page.evaluate(f'(element) => element.textContent="{x}"', h1)
def p(x):
print("def p(x,page):",x)
asyncio.run(f(x))
async def main():
pro = Process(target=p, args=("1111",))
pro.start()
pro = Process(target=p, args=("2222",))
pro.start()
if __name__ =="__main__":
asyncio.get_event_loop().run_until_complete(main())
In order to process a lot, it is burdensome to create multiple browsers.
So, I try to create a lot of tabs.
This is the code I want, but I get an PermissionError
How can I solve this?
import asyncio
from pyppeteer import launch
from multiprocessing import Process
async def f(x,page):
print("async def f(x,page):",x)
await page.goto('https://example.com')
h1 = await page.querySelector("body > div > h1")
await page.evaluate(f'(element) => element.textContent="{x}"', h1)
def p(x,page):
print("def p(x,page):",x)
asyncio.run(f(x,page))
async def main():
browser = await launch(headless=False, autoClose=False)
page = (await browser.pages())[0]
pro = Process(target=p, args=("1111",page))
pro.start()
if __name__ =="__main__":
asyncio.get_event_loop().run_until_complete(main())
error message
c:\Users\mimmi\python\ttttt.py:24: DeprecationWarning: There is no current event loop
asyncio.get_event_loop().run_until_complete(main())
Traceback (most recent call last):
File "c:\Users\mimmi\python\ttttt.py", line 24, in <module>
asyncio.get_event_loop().run_until_complete(main())
File "C:\python\python311\Lib\asyncio\base_events.py", line 650, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "c:\Users\mimmi\python\ttttt.py", line 21, in main
pro.start()
^^^^^^^^^^^
File "C:\python\python311\Lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
^^^^^^^^^^^^^^^^^
File "C:\python\python311\Lib\multiprocessing\context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\python\python311\Lib\multiprocessing\context.py", line 336, in _Popen
return Popen(process_obj)
^^^^^^^^^^^^^^^^^^
File "C:\python\python311\Lib\multiprocessing\popen_spawn_win32.py", line 94, in __init__
reduction.dump(process_obj, to_child)
File "C:\python\python311\Lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle '_thread.lock' object
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\python\python311\Lib\multiprocessing\spawn.py", line 111, in spawn_main
new_handle = reduction.duplicate(pipe_handle,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\python\python311\Lib\multiprocessing\reduction.py", line 79, in duplicate
return _winapi.DuplicateHandle(
^^^^^^^^^^^^^^^^^^^^^^^^
PermissionError: [WinError 5] Access is denied
my environment
windows11
python3.11
pyppeteer1.0.2
I got the desired result with this code.
queue = asyncio.Queue()
browser = await launch(headless=False, autoClose=False)
for i in range(MAX_TASK_COUNT-1):
await browser.newPage()
pages = await browser.pages()
for page in pages:
asyncio.create_task(crawlingTask(queue, page))
await asyncio.create_task(queuePutter(queue, session, appendList))
await queue.join()
Related
I made some pretty simple script which pulls data from clicky.com api but for some reason it does not work as expected from time to time.
Sometimes it gets results but another time I am getting the following errors which I cant debug. I am fairly new to asyncio and aiohttp
Traceback (most recent call last):
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/1_stable/asy/usable/baza_goals.py", line 118, in <module>
goals_results_last_week = asyncio.run(goals_clicky_results_last_week())
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 641, in run_until_complete
return future.result()
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/1_stable/asy/usable/baza_goals.py", line 82, in goals_clicky_results_last_week
responses_clicky_goals = await asyncio.gather(*tasks_goals)
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/client.py", line 1122, in send
return self._coro.send(arg)
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/client.py", line 535, in _request
conn = await self._connector.connect(
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/connector.py", line 542, in connect
proto = await self._create_connection(req, traces, timeout)
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/connector.py", line 907, in _create_connection
_, proto = await self._create_direct_connection(req, traces, timeout)
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/connector.py", line 1175, in _create_direct_connection
transp, proto = await self._wrap_create_connection(
File "/Users/almeco/Downloads/python/_projekty/projekt_baza_review/venv/lib/python3.10/site-packages/aiohttp/connector.py", line 986, in _wrap_create_connection
return await self._loop.create_connection(*args, **kwargs) # type: ignore[return-value] # noqa
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1040, in create_connection
sock = await self._connect_sock(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 954, in _connect_sock
await self.sock_connect(sock, address)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/selector_events.py", line 502, in sock_connect
return await fut
RuntimeError: await wasn't used with future
How to debug this? Whats the problem here?
edited:
Here is my code for you to test:
import asyncio
import datetime
import aiohttp
import requests
start_operacji = datetime.datetime.now()
print('start', start_operacji)
date_filename = datetime.datetime.now().strftime('%d-%m_%H-%M')
def goals_clicky_tasks_last_week(session):
tasks_clicky_goals = []
# todo to mozna by jeszcze dać do asyncio
clicky_auth = requests.get(
'https://api.clicky.com/api/account/sites?username=meeffe&password=hAs!23$5cy&output=json')
auth_jsonised = clicky_auth.json()
list_site_id_sitekey_dict = []
for k in auth_jsonised:
site_id_sitekey_dict = {'site_id': k['site_id'], 'sitekey': k['sitekey']}
list_site_id_sitekey_dict.append(site_id_sitekey_dict)
for auth_item in list_site_id_sitekey_dict:
goal_url = f"https://api.clicky.com/api/stats/4?site_id={auth_item['site_id']}&sitekey={auth_item['sitekey']}&type=goals&date=today&limit=1000&output=json"
tasks_clicky_goals.append(asyncio.ensure_future(session.get(goal_url, ssl=False)))
return tasks_clicky_goals
async def goals_clicky_results_last_week():
list_final_goals = []
async with aiohttp.ClientSession() as session:
tasks_goals = goals_clicky_tasks_last_week(session)
responses_clicky_goals = await asyncio.gather(*tasks_goals)
for response_clicky_goal in responses_clicky_goals:
clicky_data = await response_clicky_goal.json(content_type=None)
goals_list = []
for url_item_goal in clicky_data[0]['dates'][0]['items']:
if url_item_goal['conversion'] != '':
if url_item_goal['title'].startswith(
'http'): # nie bierze pod uwagę goalsów które zawierają U - https:// etc
goals_dict = {'url': url_item_goal['title'].replace('http://', 'https://'),
'goals': url_item_goal['value'],
'ad_ctr': url_item_goal['conversion']
}
goals_list.append(goals_dict)
else:
continue
else:
continue
list_final_goals.append(goals_list)
flattened_list_final_goals = [val for sublist in list_final_goals for val in sublist]
return flattened_list_final_goals
print(asyncio.run(goals_clicky_results_last_week()), 'goals_clicky_results_last_week')
goals_results_last_week = asyncio.run(goals_clicky_results_last_week())
######################################################################
end = datetime.datetime.now() - start_operacji
print('Ready:)!')
print('It took: ', end)
I actually found a solution by myself.
Instead of aiohttp I used httpx
I used timeout with every request
I removed unnecessary await
Changes in an original code below. Now the script run 100% stable. To be frank I am not sure which of these changes had the biggest impact but it works as expected.
timeout = httpx.Timeout(60.0, connect=60.0)
async with httpx.AsyncClient(verify=False, timeout=timeout) as session:
tasks_goals = goals_clicky_tasks_last_week(session)
responses_clicky_goals = await asyncio.gather(*tasks_goals)
for response_clicky_goal in responses_clicky_goals:
clicky_data = response_clicky_goal.json()
...
I need help figure out what I did wrong and guidance to debug issues.
The program keep throwing one of these 3 errors. I'm using python 3.6 on window 10.
The error starts with crawling only 200 urls and I need to verify up to 20000 urls
OSError: [WinError 10038] An operation was attempted on something that is not a socket
aiohttp.client_exceptions.ServerTimeoutError: Timeout on reading data from socket
Traceback (most recent call last):
File "test_asyncio.py", line 91, in <module>
def func():
File "E:\git\AzureRepo\SharePoint\applitools\timer.py", line 8, in wrapper
runs = timeit.repeat(func, number=number,
File "C:\Users\xxxxx\AppData\Local\Programs\Python\Python38\lib\timeit.py", line 238, in repeat
return Timer(stmt, setup, timer, globals).repeat(repeat, number)
File "C:\Users\xxxxx\AppData\Local\Programs\Python\Python38\lib\timeit.py", line 205, in repeat
t = self.timeit(number)
File "C:\Users\xxxxx\AppData\Local\Programs\Python\Python38\lib\timeit.py", line 177, in timeit
timing = self.inner(it, self.timer)
File "<timeit-src>", line 6, in inner
File "test_asyncio.py", line 110, in func
res = loop.run_until_complete(du_main(all_urls, all_site))
File "C:\Users\xxxxx\AppData\Local\Programs\Python\Python38\lib\asyncio\base_events.py", line 616, in run_until_complete
return future.result()
File "test_asyncio.py", line 82, in du_main
result = await asyncio.gather(*tasks)
File "test_asyncio.py", line 43, in du_fetch
async with session.get(url=url, allow_redirects=False,
File "E:\git\AzureRepo\SharePoint\applitools\.venv\lib\site-packages\aiohttp\client.py", line 1117, in __aenter__
self._resp = await self._coro
File "E:\git\AzureRepo\SharePoint\applitools\.venv\lib\site-packages\aiohttp\client.py", line 544, in _request
await resp.start(conn)
File "E:\git\AzureRepo\SharePoint\applitools\.venv\lib\site-packages\aiohttp\client_reqrep.py", line 890, in start
message, payload = await self._protocol.read() # type: ignore
File "E:\git\AzureRepo\SharePoint\applitools\.venv\lib\site-packages\aiohttp\streams.py", line 604, in read
await self._waiter
aiohttp.client_exceptions.ServerTimeoutError: Timeout on reading data from socket
ERROR - Unclosed connection
async def du_fetch(session, site, url):
# try:
t0 = datetime.datetime.now()
async with session.get(url=url, allow_redirects=False,
timeout=ClientTimeout(total=1200, # 1200 works
sock_connect=5, # 600 works
sock_read=5)) as resp:
redirect_url = resp.headers.get('Location', None)
t1 = datetime.datetime.now()
time_diff = round((t1 - t0).total_seconds(), 3)
# resp.close()
# logging.debug(f'du_fetch {site} {url.encode("utf-8")}')
'''
except Exception as e:
t1 = datetime.datetime.now()
time_diff = round((t1 - t0).total_seconds(), 3)
logging.debug(e)
return (site, 999,
t0, time_diff, url, e.__class__.__name__)
'''
return (site, resp.status,
t0, time_diff, url, redirect_url)
async def du_main(urls, sites):
client_timeout = aiohttp.ClientTimeout(
total=120) # total 300 seconds (5min) timeout # 12000 works
connector = aiohttp.TCPConnector(limit=100, # default is 100 # 600 works
# limit_per_host=0,
ssl=False)
dummy_jar = aiohttp.DummyCookieJar()
tasks = []
async with aiohttp.ClientSession(connector=connector,
timeout=client_timeout,
cookie_jar=dummy_jar) as session:
tasks = [du_fetch(session, sites[i], url)
for i, url in enumerate(urls)]
result = await asyncio.gather(*tasks)
logging.error(f'du_main {session}')
await session.close()
return result
if __name__ == '__main__':
loop = asyncio.get_event_loop()
res = loop.run_until_complete(du_main(all_urls, all_site))
loop.close()
I try to run this code:
import asyncio
async def eva(code):
exec("async def ex(): return {}".format(code))
return await asyncio.wait_for(locals()["ex"](), timeout=1.0)
async def main():
while True:
code = input()
x = await asyncio.wait_for(eva(code), timeout=1.0)
print(x)
asyncio.run(main())
And getting following error:
<module>
asyncio.run(main()) File "C:\Users\\{user}\AppData\Local\Programs\Python\Python37\lib\asyncio\runners.py",
line 43, in run
return loop.run_until_complete(main) File "C:\Users\\{user}\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py",
line 584, in run_until_complete
return future.result()
File "eval.py", line 10, in main
x = await asyncio.wait_for(eva(code), timeout=1.0) File "C:\Users\\{user}\AppData\Local\Programs\Python\Python37\lib\asyncio\tasks.py",
line 416, in wait_for
return fut.result()
File "eval.py", line 5, in eva
return await asyncio.wait_for(locals()["ex"](), timeout=1.0)
File "C:\Users\\
{user}\AppData\Local\Programs\Python\Python37\lib\asyncio\tasks.py",
line 416, in wait_for
return fut.result()
File "<string>", line 1, in ex TypeError: 'int' object is not
iterable ```
Can you help me to understand what exactly happens?
From traceback you can see that error happend at function ex:
File "<string>", line 1, in ex
And shortly before it this line was executed:
File "eval.py", line 5, in eva
return await asyncio.wait_for(locals()["ex"](), timeout=1.0)
In other words exception was raised somewhere inside coroutine ex you got from locals()["ex"].
Exception message is:
TypeError: 'int' object is not iterable
You can google it to understand typical situation when it can happen, but it's not hard to assume either: something inside ex tried to iterate object of type int.
Something like this happened:
import asyncio
async def ex():
for i in 123:
pass
async def main():
return await asyncio.wait_for(ex(), timeout=1.0)
asyncio.run(main())
Run and you'll see similar:
File "...\main.py", line 15, in main
return await asyncio.wait_for(ex(), timeout=1.0)
File "...\python37\lib\asyncio\tasks.py", line 416, in wait_for
return fut.result()
File "...\main.py", line 11, in ex
for i in 123:
TypeError: 'int' object is not iterable
I'm having an issue where i try to print a invite link into console, but instead it prints out
<generator object Client.create_invite at 0x000001D310A183B8>
<generator object Client.create_invite at 0x000001D310A65410>
i am using this code:
#client.event
async def on_ready():
#await client.change_presence(game=Game(name="with humans"))
print("Logged in as " + client.user.name)
await asyncio.sleep(3)
for server in client.servers:
for channel in server.channels:
channel_type = channel.type
if str(channel_type) == 'text':
invitelinknew = client.create_invite(destination=channel, xkcd=True, max_uses=100)
print(str(invitelinknew))
break
i tried changing print(invitelinknew) to print(str(invitelinknew)), but it didn't change the outcome
EDIT:
New errors when consuming the genrator with invitelinknew2 = list(invitelinknew)
and print(invitelinknew2):
Traceback (most recent call last):
File "C:\Program Files\Python36\lib\site-packages\discord\client.py", line 307, in _run_event
yield from getattr(self, event)(*args, **kwargs)
File "C:/Users/Rasmus/Python/discordbot/botnoggi2.py", line 128, in on_ready
invitelinknew2 = list(invitelinknew)
File "C:\Program Files\Python36\lib\site-packages\discord\client.py", line 2628, in create_invite
data = yield from self.http.create_invite(destination.id, **options)
File "C:\Program Files\Python36\lib\site-packages\discord\http.py", line 137, in request
r = yield from self.session.request(method, url, **kwargs)
File "C:\Program Files\Python36\lib\site-packages\aiohttp\client.py", line 555, in __iter__
resp = yield from self._coro
File "C:\Program Files\Python36\lib\site-packages\aiohttp\client.py", line 202, in _request
yield from resp.start(conn, read_until_eof)
File "C:\Program Files\Python36\lib\site-packages\aiohttp\client_reqrep.py", line 640, in start
message = yield from httpstream.read()
File "C:\Program Files\Python36\lib\site-packages\aiohttp\streams.py", line 641, in read
result = yield from super().read()
File "C:\Program Files\Python36\lib\site-packages\aiohttp\streams.py", line 476, in read
yield from self._waiter
AssertionError: yield from wasn't used with future
Future exception was never retrieved
future: <Future finished exception=ServerDisconnectedError()>
aiohttp.errors.ServerDisconnectedError
According to the documentation create_invite is a coroutine and requires the await keyword
Change your code to include it such as
if channel.type == discord.ChannelType.text:
invitelinknew = await client.create_invite(destination=channel, xkcd=True, max_uses=100)
print(invitelinknew.url)
I wrote a program that would post events using asyncio and aiohttp. This program works when I run it locally. I can post 10k events no problem. However, I SCPed the whole codebase to a remote machine and within that machine I can't post more than 15 events without getting this error:
RuntimeError: Event loop is closed
Exception ignored in: <coroutine object Poster.async_post_event at 0x7f4a53989410>
Traceback (most recent call last):
File "/home/bli1/qe-trinity/tracer/utils/poster.py", line 63, in async_post_event
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/client.py", line 565, in __aenter__
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/client.py", line 198, in _request
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/connector.py", line 316, in connect
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/connector.py", line 349, in _release_waiter
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/futures.py", line 332, in set_result
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/futures.py", line 242, in _schedule_callbacks
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 447, in call_soon
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 456, in _call_soon
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 284, in _check_closed
RuntimeError: Event loop is closed
Exception ignored in: <coroutine object Poster.async_post_event at 0x7f4a5397ffc0>
Traceback (most recent call last):
File "/home/bli1/qe-trinity/tracer/utils/poster.py", line 63, in async_post_event
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/client.py", line 565, in __aenter__
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/client.py", line 198, in _request
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/connector.py", line 316, in connect
File "/home/bli1/py/python3.5/lib/python3.5/site-packages/aiohttp/connector.py", line 349, in _release_waiter
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/futures.py", line 332, in set_result
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/futures.py", line 242, in _schedule_callbacks
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 447, in call_soon
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 456, in _call_soon
File "/home/bli1/py/python3.5/lib/python3.5/asyncio/base_events.py", line 284, in _check_closed
RuntimeError: Event loop is closed
How can I debug this or find out the source of this problem?
Here is the class that I created and I use the method post() to run:
import uuid
import os
import asyncio
import time
import random
import json
import aiohttp
from tracer.utils.phase import Phase
class Poster(Phase):
def __init__(self, log, endpoint, num_post, topic, datafile, timeout, oracles, secure=False, thru_proxy=True):
Phase.__init__(self, log, "post", oracles, secure, thru_proxy)
self.log = log
self.num_post = int(num_post)
self.datafile = datafile.readlines()
self.topic = topic
self.endpoint = self.set_endpoint(endpoint, self.topic)
self.response = None
self.timeout = timeout
def random_line(self):
""" Returns random line from file and converts it to JSON """
return json.loads(random.choice(self.datafile))
#staticmethod
def change_uuid(event):
""" Creates new UUID for event_id """
new_uuid = str(uuid.uuid4())
event["event_header"]["event_id"] = new_uuid
return event
#staticmethod
def wrapevent(event):
""" Wrap event with metadata for analysis later on """
return {
"tracer": {
"post": {
"statusCode": None,
"timestamp": None,
},
"awsKafkaTimestamp": None,
"qdcKakfaTimestamp": None,
"hdfsTimestamp": None
},
"event": event
}
def gen_random_event(self):
random_event = self.random_line()
event = self.change_uuid(random_event)
dataspec = self.wrapevent(event)
return dataspec
async def async_post_event(self, event, session):
async with session.post(self.endpoint, data=event, proxy=self.proxy) as resp:
event["tracer"]["post"]["timestamp"] = time.time() * 1000.0
event["tracer"]["post"]["statusCode"] = resp.status
unique_id = event["event"]["event_header"]["event_id"]
oracle_endpoint = os.path.join(self.oracle, unique_id)
async with session.put(oracle_endpoint, data=json.dumps(event), proxy=self.proxy) as resp:
if resp.status != 200:
self.log.debug("Post to ElasticSearch not 200")
self.log.debug(event["event"]["event_header"]["event_id"])
self.log.debug("Status code: " + str(resp.status))
return event["event"]["event_header"]["event_id"], resp.status
async def async_post_events(self, events):
coros = []
conn = aiohttp.TCPConnector(verify_ssl=self.secure)
async with aiohttp.ClientSession(connector=conn) as session:
for event in events:
coros.append(self.async_post_event(event, session))
return await asyncio.gather(*coros)
def post(self):
event_loop = asyncio.get_event_loop()
try:
events = [self.gen_random_event() for i in range(self.num_post)]
start_time = time.time()
results = event_loop.run_until_complete(self.async_post_events(events))
print("Time taken: " + str(time.time() - start_time))
finally:
event_loop.close()
You cannot re-use a loop once it's closed. From AbstractEventLoop.close documentation:
This is idempotent and irreversible. No other methods should be called after this one.
Either remove the loop.close call or create a new loop for each post.
My advice would be to avoid those problems by running everything inside the loop and awaiting async_post_events when needed.