I'm trying to make requests with headless chrome using pyppeteer. But I keep getting "OSError: [Errno 24] Too many open files" after a certain amount of requests. I checked the open resources of the python process with losf and found out that with every new request there's a new line like the following
python3 14840 root 11r FIFO 0,8 0t0 64208510 pipe
Can someone tell me what resources aren't being closed? The code that's producing this error is below
def search(self, search_path):
async def main(url):
browser = await launch(args=['--no-sandbox'], handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False)
page = await browser.newPage()
await page.setJavaScriptEnabled(False)
try:
response = await page.goto(url, options={"timeout": 50000})
except pyppeteer.errors.TimeoutError:
pass
src = await page.content()
await browser.close()
return src
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
url = "https://www.example.com" + search_path
val = asyncio.get_event_loop().run_until_complete(main(url))
loop.close()
EDIT
I managed to close the open pipes by calling
browser.process.communicate()
Related
I've struggled a while now trying to download a pdf in Python Playwright which is rendered from a php web page (not included in this example as it has sensitive code - instead I've included a link to a pdf).
Here's my code so far, using the JavaScript code from https://github.com/microsoft/playwright/issues/3509 as example:
from playwright.async_api import Playwright, async_playwright, expect
import asyncio
import os
import json
tmp_dir = './pwtest/'
user_dir = os.path.join(os.getcwd(),"pwtest","user_dir")
print("User dir: ", user_dir)
downloads_path = os.path.join(os.getcwd(),"pwtest","downloads")
print("Downloads path: ", downloads_path)
storage_state_path = "./pwtest/"
default_preferences = {
"plugins": {
"always_open_pdf_externally": True
}
}
#Making directories can likely be done more efficiently... please comment if you know how, then I'll edit the post.
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","downloads"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir","Default"))
except:
print("Unable to create folder... Likely it already exists.")
with open(os.path.join(user_dir, "Default", "Preferences"), "w") as f:
f.write(json.dumps(default_preferences))
async def run(playwright: Playwright) -> None:
browser = await playwright.chromium.launch_persistent_context(user_dir, accept_downloads=True, headless=False, slow_mo=1000)
browser.set_default_timeout(10000)
page = await browser.new_page()
# Start waiting for the download
file_name = "test_d.pdf"
async with page.expect_download() as download_info:
await page.goto("https://www.africau.edu/images/default/sample.pdf", timeout= 5000)
await page.wait_for_timeout(200)
print("Saving file to ", downloads_path, file_name)
# Wait for the download to start
download = await download_info.value
# Wait for the download process to complete
print(await download.path())
# Save downloaded file somewhere
await download.save_as(os.path.join(downloads_path, file_name))
await browser.close()
async def main() -> None:
async with async_playwright() as playwright:
await run(playwright)
asyncio.run(main())
Help will be appreciated.
I keep getting the following error after trying sync, async etc. code. Another alternative is likely to intercept the blob transfer, but I don't know how that's done. Please advise.
playwright._impl._api_types.Error: net::ERR_ABORTED at https://www.africau.edu/images/default/sample.pdf
=========================== logs ===========================
navigating to "https://www.africau.edu/images/default/sample.pdf", waiting until "load"
============================================================
The page is taking too much time to load all js content. Change the default time out to zero. So, that there will be no timeout.
This downloads updated fasta files (protein sequences) from a database, I've gotten this to work faster using asyncio compared to requests, however I'm not convinced the downloads are actually happening asynchronously.
import os
import aiohttp
import aiofiles
import asyncio
folder = '~/base/fastas/proteomes/'
upos = {'UP000005640': 'Human_Homo_sapien',
'UP000002254': 'Dog_Boxer_Canis_Lupus_familiaris',
'UP000002311': 'Yeast_Saccharomyces_cerevisiae',
'UP000000589': 'Mouse_Mus_musculus',
'UP000006718': 'Monkey_Rhesus_macaque_Macaca_mulatta',
'UP000009130': 'Monkey_Cynomolgus_Macaca_fascicularis',
'UP000002494': 'Rat_Rattus_norvegicus',
'UP000000625': 'Escherichia_coli',
}
#https://www.uniprot.org/uniprot/?query=proteome:UP000005640&format=fasta Example link
startline = r'https://www.uniprot.org/uniprot/?query=proteome:'
endline = r'&format=fasta&include=False' #include is true to include isoforms, make false for only canonical sequences
async def fetch(session, link, folderlocation, name):
async with session.get(link, timeout=0) as response:
try:
file = await aiofiles.open(folderlocation, mode='w')
file.write(await response.text())
await file.close()
print(name, 'ended')
except FileNotFoundError:
loc = ''.join((r'/'.join((folderlocation.split('/')[:-1])), '/'))
command = ' '.join(('mkdir -p', loc))
os.system(command)
file = await aiofiles.open(folderlocation, mode='w')
file.write(await response.text())
await file.close()
print(name, 'ended')
async def rfunc():
async with aiohttp.ClientSession() as session:
for upo, name in upos.items():
print(name, 'started')
link = ''.join((startline, upo, endline))
folderlocation =''.join((folder, name, '.fasta'))
await fetch(session, link, folderlocation, name)
loop = asyncio.get_event_loop()
loop.run_until_complete(rfunc())
My output from running this:
In [5]: runfile('~/base/Fasta Proteome Updater.py')
Human_Homo_sapien started
Human_Homo_sapien ended
Dog_Boxer_Canis_Lupus_familiaris started
Dog_Boxer_Canis_Lupus_familiaris ended
Yeast_Saccharomyces_cerevisiae started
Yeast_Saccharomyces_cerevisiae ended
Mouse_Mus_musculus started
Mouse_Mus_musculus ended
Monkey_Rhesus_macaque_Macaca_mulatta started
Monkey_Rhesus_macaque_Macaca_mulatta ended
Monkey_Cynomolgus_Macaca_fascicularis started
Monkey_Cynomolgus_Macaca_fascicularis ended
Rat_Rattus_norvegicus started
Rat_Rattus_norvegicus ended
Escherichia_coli started
Escherichia_coli ended
The printed output seems to signify the downloads are happening one at a time, is there something wrong here?
You are looping the items to download and waiting (await) for each item to finish. To make them happen all at one time, you need to schedule all downloads for execution at once - e.g. using gather.
Then your code could look like this:
async def rfunc():
async with aiohttp.ClientSession() as session:
await gather(
*[
fetch(
session,
''.join((startline, upo, endline)),
''.join((folder, name, '.fasta')),
name,
) for upo, name in upos.items()
]
)
loop = asyncio.get_event_loop()
loop.run_until_complete(rfunc())
So included within this online script (Non-existent), I am trying to find out the length of the video/audio that will be downloaded before it does. Notes are included within the script.
My question: How do I find the length of a video before it downloads as audio?
Please do not use or mention any part of repl.it. I use IDLE python 3.7.3, while this portion is located on a script on my PC to run my bot.
Sorry if I could not word this any better.
You can try to process data then download if video length is less or equal to a certain value:
class AudioTooLongError(commands.CommandError): pass
#classmethod
async def from_url(cls, url, *, loop=None, stream=False):
loop = loop or asyncio.get_event_loop()
to_run = functools.partial(ytdl.extract_info, url=url, download=False)
data = await loop.run_in_executor(None, to_run)
if 'entries' in data:
# take first item from a playlist
data = data['entries'][0]
if data['duration'] > 3600:
# Forbid downloading of sources more than 1 hour long.
raise AudioTooLongError
if not stream:
try: data = ytdl.process_ie_result(data)
except PermissionError: pass
source = ytdl.prepare_filename(data)
return cls(discord.FFmpegPCMAudio(source), data=data, requester=ctx.author)
return data['url']
Try this!
I've created a script in python using pyppeteer to collect the links of different posts from a webpage and then parse the title of each post by going in their target page reusing those collected links. Although the content are static, I like to know how pyppeteer works in such cases.
I tried to supply this browser variable from main() function to fetch() and browse_all_links() function so that I can reuse the same browser over and over again.
My current approach:
import asyncio
from pyppeteer import launch
url = "https://stackoverflow.com/questions/tagged/web-scraping"
async def fetch(page,url):
await page.goto(url)
linkstorage = []
await page.waitForSelector('.summary .question-hyperlink')
elements = await page.querySelectorAll('.summary .question-hyperlink')
for element in elements:
linkstorage.append(await page.evaluate('(element) => element.href', element))
return linkstorage
async def browse_all_links(page,link):
await page.goto(link)
await page.waitForSelector('h1 > a')
title = await page.querySelectorEval('h1 > a','(e => e.innerText)')
print(title)
async def main():
browser = await launch(headless=False,autoClose=False)
[page] = await browser.pages()
links = await fetch(page,url)
tasks = [await browse_all_links(page,url) for url in links]
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())
The above script fetches some titles but spits out the following error at some point within the execution:
Possible to select <a> with specific text within the quotes?
Crawler Runs Too Slow
How do I loop a list of ticker to scrape balance sheet info?
How to retrive the url of searched video from youtbe using python
VBA-JSON to import data from all pages in one table
Is there an algorithm that detects semantic visual blocks in a webpage?
find_all only scrape the last value
#ERROR STARTS
Future exception was never retrieved
future: <Future finished exception=NetworkError('Protocol error (Runtime.releaseObject): Cannot find context with specified id')>
pyppeteer.errors.NetworkError: Protocol error (Runtime.releaseObject): Cannot find context with specified id
Future exception was never retrieved
AS it's been two days since this question has been posted but no one yet to answer, I will take this opportunity to address this issue what I
think might be helpful to you.
There are 15 links but you are getting only 7, this is probably websockets is loosing connection and page is not reachable anymore
List comprehension
tasks = [await browse_all_links(page,url) for url in links] What do expect is this list? If it's succesful, it will be a list
of none element. So your next line of code will throw error!
Solution
downgrade websockets 7.0 to websockets 6.0
remove this line of code await asyncio.gather(*tasks)
I am using python 3.6, so I had to change last line of code.
You don't need change it if you are using python 3.7 which I think you are using
import asyncio
from pyppeteer import launch
url = "https://stackoverflow.com/questions/tagged/web-scraping"
async def fetch(page,url):
await page.goto(url)
linkstorage = []
await page.waitForSelector('.summary .question-hyperlink')
elements = await page.querySelectorAll('.summary .question-hyperlink')
for element in elements:
linkstorage.append(await page.evaluate('(element) => element.href', element))
return linkstorage
async def browse_all_links(page,link):
await page.goto(link)
await page.waitForSelector('h1 > a')
title = await page.querySelectorEval('h1 > a','(e => e.innerText)')
print(title)
async def main():
browser = await launch(headless=False,autoClose=False)
[page] = await browser.pages()
links = await fetch(page,url)
tasks = [await browse_all_links(page,url) for url in links]
#await asyncio.gather(*tasks)
await browser.close()
if __name__ == '__main__':
#asyncio.run(main())
asyncio.get_event_loop().run_until_complete(main())
Output
(testenv) C:\Py\pypuppeteer1>python stack3.py
Scrapy Shell response.css returns an empty array
Scrapy real-time spider
Why do I get KeyError while reading data with get request?
Scrapy spider can't redefine custom_settings according to args
Custom JS Script using Lua in Splash UI
Can someone explain why and how this piece of code works [on hold]
How can I extract required data from a list of strings?
Scrapy CrawlSpider rules for crawling single page
how to scrape a web-page with search bar results, when the search query does not
appear in the url
Nested for loop keeps repeating
Get all tags except a list of tags BeautifulSoup
Get current URL using Python and webbot
How to login to site and send data
Unable to append value to colums. Getting error IndexError: list index out of ra
nge
NextSibling.Innertext not working. “Object doesn't support this property”
I'm currently building a Discord bot that uploads a file to Google Drive when a command is used. However, the command methods are Asynchronous and the files().create() method is Synchronous, and calling it simply causes the bot to get stuck.
#bot.command(pass_context = True)
#commands.has_role(name = 'Archivist')
async def archivechannel(ctx, channel : discord.Channel, filename):
await bot.say("Archiving....")
try:
with open("{}.txt".format(filename), "w") as openfile:
lines = []
async for message in bot.logs_from(channel, limit=500, reverse=True):
if not (message.author.bot or message.content.startswith("]")):
print ("<{}> {}#{}: {}".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
lines.append("<{}> {}#{}: {}\n".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
openfile.writelines(lines)
await bot.say("Archive Complete!")
except IOError:
await bot.say("Error: IOException")
await bot.say("Uploading....")
metadata = {'name' : "{}.txt".format(filename), 'mimetype' : 'application/vnd.google.apps.document', 'parents' : folderID}
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
res = service.files().create(body=metadata, media_body=media).execute()
print(res)
The line causing the problem is:
res = service.files().create(body=metadata, media_body=media).execute()
The bot just gets stuck after saying "Uploading...." and doesn't upload anything.
Does anyone know how I can fix this?
Edit: Using a ThreadPoolExecutor, nor a DefaultExecutor has worked, nor has setting up a synchronous function that runs the create and execute methods, taking in the metadata and media parameters
Edit 2: After doing some more screwing around, it turns out the problem is now in the following line:
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
However from my testing, for the question I asked, Patrick is correct and I have marked the question as answered.
You can run your blocking operation in another thread, while your asynchronous code waits for it to complete without blocking the event loop.
We'll create a new ThreadPoolExecutor, then use run_in_executor to use it to run the task.
from concurrent.futures import ThreadPoolExecutor
def upload_file(metadata, media):
return service.files().create(body=metadata, media_body=media).execute()
#bot.command(pass_context = True)
#commands.has_role(name = 'Archivist')
async def archivechannel(ctx, channel : discord.Channel, filename):
await bot.say("Archiving....")
try:
with open("{}.txt".format(filename), "w") as openfile:
lines = []
async for message in bot.logs_from(channel, limit=500, reverse=True):
if not (message.author.bot or message.content.startswith("]")):
print ("<{}> {}#{}: {}".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
lines.append("<{}> {}#{}: {}\n".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
openfile.writelines(lines)
await bot.say("Archive Complete!")
except IOError:
await bot.say("Error: IOException")
await bot.say("Uploading....")
metadata = {filename : "{}.txt".format(filename), 'mimetype' : 'application/vnd.google.apps.document', 'parents' : folderID}
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
with ThreadPoolExecutor() as pool:
res = await bot.loop.run_in_executor(
pool, upload_file, metadata, media
)
print(res)
You may also be able to use the default executor by removing the context manager and passing None instead of pool. I'm having trouble finding information about the default executor though, so you may want to experiment.