I try to organize pool with maximum 10 concurrent downloads. The function should download base url, then parser all urls on this page and download each of them, but OVERALL number of concurrent downloads should not exceed 10.
from lxml import etree
import gevent
from gevent import monkey, pool
import requests
monkey.patch_all()
urls = [
'http://www.google.com',
'http://www.yandex.ru',
'http://www.python.org',
'http://stackoverflow.com',
# ... another 100 urls
]
LINKS_ON_PAGE=[]
POOL = pool.Pool(10)
def parse_urls(page):
html = etree.HTML(page)
if html:
links = [link for link in html.xpath("//a/#href") if 'http' in link]
# Download each url that appears in the main URL
for link in links:
data = requests.get(link)
LINKS_ON_PAGE.append('%s: %s bytes: %r' % (link, len(data.content), data.status_code))
def get_base_urls(url):
# Download the main URL
data = requests.get(url)
parse_urls(data.content)
How can I organize it to go concurrent way, but to keep the general global Pool limit for ALL web requests?
I think the following should get you what you want. I'm using BeautifulSoup in my example instead the link striping stuff you had.
from bs4 import BeautifulSoup
import requests
import gevent
from gevent import monkey, pool
monkey.patch_all()
jobs = []
links = []
p = pool.Pool(10)
urls = [
'http://www.google.com',
# ... another 100 urls
]
def get_links(url):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
links.extend(soup.find_all('a'))
for url in urls:
jobs.append(p.spawn(get_links, url))
gevent.joinall(jobs)
gevent.pool will limit the concurrent greenlets, not the connections.
You should use session with HTTPAdapter
connection_limit = 10
adapter = requests.adapters.HTTPAdapter(pool_connections=connection_limit,
pool_maxsize=connection_limit)
session = requests.session()
session.mount('http://', adapter)
session.get('some url')
# or do your work with gevent
from gevent.pool import Pool
# it should bigger than connection limit if the time of processing data
# is longer than downings,
# to give a change run processing.
pool_size = 15
pool = Pool(pool_size)
for url in urls:
pool.spawn(session.get, url)
You should use gevent.queue to do it in the right way.
Also this(eventlet examples) will be helpful for you to understand the basic idea.
Gevent solution is similar to the eventlet.
Keep in mind that will have somewhere to store visited URLs, so as not to get cycling, so you do not get out of memory error, you need to introduce some restrictions.
Related
I am working on a web scraping project willing to take prices from a website using different urls. I have run the following code but it takes so long to print the price number. I am using PyCharm on a MacBook Pro 13'' i5 (2020) 1.4 GHz and 8GB RAM, if this can help.
import ssl
import bs4
from urllib.request import Request, urlopen
import json
#to avoid SSL verification
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#Define the url to monitor
urls = ['https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-hardwear-graduated-link-necklace-63008966/', 'https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-t-smile-pendant-35189459/']
for i in urls:
#Open the url to monitor using a new user agent to avoid website blocks you
req = Request(
url=i,
headers={'User-Agent': 'Mozilla/5.0'}
)
#Read the HTML code of the url
webpage = urlopen(req, context=ctx).read()
soup = bs4.BeautifulSoup(webpage, "html.parser")
#Define the HTML element we need to screen and find prices (this time using Javascript)
data = json.loads(soup.find_all('script', {'type': 'application/ld+json'})[-1].get_text())
price = int(data['offers']['price'])
print(price)
Using only one url, the code works, but adding other urls and a simple for loop, it takes a while. How could I speed up the process? Thanks a lot!
You can speed up the processing using multi-threading or multi-processing. This example will use multiprocessing module (with Pool of 4 processes) to obtain the prices:
import json
from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
def get_price(url):
soup = BeautifulSoup(requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).content, "html.parser")
data = json.loads(soup.find_all('script', {'type': 'application/ld+json'})[-1].get_text())
return url, int(data['offers']['price'])
if __name__ == '__main__':
urls = ['https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-hardwear-graduated-link-necklace-63008966/', 'https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-t-smile-pendant-35189459/']
with Pool(processes=4) as pool:
for url, price in pool.imap_unordered(get_price, urls):
print(url, price)
Prints (for example, the order could vary):
https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-t-smile-pendant-35189459/ 920
https://www.tiffany.co.uk/jewelry/necklaces-pendants/tiffany-hardwear-graduated-link-necklace-63008966/ 13900
I am trying to scrape a website using python requests. We can only scrape the website using proxies so I implemented the code for that. However its banning all my requests even when i am using proxies, So I used a website https://api.ipify.org/?format=json to check whether proxies working properly or not. I found it showing my original IP even while using proxies. The code is below
from concurrent.futures import ThreadPoolExecutor
import string, random
import requests
import sys
http = []
#loading http into the list
with open(sys.argv[1],"r",encoding = "utf-8") as data:
for i in data:
http.append(i[:-1])
data.close()
url = "https://api.ipify.org/?format=json"
def fetch(session, url):
for i in range(5):
proxy = {'http': 'http://'+random.choice(http)}
try:
with session.get(url,proxies = proxy, allow_redirects=False) as response:
print("Proxy : ",proxy," | Response : ",response.text)
break
except:
pass
# #timer(1, 5)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=1) as executor:
with requests.Session() as session:
executor.map(fetch, [session] * 100, [url] * 100)
executor.shutdown(wait=True)
I tried a lot but didn't understand how my ip address is getting shown instead of the proxy ipv4. You will find output of the code here https://imgur.com/a/z02uSvi
The problem that you have set proxy for http and sending request to website which uses https. Solution is simple:
proxies = dict.fromkeys(('http', 'https', 'ftp'), 'http://' + random.choice(http))
# You can set proxy for session
session.proxies.update(proxies)
response = session.get(url)
# Or you can pass proxy as argument
response = session.get(url, proxies=proxies)
This question already has answers here:
Asynchronous Requests with Python requests
(15 answers)
Closed 3 years ago.
Hello I am using the requests module and I would like to improve the speed because I have many urls so I suppose I can use threading to have a better speed. Here is my code :
import requests
urls = ["http://www.google.com", "http://www.apple.com", "http://www.microsoft.com", "http://www.amazon.com", "http://www.facebook.com"]
for url in urls:
reponse = requests.get(url)
value = reponse.json()
But I don't know how to use requests with threading ...
Could you help me please ?
Thank you !
Just to add from bashrc, you can also use it with requests.
You don't need to use urllib.request method.
it would be something like :
from concurrent import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
with futures.ThreadPoolExecutor(max_workers=5) as executor: ## you can increase the amount of workers, it would increase the amount of thread created
res = executor.map(requests.get,URLS)
responses = list(res) ## the future is returning a generator. You may want to turn it to list.
What I like to do however, it is to create a function that returns directly the json from the response (or the text if you want to scrape).
And use that function in the threadpool
import requests
from concurrent import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def getData(url):
res = requests.get(url)
try:
return res.json()
except:
return res.text
with futures.ThreadPoolExecutor(max_workers=5) as executor:
res = executor.map(getData,URLS)
responses = list(res) ## your list will already be pre-formated
You can use concurrent module.
pool = concurrent.futures.thread.ThreadPoolExecutor(max_workers=DEFAULT_NUMBER_OF_THREADS)
pool.map(lambda x : requests.get(x), urls)
This allows controlled concurrency.
This is a direct example from the threadpool documentation
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text
I'm trying to download all the PGNs from this site.
I think I have to use urlopen to open each url and then use urlretrieve to download each pgn by accessing it from the download button near the bottom of each game. Do I have to create a new BeautifulSoup object for each game? I'm also unsure of how urlretrieve works.
import urllib
from urllib.request import urlopen, urlretrieve, quote
from bs4 import BeautifulSoup
url = 'http://www.chessgames.com/perl/chesscollection?cid=1014492'
u = urlopen(url)
html = u.read().decode('utf-8')
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a'):
urlopen('http://chessgames.com'+link.get('href'))
There is no short answer to your question. I will show you a complete solution and comment this code.
First, import necessary modules:
from bs4 import BeautifulSoup
import requests
import re
Next, get index page and create BeautifulSoup object:
req = requests.get("http://www.chessgames.com/perl/chesscollection?cid=1014492")
soup = BeautifulSoup(req.text, "lxml")
I strongly advice to use lxml parser, not common html.parser
After that, you should prepare game's links list:
pages = soup.findAll('a', href=re.compile('.*chessgame\?.*'))
You can do it by searching links containing 'chessgame' word in it.
Now, you should prepare function which will download files for you:
def download_file(url):
path = url.split('/')[-1].split('?')[0]
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
And final magic is to repeat all previous steps preparing links for file downloader:
host = 'http://www.chessgames.com'
for page in pages:
url = host + page.get('href')
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
file_link = soup.find('a',text=re.compile('.*download.*'))
file_url = host + file_link.get('href')
download_file(file_url)
(first you search links containing text 'download' in their description, then construct full url - concatenate hostname and path, and finally download file)
I hope you can use this code without correction!
The accepted answer is fantastic but the task is embarrassingly parallel; there's no need to retrieve these sub-pages and files one at a time. This answer shows how to speed things up.
The first step is to use requests.Session() when sending multiple requests to a single host. Quoting Advanced Usage: Session Objects from the requests docs:
The Session object allows you to persist certain parameters across requests. It also persists cookies across all requests made from the Session instance, and will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase (see HTTP persistent connection).
Next, asyncio, multiprocessing or multithreading are available to parallelize the workload. Each has tradeoffs respective to the task at hand and which you choose is likely best determined by benchmarking and profiling. This page offers great examples of all three.
For the purposes of this post, I'll show multithreading. The impact of the GIL shouldn't be too much of a bottleneck because the tasks are mostly IO-bound, consisting of babysitting requests on the air to wait for the response. When a thread is blocked on IO, it can yield to a thread parsing HTML or doing other CPU-bound work.
Here's the code:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, host, page, destination_path = task
response = session.get(host + page)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
def main():
host = "http://www.chessgames.com"
url_to_scrape = host + "/perl/chesscollection?cid=1014492"
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
response = session.get(url_to_scrape)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*chessgame\?.*"))
tasks = [
(session, host, page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)
if __name__ == "__main__":
main()
I used response.iter_content here which is unnecessary on such tiny text files, but is a generalization so the code will handle larger files in a memory-friendly way.
Results from a rough benchmark (the first request is a bottleneck):
max workers
session?
seconds
1
no
126
1
yes
111
8
no
24
8
yes
22
32
yes
16