How can I use threading with requests? [duplicate] - python

This question already has answers here:
Asynchronous Requests with Python requests
(15 answers)
Closed 3 years ago.
Hello I am using the requests module and I would like to improve the speed because I have many urls so I suppose I can use threading to have a better speed. Here is my code :
import requests
urls = ["http://www.google.com", "http://www.apple.com", "http://www.microsoft.com", "http://www.amazon.com", "http://www.facebook.com"]
for url in urls:
reponse = requests.get(url)
value = reponse.json()
But I don't know how to use requests with threading ...
Could you help me please ?
Thank you !

Just to add from bashrc, you can also use it with requests.
You don't need to use urllib.request method.
it would be something like :
from concurrent import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
with futures.ThreadPoolExecutor(max_workers=5) as executor: ## you can increase the amount of workers, it would increase the amount of thread created
res = executor.map(requests.get,URLS)
responses = list(res) ## the future is returning a generator. You may want to turn it to list.
What I like to do however, it is to create a function that returns directly the json from the response (or the text if you want to scrape).
And use that function in the threadpool
import requests
from concurrent import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def getData(url):
res = requests.get(url)
try:
return res.json()
except:
return res.text
with futures.ThreadPoolExecutor(max_workers=5) as executor:
res = executor.map(getData,URLS)
responses = list(res) ## your list will already be pre-formated

You can use concurrent module.
pool = concurrent.futures.thread.ThreadPoolExecutor(max_workers=DEFAULT_NUMBER_OF_THREADS)
pool.map(lambda x : requests.get(x), urls)
This allows controlled concurrency.
This is a direct example from the threadpool documentation
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

Related

ThreadPoolExecutor not Working Properly with Python Requests

When I requests a long list of urls from ThreadPoolExecutor I am encountering a problem where it seems like the request isn't being sent. I am expecting json responses from the urls
urls = ["example1.com", "example2.com", "example3.com"] #My list is about 1,500 urls long
def load_url(url):
print("here")
r = requests.get(url)
print(r.url)
print(r.json())
return r
def main():
urls = ["example1.com", "example2.com", "example3.com"] #My list is about 1,500 urls long
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(
load_url, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
print("right here")
url = future_to_url[future]
try:
response = future.result()
except (HTTPError, ConnectionError, ConnectTimeout):
print("fail")
else:
print(response.json())
main()
When I execute the following code, only the first url's request seems to have been sent and received by the as_completed function. The rest, despite printing "here" and the correct urls and the correct json response (from the load_url function), they are not sent to the as_completed function. So, it's printing "here" continuously but not "right here".
My python version is 3.9.1

Python Multithreading no current event loop

My Threading works fine only when fetch() is not called from validate(). But in this scenario below it returns
RuntimeError: There is no current event loop in thread
in all the threads 0 - 99 what am I doing wrong here ?
from threading import Thread
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def fetch():
#fetch data from another site
session = HTMLSession()
url='http://url'
data = session.get(url)
data.html.render()
content = data.html.html
soup = BeautifulSoup(content, "html.parser")
iban = soup.find('p',{"id":"demo"})
return result.text
def validate():
url = "https://url"
payload = {
"data" : fetch(),
"veto" : "fi"
}
response = requests.post(url, data=payload)
soup = BeautifulSoup(response.text, "html.parser")
data = soup.body.find(text='contact')
if(data):
print (data)
else:
print ("no data")
if __name__ == "__main__":
threads = []
for i in range(100):
# We start one thread per url present.
process = Thread(target=validate)
process.start()
threads.append(process)
From a quick search of the error, I found this Github issue that seems to show your problem and its solution.
It looks like you need to use asyncio and, at the beginning of each running thread, call asyncio.set_event_loop(asyncio.new_event_loop()).

How to get the thread used by a python function?

I'm trying a sample code for multithreading in python, which was found in this doc page
The actual code is the following:
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I would like the load_url function to print out the id of the thread it is currently using so I can monitor if multithreading is actually working. Of course if you have better ways to achieve this same goal, please let me know.
Thanks
Edit
I think I just bumped into the answer, this seems to work
# Retrieve a single page and report the URL and contents
import threading
def load_url(url, timeout):
print('Using thread {}, looking for url {}'.format(threading.get_ident(), url))
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
any feedback on preferred methods are however welcome.

Unable to pass proxies and links to the threadpool to get results

I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text

Gevent pool with nested web requests

I try to organize pool with maximum 10 concurrent downloads. The function should download base url, then parser all urls on this page and download each of them, but OVERALL number of concurrent downloads should not exceed 10.
from lxml import etree
import gevent
from gevent import monkey, pool
import requests
monkey.patch_all()
urls = [
'http://www.google.com',
'http://www.yandex.ru',
'http://www.python.org',
'http://stackoverflow.com',
# ... another 100 urls
]
LINKS_ON_PAGE=[]
POOL = pool.Pool(10)
def parse_urls(page):
html = etree.HTML(page)
if html:
links = [link for link in html.xpath("//a/#href") if 'http' in link]
# Download each url that appears in the main URL
for link in links:
data = requests.get(link)
LINKS_ON_PAGE.append('%s: %s bytes: %r' % (link, len(data.content), data.status_code))
def get_base_urls(url):
# Download the main URL
data = requests.get(url)
parse_urls(data.content)
How can I organize it to go concurrent way, but to keep the general global Pool limit for ALL web requests?
I think the following should get you what you want. I'm using BeautifulSoup in my example instead the link striping stuff you had.
from bs4 import BeautifulSoup
import requests
import gevent
from gevent import monkey, pool
monkey.patch_all()
jobs = []
links = []
p = pool.Pool(10)
urls = [
'http://www.google.com',
# ... another 100 urls
]
def get_links(url):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
links.extend(soup.find_all('a'))
for url in urls:
jobs.append(p.spawn(get_links, url))
gevent.joinall(jobs)
gevent.pool will limit the concurrent greenlets, not the connections.
You should use session with HTTPAdapter
connection_limit = 10
adapter = requests.adapters.HTTPAdapter(pool_connections=connection_limit,
pool_maxsize=connection_limit)
session = requests.session()
session.mount('http://', adapter)
session.get('some url')
# or do your work with gevent
from gevent.pool import Pool
# it should bigger than connection limit if the time of processing data
# is longer than downings,
# to give a change run processing.
pool_size = 15
pool = Pool(pool_size)
for url in urls:
pool.spawn(session.get, url)
You should use gevent.queue to do it in the right way.
Also this(eventlet examples) will be helpful for you to understand the basic idea.
Gevent solution is similar to the eventlet.
Keep in mind that will have somewhere to store visited URLs, so as not to get cycling, so you do not get out of memory error, you need to introduce some restrictions.

Categories