Python returning values from infinite loop thread - python

So for my program I need to check a client on my local network, which has a Flask server running. This Flask server is returning a number that is able to change.
Now to retrieve that value, I use the requests library and BeautifulSoup. I want to use the retrieved value in another part of my script (while continuously checking the other client). For this I thought I could use the threading module.The problem is, however, that the thread only returns it's values when it's done with the loop, but the loop needs to be infinite.This is what I got so far:
import threading
import requests
from bs4 import BeautifulSoup
def checkClient():
while True:
page = requests.get('http://192.168.1.25/8080')
soup = BeautifulSoup(page.text, 'html.parser')
value = soup.find('div', class_='valueDecibel')
print(value)
t1 = threading.Thread(target=checkClient, name=checkClient)
t1.start()
Does anyone know how to return the printed values to another function here? Of course you can replace the requests.get url with some kind of API where the values change a lot.

You need a Queue and something listening on the queue
import queue
import threading
import requests
from bs4 import BeautifulSoup
def checkClient(q):
while True:
page = requests.get('http://192.168.1.25/8080')
soup = BeautifulSoup(page.text, 'html.parser')
value = soup.find('div', class_='valueDecibel')
q.put(value)
q = queue.Queue()
t1 = threading.Thread(target=checkClient, name=checkClient, args=(q,))
t1.start()
while True:
value = q.get()
print(value)
The Queue is thread safe and allows to pass values back and forth. In your case they are only being sent from the thread to a receiver.
See: https://docs.python.org/3/library/queue.html

Related

Web scraping of hyperlinks going so slow

I am using the following function to scrape the Twitter URLs from a list of websites.
import httplib2
import bs4 as bs
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urlparse
import pandas as pd
import swifter
def twitter_url(website): # website address is given to the function in a string format
try:
http = httplib2.Http()
status, response = http.request(str('https://') + website)
url = 'https://twitter.com'
search_domain = urlparse(url).hostname
l = []
for link in bs.BeautifulSoup(response, 'html.parser',
parseOnlyThese=SoupStrainer('a')):
if link.has_attr('href'):
if search_domain in link['href']:
l.append(link['href'])
return list(set(l))
except:
ConnectionRefusedError
and then I apply the function into the dataframe which includes the URL addresses
df ['twitter_id'] = df.swifter.apply(lambda x:twitter_url(x['Website address']), axis=1)
The dataframe has about 100,000 website addresses. Even when I run the code for 10,000 samples, the code is running so slow. Is there any way to run this faster?
The issue must be a result of the time taken to retrieve the HTML code for each of the websites.
Since the URLs are processed one after the other, even if each one took 100ms it would still take 1000s (~16 mins) to finish up.
If you however process each URL in a separate thread, that should significantly cut down the time taken.
You can check out the threading library to accomplish that.

Python Multithreading no current event loop

My Threading works fine only when fetch() is not called from validate(). But in this scenario below it returns
RuntimeError: There is no current event loop in thread
in all the threads 0 - 99 what am I doing wrong here ?
from threading import Thread
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def fetch():
#fetch data from another site
session = HTMLSession()
url='http://url'
data = session.get(url)
data.html.render()
content = data.html.html
soup = BeautifulSoup(content, "html.parser")
iban = soup.find('p',{"id":"demo"})
return result.text
def validate():
url = "https://url"
payload = {
"data" : fetch(),
"veto" : "fi"
}
response = requests.post(url, data=payload)
soup = BeautifulSoup(response.text, "html.parser")
data = soup.body.find(text='contact')
if(data):
print (data)
else:
print ("no data")
if __name__ == "__main__":
threads = []
for i in range(100):
# We start one thread per url present.
process = Thread(target=validate)
process.start()
threads.append(process)
From a quick search of the error, I found this Github issue that seems to show your problem and its solution.
It looks like you need to use asyncio and, at the beginning of each running thread, call asyncio.set_event_loop(asyncio.new_event_loop()).

Web scrape a updating value

I am new to python and I am trying to scrape a value that constantly updates. When you first enter the site the value says "laddar temperatur (loading temperature)" and then goes on to show the actual temperature after a while. When I run my script the only thing I get is the "loading temperature" value. I am guessing that it has to do with the fact that the script reloads the site every time I run it. How do I get it so that it "stays" on the site and collect the information after the "loading temperature"?
Site: http://s-websrv02.lulea.se/ormberget/
from bs4 import BeautifulSoup
import requests
import time
r = requests.get("http://s-websrv02.lulea.se/ormberget/")
soup = BeautifulSoup(r.text, "html.parser")
match = soup.find("div", id="ReloadThis").text
for item in match:
print(match)
time.sleep(20)
The temperature is fetched using XHR call.
The code below should return the temperature
import requests
r = requests.get('http://s-websrv02.lulea.se/ormberget/Orm_Stadium.php')
print(r.text.strip())
if you like to get the temperature value periodically do something like:
import time
import requests
collected_data = []
SLEEP_TIME = 5
while True:
r = requests.get('http://s-websrv02.lulea.se/ormberget/Orm_Stadium.php')
value = r.text.strip() if r.status_code == 200 else '-1000'
collected_data.append({'time':time.time(), 'value':value})
time.sleep(SLEEP_TIME)

Unable to pass proxies and links to the threadpool to get results

I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text

Is this actually using threading to scrape for urls?

Thanks in advance for your help. I'm new to Python and trying to figure out how to use the threading module to scrape the NY Daily News site for urls. I put the following together and the script is scrapping but it doesn't seem to be any faster than it was before so I'm not sure the threading is happening. Can you let me know if it is? Can I write in anything so that I can tell? And also any other tips you have about threading?
Thank you.
from bs4 import BeautifulSoup, SoupStrainer
import urllib2
import os
import io
import threading
def fetch_url():
for i in xrange(15500, 6100, -1):
page = urllib2.urlopen("http://www.nydailynews.com/search-results/search-results-7.113?kw=&tfq=&afq=&page={}&sortOrder=Relevance&selecturl=site&q=the&sfq=&dtfq=seven_years".format(i))
soup = BeautifulSoup(page.read())
snippet = soup.find_all('h2')
for h2 in snippet:
for link in h2.find_all('a'):
logfile.write("http://www.nydailynews.com" + link.get('href') + "\n")
print "finished another url from page {}".format(i)
with open("dailynewsurls.txt", 'a') as logfile:
threads = threading.Thread(target=fetch_url())
threads.start()
The below is a naive implementation (which will very quickly get you blacklisted from nydailynews.com):
def fetch_url(i, logfile):
page = urllib2.urlopen("http://www.nydailynews.com/search-results/search-results-7.113?kw=&tfq=&afq=&page={}&sortOrder=Relevance&selecturl=site&q=the&sfq=&dtfq=seven_years".format(i))
soup = BeautifulSoup(page.read())
snippet = soup.find_all('h2')
for h2 in snippet:
for link in h2.find_all('a'):
logfile.write("http://www.nydailynews.com" + link.get('href') + "\n")
print "finished another url from page {}".format(i)
with open("dailynewsurls.txt", 'a') as logfile:
threads = []
for i in xrange(15500, 6100, -1):
t = threading.Thread(target=fetch_url, args=(i, logfile))
t.start()
threads.append(t)
for t in threads:
t.join()
Note that fetch_url takes the number to substitute in the URL as an argument, and each possible value for that argument is started in its own, separate thread.
I would strongly suggest dividing the job into smaller batches, and running one batch at a time.
No, you're not using threads. threads = threading.Thread(target=fetch_url()) calls fetch_url() in your main thread, waits for it to complete and passes its return value (None) to the threading.Thread constructor.

Categories