Python Multithreading no current event loop - python

My Threading works fine only when fetch() is not called from validate(). But in this scenario below it returns
RuntimeError: There is no current event loop in thread
in all the threads 0 - 99 what am I doing wrong here ?
from threading import Thread
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def fetch():
#fetch data from another site
session = HTMLSession()
url='http://url'
data = session.get(url)
data.html.render()
content = data.html.html
soup = BeautifulSoup(content, "html.parser")
iban = soup.find('p',{"id":"demo"})
return result.text
def validate():
url = "https://url"
payload = {
"data" : fetch(),
"veto" : "fi"
}
response = requests.post(url, data=payload)
soup = BeautifulSoup(response.text, "html.parser")
data = soup.body.find(text='contact')
if(data):
print (data)
else:
print ("no data")
if __name__ == "__main__":
threads = []
for i in range(100):
# We start one thread per url present.
process = Thread(target=validate)
process.start()
threads.append(process)

From a quick search of the error, I found this Github issue that seems to show your problem and its solution.
It looks like you need to use asyncio and, at the beginning of each running thread, call asyncio.set_event_loop(asyncio.new_event_loop()).

Related

Looping URLs for scraping on BeautifulSoup

My script currently looks at a list of 5 URLs, once it reaches the end of the list it stops scraping. I want it to loop back to the first URL after it completes the last URL. How would I achieve that?
The reason I want it to loop is to monitor for any changes in the product such as the price etc.
I tried looking at a few method I found online but couldn't figure it out as I am new to this. Hope you can help!
import requests
import lxml.html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
import random
ua = UserAgent()
header = {'User-Agent':ua.chrome}
# Proxies
proxy_list = []
for line in open('proxies.txt', 'r'):
line = line.replace('\n', '')
proxy_list.append(line)
def get_proxy():
proxy = random.choice(proxy_list)
proxies = {
"http": f'{str(proxy)}',
"https": f'{str(proxy)}'
}
return proxies
# Opening URL file
with open('urls.txt','r') as file:
for url in file.readlines():
proxies = get_proxy()
result = requests.get(url.strip() ,headers=header,timeout=4,proxies=proxies)
#src = result.content
soup = BeautifulSoup(result.content, 'lxml')
You can store the urls in a list and do a while loop over it, the basic logic will be
with open('urls.txt','r') as file:
url_list = file.readlines()
pos = 0
while True:
if pos >= len(url_list):
pos = 0
url = url_list[pos]
pos += 1
*** rest of your logic ***
You can add a while True: loop outside and above your main with statement & for loop (and add one level of indent to every line inside). This way the program will keep running until terminated by user.

How to forcefully execute or render a script in browser from python in scraping?

I am working on data scraping and machine learning. I am new to both Python and Scraping. I am trying to scrape this particular site.
https://www.space-track.org/
From what I have monitored they execute several scripts in between login and next page. Hence they get those table data. I am able to successfully login and then with session get the data from next page as well, what I am missing is getting that data which they get from executing script in between. I need the data from table
satcat
and achieve pagination. Following is my code
import requests
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen
import html2text
import time
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
with requests.Session() as s:
#s = requests.Session()
session = HTMLSession()
url = 'https://www.space-track.org/'
headers = {'User-Agent':'Mozilla/5.0(X11; Ubuntu; Linux x86_64; rv:66.0)Gecko/20100101 Firefox/66.0'}
login_data = { "identity": "",
"password": "",
"btnLogin": "LOGIN"
}
login_data_extra={"identity": "", "password": ""}
preLogin = session.get(url + 'auth/login', headers=headers)
time.sleep(3)
print('*******************************')
print('\n')
print('data to retrive csrf cookie')
#print(preLogin.text)
#soup = BeautifulSoup(preLogin.content,'html.parser')
#afterpretty = soup.prettify()
#login_data['spacetrack_csrf_token'] = soup.find('input',attrs={'name':'spacetrack_csrf_token'})['value']
csrf = dict(session.cookies)['spacetrack_csrf_cookie']
#csrf = p.headers['Set-Cookie'].split(";")[0].split("=")[-1]
login_data['spacetrack_csrf_token'] = csrf
#print(login_data)
# html = open(p.content).read()
# print (html2text.html2text(p.text))
#login_data['spacetrack_csrf_token'] = soup.find('spacetrack_csrf_token"')
#print(login_data)
login = session.post(url+'auth/login',data=login_data,headers=headers,allow_redirects=True)
time.sleep(1)
print('****************************************')
print('\n')
print('login api status code')
print(login.url)
#print(r.url)
#print(r.content)
print('******************************')
print(' ')
print(' ')
print('\n')
print('data post login')
#async def get_pyclock():
# r = await session.get(url)
# await r.html.arender()
# return r
#postLogin = session.run(get_pyclock)
time.sleep(3)
postLogin = session.get(url)
postLogin.html.render(sleep=5, keep_page=True)
As you can see I have used requests_html library to render the html, but I have been unsuccessful in getting the data. This is the url executed in js internally which gets my data
https://www.space-track.org/master/loadSatCatData
Can anyone help me with how to scrape that data or javascript ?
Thank you :)
You can go for selenium. It has a function browser.execute_script(). This will help you to execute script. Hope this helps :)

Unable to pass proxies and links to the threadpool to get results

I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text

Python returning values from infinite loop thread

So for my program I need to check a client on my local network, which has a Flask server running. This Flask server is returning a number that is able to change.
Now to retrieve that value, I use the requests library and BeautifulSoup. I want to use the retrieved value in another part of my script (while continuously checking the other client). For this I thought I could use the threading module.The problem is, however, that the thread only returns it's values when it's done with the loop, but the loop needs to be infinite.This is what I got so far:
import threading
import requests
from bs4 import BeautifulSoup
def checkClient():
while True:
page = requests.get('http://192.168.1.25/8080')
soup = BeautifulSoup(page.text, 'html.parser')
value = soup.find('div', class_='valueDecibel')
print(value)
t1 = threading.Thread(target=checkClient, name=checkClient)
t1.start()
Does anyone know how to return the printed values to another function here? Of course you can replace the requests.get url with some kind of API where the values change a lot.
You need a Queue and something listening on the queue
import queue
import threading
import requests
from bs4 import BeautifulSoup
def checkClient(q):
while True:
page = requests.get('http://192.168.1.25/8080')
soup = BeautifulSoup(page.text, 'html.parser')
value = soup.find('div', class_='valueDecibel')
q.put(value)
q = queue.Queue()
t1 = threading.Thread(target=checkClient, name=checkClient, args=(q,))
t1.start()
while True:
value = q.get()
print(value)
The Queue is thread safe and allows to pass values back and forth. In your case they are only being sent from the thread to a receiver.
See: https://docs.python.org/3/library/queue.html

Python Multi Threading using Requests and BeautifulSoup

I'm writing a web scraper. I could've just used scrapy but decided to write it from scratch so I can practice.
I've created a scraper that works successfully using requests and BeautifulSoup. It navigates through about 135 pages with 12 items on each, grabs the link and then grabs the information from the link destination. At the end it writes everything in a CSV file. It only grabs strings and it doesn't download any images or anything like that… for now.
Problem? It's quite slow. It takes about 5 secs to grab the everything just from contents of one page so that times 135 is about 11 minutes.
So my question is how do I implement threading in my code so it gets data way faster.
Here's the code:
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_actor_dict_from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
#There must be a better way to handle this, but let's assign a NULL value to all upcoming variables.
profileName = profileImage = profileHeight = profileWeight = 'NULL'
#Let's get the name and image..
profileName = str.strip(soup.find('h1').get_text())
profileImage = "http://images.host.com/actors/" + re.findall(r'\d+', url)[0] + "/actor-large.jpg"
#Now the rest of the stuff..
try:
profileHeight = soup.find('a', {"title": "Height"}).get_text()
except:
pass
try:
profileWeight = soup.find('a', {"title": "Weight"}).get_text()
except:
pass
return {
'Name': profileName,
'ImageUrl': profileImage,
'Height': profileHeight,
'Weight': profileWeight,
}
def lotta_downloads():
output = open("/tmp/export.csv", 'w', newline='')
wr = csv.DictWriter(output, ['Name','ImageUrl','Height','Weight'], delimiter=',')
wr.writeheader()
for i in range(135):
url = "http://www.host.com/actors/all-actors/name/{}/".format(i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("div", { "class" : "card-image" })
for a in links:
for url in a.find_all('a'):
url = "http://www.host.com" + url['href']
print(url)
response = requests.get(url)
html = response.content
actor_dict = get_actor_dict_from_html(url, html)
wr.writerow(actor_dict)
print('All Done!')
if __name__ == "__main__":
lotta_downloads()
Thanks!
Why don't you try to use gevent library?
gevent library has monkey patch making blocking function to non-blocking function.
Maybe wait time of requests is too much and so slow.
So I think that Making request as non-blocking function make your program fast.
On python 2.7.10
example:
import gevent
from gevent import monkey; monkey.patch_all() # Fix import code
import reqeusts
actor_dict_list = []
def worker(url):
content = requests.get(url).content
bs4.BeautifulSoup(content)
links = soup.find_all('div', {'class': 'card-image'})
for a in links:
for url in a.find_all('a'):
response = requests.get(url) # You can also use gevent spawn function on this line
...
actor_dict_list.append(get_actor_dict_from_html(url, html)) # Because of preventing race condition
output = open("/tmp/export.csv", "w", newline='')
wr = csv.DictWriter(output, ['Name', 'ImageUrl', 'Height', 'Weight'], delimiter=',')
wr.writeheader()
urls = ["http://www.host.com/actors/all-actors/name/{}/".format(i) for i in range(135)]
jobs = [gevent.spawn(worker, url) for url in urls]
gevent.joinall(jobs)
for i in actor_dict_list:
wr.writerow(actor_dict)
public gevent document: doc
P.S.
You must install python-gevent If you have ubuntu OS
sudo apt-get install python-gevent

Categories