i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()
You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.
Related
My code is working as i wish but is very slow when i run this line.
--- newdf['Login'] = newdf['Site'].apply(lambda x : "yes" if get(x).status_code == 200 else "no") ---
After comment the code run fast.
How can i change this line to add a Yes or No to login column and keep fast?
And if i can improve all this i will be appreciative.
I hope I made understood myself.
Thank you!
import pandas as pd
import requests
from requests import get
from requests.exceptions import HTTPError
lista = pd.read_csv('sites4.csv', sep=',')
df = pd.DataFrame(lista, columns=['Site', 'Login'])
newdf = df.assign(Site=df['Site'].map(str) + 'Login')
headers = {'Content-Type': 'application/json'}
for i in newdf['Site']:
try:
result = get(i, headers=headers, timeout=5)
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except Exception as err:
print(f'Other error occurred: {err}')
else:
if 'application/json' in result.headers.get('Content-Type') or result.status_code == 406 or result.status_code == 403:
newdf['Login'] = newdf['Site'].apply(lambda x : "yes" if get(x).status_code == 200 else "no")
print(i + ' é Login')
print(result)
This Should Be a Much Faster Implementation of your Functions
from typing import Optional, Coroutine, List
import aiohttp
from pandas import DataFrame
from pandas.errors import EmptyDataError
import pandas as pd
import asyncio
def create_df_form_file() -> Optional[DataFrame]:
try:
site_list = pd.read_csv('sites4.csv', sep=',')
df = pd.DataFrame(site_list, columns=['Site', 'Login'])
return df.assign(Site=df['Site'].map(str) + 'Login')
except EmptyDataError as e:
print(f'File Error: {e}')
return None
new_df: Optional[DataFrame] = create_df_form_file()
if not isinstance(new_df, DataFrame):
print("empty data goodbye")
exit(1)
# NOTE: Async Get Request
async def get_request(x_url: str) -> bool:
async with aiohttp.ClientSession() as session:
async with session.get(url=x_url) as result:
return result.ok
# functions to test if site needs a login and if header contains json and to include Yes or No
def needs_login(result): return result.status == 406 or result.status == 403
def is_json(result): return result.headers.get('Content-Type') == 'application/json'
async def yes_no(x): return 'yes' if await get_request(x) else 'no'
async def _do_work(site_column, _headers: dict) -> Optional[DataFrame]:
async with aiohttp.ClientSession() as session:
async with session.get(site_column['Site'], headers=_headers) as result:
if is_json(result) or needs_login(result):
# Appending Yes No
site_column['login'] = site_column['Site'].apply(yes_no)
print(site_column['Site'] + ' é Login')
print(result)
return site_column
return None
def get_results():
global new_df
headers = {'Content-Type': 'application/json'}
try:
_coro: List[Coroutine] = [_do_work(site_column, _headers=headers) for site_column in new_df['Site']]
except KeyError:
print("please insure your input file is accurate")
exit(1)
try:
event_loop = asyncio.get_event_loop()
except RuntimeError:
asyncio.set_event_loop(asyncio.new_event_loop())
event_loop = asyncio.get_event_loop()
# noinspection PyUnboundLocalVariable
results = event_loop.run_until_complete(*_coro)
print(results)
if __name__ == '__main__':
get_results()
for more information on python programming please visit my tutorial site here
because get is func in requests.
Since requests is not an asynchronous package, all codes are interrupted until the request is completed.
If you use asyncio and aiohttp, you can improve it.
import requests
import json
import threading
data = {
"amount": 2
}
def foo(data):
try:
r = requests.post(url = "www.mysite.com", data = data)
j = json.loads(r.text)
print(j)
except requests.exceptions.RequestException as e:
raise SystemExist(e)
threading.Timer(1, foo, [data]).start()
I want to run this http request every second using a thread in my program. However, the program only runs the http request once and exit. How do I fix this?
You need to restart the timer after each request :
def foo(data):
try:
r = requests.post(url = "www.mysite.com", data = data)
j = json.loads(r.text)
print(j)
threading.Timer(1, foo, [data]).start() # New Line Added
except requests.exceptions.RequestException as e:
raise SystemExist(e)
I'm currently trying to make my requests faster by multithreading them but I'm not sure how to do it the way I want. I know about grequests but they seem to require a URL list. I have code with a starting number contained in URL and would like all threads to stop after getting a status_code of 200
I have tried to accomplish this by grequests but couldn't make it work. Also tried threading but don't know how to stop all threads after working URL was found
import requests
import webbrowser
def url_request(number):
url = "http://website.com/download/" + str(number) + ".zip"
r = requests.head(url)
if r.status_code == 404:
print(url + " - 404 Not Found!")
number += 1
url_request(number)
elif r.status_code == 200:
webbrowser.open(url)
print(url + " - 200 Found!")
if __name__ == "__main__":
url_request(int(input("Starting number: ")))
What I want the code to do is execute multiple request.head at once with a number after "Starting number" and will stop after one of the threads finds url with status_code 200.
Ok, figured it out. Thanks for your advice.
Here's the code:
from gevent import monkey
monkey.patch_all()
import grequests
import webbrowser
def url_request_threaded(startnumber, stopnumber):
urls = []
for i in range(startnumber, stopnumber):
urls.append("http://website.com/download/" + str(i) + ".zip")
gr = (grequests.head(url, stream=False) for url in urls)
gresponses = grequests.imap(gr, size=10)
try:
for response in gresponses:
if response.status_code == 404:
print(response.url + " - 404 Not Found!")
elif response.status_code == 200:
webbrowser.open(response.url)
print(response.url + " - 200 Found!")
raise SystemExit
except SystemExit:
pass
if __name__ == "__main__":
while True:
try:
startn = input("Starting number: ")
startn = int(startn)
stopn = input("End number: ")
stopn = int(stopn)
url_request_threaded(b, v, startn, stopn)
except ValueError:
print("Must be a number!")
continue
break
error: (3, 'Illegal characters found in URL')
My url has got special characters like [AVC_(1)_(P1)_0]
i cant get this to work, i tried encoding but that would give me "Could not resolve host: https%3A"
Please advice
import sys
import Queue
import threading
import pycurl
import os
import urllib
from StringIO import StringIO
num_conn = 1
# Make a queue with (url, filename) tuples
queue = Queue.Queue()
with open('list.txt') as f:
for line in f:
print line
queue.put((line, 'test.mp4'))
if 'str' in line:
break
# Check args
assert queue.queue, "no URLs given"
num_urls = len(queue.queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
class WorkerThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
#dirname = os.path.dirname(filename)
#fp = open(dirname, "wb")\
#url = urllib.quote(url.encode('utf-8'))
fp = open(os.getcwd()+'/'+filename, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
curl.close()
fp.close()
sys.stdout.write(".")
sys.stdout.flush()
# Start a bunch of threads
threads = []
for dummy in range(num_conn):
t = WorkerThread(queue)
t.start()
threads.append(t)
# Wait for all threads to finish
for thread in threads:
thread.join()
Why not use requests in lieu of pycurl, which would make your run method:
def run(self):
while True:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
with open(os.getcwd()+'/'+filename, "wb") as fp:
#fp.write(requests.get(url).content)
fp.write(requests.get(url, headers={'user-agent': 'CodeGuru'}).content
I made a few other, stylistic changes.
I am trying to implement a multihtreaded crawler that takes an initial url and searches for links within that link and displays each links and at the same time look for links within each link
This is my code
import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
class a3_6:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls = []
def __init__(self, start_url, max_threads):
self.__url_q.put(start_url)
self.max_threads = max_threads
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
except urllib.error.URLError as e:
print(e.reason)
except:
print("invalid: " + url)
self.__visited_urls.append(url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
if a not in self.__visited_urls:
link='https://en.wikipedia.org'+a.get('href')
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
print (self.__data_q.get())
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
else:
break
def run(self):
self.download_thread()
self.mine_thread()
self.store()
def op(self):
for x in range(self.max_threads):
t = threading.Thread(target=self.run)
t.daemon = True
t.start()
self.store()
if __name__ == '__main__':
a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
a.op()
EDIT: I edited the code and now I am getting proper results but again not ending.
I arrived at the solution. I took James Harrison's help. i don't know why he deleted his original solution but here it is
import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db
class a3_5:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls=[]
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
pars=urlparse(url)
except urllib.error.URLError as e:
print(e.reason+':'+url)
except:
print("invalid: " + url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
link=a.get('href')
"""if not link.startswith('www'):
link=self.__prfx+link"""
if link not in self.__visited_urls:
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
cont=self.__data_q.get()
print (cont)
else:
break
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
self.__url_q.task_done()
def op(self,*urls):
for x in range(25):
d = threading.Thread(target=self.download_thread)
d.setDaemon(True)
d.start()
for url in urls:
self.__url_q.put(url)
self.__url_q.join()
self.mine_thread()
self.store()
if __name__ == '__main__':
urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
a=a3_5()
a.op(*urls)
Essentially I had to arrange another queue where I had to set the workers to activate the threads. Also, the mine_thread and store methods needed to start after the completion of download_thread method, because the values wouldn't get stored.