multi threaded crawler in python

multi threaded crawler in python - python

I am trying to implement a multihtreaded crawler that takes an initial url and searches for links within that link and displays each links and at the same time look for links within each link
This is my code
import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
class a3_6:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls = []
def __init__(self, start_url, max_threads):
self.__url_q.put(start_url)
self.max_threads = max_threads
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
except urllib.error.URLError as e:
print(e.reason)
except:
print("invalid: " + url)
self.__visited_urls.append(url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
if a not in self.__visited_urls:
link='https://en.wikipedia.org'+a.get('href')
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
print (self.__data_q.get())
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
else:
break
def run(self):
self.download_thread()
self.mine_thread()
self.store()
def op(self):
for x in range(self.max_threads):
t = threading.Thread(target=self.run)
t.daemon = True
t.start()
self.store()
if __name__ == '__main__':
a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
a.op()
EDIT: I edited the code and now I am getting proper results but again not ending.

I arrived at the solution. I took James Harrison's help. i don't know why he deleted his original solution but here it is
import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db
class a3_5:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls=[]
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
pars=urlparse(url)
except urllib.error.URLError as e:
print(e.reason+':'+url)
except:
print("invalid: " + url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
link=a.get('href')
"""if not link.startswith('www'):
link=self.__prfx+link"""
if link not in self.__visited_urls:
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
cont=self.__data_q.get()
print (cont)
else:
break
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
self.__url_q.task_done()
def op(self,*urls):
for x in range(25):
d = threading.Thread(target=self.download_thread)
d.setDaemon(True)
d.start()
for url in urls:
self.__url_q.put(url)
self.__url_q.join()
self.mine_thread()
self.store()
if __name__ == '__main__':
urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
a=a3_5()
a.op(*urls)
Essentially I had to arrange another queue where I had to set the workers to activate the threads. Also, the mine_thread and store methods needed to start after the completion of download_thread method, because the values wouldn't get stored.

Related

Why my code still slow after threading for 15k records only, how to fix this

I have a script, taking links from a file, visiting it, getting re-directed links, storing it back. But it works too slow on a file with 15k records. How can I make it quick? already used threading
Please do help to fix it out!, I've tried multiple ways, threadings but I cannot make it quick. Is there any solution to my problem by any chance? any expert who could help me out.
import concurrent.futures
import sys
import pandas as pd
import requests
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
file_name =None
concurrent = 10000
q = None
count=0
df =None
def do_work():
while True:
global q
url = q.get()
res = get_status(url)
q.task_done()
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
print(count)
count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
try:
for url in urls:
if type(url)!=str:
continue
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
process_data()
for i in range (len(df['shopify'])):
for j in range(len(old)):
if df['shopify'][i]==old[j]:
df['shopify'][i]=out[j]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)
Email,shopify,Proofy_Status_Name
hello#knobblystudio.com,http://puravidabracelets.myshopify.com,Deliverable
service#cafe-select.co.uk,cafe-select.co.uk,Deliverable
mtafich#gmail.com,,Deliverable
whoopies#stevessnacks.com,stevessnacks.com,Deliverable
customerservice#runwayriches.com,runwayriches.com,Deliverable
shop#blackdogride.com.au,blackdogride.com.au,Deliverable
anavasconcelos.nica#gmail.com,grass4you.com,Deliverable
info#prideandprestigehair.com,prideandprestigehair.com,Deliverable
info#dancinwoofs.com,dancinwoofs.com,Deliverable

Threads in Python do not run simultaneously due to the Global Interpreter Lock. You might want to use the multiprocessing module instead, or ProcessPoolExecutor() from concurrent.futures. If you decide to use ProcessPoolExecutors, pass the URLs to the callback and have the callback return the old and redirected URL which should be returned by the result method of the future you get from the executor.submit. When using processes, global variables are not shared, unlike threads.
There has been an attempt to remove the global interpreter lock but without the GIL, Python doesn't run quite as fast or something like that if I remember correctly.
Something like the following might work. I renamed the concurrent variable because it would shadow the concurrent module and probably cause an error. This code is untested because I don't have the csv file to test with.
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import sys
import pandas as pd
import requests
import numpy as np
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
futures = []
file_name =None
concurrent_ = 10000
q = None
count=0
df =None
def do_work(urls):
results = []
for url in urls:
res = get_status(url)
if res:
results.append((res[2], res[1]))
else:
results.append((url, url))
return results
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
#print(count)
#count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
def load_url(url, timeout):
ans = requests.get(url, timeout=timeout)
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
global urls
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
workers = 50
with ProcessPoolExecutor(max_workers=workers) as executor:
url_arrays = np.array_split(urls, workers)
for urls in url_arrays:
f = executor.submit(do_work, urls)
futures.append(f)
process_data()
df['shopify'] = [res[1] for f in concurrent.futures.as_completed(futures) for res in f.result()]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)

I'm trying to cancel the tasks in running event loop using python asyncio

Hi I'm stuck the problem asyncio. I'm using Python version 3.7.3.
And Sorry for I'm not native speaker at English.
I'm writing a script for get a lyric from genius.
This is my script.
Every requests going to be 6times if I couldn't the result in the end.
I divided get request 2 times almost same time. It means like a 2*3.
Checking the result and if I could get the result I want to stop the other tasks.
Because of to be less request.
So I used cancel() and tried to raise asyncio.exceptions.CancelledError when I got the lyric but It doesn't work well.
It shows RuntimeError: Event loop is closed I don't know why doesn't work well.
Please teach me some one familiar with this situation.
import asyncio
import requests
from bs4 import BeautifulSoup, Comment
#Lyrics__Container-sc-1ynbvzw-2
class Lyric():
def __init__(self, artist, song_name):
self.artist = artist
self.song_name = song_name
self.__gtask = []
self.__canceled = False
self.__lyric = ''
self.genius_url = self.make_genius_url(artist, song_name)
lyric = self.lyric_from_genius(self.genius_url)
print(lyric)
def make_genius_url(self, artist, song_name):
search_song = f'{artist} {song_name}'
search_song = search_song.replace(' ', '-')
print(search_song)
return f'https://genius.com/{search_song}-lyrics'
def get_soup(self, url):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content, 'lxml')
return soup
else:
return False
def scrape_genius(self, url):
soup = self.get_soup(url)
if soup and not self.__canceled:
lyric_soup = soup.select('.song_body-lyrics .lyrics p')
if lyric_soup:
self.__canceled = True
tags = lyric_soup[0].find_all(['a', 'i'])
for tag in tags:
tag.unwrap()
print('ここから歌詞です。')
print(lyric_soup[0].text)
self.__lyric = lyric_soup[0].text
self.__gtask.cancel()
else:
print('歌詞情報を取得出来なかった。')
else:
if self.__canceled:
print('歌詞取得した')
else:
print('歌詞情報がない')
self.__gtask.cancel()
def lyric_from_genius(self, url):
async def main_loop(url):
sem = asyncio.Semaphore(2)
async def get_lyric_soup(url):
async with sem:
await self.loop.run_in_executor(None, self.scrape_genius, url)
#main_loopの処理
for _ in range(6):
self.__gtask += [get_lyric_soup(url)]
return await asyncio.gather(*self.__gtask)
try:
self.loop = asyncio.new_event_loop()
self.loop.run_until_complete(main_loop(url))
except asyncio.exceptions.CancelledError as e:
print("*** CancelledError ***", e)
finally:
if self.__lyric:
return self.__lyric
else:
print('5回のリクエストで曲情報が取れなかった。')
Lyric = Lyric('kamal', 'blue')

How to get simple threading to work Python

i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()

You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.

Python Curl - Illegal Characters Found in URL

error: (3, 'Illegal characters found in URL')
My url has got special characters like [AVC_(1)_(P1)_0]
i cant get this to work, i tried encoding but that would give me "Could not resolve host: https%3A"
Please advice
import sys
import Queue
import threading
import pycurl
import os
import urllib
from StringIO import StringIO
num_conn = 1
# Make a queue with (url, filename) tuples
queue = Queue.Queue()
with open('list.txt') as f:
for line in f:
print line
queue.put((line, 'test.mp4'))
if 'str' in line:
break
# Check args
assert queue.queue, "no URLs given"
num_urls = len(queue.queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
class WorkerThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while 1:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
#dirname = os.path.dirname(filename)
#fp = open(dirname, "wb")\
#url = urllib.quote(url.encode('utf-8'))
fp = open(os.getcwd()+'/'+filename, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
curl.close()
fp.close()
sys.stdout.write(".")
sys.stdout.flush()
# Start a bunch of threads
threads = []
for dummy in range(num_conn):
t = WorkerThread(queue)
t.start()
threads.append(t)
# Wait for all threads to finish
for thread in threads:
thread.join()

Why not use requests in lieu of pycurl, which would make your run method:
def run(self):
while True:
try:
url, filename = self.queue.get_nowait()
except Queue.Empty:
raise SystemExit
with open(os.getcwd()+'/'+filename, "wb") as fp:
#fp.write(requests.get(url).content)
fp.write(requests.get(url, headers={'user-agent': 'CodeGuru'}).content
I made a few other, stylistic changes.

Python: Create new thread or process when button is clicked

In Python 3.6.4, what is a good approach for starting a new separate process or thread on every click of a button? I have written this code but it's not working the way I want it to.
from multiprocessing import process
import requests
import threading
from tkinter import *
def download():
name=entry2.get()
url=entry1.get()
r = requests.head(url)
if name:
file_name = name
else:
file_name = url.split('/')[-1]
try:
file_size = int(r.headers['content-length'])
part=file_size/4
start=0
end=part
except:
print ("Invalid URL")
return
print ('%s downloaded' % file_name)
def thread(url):
file_name=entry2.get()
r=requests.get(url)
data=r.content
with open('file_name','rb+')as fp:
data1=fp.read()
with open('file_name',"wb+") as fp:
data1=fp.write(data)
print("its working3")
if __name__=='__main__':
p=process(target=download,args=())
p.start()
p.join()
root=Tk()
frame=Frame(root,width=500,height=450,bg="lightpink")
url1=Label(frame,text="enter url here")
name=Label(frame,text="enter the name of the file")
url1.grid(row=0,sticky=E)
name.grid(row=1,sticky=E)
entry1=Entry(frame)
entry2=Entry(frame)
entry1.grid(row=0,column=1)
entry2.grid(row=1,column=1)
button1=Button(frame,text="download" ,command=download)
button1.grid(row=2,column=0)
button3=Button(frame,text="quit",command=frame.quit)
button3.grid(row=2,column=1)
frame.grid()
print("its working4")
root.mainloop()

Does this do the job? It uses the threading module rather than multiprocessing:
#from multiprocessing import process
from threading import Thread as process
import requests
import threading
from tkinter import *
def download():
name=entry2.get()
url=entry1.get()
r = requests.head(url)
if name:
file_name = name
else:
file_name = url.split('/')[-1]
try:
file_size = int(r.headers['content-length'])
part=file_size/4
start=0
end=part
except:
print ("Invalid URL")
return
print ('%s downloaded' % file_name)
def thread(url):
file_name=entry2.get()
r=requests.get(url)
data=r.content
with open('file_name','rb+')as fp:
data1=fp.read()
with open('file_name',"wb+") as fp:
data1=fp.write(data)
print("its working3")
root=Tk()
frame=Frame(root,width=500,height=450,bg="lightpink")
url1=Label(frame,text="enter url here")
name=Label(frame,text="enter the name of the file")
url1.grid(row=0,sticky=E)
name.grid(row=1,sticky=E)
entry1=Entry(frame)
entry2=Entry(frame)
entry1.grid(row=0,column=1)
entry2.grid(row=1,column=1)
button1=Button(frame,text="download" ,command=lambda: process (target = download).start ())
button1.grid(row=2,column=0)
button3=Button(frame,text="quit",command=root.destroy)
button3.grid(row=2,column=1)
frame.grid()
print("its working4")
root.mainloop()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

multi threaded crawler in python - python

Related

Why my code still slow after threading for 15k records only, how to fix this

I'm trying to cancel the tasks in running event loop using python asyncio

How to get simple threading to work Python

Python Curl - Illegal Characters Found in URL

Python: Create new thread or process when button is clicked

Categories

Resources