I need to extract all the urls from an ip list,
i wrote this python script, but i have issue extracting the same ip multiple times (more threads are created with the same ip).
Could anyone Improve on my solution using multithreading ?
Sorry for my english
Thanks all
import urllib2, os, re, sys, os, time, httplib, thread, argparse, random
try:
ListaIP = open(sys.argv[1], "r").readlines()
except(IOError):
print "Error: Check your IP list path\n"
sys.exit(1)
def getIP():
if len(ListaIP) != 0:
value = random.sample(ListaIP, 1)
ListaIP.remove(value[0])
return value
else:
print "\nListaIPs sa terminat\n"
sys.exit(1)
def extractURL(ip):
print ip + '\n'
page = urllib2.urlopen('http://sameip.org/ip/' + ip)
html = page.read()
links = re.findall(r'href=[\'"]?([^\'" >]+)', html)
outfile = open('2.log', 'a')
outfile.write("\n".join(links))
outfile.close()
def start():
while True:
if len(ListaIP) != 0:
test = getIP()
IP = ''.join(test).replace('\n', '')
extractURL(IP)
else:
break
for x in range(0, 10):
thread.start_new_thread( start, () )
while 1:
pass
use a threading.Lock. The lock should be global, and create at the beginning when you create the IP list.
lock.acquire at the start of getIP()
and release it before you leave the method.
What you are seeing is, thread 1 executes value=random.sample, and then thread 2 also executes value=random.sample before thread 1 gets to the remove. So the item is still in the list at the time thread 2 gets there.
Therefore both threads have a chance of getting the same IP.
Related
I have the following code:
#!/usr/bin/env python
# coding=utf-8
import threading
import requests
import Queue
import sys
import re
#ip to num
def ip2num(ip):
ip = [int(x) for x in ip.split('.')]
return ip[0] << 24 | ip[1] << 16 | ip[2] << 8 | ip[3]
#num to ip
def num2ip(num):
return '%s.%s.%s.%s' % ((num & 0xff000000) >> 24,(num & 0x00ff0000) >> 16,(num & 0x0000ff00) >> 8,num & 0x000000ff)
def ip_range(start, end):
return [num2ip(num) for num in range(ip2num(start), ip2num(end) + 1) if num & 0xff]
def bThread(iplist):
threadl = []
queue = Queue.Queue()
for host in iplist:
queue.put(host)
for x in xrange(0, int(SETTHREAD)):
threadl.append(tThread(queue))
for t in threadl:
t.start()
for t in threadl:
t.join()
#create thread
class tThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while not self.queue.empty():
host = self.queue.get()
try:
checkServer(host)
except:
continue
def checkServer(host):
ports = [80]
for k in ports:
try:
aimurl = "http://"+host+":"+str(k)
response = requests.get(url=aimurl,timeout=3)
serverText = response.headers['server']
if (response.status_code) == 403:
print "-"*50+"\n"+aimurl +" Server: "+serverText
except:
pass
if __name__ == '__main__':
print '\n############# CDN IP #############'
print ' '
print '################################################\n'
global SETTHREAD
try:
SETTHREAD = sys.argv[2]
iplist = []
file = open(sys.argv[1], 'r')
tmpIpList = file.readlines()
for ip in tmpIpList:
iplist.append(ip.rstrip("\n"))
print '\nEscaneando '+str(len(iplist))+" IP's...\n"
bThread(iplist)
except KeyboardInterrupt:
print 'Keyboard Interrupt!'
sys.exit()
This script works as follows, a range of ip is entered:
python2 script.py 104.0.0.0-104.0.1.255 100 (100 is the number of threads)
I want to add support so that it reads the ip of a file, and that the range also works.
python2 script.py ips.txt 100
I tried this:
file = open(sys.argv[1], 'r')
iplist = file.readlines()
But it does not work.
Edit1: added file reading code recommended by user Syed Hasan, the problem seems to be the bThread(iplist) function
I assume you're attempting to use 'iplist' the same way as your CLI input was attempting to parse it. However, the readlines function simply reads the entire file at once and appends a newline (\n) at the end (provided you do format the IPs with a succeeding newline character).
Currently, you should be getting a list of IPs with a succeeding newline character. Try removing it from the rightmost end using rstrip:
file = open(sys.argv[1], 'r')
tmpIpList = file.readlines()
for ip in tmpIpList:
iplist.append(ip.rstrip("\n"))
How you switch between the two modes is a challenge you should attempt to solve. Perhaps use command-line parameter support to identify the mode of operations (look into the argparse library).
I am making a program to generate a .onion url and then test the url to see if it exists. The issue is I have set it up to use multiprocessing and it only uses one core, even when multiple processes are running.
My code is
import requests
import time
import sys
import threading
import random
import string
import multiprocessing
proxies = {
'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'
}
try:
print("Your hidden ip is: " + requests.get("https://api.ipify.org/?", proxies=proxies, timeout=10).text)
print("Checks at intervals to check if the commection is still established (pings google.com)")
except Exception as e:
print("Make sure tor is running or check that a socks proxy is running at 127.0.0.1:9150!")
sys.exit(0)
def checkSite(site):
if random.randint(0, 100) == 5:
site = "https://google.com"
try:
data = requests.get(site, proxies=proxies, timeout=10)
if data.status_code == 200:
print('Website exists {}'.format(site))
sys.exit(0)
print("hello")
else:
return
#print('Website does not exist {}'.format(site))
except Exception:
#print('Website does not exist {}'.format(site))
return
def genSite():
site = ""
x= string.ascii_lowercase + "234562"
for i in range(0, 16):
site = site + x[random.randint(0, len(x) - 1)]
return ("http://" + site + ".onion")
def findSite():
while True:
checkSite(genSite())
if __name__ == "__main__":
processes = []
for _ in range(int(input("Enter number of processes: "))):
p = multiprocessing.Process(target=findSite) # create a new Process
processes.append(p)
p.start()
for process in processes:
process.join()
I have tried this with running a factorial function instead and this will utilize all cores, however it will only use one core when running the findSite function. I don't know weather this is an issue with networking and threads should be used or if my code is wrong. I have also tried to join the processes directly after they are started and put the join statement in different places however this has not worked.
So I have been playing around with Multiprocessing and I was thinking to upgrade my knowledge where I can read the first sentence from the text file for process 1 then the second sentence for process 2 etc...
txt file:
helloworld#world.com
helloworld2#world.com
helloworld3#world.com
helloworld4#world.com
helloworld5#world.com
and this is how the code is looking:
def info(thread):
global prod
prod = int(thread) + 1
runit()
def runit():
log("Profile-" + str(prod) + Fore.GREEN + ' - ' + email)
#From here I can then use the email for each worker basically. Or thats the plan atleast. Theplan is that every worker will have its own email that can be used in here.
sys.exit()
def main():
user_input = 0
while True:
try:
user_input = int(input(Fore.WHITE + 'How many tasks do you wanna run? [NUMBERS] \n' + Fore.RESET))
except ValueError:
print(Fore.RED + "Stop being stupid" + Fore.RESET)
continue
else:
with open('email.txt') as f:
content = f.readlines()
content = [x.strip('\n') for x in content]
try:
for i, email in enumerate(content):
print(email)
except ValueError as e:
print(e)
HowManyThread = user_input
i = 0
jobs = []
for i in range(HowManyThread):
p = multiprocessing.Process(target=info, args=(str(i),))
jobs.append(p)
time.sleep(.5)
p.start()
for p in jobs:
p.join()
sys.exit()
Log is just a log message basically, Nothing special
Fore.COLOR <-- Colorama
However, I have completely no idea what I should do to actually make each process take each email row. So basically....
Process-1 to take helloworld#world.com
Process-2 to take helloworld2#world.com
Process-3 to take helloworld3#world.com
Process-4 to take helloworld4#world.com
Process-5 to take helloworld5#world.com
What are the suggestions on how I can do this? I'm completely off and have absolutely no idea on how to move forward.
Update
from multiprocessing import pool, Process, Queue
from tqdm import tqdm
with open('email.txt') as f:
content = f.readlines()
global email_list
email_list = [x.strip('\n') for x in content]
def info(thread):
global prod
prod = int(thread) + 1
runit()
def runit(email_index):
email = email_list[email_index]
log("Profile-" + str(prod) + Fore.GREEN + ' - ' + email)
sys.exit()
def main():
wipe()
text()
Infotext = "First name : Last name : Email: : Random char + Street"
with open('data.json', 'w') as f:
json.dump(Infotext, f)
f.write("\n")
with Pool(8) as pool:
result_list = list(tqdm(pool.imap_unordered(, range(len(email_list)), chunksize=5), total=len(email_list))))
if __name__ == '__main__':
try:
main()
except Exception as e:
print(e)
print(traceback.print_exc())
print(traceback)
The following approach delegates the multiprocessing to a pool of workers, each of which receives a chunk of indices and processes these indices a single line at a time (the choice of poolsize=8 and chunksize=5 here is arbitrary and can be tuned according to your requirements).
The result of all workers is then collected into a final list. Note that imap_unordered is only appropriate if you don't care about the order in which the lines are processed (i.e. result_list does not maintain the original order of content.
from multiprocessing import Pool
# progress bar to track your multiproc
from tqdm import tqdm
with open('email.txt') as f:
content = f.readlines()
# this list will be accessed by each worker
global email_list
email_list = [x.strip('\n') for x in content]
# define function that worker will apply to each email
# it gets sent an index for the list of emails
# it accesses the email at that index, performs its function and returns
def runit(email_index):
email = email_list[email_index]
# do the stuff you're interested in for a single email
# run the multiprocessing to get your results
# this sends the indexes for the emails out to the workers
# and collects the results of runit into result list
with Pool(8) as pool:
result_list = list(tqdm(pool.imap_unordered(runit,
range(len(email_list)), chunksize=5),
total=len(email_list)))
What you need is a pool of worker processes - even if for you use case, I really wonder whether threads (or multiprocessing.dummy) would not be enough.
A pool starts the asked number of worker processes, and you can submit asynchronous tasks to the pool that will be handled by the first free worked.
A stripped down version of your example (no fancy printing, no unnecessary reading of a sequential file in a list) could be:
import multiprocessing
import time
def runit(prod, email):
print("Profile-" + str(prod) + ' - ' + email)
#From here I can then use the email for each worker basically. Or thats the plan atleast. Theplan is that every worker will have its own email that can be used in here.
# sys.exit() # NEVER CALL EXPLICITELY sys.exit() in a worker process
time.sleep(1) # to add a delay inside each task
def main():
while True:
try:
HowManyThread = int(input(
'How many tasks do you wanna run? [NUMBERS] \n'))
except ValueError:
print("Stop being stupid")
continue
if HowManyThread == 0: break
pool = multiprocessing.Pool(HowManyThread)
with open('email.txt') as f:
for i, email in enumerate(f):
email = email.strip()
# runit will be runned by a worker process
pool.apply_async(runit, (i, email))
pool.close() # no more task to add
pool.join() # wait for last worker to end
if __name__ == "__main__":
main()
So I've started learning python now, and I absolutely am in love with it.
I'm building a small scale facebook data scraper. Basically, it will use the Graph API and scrape the first names of the specified number of users. It works fine in a single thread (or no thread I guess).
I used online tutorials to come up with the following multithreaded version (updated code):
import requests
import json
import time
import threading
import Queue
GraphURL = 'http://graph.facebook.com/'
first_names = {} # will store first names and their counts
queue = Queue.Queue()
def getOneUser(url):
http_response = requests.get(url) # open the request URL
if http_response.status_code == 200:
data = http_response.text.encode('utf-8', 'ignore') # Get the text of response, and encode it
json_obj = json.loads(data) # load it as a json object
# name = json_obj['name']
return json_obj['first_name']
# last = json_obj['last_name']
return None
class ThreadGet(threading.Thread):
""" Threaded name scraper """
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#print 'thread started\n'
url = GraphURL + str(self.queue.get())
first = getOneUser(url) # get one user's first name
if first is not None:
if first_names.has_key(first): # if name has been encountered before
first_names[first] = first_names[first] + 1 # increment the count
else:
first_names[first] = 1 # add the new name
self.queue.task_done()
#print 'thread ended\n'
def main():
start = time.time()
for i in range(6):
t = ThreadGet(queue)
t.setDaemon(True)
t.start()
for i in range(100):
queue.put(i)
queue.join()
for name in first_names.keys():
print name + ': ' + str(first_names[name])
print '----------------------------------------------------------------'
print '================================================================'
# Print top first names
for key in first_names.keys():
if first_names[key] > 2:
print key + ': ' + str(first_names[key])
print 'It took ' + str(time.time()-start) + 's'
main()
To be honest, I don't understand some of the parts of the code but I get the main idea. The output is nothing. I mean the shell has nothing in it, so I believe it keeps on running.
So what I am doing is filling queue with integers that are the user id's on fb. Then each ID is used to build the api call URL. getOneUser returns the name of one user at a time. That task (ID) is marked as 'done' and it moves on.
What is wrong with the code above?
Your usage of first_names is not thread-safe. You could add a lock to protect the increment. Otherwise the code should work. You might be hitting some facebook api limit i.e., you should limit your request rate.
You could simplify your code by using a thread pool and counting the names in the main thread:
#!/usr/bin/env python
import json
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
def get_name(url):
try:
return json.load(urllib2.urlopen(url))['first_name']
except Exception:
return None # error
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
To see what errors you get, you could add logging:
#!/usr/bin/env python
import json
import logging
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
def get_name(url):
try:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
A simple way to limit number of requests per given time period is to use a semaphore:
#!/usr/bin/env python
import json
import logging
import time
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
from threading import _BoundedSemaphore as BoundedSemaphore, Timer
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
class RatedSemaphore(BoundedSemaphore):
"""Limit to 1 request per `period / value` seconds (over long run)."""
def __init__(self, value=1, period=1):
BoundedSemaphore.__init__(self, value)
t = Timer(period, self._add_token_loop,
kwargs=dict(time_delta=float(period) / value))
t.daemon = True
t.start()
def _add_token_loop(self, time_delta):
"""Add token every time_delta seconds."""
while True:
try:
BoundedSemaphore.release(self)
except ValueError: # ignore if already max possible value
pass
time.sleep(time_delta) # ignore EINTR
def release(self):
pass # do nothing (only time-based release() is allowed)
def get_name(gid, rate_limit=RatedSemaphore(value=100, period=600)):
url = 'http://graph.facebook.com/%d' % gid
try:
with rate_limit:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, xrange(200)))
print first_names.most_common()
After the initial burst, it should make a single request every 6 seconds.
Consider using batch requests.
Your original run function only processed one item from the queue. In all you've only removed 5 items from the queue.
Usually run functions look like
run(self):
while True:
doUsefulWork()
i.e. they have a loop which causes the recurring work to be done.
[Edit] OP edited code to include this change.
Some other useful things to try:
Add a print statement into the run function: you'll find that it is only called 5 times.
Remove the queue.join() call, this is what is causing the module to block, then you will be able to probe the state of the queue.
put the entire body of run into a function. Verify that you can use that function in a single threaded manner to get the desired results, then
try it with just a single worker thread, then finally go for
multiple worker threads.
I'm obviously missing something here. Same project I've been working on for a number of days. Stepping through it bit by bit, seemed to be working fine. I added in a portion of the main() function to actually create the comparison lists, and suddenly starts throwing out cannot pop from empty list error at me, even through a print function I've placed ahead of the pop() call clearly shows that the list is not empty? Any ideas what I'm doing wrong? and is this monstrosity gonna actually work the way I intend? First time working with threads and all. Here is the code in its entirety:
import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading
class Crawler(threading.Thread):
def __init__(self):
self.links = []
self.queue = []
self.mal_list = []
self.count = 0
self.mal_set = set(self.mal_list)
self.crawled = []
self.crawled_set = set(self.crawled)
self.links_set = set(self.links)
self.queue.append(sys.argv[1])
self.queue_set = set(self.queue)
def run(self, max_depth):
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
if tgt not in self.mal_set:
self.crawl(tgt)
else:
print("Malicious Link Found: {0}".format(tgt)
continue
sys.exit("Finished!")
def crawl(self, tgt):
url = urlparse(tgt)
self.crawled.append(tgt)
try:
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
self.count += 1
except:
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
self.links.append(fixed)
self.queue_links(self.links_set, url)
def queue_links(self, links, url):
for link in links:
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
if link.decode('utf-8') not in self.crawled_set:
self.queue.append(link)
def make_mal_list(self):
"""
Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
with open(hosts1) as first:
for line1 in first.readlines():
link = "http://" + line1.strip()
self.mal_list.append(link)
with open(hosts2) as second:
for line2 in second.readlines():
link = "http://" + line2.strip()
self.mal_list.append(link)
with open(hosts3) as third:
for line3 in third.readlines():
link = "http://" + line3.strip()
self.mal_list.append(link)
def main():
crawler = Crawler()
crawler.make_mal_list()
crawler.run(25)
if __name__ == "__main__":
main()
First of all , i did get lost while reading your code so maybe i can give you some remark if i may before:
to many instance variable you don't have to create a new instance var just to put on it a set() of another vars like this code : self.mal_set = set(self.mal_list)and you are repeating the same thing many times
if you want to use threading so use it, because in your code you are just creating one thread, for that you should create like (10) thread or so each thread will deal with a bunch of URL that he should fetch, and don't forget to put the threads in a Queue.Queue to synchronize between them.
EDIT : Ahh i forgot : indent your code :)
now about your problem :
where do you assign self.queue because i don't see it ? you are just calling the make_mal_list() method that will initialize only self.mal_listand after when you run you own thread i think it's obvious that self.queue is empty so you can't pop() right ?
EDIT 2:
i think your example is more complicate (using black list and all this stuff, ...) but you can start with something like this:
import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse
THREAD_NUMBER = 10
class Crawler(threading.Thread):
def __init__(self, queue, mal_urls):
self.queue = queue
self.mal_list = mal_urls
threading.Thread.__init__(self) # i forgot , thanks seriyPS :)
def run(self):
while True:
# Grabs url to fetch from queue.
url = self.queue.get()
if url not in self.mal_list:
self.crawl(url)
else:
print "Malicious Link Found: {0}".format(url)
# Signals to queue job is done
self.queue.task_done()
def crawl(self, tgt):
try:
url = urlparse(tgt)
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
except: # TODO: write explicit exceptions the URLError, ValueERROR ...
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
# I don't think you need this, but maybe i'm mistaken.
# self.links.append(fixed)
# Add more URL to the queue.
self.queue_links(fixed, url)
def queue_links(self, link, url):
"""I guess this method allow recursive download of urls that will
be fetched from the web pages ????
"""
#for link in links: # i changed the argument so now links it just one url.
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
# Add urls extracted from the HTML text to the queue to fetche them
if link.decode('utf-8') not in self.crawled_set:
self.queue.put(link)
def get_make_mal_list():
"""Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
mal_list = []
with open(hosts1) as first:
for line1 in first:
link = "http://" + line1.strip()
mal_list.append(link)
with open(hosts2) as second:
for line2 in second:
link = "http://" + line2.strip()
mal_list.append(link)
with open(hosts3) as third:
for line3 in third:
link = "http://" + line3.strip()
mal_list.append(link)
return mal_list
def main():
queue = Queue.Queue()
# Get malicious URLs.
mal_urls = set(get_make_mal_list())
# Create a THREAD_NUMBER thread and start them.
for i in xrange(THREAD_NUMBER):
cr = Crawler(queue, mal_urls)
cr.start()
# Get all url that you want to fetch and put them in the queue.
for url in sys.argv[1:]:
queue.put(url)
# Wait on the queue until everything has been processed.
queue.join()
if __name__ == '__main__':
main()
Small offtopic:
class Crawler(threading.Thread):
def __init__(self):
#you code
threading.Thread.__init__(self)#!!!
don't forget run Thread.__init__(self) directly if you override __init__ function
And, ofcourse, you must use http://docs.python.org/library/queue.html class for implement you job's queue in thread-safe mode
My primary language is C#, but issue you are experiencing is because of threading. In thread #1 you check that list is not empty, while thread #2 clears that list and thus you receive exception.
list is not thread-safe. If you need a thread-safe data structure, use Queue.Queue (Python 2.x) or queue.Queue (Python 3.x).
Also, look on this fragment:
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
you do print(self.queue) only before in first while iteration, so, self.queue.pop() can make many iterations (and fetch many links) and raise "cannot pop from empty list" only when queue is really empty!
try this:
while self.count < max_depth:
print(self.queue)
tgt = self.queue.pop(0)
for detect moment when you take exception.