How to set a time out in web crawler?

How to set a time out in web crawler? - python

I am very new to python and trying to develop very simple web crawler. My web crawler works good but it stick to one link for long time. How can I set up a timeout function?
How to deal with the urllib2.HTTPError? Is my except statement correct?
def get_link(page):
start = page.find('<a href=')
if start==-1:
return None,0
startp=page.find('"',start)
endp=page.find('"',startp+1)
url=page[startp+1:endp]
return url,endp
def get_all_link(page):
allurl = []
while True:
url,endp=get_link(page)
if url:
page=page[endp:]
allurl.append(url)
else:
return allurl
break
def get_page(page, tocrawl):
import urllib2
try:
page_source = urllib2.urlopen(page)
return page_source.read()
except:
page = tocrawl.pop()
raise
def validate(page):
valid = page.find('http')
if valid == -1:
return 0
return 1
def crawler(seed):
tocrawl = [seed]
crawled = []
i=0
while tocrawl:
page=tocrawl.pop()
valid = validate(page)
if valid:
if page not in crawled:
tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
crawled.append(page)
i=i+1
f = open("crawled.txt","a")
f.write(repr(i)+" : "+repr(page)+"\n")
f.close()
return crawled
crawler("http://google.com")

Related

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!

Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[#id="hhh"]/h4/a', '//*[#id="sss"]/h4/a/', '//*[#id="jjj"]/h4/a',
'//*[#id="bbb"]/h4/a', '//*[#id="ggg"]/h4/a']
dark_side_xpaths = ['//*[#id="ccc"]/h4/a', '//*[#id="ppp"]/h4', '//*[#id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[#id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[#id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[#id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[#id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
and the main.py file that opens the two processes and initializes them is as follows:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
Any help would be appreciated!

Try passing your target as reference to your function instead of calling it, like this Process(target=darksideScraper.navigate_pages). Also refer to this, for another example of how to use multiprocessing.

Best way to make thousands of get requests in python

Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

How do I implement a web crawler that scrapes ad links?

To get training data, I wrote a crawler to follow the top 500 websites on Alexa with a depth of 2 and write all links found to a file. Right now, it looks for all the links in the html and writes them to a file. The problem is, the crawler misses all links to ads, some of which are located in iframes or located in CSS files. How can I change my web crawler so that it scrapes all links, including ads? The relevant code can be found below.
class Crawler(object):
def __init__(self, root, depth, locked=True):
self.root = root
self.depth = depth
self.locked = locked
self.host = urlparse.urlparse(root)[1]
self.urls = []
self.links = 0
self.followed = 0
def crawl(self):
#print " in crawl"
page = Fetcher(self.root)
q = Queue()
#print "made fetcher"
try:
page.fetch()
if page.urls == []:
print "Error: could not fetch urls for %s" % (self.root)
return
#raise KeyboardInterrupt
else:
target = open("output.txt", 'w')
for url in page.urls:
q.put(url)
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
except Exception as e:
print('Error: could not fetch urls')
raise KeyboardInterrupt
'''
q = Queue()
target = open("output.txt", 'w')
for url in page.urls:
q.put(url) f
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
#print followed
'''
n = 0
while True:
try:
url = q.get()
except QueueEmpty:
break
n += 1
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
#print url
self.followed += 1
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
q.put(url)
self.urls.append(url)
with open("data.out", 'w') as f:
f.write(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
print format_exc()
class Fetcher(object):
def __init__(self, url):
self.url = url
self.urls = []
def __getitem__(self, x):
return self.urls[x]
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
Static methods:
def main():
depth =2
file_in = []
reload(sys)
sys.setdefaultencoding('utf-8')
filename = "stuff.txt"
text = open(filename)
for line in text:
file_in.append(line.rstrip())
for i in file_in:
print "Crawling %s (Max Depth: %d)" % (i, depth)
crawler = Crawler(i, depth)
crawler.crawl()
print "\n".join(crawler.urls)

A lot of advertising is delivered via asynchronous javascript executed on the page. If you're just scraping the server initial output you won't be able to obtain those others links. One method would be to use a headless browser like PhantomJS to render the html to a file then use your script on that. There are other possibilities as well.

Python KeyError exception / exiting code without error

I got a rather weird problem. The following code in scrape1 sometimes works as it should, but most of the time it just stops at line 24, where request.get is being used. I do however consistently get this keyerror exception:
Exception KeyError: KeyError(140186412830800,) in module <'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
The exception is only thrown when I'm importing the module proxyfetch.py but as long as I don't actually execute the code in proxyfetch.py, scrape1.py doesn't break (exception is thrown after nominal execution). Proxyfetch is based on DanMcInerney elite-proxy-finder on github. I just edited it so I could use it as a module which returns a list of proxys instead of printing them.
So here are the 2 scripts:
scrape1.py:
#scrape1.py
from bs4 import BeautifulSoup
from proxyfetch import getprox
import requests
proxcount=3
listz = getprox(proxcount)
proxfile = open("proxysave.txt", "w")
base_url = "http://google.com"
def pagefetch(url):
print "Test"
http_proxy = "http://"+listz[0]
#http_proxy = "http://103.25.203.227:3127"
print "Test2"
proxydict = {
"http" : http_proxy
#"https_proxy" : https_proxy
}
print "Test3"
page = requests.get(url, proxies=proxydict) #with proxy
#page = requests.get(url) #without proxy
print "Test4"
return page
page = pagefetch(base_url)
soup = BeautifulSoup(page.text)
links = soup.find_all("a")
if links:
for n in links:
print n
else:
print "I got nuthin."
And proxyfetch.py
#!/usr/bin/env python2
#proxyfetch.py
'''Finds hundreds of elite anonymity (L1) HTTP proxies then tests them all in parallel printing the fastest ones first.
Checks headers to confirm eliteness, checks if compatible with opening HTTPS sites, and confirms the proxy is working
through multiple IP checking sites'''
# TO DO:
# -Add http://free-proxy-list.net/
# -Add hidemyass
#from IPython import embed
__author__ = 'Dan McInerney'
__contact__ = 'danhmcinerney gmail'
from gevent import monkey
monkey.patch_all()
import requests
import ast
import gevent
import sys, re, time, os, argparse
import socket
from bs4 import BeautifulSoup
listz =[]
def getprox(amount):
argz = [amount, False, True]
try:
P = find_http_proxy(argz)
P.run()
except BaseException,Err:
return listz
return listz
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--show', help='Show this number of results. Example: "-s 5" will show the 5 fastest proxies then stop')
parser.add_argument('-a', '--all', help='Show all proxy results including the ones that failed 1 of the 3 tests', action='store_true')
parser.add_argument('-q', '--quiet', help='Only print the IP:port of the fastest proxies that pass all the tests', action='store_true')
return parser.parse_args()
class find_http_proxy():
''' Will only gather L1 (elite anonymity) proxies
which should not give out your IP or advertise
that you are using a proxy at all '''
#argz = [arg1, False, True]
def __init__(self, argz):
self.proxy_list = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
self.show_num = argz[0]
self.show_all = False
self.quiet = True
self.errors = []
self.print_counter = 0
self.externalip = self.external_ip()
def external_ip(self):
req = requests.get('http://myip.dnsdynamic.org/', headers=self.headers)
ip = req.text
return ip
def run(self):
''' Gets raw high anonymity (L1) proxy data then calls make_proxy_list()
Currently parses data from gatherproxy.com and letushide.com '''
if not self.quiet:
print '[*] Your accurate external IP: %s' % self.externalip
letushide_list = self.letushide_req()
if not self.quiet:
print '[*] letushide.com: %s proxies' % str(len(letushide_list))
# Has a login now :(
gatherproxy_list = self.gatherproxy_req()
if not self.quiet:
print '[*] gatherproxy.com: %s proxies' % str(len(gatherproxy_list))
checkerproxy_list = self.checkerproxy_req()
if not self.quiet:
print '[*] checkerproxy.net: %s proxies' % str(len(checkerproxy_list))
self.proxy_list.append(letushide_list)
self.proxy_list.append(gatherproxy_list)
self.proxy_list.append(checkerproxy_list)
# Flatten list of lists (1 master list containing 1 list of ips per proxy website)
self.proxy_list = [ips for proxy_site in self.proxy_list for ips in proxy_site]
self.proxy_list = list(set(self.proxy_list)) # Remove duplicates
if not self.quiet:
print '[*] %d unique high anonymity proxies found' % len(self.proxy_list)
print '[*] Testing proxy speeds ...'
print ''
print ' Proxy | CC | Domain | Time/Errors'
self.proxy_checker()
return list_
def checkerproxy_req(self):
''' Make the request to checkerproxy and create a master list from that site '''
cp_ips = []
try:
url = 'http://checkerproxy.net/all_proxy'
r = requests.get(url, headers=self.headers)
html = r.text
except Exception:
print '[!] Failed to get reply from %s' % url
checkerproxy_list = []
return checkerproxy_list
checkerproxy_list = self.parse_checkerproxy(html)
return checkerproxy_list
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
def letushide_req(self):
''' Make the request to the proxy site and create a master list from that site '''
letushide_ips = []
for i in xrange(1,20): # can search maximum of 20 pages
try:
url = 'http://letushide.com/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i)
r = requests.get(url, headers=self.headers)
html = r.text
ips = self.parse_letushide(html)
# Check html for a link to the next page
if '/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i+1) in html:
pass
else:
letushide_ips.append(ips)
break
letushide_ips.append(ips)
except:
print '[!] Failed get reply from %s' % url
break
# Flatten list of lists (1 list containing 1 list of ips for each page)
letushide_list = [item for sublist in letushide_ips for item in sublist]
return letushide_list
def parse_letushide(self, html):
''' Parse out list of IP:port strings from the html '''
# \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} - matches IP addresses
# </a></td><td> - is in between the IP and the port
# .*?< - match all text (.) for as many characters as possible (*) but don't be greedy (?) and stop at the next greater than (<)
raw_ips = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}</a></td><td>.*?<', html)
ips = []
for ip in raw_ips:
ip = ip.replace('</a></td><td>', ':')
ip = ip.strip('<')
ips.append(ip)
return ips
def gatherproxy_req(self):
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
try:
r = requests.get(url, headers = self.headers)
lines = r.text.splitlines()
except:
print '[!] Failed get reply from %s' % url
gatherproxy_list = []
return gatherproxy_list
gatherproxy_list = self.parse_gp(lines)
return gatherproxy_list
def parse_gp(self, lines):
''' Parse the raw scraped data '''
gatherproxy_list = []
for l in lines:
if 'proxy_ip' in l.lower():
l = l.replace('gp.insertPrx(', '')
l = l.replace(');', '')
l = l.replace('null', 'None')
l = l.strip()
l = ast.literal_eval(l)
proxy = '%s:%s' % (l["PROXY_IP"], l["PROXY_PORT"])
gatherproxy_list.append(proxy)
#ctry = l["PROXY_COUNTRY"]
return gatherproxy_list
def proxy_checker(self):
''' Concurrency stuff here '''
jobs = [gevent.spawn(self.proxy_checker_req, proxy) for proxy in self.proxy_list]
try:
gevent.joinall(jobs)
except KeyboardInterrupt:
sys.exit('[-] Ctrl-C caught, exiting')
def proxy_checker_req(self, proxy):
''' See how long each proxy takes to open each URL '''
proxyip = str(proxy.split(':', 1)[0])
# A lot of proxy checker sites give a different final octet for some reason
#proxy_split = proxyip.split('.')
#first_3_octets = '.'.join(proxy_split[:3])+'.'
results = []
urls = ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org', 'https://www.astrill.com/what-is-my-ip-address.php', 'http://danmcinerney.org/headers.php']
for url in urls:
try:
check = requests.get(url,
headers = self.headers,
proxies = {'http':'http://'+proxy,
'https':'http://'+proxy},
timeout = 15)
time_or_error = str(check.elapsed)
html = check.text
time_or_error = self.html_handler(time_or_error, html, url)
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
except Exception as e:
time_or_error = self.error_handler(str(e))
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
self.print_handler(results, proxyip)
def html_handler(self, time_or_error, html, url):
''' Check the html for errors and if none are found return time to load page '''
html_lines = html.splitlines()
leng = len(html_lines)
ipre = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
# Both of these urls just return the ip and nothing else
if url in ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org']:
if leng == 1: # Should return 1 line of html
match = re.match(ipre, html)
if match:
if self.externalip in html:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
# This is the SSL page
if 'astrill' in url:
soup = BeautifulSoup(html)
ip = soup.find("td", { "colspan": 2 }).text # the ip is the only on with colspan = 2
match = re.match(ipre, ip)
if match:
if self.externalip in ip:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
if '/headers' in url:
# check for proxy headers
proxy_headers = ['via: ', 'forwarded: ', 'x-forwarded-for', 'client-ip']
if leng > 15: # 15 is arbitrary, I just don't think you'll ever see more than 15 headers
time_or_error = 'Err: headers not returned'
return time_or_error
for l in html_lines:
for h in proxy_headers:
if h in l.lower():
time_or_error = 'Err: Proxy headers found'
return time_or_error
time_or_error = 'Passed: elite proxy'
return time_or_error
def print_handler(self, results, proxyip):
if self.show_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
else:
passed_all = self.passed_all_tests(results)
if passed_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
if self.show_num:
self.limiter()
def printer(self, results, country_code):
''' Creates the output '''
counter = 0
if not self.quiet:
print '--------------------------------------------------------------------'
for r in results:
counter += 1
time_or_error = r[0]
proxy = r[1]
url = r[2]
if self.quiet:
if counter % 4 == 0: #################### THIS results is a list of 4 tuples each, so proxies will repeat 4 times
#print proxy
global listz
listz.append(proxy)
else:
# Only print the proxy once, on the second print job
if counter == 1:
print '%s | %s | %s | %s' % (proxy.ljust(21), country_code.ljust(3), url.ljust(21), time_or_error)
else:
print '%s | %s | %s | %s' % (' '.ljust(21), ' ', url.ljust(21), time_or_error)
def get_country_code(self, proxyip):
''' Get the 3 letter country code of the proxy using geoiptool.com
Would use the geoip library, but it requires a local DB and what
is the point of that hassle other than marginal speed improvement '''
cc_line_found = False
cc = 'N/A'
try:
r = requests.get('http://www.geoiptool.com/en/?IP=%s' % proxyip, headers=self.headers)
html = r.text
html_lines = html.splitlines()
for l in html_lines:
if cc_line_found == True:
cc = l.split('(', 1)[1].split(')', 1)[0]
break
if 'country code:' in l.lower():
cc_line_found = True
except:
pass
return cc
def error_handler(self, e):
if 'Cannot connect' in e:
time_or_error = 'Err: Cannot connect to proxy'
elif 'timed out' in e.lower():
time_or_error = 'Err: Timed out'
elif 'retries exceeded' in e:
time_or_error = 'Err: Max retries exceeded'
elif 'Connection reset by peer' in e:
time_or_error = 'Err: Connection reset by peer'
elif 'readline() takes exactly 1 argument (2 given)' in e:
time_or_error = 'Err: SSL error'
else:
time_or_error = 'Err: ' + e
return time_or_error
def url_shortener(self, url):
if 'ip.php' in url:
url = 'danmcinerney.org'
elif 'headers.php' in url:
url = 'Header check'
elif 'dnsdynamic' in url:
url = 'dnsdynamic.org'
elif 'astrill' in url:
url = 'https://astrill.com'
return url
def passed_all_tests(self, results):
for r in results:
time_or_error= r[0]
if 'Err:' in time_or_error:
global testx
testx = 50
return False
return True
def limiter(self):
testx = 0
''' Kill the script if user supplied limit of successful proxy attempts (-s argument) is reached '''
if self.print_counter >= int(self.show_num):
sys.exit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to set a time out in web crawler? - python

Related

requests_html stop website from redirecting

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

Best way to make thousands of get requests in python

How do I implement a web crawler that scrapes ad links?

Python KeyError exception / exiting code without error

Categories

Resources