How do I implement a web crawler that scrapes ad links?

How do I implement a web crawler that scrapes ad links? - python

To get training data, I wrote a crawler to follow the top 500 websites on Alexa with a depth of 2 and write all links found to a file. Right now, it looks for all the links in the html and writes them to a file. The problem is, the crawler misses all links to ads, some of which are located in iframes or located in CSS files. How can I change my web crawler so that it scrapes all links, including ads? The relevant code can be found below.
class Crawler(object):
def __init__(self, root, depth, locked=True):
self.root = root
self.depth = depth
self.locked = locked
self.host = urlparse.urlparse(root)[1]
self.urls = []
self.links = 0
self.followed = 0
def crawl(self):
#print " in crawl"
page = Fetcher(self.root)
q = Queue()
#print "made fetcher"
try:
page.fetch()
if page.urls == []:
print "Error: could not fetch urls for %s" % (self.root)
return
#raise KeyboardInterrupt
else:
target = open("output.txt", 'w')
for url in page.urls:
q.put(url)
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
except Exception as e:
print('Error: could not fetch urls')
raise KeyboardInterrupt
'''
q = Queue()
target = open("output.txt", 'w')
for url in page.urls:
q.put(url) f
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
#print followed
'''
n = 0
while True:
try:
url = q.get()
except QueueEmpty:
break
n += 1
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
#print url
self.followed += 1
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
q.put(url)
self.urls.append(url)
with open("data.out", 'w') as f:
f.write(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
print format_exc()
class Fetcher(object):
def __init__(self, url):
self.url = url
self.urls = []
def __getitem__(self, x):
return self.urls[x]
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
Static methods:
def main():
depth =2
file_in = []
reload(sys)
sys.setdefaultencoding('utf-8')
filename = "stuff.txt"
text = open(filename)
for line in text:
file_in.append(line.rstrip())
for i in file_in:
print "Crawling %s (Max Depth: %d)" % (i, depth)
crawler = Crawler(i, depth)
crawler.crawl()
print "\n".join(crawler.urls)

A lot of advertising is delivered via asynchronous javascript executed on the page. If you're just scraping the server initial output you won't be able to obtain those others links. One method would be to use a headless browser like PhantomJS to render the html to a file then use your script on that. There are other possibilities as well.

Related

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!

Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

Trying to scrape data off of a website using Python and Chromedriver, but it's returning a nonetype error for "find"

I am trying to scrape data off of WhoScored.com. I am not sure what is the best way to do it or if anyone is familiar with this particular website, but I have a Python script that is supposed to scrape the data.
Here is my code:
import time
import bs4
import selenium_func as sel
from helper_functions import read_from_file, append_to_file
TIERS_PATH = 'tiers_urls/tiers_urls.txt'
TEAMS_PATH = 'teams_urls/teams_urls.txt'
TEAMS_LOGS = 'teams_urls/teams_logs.txt'
"""
Functions
"""
def get_teams_urls(start_idx):
"""
Searches each tier and extracts all the teams' urls within that tier.
"""
server, driver = sel.start_server_and_driver()
tiers_urls = read_from_file(TIERS_PATH)
length = len(tiers_urls)
for tier in tiers_urls[start_idx:]:
error = False
teams_urls = []
try:
complete_url = sel.WHOSCORED_URL + tier
try:
driver.get(complete_url)
content = driver.page_source
soup = bs4.BeautifulSoup(''.join(content), 'lxml')
except Exception as e:
print('\n')
print("Problem accessing {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError accessing: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
continue
stage = None
stages_div = soup.find('div', {'id':'sub-navigation'})
if stages_div != None:
stage_li = stages_div.find_all('li')[0]
if stage_li != None:
stage_href = stage_li.find('a', href=True)['href']
if stage_href != None:
stage = stage_href.split('/')[8]
if stage != None:
standings_table = soup.find('div', {'id':'standings-'+stage})
standings_tbody = standings_table.find(id='standings-'+stage+'-content')
teams_tr = standings_tbody.find_all('tr')
if len(teams_tr) > 0:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
except Exception as e:
print('\n')
print("Problem reading data from: {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError reading data from: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
error = True
if error == False:
if len(teams_urls) > 0:
to_store = {tier:teams_urls}
append_to_file(str(to_store), TEAMS_PATH)
append_to_file("\nSuccessfully retrieved from: " + str(tiers_urls.index(tier)) + "/" + str(length), TEAMS_LOGS)
time.sleep(1)
sel.stop_server_and_driver(server, driver)
return
if __name__ == '__main__':
get_teams_urls(0)
I am trying to scrape data off of WhoScored.com and it opens up the website, but it returns this error:
'NoneType' object has no attribute 'find'
How do I fix this and successfully scrape the data ?

Sounds like you need some null/None-checks:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
if team_td != None:
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
You didn't check if team_td was None before calling find

Using proxy middleware to scrape Amazon

class HttpProxyMiddleware(object):
###never retry these errors
def __init__(self, settings):
socket.setdefaulttimeout(3)
self.proxies = []
self.proxy_index = 1
##plant proxies
self.proxy_list = settings.get('PROXY_LIST')
fin = open(self.proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
self.proxies.append(line)
print self.proxies
fin.close()
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
if len(self.proxies) == 0:
raise ValueError('All proxies are unusable, cannot proceed')
proxy_address = self.proxies[self.proxy_index]
print proxy_address
request.meta['proxy'] = proxy_address
def process_exception(self, request, exception, spider):
print 'not working'
self.proxy_index += 1
new_request = request.copy()
new_request.dont_filter = True
return new_request
I have the complete proxy list from HMA and I am try to use this middleware to scrape amazon items. However, it turns out it hardly works. In 'process_request', after printing out the proxy address, it stops for a long time and eventually failed.
I also create the following program to test each proxy ip in my list. All proxy ip are working from here.
def main():
socket.setdefaulttimeout(3)
proxies = []
##plant proxies
proxy_list = '/users/zehuapan/desktop/amazon/amazon/proxy_list.txt'
fin = open(proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
print line
if check_validity(line):
proxies.append(line)
fin.close()
print proxies
file = open('/users/zehuapan/desktop/amazon/amazon/valid_proxy_list.txt', 'w+')
for proxy in proxies:
file.write(proxy + '\n')
file.close()
def check_validity(proxy):
try:
proxy_handler = urllib2.ProxyHandler({'http': proxy})
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req=urllib2.Request('https://www.amazon.com')
sock=urllib2.urlopen(req)
except urllib2.HTTPError, e:
print 'Error code: ', e.code
return e.code
except Exception, detail:
print "ERROR:", detail
return False
return True
if __name__ == '__main__':
main()

Zero response is unlikely to be given using a proxy. And a working proxy goes down very fast with Amazon after few hundreds of requests.
When a proxy still works you can get a captcha page with 200 code, when it is banned you get 503 code.
Also, the proxy might work for the main page and for the offers list, but fail with the product page.
Hope that helps.

How to set a time out in web crawler?

I am very new to python and trying to develop very simple web crawler. My web crawler works good but it stick to one link for long time. How can I set up a timeout function?
How to deal with the urllib2.HTTPError? Is my except statement correct?
def get_link(page):
start = page.find('<a href=')
if start==-1:
return None,0
startp=page.find('"',start)
endp=page.find('"',startp+1)
url=page[startp+1:endp]
return url,endp
def get_all_link(page):
allurl = []
while True:
url,endp=get_link(page)
if url:
page=page[endp:]
allurl.append(url)
else:
return allurl
break
def get_page(page, tocrawl):
import urllib2
try:
page_source = urllib2.urlopen(page)
return page_source.read()
except:
page = tocrawl.pop()
raise
def validate(page):
valid = page.find('http')
if valid == -1:
return 0
return 1
def crawler(seed):
tocrawl = [seed]
crawled = []
i=0
while tocrawl:
page=tocrawl.pop()
valid = validate(page)
if valid:
if page not in crawled:
tocrawl = set(tocrawl) | set(get_all_link(get_page(page,tocrawl)))
crawled.append(page)
i=i+1
f = open("crawled.txt","a")
f.write(repr(i)+" : "+repr(page)+"\n")
f.close()
return crawled
crawler("http://google.com")

Python KeyError exception / exiting code without error

I got a rather weird problem. The following code in scrape1 sometimes works as it should, but most of the time it just stops at line 24, where request.get is being used. I do however consistently get this keyerror exception:
Exception KeyError: KeyError(140186412830800,) in module <'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
The exception is only thrown when I'm importing the module proxyfetch.py but as long as I don't actually execute the code in proxyfetch.py, scrape1.py doesn't break (exception is thrown after nominal execution). Proxyfetch is based on DanMcInerney elite-proxy-finder on github. I just edited it so I could use it as a module which returns a list of proxys instead of printing them.
So here are the 2 scripts:
scrape1.py:
#scrape1.py
from bs4 import BeautifulSoup
from proxyfetch import getprox
import requests
proxcount=3
listz = getprox(proxcount)
proxfile = open("proxysave.txt", "w")
base_url = "http://google.com"
def pagefetch(url):
print "Test"
http_proxy = "http://"+listz[0]
#http_proxy = "http://103.25.203.227:3127"
print "Test2"
proxydict = {
"http" : http_proxy
#"https_proxy" : https_proxy
}
print "Test3"
page = requests.get(url, proxies=proxydict) #with proxy
#page = requests.get(url) #without proxy
print "Test4"
return page
page = pagefetch(base_url)
soup = BeautifulSoup(page.text)
links = soup.find_all("a")
if links:
for n in links:
print n
else:
print "I got nuthin."
And proxyfetch.py
#!/usr/bin/env python2
#proxyfetch.py
'''Finds hundreds of elite anonymity (L1) HTTP proxies then tests them all in parallel printing the fastest ones first.
Checks headers to confirm eliteness, checks if compatible with opening HTTPS sites, and confirms the proxy is working
through multiple IP checking sites'''
# TO DO:
# -Add http://free-proxy-list.net/
# -Add hidemyass
#from IPython import embed
__author__ = 'Dan McInerney'
__contact__ = 'danhmcinerney gmail'
from gevent import monkey
monkey.patch_all()
import requests
import ast
import gevent
import sys, re, time, os, argparse
import socket
from bs4 import BeautifulSoup
listz =[]
def getprox(amount):
argz = [amount, False, True]
try:
P = find_http_proxy(argz)
P.run()
except BaseException,Err:
return listz
return listz
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--show', help='Show this number of results. Example: "-s 5" will show the 5 fastest proxies then stop')
parser.add_argument('-a', '--all', help='Show all proxy results including the ones that failed 1 of the 3 tests', action='store_true')
parser.add_argument('-q', '--quiet', help='Only print the IP:port of the fastest proxies that pass all the tests', action='store_true')
return parser.parse_args()
class find_http_proxy():
''' Will only gather L1 (elite anonymity) proxies
which should not give out your IP or advertise
that you are using a proxy at all '''
#argz = [arg1, False, True]
def __init__(self, argz):
self.proxy_list = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
self.show_num = argz[0]
self.show_all = False
self.quiet = True
self.errors = []
self.print_counter = 0
self.externalip = self.external_ip()
def external_ip(self):
req = requests.get('http://myip.dnsdynamic.org/', headers=self.headers)
ip = req.text
return ip
def run(self):
''' Gets raw high anonymity (L1) proxy data then calls make_proxy_list()
Currently parses data from gatherproxy.com and letushide.com '''
if not self.quiet:
print '[*] Your accurate external IP: %s' % self.externalip
letushide_list = self.letushide_req()
if not self.quiet:
print '[*] letushide.com: %s proxies' % str(len(letushide_list))
# Has a login now :(
gatherproxy_list = self.gatherproxy_req()
if not self.quiet:
print '[*] gatherproxy.com: %s proxies' % str(len(gatherproxy_list))
checkerproxy_list = self.checkerproxy_req()
if not self.quiet:
print '[*] checkerproxy.net: %s proxies' % str(len(checkerproxy_list))
self.proxy_list.append(letushide_list)
self.proxy_list.append(gatherproxy_list)
self.proxy_list.append(checkerproxy_list)
# Flatten list of lists (1 master list containing 1 list of ips per proxy website)
self.proxy_list = [ips for proxy_site in self.proxy_list for ips in proxy_site]
self.proxy_list = list(set(self.proxy_list)) # Remove duplicates
if not self.quiet:
print '[*] %d unique high anonymity proxies found' % len(self.proxy_list)
print '[*] Testing proxy speeds ...'
print ''
print ' Proxy | CC | Domain | Time/Errors'
self.proxy_checker()
return list_
def checkerproxy_req(self):
''' Make the request to checkerproxy and create a master list from that site '''
cp_ips = []
try:
url = 'http://checkerproxy.net/all_proxy'
r = requests.get(url, headers=self.headers)
html = r.text
except Exception:
print '[!] Failed to get reply from %s' % url
checkerproxy_list = []
return checkerproxy_list
checkerproxy_list = self.parse_checkerproxy(html)
return checkerproxy_list
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
def letushide_req(self):
''' Make the request to the proxy site and create a master list from that site '''
letushide_ips = []
for i in xrange(1,20): # can search maximum of 20 pages
try:
url = 'http://letushide.com/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i)
r = requests.get(url, headers=self.headers)
html = r.text
ips = self.parse_letushide(html)
# Check html for a link to the next page
if '/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i+1) in html:
pass
else:
letushide_ips.append(ips)
break
letushide_ips.append(ips)
except:
print '[!] Failed get reply from %s' % url
break
# Flatten list of lists (1 list containing 1 list of ips for each page)
letushide_list = [item for sublist in letushide_ips for item in sublist]
return letushide_list
def parse_letushide(self, html):
''' Parse out list of IP:port strings from the html '''
# \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} - matches IP addresses
# </a></td><td> - is in between the IP and the port
# .*?< - match all text (.) for as many characters as possible (*) but don't be greedy (?) and stop at the next greater than (<)
raw_ips = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}</a></td><td>.*?<', html)
ips = []
for ip in raw_ips:
ip = ip.replace('</a></td><td>', ':')
ip = ip.strip('<')
ips.append(ip)
return ips
def gatherproxy_req(self):
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
try:
r = requests.get(url, headers = self.headers)
lines = r.text.splitlines()
except:
print '[!] Failed get reply from %s' % url
gatherproxy_list = []
return gatherproxy_list
gatherproxy_list = self.parse_gp(lines)
return gatherproxy_list
def parse_gp(self, lines):
''' Parse the raw scraped data '''
gatherproxy_list = []
for l in lines:
if 'proxy_ip' in l.lower():
l = l.replace('gp.insertPrx(', '')
l = l.replace(');', '')
l = l.replace('null', 'None')
l = l.strip()
l = ast.literal_eval(l)
proxy = '%s:%s' % (l["PROXY_IP"], l["PROXY_PORT"])
gatherproxy_list.append(proxy)
#ctry = l["PROXY_COUNTRY"]
return gatherproxy_list
def proxy_checker(self):
''' Concurrency stuff here '''
jobs = [gevent.spawn(self.proxy_checker_req, proxy) for proxy in self.proxy_list]
try:
gevent.joinall(jobs)
except KeyboardInterrupt:
sys.exit('[-] Ctrl-C caught, exiting')
def proxy_checker_req(self, proxy):
''' See how long each proxy takes to open each URL '''
proxyip = str(proxy.split(':', 1)[0])
# A lot of proxy checker sites give a different final octet for some reason
#proxy_split = proxyip.split('.')
#first_3_octets = '.'.join(proxy_split[:3])+'.'
results = []
urls = ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org', 'https://www.astrill.com/what-is-my-ip-address.php', 'http://danmcinerney.org/headers.php']
for url in urls:
try:
check = requests.get(url,
headers = self.headers,
proxies = {'http':'http://'+proxy,
'https':'http://'+proxy},
timeout = 15)
time_or_error = str(check.elapsed)
html = check.text
time_or_error = self.html_handler(time_or_error, html, url)
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
except Exception as e:
time_or_error = self.error_handler(str(e))
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
self.print_handler(results, proxyip)
def html_handler(self, time_or_error, html, url):
''' Check the html for errors and if none are found return time to load page '''
html_lines = html.splitlines()
leng = len(html_lines)
ipre = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
# Both of these urls just return the ip and nothing else
if url in ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org']:
if leng == 1: # Should return 1 line of html
match = re.match(ipre, html)
if match:
if self.externalip in html:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
# This is the SSL page
if 'astrill' in url:
soup = BeautifulSoup(html)
ip = soup.find("td", { "colspan": 2 }).text # the ip is the only on with colspan = 2
match = re.match(ipre, ip)
if match:
if self.externalip in ip:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
if '/headers' in url:
# check for proxy headers
proxy_headers = ['via: ', 'forwarded: ', 'x-forwarded-for', 'client-ip']
if leng > 15: # 15 is arbitrary, I just don't think you'll ever see more than 15 headers
time_or_error = 'Err: headers not returned'
return time_or_error
for l in html_lines:
for h in proxy_headers:
if h in l.lower():
time_or_error = 'Err: Proxy headers found'
return time_or_error
time_or_error = 'Passed: elite proxy'
return time_or_error
def print_handler(self, results, proxyip):
if self.show_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
else:
passed_all = self.passed_all_tests(results)
if passed_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
if self.show_num:
self.limiter()
def printer(self, results, country_code):
''' Creates the output '''
counter = 0
if not self.quiet:
print '--------------------------------------------------------------------'
for r in results:
counter += 1
time_or_error = r[0]
proxy = r[1]
url = r[2]
if self.quiet:
if counter % 4 == 0: #################### THIS results is a list of 4 tuples each, so proxies will repeat 4 times
#print proxy
global listz
listz.append(proxy)
else:
# Only print the proxy once, on the second print job
if counter == 1:
print '%s | %s | %s | %s' % (proxy.ljust(21), country_code.ljust(3), url.ljust(21), time_or_error)
else:
print '%s | %s | %s | %s' % (' '.ljust(21), ' ', url.ljust(21), time_or_error)
def get_country_code(self, proxyip):
''' Get the 3 letter country code of the proxy using geoiptool.com
Would use the geoip library, but it requires a local DB and what
is the point of that hassle other than marginal speed improvement '''
cc_line_found = False
cc = 'N/A'
try:
r = requests.get('http://www.geoiptool.com/en/?IP=%s' % proxyip, headers=self.headers)
html = r.text
html_lines = html.splitlines()
for l in html_lines:
if cc_line_found == True:
cc = l.split('(', 1)[1].split(')', 1)[0]
break
if 'country code:' in l.lower():
cc_line_found = True
except:
pass
return cc
def error_handler(self, e):
if 'Cannot connect' in e:
time_or_error = 'Err: Cannot connect to proxy'
elif 'timed out' in e.lower():
time_or_error = 'Err: Timed out'
elif 'retries exceeded' in e:
time_or_error = 'Err: Max retries exceeded'
elif 'Connection reset by peer' in e:
time_or_error = 'Err: Connection reset by peer'
elif 'readline() takes exactly 1 argument (2 given)' in e:
time_or_error = 'Err: SSL error'
else:
time_or_error = 'Err: ' + e
return time_or_error
def url_shortener(self, url):
if 'ip.php' in url:
url = 'danmcinerney.org'
elif 'headers.php' in url:
url = 'Header check'
elif 'dnsdynamic' in url:
url = 'dnsdynamic.org'
elif 'astrill' in url:
url = 'https://astrill.com'
return url
def passed_all_tests(self, results):
for r in results:
time_or_error= r[0]
if 'Err:' in time_or_error:
global testx
testx = 50
return False
return True
def limiter(self):
testx = 0
''' Kill the script if user supplied limit of successful proxy attempts (-s argument) is reached '''
if self.print_counter >= int(self.show_num):
sys.exit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I implement a web crawler that scrapes ad links? - python

Related

requests_html stop website from redirecting

Trying to scrape data off of a website using Python and Chromedriver, but it's returning a nonetype error for "find"

Using proxy middleware to scrape Amazon

How to set a time out in web crawler?

Python KeyError exception / exiting code without error

Categories

Resources